mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
Merge branch 'dev' into claude/refactor-process-management-WcQyZ
Resolved conflicts by keeping Process model changes and accepting dev changes for unrelated files. Ensured pid_utils.py remains deleted as intended by this PR. Co-authored-by: Nick Sweeting <pirate@users.noreply.github.com>
This commit is contained in:
@@ -27,36 +27,45 @@ class ArchiveBoxGroup(click.Group):
|
||||
'init': 'archivebox.cli.archivebox_init.main',
|
||||
'install': 'archivebox.cli.archivebox_install.main',
|
||||
}
|
||||
# Model commands (CRUD operations via subcommands)
|
||||
model_commands = {
|
||||
'crawl': 'archivebox.cli.archivebox_crawl.main',
|
||||
'snapshot': 'archivebox.cli.archivebox_snapshot.main',
|
||||
'archiveresult': 'archivebox.cli.archivebox_archiveresult.main',
|
||||
'tag': 'archivebox.cli.archivebox_tag.main',
|
||||
'binary': 'archivebox.cli.archivebox_binary.main',
|
||||
'process': 'archivebox.cli.archivebox_process.main',
|
||||
'machine': 'archivebox.cli.archivebox_machine.main',
|
||||
}
|
||||
archive_commands = {
|
||||
# High-level commands
|
||||
'add': 'archivebox.cli.archivebox_add.main',
|
||||
'remove': 'archivebox.cli.archivebox_remove.main',
|
||||
'run': 'archivebox.cli.archivebox_run.main',
|
||||
'update': 'archivebox.cli.archivebox_update.main',
|
||||
'search': 'archivebox.cli.archivebox_search.main',
|
||||
'status': 'archivebox.cli.archivebox_status.main',
|
||||
'config': 'archivebox.cli.archivebox_config.main',
|
||||
'schedule': 'archivebox.cli.archivebox_schedule.main',
|
||||
'server': 'archivebox.cli.archivebox_server.main',
|
||||
'shell': 'archivebox.cli.archivebox_shell.main',
|
||||
'manage': 'archivebox.cli.archivebox_manage.main',
|
||||
# Worker/orchestrator commands
|
||||
'orchestrator': 'archivebox.cli.archivebox_orchestrator.main',
|
||||
# Introspection commands
|
||||
'pluginmap': 'archivebox.cli.archivebox_pluginmap.main',
|
||||
# Worker command
|
||||
'worker': 'archivebox.cli.archivebox_worker.main',
|
||||
# Task commands (called by workers as subprocesses)
|
||||
'crawl': 'archivebox.cli.archivebox_crawl.main',
|
||||
'snapshot': 'archivebox.cli.archivebox_snapshot.main',
|
||||
'extract': 'archivebox.cli.archivebox_extract.main',
|
||||
}
|
||||
all_subcommands = {
|
||||
**meta_commands,
|
||||
**setup_commands,
|
||||
**model_commands,
|
||||
**archive_commands,
|
||||
}
|
||||
renamed_commands = {
|
||||
'setup': 'install',
|
||||
'list': 'search',
|
||||
'import': 'add',
|
||||
'archive': 'add',
|
||||
'export': 'search',
|
||||
# Old commands replaced by new model commands
|
||||
'orchestrator': 'run',
|
||||
'extract': 'archiveresult',
|
||||
}
|
||||
|
||||
@classmethod
|
||||
@@ -110,9 +119,9 @@ def cli(ctx, help=False):
|
||||
if help or ctx.invoked_subcommand is None:
|
||||
ctx.invoke(ctx.command.get_command(ctx, 'help'))
|
||||
|
||||
# if the subcommand is in the archive_commands dict and is not 'manage',
|
||||
# if the subcommand is in archive_commands or model_commands,
|
||||
# then we need to set up the django environment and check that we're in a valid data folder
|
||||
if subcommand in ArchiveBoxGroup.archive_commands:
|
||||
if subcommand in ArchiveBoxGroup.archive_commands or subcommand in ArchiveBoxGroup.model_commands:
|
||||
# print('SETUP DJANGO AND CHECK DATA FOLDER')
|
||||
try:
|
||||
from archivebox.config.django import setup_django
|
||||
|
||||
380
archivebox/cli/archivebox_archiveresult.py
Normal file
380
archivebox/cli/archivebox_archiveresult.py
Normal file
@@ -0,0 +1,380 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
archivebox archiveresult <action> [args...] [--filters]
|
||||
|
||||
Manage ArchiveResult records (plugin extraction results).
|
||||
|
||||
Actions:
|
||||
create - Create ArchiveResults for Snapshots (queue extractions)
|
||||
list - List ArchiveResults as JSONL (with optional filters)
|
||||
update - Update ArchiveResults from stdin JSONL
|
||||
delete - Delete ArchiveResults from stdin JSONL
|
||||
|
||||
Examples:
|
||||
# Create ArchiveResults for snapshots (queue for extraction)
|
||||
archivebox snapshot list --status=queued | archivebox archiveresult create
|
||||
archivebox archiveresult create --plugin=screenshot --snapshot-id=<uuid>
|
||||
|
||||
# List with filters
|
||||
archivebox archiveresult list --status=failed
|
||||
archivebox archiveresult list --plugin=screenshot --status=succeeded
|
||||
|
||||
# Update (reset failed extractions to queued)
|
||||
archivebox archiveresult list --status=failed | archivebox archiveresult update --status=queued
|
||||
|
||||
# Delete
|
||||
archivebox archiveresult list --plugin=singlefile | archivebox archiveresult delete --yes
|
||||
|
||||
# Re-run failed extractions
|
||||
archivebox archiveresult list --status=failed | archivebox run
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox archiveresult'
|
||||
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
import rich_click as click
|
||||
from rich import print as rprint
|
||||
|
||||
from archivebox.cli.cli_utils import apply_filters
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# CREATE
|
||||
# =============================================================================
|
||||
|
||||
def create_archiveresults(
|
||||
snapshot_id: Optional[str] = None,
|
||||
plugin: Optional[str] = None,
|
||||
status: str = 'queued',
|
||||
) -> int:
|
||||
"""
|
||||
Create ArchiveResults for Snapshots.
|
||||
|
||||
Reads Snapshot records from stdin and creates ArchiveResult entries.
|
||||
Pass-through: Non-Snapshot/ArchiveResult records are output unchanged.
|
||||
If --plugin is specified, only creates results for that plugin.
|
||||
Otherwise, creates results for all pending plugins.
|
||||
|
||||
Exit codes:
|
||||
0: Success
|
||||
1: Failure
|
||||
"""
|
||||
from django.utils import timezone
|
||||
|
||||
from archivebox.misc.jsonl import read_stdin, write_record, TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
|
||||
from archivebox.core.models import Snapshot, ArchiveResult
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
# If snapshot_id provided directly, use that
|
||||
if snapshot_id:
|
||||
try:
|
||||
snapshots = [Snapshot.objects.get(id=snapshot_id)]
|
||||
pass_through_records = []
|
||||
except Snapshot.DoesNotExist:
|
||||
rprint(f'[red]Snapshot not found: {snapshot_id}[/red]', file=sys.stderr)
|
||||
return 1
|
||||
else:
|
||||
# Read from stdin
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No Snapshot records provided via stdin[/yellow]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Separate snapshot records from pass-through records
|
||||
snapshot_ids = []
|
||||
pass_through_records = []
|
||||
|
||||
for record in records:
|
||||
record_type = record.get('type', '')
|
||||
|
||||
if record_type == TYPE_SNAPSHOT:
|
||||
# Pass through the Snapshot record itself
|
||||
pass_through_records.append(record)
|
||||
if record.get('id'):
|
||||
snapshot_ids.append(record['id'])
|
||||
|
||||
elif record_type == TYPE_ARCHIVERESULT:
|
||||
# ArchiveResult records: pass through if they have an id
|
||||
if record.get('id'):
|
||||
pass_through_records.append(record)
|
||||
# If no id, we could create it, but for now just pass through
|
||||
else:
|
||||
pass_through_records.append(record)
|
||||
|
||||
elif record_type:
|
||||
# Other typed records (Crawl, Tag, etc): pass through
|
||||
pass_through_records.append(record)
|
||||
|
||||
elif record.get('id'):
|
||||
# Untyped record with id - assume it's a snapshot ID
|
||||
snapshot_ids.append(record['id'])
|
||||
|
||||
# Output pass-through records first
|
||||
if not is_tty:
|
||||
for record in pass_through_records:
|
||||
write_record(record)
|
||||
|
||||
if not snapshot_ids:
|
||||
if pass_through_records:
|
||||
rprint(f'[dim]Passed through {len(pass_through_records)} records, no new snapshots to process[/dim]', file=sys.stderr)
|
||||
return 0
|
||||
rprint('[yellow]No valid Snapshot IDs in input[/yellow]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
snapshots = list(Snapshot.objects.filter(id__in=snapshot_ids))
|
||||
|
||||
if not snapshots:
|
||||
rprint('[yellow]No matching snapshots found[/yellow]', file=sys.stderr)
|
||||
return 0 if pass_through_records else 1
|
||||
|
||||
created_count = 0
|
||||
for snapshot in snapshots:
|
||||
if plugin:
|
||||
# Create for specific plugin only
|
||||
result, created = ArchiveResult.objects.get_or_create(
|
||||
snapshot=snapshot,
|
||||
plugin=plugin,
|
||||
defaults={
|
||||
'status': status,
|
||||
'retry_at': timezone.now(),
|
||||
}
|
||||
)
|
||||
if not created and result.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED]:
|
||||
# Reset for retry
|
||||
result.status = status
|
||||
result.retry_at = timezone.now()
|
||||
result.save()
|
||||
|
||||
if not is_tty:
|
||||
write_record(result.to_json())
|
||||
created_count += 1
|
||||
else:
|
||||
# Create all pending plugins
|
||||
snapshot.create_pending_archiveresults()
|
||||
for result in snapshot.archiveresult_set.filter(status=ArchiveResult.StatusChoices.QUEUED):
|
||||
if not is_tty:
|
||||
write_record(result.to_json())
|
||||
created_count += 1
|
||||
|
||||
rprint(f'[green]Created/queued {created_count} archive results[/green]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# LIST
|
||||
# =============================================================================
|
||||
|
||||
def list_archiveresults(
|
||||
status: Optional[str] = None,
|
||||
plugin: Optional[str] = None,
|
||||
snapshot_id: Optional[str] = None,
|
||||
limit: Optional[int] = None,
|
||||
) -> int:
|
||||
"""
|
||||
List ArchiveResults as JSONL with optional filters.
|
||||
|
||||
Exit codes:
|
||||
0: Success (even if no results)
|
||||
"""
|
||||
from archivebox.misc.jsonl import write_record
|
||||
from archivebox.core.models import ArchiveResult
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
queryset = ArchiveResult.objects.all().order_by('-start_ts')
|
||||
|
||||
# Apply filters
|
||||
filter_kwargs = {
|
||||
'status': status,
|
||||
'plugin': plugin,
|
||||
'snapshot_id': snapshot_id,
|
||||
}
|
||||
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
|
||||
|
||||
count = 0
|
||||
for result in queryset:
|
||||
if is_tty:
|
||||
status_color = {
|
||||
'queued': 'yellow',
|
||||
'started': 'blue',
|
||||
'succeeded': 'green',
|
||||
'failed': 'red',
|
||||
'skipped': 'dim',
|
||||
'backoff': 'magenta',
|
||||
}.get(result.status, 'dim')
|
||||
rprint(f'[{status_color}]{result.status:10}[/{status_color}] {result.plugin:15} [dim]{result.id}[/dim] {result.snapshot.url[:40]}')
|
||||
else:
|
||||
write_record(result.to_json())
|
||||
count += 1
|
||||
|
||||
rprint(f'[dim]Listed {count} archive results[/dim]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# UPDATE
|
||||
# =============================================================================
|
||||
|
||||
def update_archiveresults(
|
||||
status: Optional[str] = None,
|
||||
) -> int:
|
||||
"""
|
||||
Update ArchiveResults from stdin JSONL.
|
||||
|
||||
Reads ArchiveResult records from stdin and applies updates.
|
||||
Uses PATCH semantics - only specified fields are updated.
|
||||
|
||||
Exit codes:
|
||||
0: Success
|
||||
1: No input or error
|
||||
"""
|
||||
from django.utils import timezone
|
||||
|
||||
from archivebox.misc.jsonl import read_stdin, write_record
|
||||
from archivebox.core.models import ArchiveResult
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
updated_count = 0
|
||||
for record in records:
|
||||
result_id = record.get('id')
|
||||
if not result_id:
|
||||
continue
|
||||
|
||||
try:
|
||||
result = ArchiveResult.objects.get(id=result_id)
|
||||
|
||||
# Apply updates from CLI flags
|
||||
if status:
|
||||
result.status = status
|
||||
result.retry_at = timezone.now()
|
||||
|
||||
result.save()
|
||||
updated_count += 1
|
||||
|
||||
if not is_tty:
|
||||
write_record(result.to_json())
|
||||
|
||||
except ArchiveResult.DoesNotExist:
|
||||
rprint(f'[yellow]ArchiveResult not found: {result_id}[/yellow]', file=sys.stderr)
|
||||
continue
|
||||
|
||||
rprint(f'[green]Updated {updated_count} archive results[/green]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# DELETE
|
||||
# =============================================================================
|
||||
|
||||
def delete_archiveresults(yes: bool = False, dry_run: bool = False) -> int:
|
||||
"""
|
||||
Delete ArchiveResults from stdin JSONL.
|
||||
|
||||
Requires --yes flag to confirm deletion.
|
||||
|
||||
Exit codes:
|
||||
0: Success
|
||||
1: No input or missing --yes flag
|
||||
"""
|
||||
from archivebox.misc.jsonl import read_stdin
|
||||
from archivebox.core.models import ArchiveResult
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
result_ids = [r.get('id') for r in records if r.get('id')]
|
||||
|
||||
if not result_ids:
|
||||
rprint('[yellow]No valid archive result IDs in input[/yellow]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
results = ArchiveResult.objects.filter(id__in=result_ids)
|
||||
count = results.count()
|
||||
|
||||
if count == 0:
|
||||
rprint('[yellow]No matching archive results found[/yellow]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if dry_run:
|
||||
rprint(f'[yellow]Would delete {count} archive results (dry run)[/yellow]', file=sys.stderr)
|
||||
for result in results[:10]:
|
||||
rprint(f' [dim]{result.id}[/dim] {result.plugin} {result.snapshot.url[:40]}', file=sys.stderr)
|
||||
if count > 10:
|
||||
rprint(f' ... and {count - 10} more', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if not yes:
|
||||
rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Perform deletion
|
||||
deleted_count, _ = results.delete()
|
||||
rprint(f'[green]Deleted {deleted_count} archive results[/green]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# CLI Commands
|
||||
# =============================================================================
|
||||
|
||||
@click.group()
|
||||
def main():
|
||||
"""Manage ArchiveResult records (plugin extraction results)."""
|
||||
pass
|
||||
|
||||
|
||||
@main.command('create')
|
||||
@click.option('--snapshot-id', help='Snapshot ID to create results for')
|
||||
@click.option('--plugin', '-p', help='Plugin name (e.g., screenshot, singlefile)')
|
||||
@click.option('--status', '-s', default='queued', help='Initial status (default: queued)')
|
||||
def create_cmd(snapshot_id: Optional[str], plugin: Optional[str], status: str):
|
||||
"""Create ArchiveResults for Snapshots from stdin JSONL."""
|
||||
sys.exit(create_archiveresults(snapshot_id=snapshot_id, plugin=plugin, status=status))
|
||||
|
||||
|
||||
@main.command('list')
|
||||
@click.option('--status', '-s', help='Filter by status (queued, started, succeeded, failed, skipped)')
|
||||
@click.option('--plugin', '-p', help='Filter by plugin name')
|
||||
@click.option('--snapshot-id', help='Filter by snapshot ID')
|
||||
@click.option('--limit', '-n', type=int, help='Limit number of results')
|
||||
def list_cmd(status: Optional[str], plugin: Optional[str],
|
||||
snapshot_id: Optional[str], limit: Optional[int]):
|
||||
"""List ArchiveResults as JSONL."""
|
||||
sys.exit(list_archiveresults(
|
||||
status=status,
|
||||
plugin=plugin,
|
||||
snapshot_id=snapshot_id,
|
||||
limit=limit,
|
||||
))
|
||||
|
||||
|
||||
@main.command('update')
|
||||
@click.option('--status', '-s', help='Set status')
|
||||
def update_cmd(status: Optional[str]):
|
||||
"""Update ArchiveResults from stdin JSONL."""
|
||||
sys.exit(update_archiveresults(status=status))
|
||||
|
||||
|
||||
@main.command('delete')
|
||||
@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
|
||||
@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
|
||||
def delete_cmd(yes: bool, dry_run: bool):
|
||||
"""Delete ArchiveResults from stdin JSONL."""
|
||||
sys.exit(delete_archiveresults(yes=yes, dry_run=dry_run))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
290
archivebox/cli/archivebox_binary.py
Normal file
290
archivebox/cli/archivebox_binary.py
Normal file
@@ -0,0 +1,290 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
archivebox binary <action> [args...] [--filters]
|
||||
|
||||
Manage Binary records (detected executables like chrome, wget, etc.).
|
||||
|
||||
Actions:
|
||||
create - Create/register a Binary
|
||||
list - List Binaries as JSONL (with optional filters)
|
||||
update - Update Binaries from stdin JSONL
|
||||
delete - Delete Binaries from stdin JSONL
|
||||
|
||||
Examples:
|
||||
# List all binaries
|
||||
archivebox binary list
|
||||
|
||||
# List specific binary
|
||||
archivebox binary list --name=chrome
|
||||
|
||||
# List binaries with specific version
|
||||
archivebox binary list --version__icontains=120
|
||||
|
||||
# Delete old binary entries
|
||||
archivebox binary list --name=chrome | archivebox binary delete --yes
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox binary'
|
||||
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
import rich_click as click
|
||||
from rich import print as rprint
|
||||
|
||||
from archivebox.cli.cli_utils import apply_filters
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# CREATE
|
||||
# =============================================================================
|
||||
|
||||
def create_binary(
|
||||
name: str,
|
||||
abspath: str,
|
||||
version: str = '',
|
||||
) -> int:
|
||||
"""
|
||||
Create/register a Binary.
|
||||
|
||||
Exit codes:
|
||||
0: Success
|
||||
1: Failure
|
||||
"""
|
||||
from archivebox.misc.jsonl import write_record
|
||||
from archivebox.machine.models import Binary
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
if not name or not abspath:
|
||||
rprint('[red]Both --name and --abspath are required[/red]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
try:
|
||||
binary, created = Binary.objects.get_or_create(
|
||||
name=name,
|
||||
abspath=abspath,
|
||||
defaults={'version': version}
|
||||
)
|
||||
|
||||
if not is_tty:
|
||||
write_record(binary.to_json())
|
||||
|
||||
if created:
|
||||
rprint(f'[green]Created binary: {name} at {abspath}[/green]', file=sys.stderr)
|
||||
else:
|
||||
rprint(f'[dim]Binary already exists: {name} at {abspath}[/dim]', file=sys.stderr)
|
||||
|
||||
return 0
|
||||
|
||||
except Exception as e:
|
||||
rprint(f'[red]Error creating binary: {e}[/red]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# LIST
|
||||
# =============================================================================
|
||||
|
||||
def list_binaries(
|
||||
name: Optional[str] = None,
|
||||
abspath__icontains: Optional[str] = None,
|
||||
version__icontains: Optional[str] = None,
|
||||
limit: Optional[int] = None,
|
||||
) -> int:
|
||||
"""
|
||||
List Binaries as JSONL with optional filters.
|
||||
|
||||
Exit codes:
|
||||
0: Success (even if no results)
|
||||
"""
|
||||
from archivebox.misc.jsonl import write_record
|
||||
from archivebox.machine.models import Binary
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
queryset = Binary.objects.all().order_by('name', '-loaded_at')
|
||||
|
||||
# Apply filters
|
||||
filter_kwargs = {
|
||||
'name': name,
|
||||
'abspath__icontains': abspath__icontains,
|
||||
'version__icontains': version__icontains,
|
||||
}
|
||||
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
|
||||
|
||||
count = 0
|
||||
for binary in queryset:
|
||||
if is_tty:
|
||||
rprint(f'[cyan]{binary.name:20}[/cyan] [dim]{binary.version:15}[/dim] {binary.abspath}')
|
||||
else:
|
||||
write_record(binary.to_json())
|
||||
count += 1
|
||||
|
||||
rprint(f'[dim]Listed {count} binaries[/dim]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# UPDATE
|
||||
# =============================================================================
|
||||
|
||||
def update_binaries(
|
||||
version: Optional[str] = None,
|
||||
abspath: Optional[str] = None,
|
||||
) -> int:
|
||||
"""
|
||||
Update Binaries from stdin JSONL.
|
||||
|
||||
Reads Binary records from stdin and applies updates.
|
||||
Uses PATCH semantics - only specified fields are updated.
|
||||
|
||||
Exit codes:
|
||||
0: Success
|
||||
1: No input or error
|
||||
"""
|
||||
from archivebox.misc.jsonl import read_stdin, write_record
|
||||
from archivebox.machine.models import Binary
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
updated_count = 0
|
||||
for record in records:
|
||||
binary_id = record.get('id')
|
||||
if not binary_id:
|
||||
continue
|
||||
|
||||
try:
|
||||
binary = Binary.objects.get(id=binary_id)
|
||||
|
||||
# Apply updates from CLI flags
|
||||
if version:
|
||||
binary.version = version
|
||||
if abspath:
|
||||
binary.abspath = abspath
|
||||
|
||||
binary.save()
|
||||
updated_count += 1
|
||||
|
||||
if not is_tty:
|
||||
write_record(binary.to_json())
|
||||
|
||||
except Binary.DoesNotExist:
|
||||
rprint(f'[yellow]Binary not found: {binary_id}[/yellow]', file=sys.stderr)
|
||||
continue
|
||||
|
||||
rprint(f'[green]Updated {updated_count} binaries[/green]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# DELETE
|
||||
# =============================================================================
|
||||
|
||||
def delete_binaries(yes: bool = False, dry_run: bool = False) -> int:
|
||||
"""
|
||||
Delete Binaries from stdin JSONL.
|
||||
|
||||
Requires --yes flag to confirm deletion.
|
||||
|
||||
Exit codes:
|
||||
0: Success
|
||||
1: No input or missing --yes flag
|
||||
"""
|
||||
from archivebox.misc.jsonl import read_stdin
|
||||
from archivebox.machine.models import Binary
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
binary_ids = [r.get('id') for r in records if r.get('id')]
|
||||
|
||||
if not binary_ids:
|
||||
rprint('[yellow]No valid binary IDs in input[/yellow]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
binaries = Binary.objects.filter(id__in=binary_ids)
|
||||
count = binaries.count()
|
||||
|
||||
if count == 0:
|
||||
rprint('[yellow]No matching binaries found[/yellow]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if dry_run:
|
||||
rprint(f'[yellow]Would delete {count} binaries (dry run)[/yellow]', file=sys.stderr)
|
||||
for binary in binaries:
|
||||
rprint(f' {binary.name} {binary.abspath}', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if not yes:
|
||||
rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Perform deletion
|
||||
deleted_count, _ = binaries.delete()
|
||||
rprint(f'[green]Deleted {deleted_count} binaries[/green]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# CLI Commands
|
||||
# =============================================================================
|
||||
|
||||
@click.group()
|
||||
def main():
|
||||
"""Manage Binary records (detected executables)."""
|
||||
pass
|
||||
|
||||
|
||||
@main.command('create')
|
||||
@click.option('--name', '-n', required=True, help='Binary name (e.g., chrome, wget)')
|
||||
@click.option('--abspath', '-p', required=True, help='Absolute path to binary')
|
||||
@click.option('--version', '-v', default='', help='Binary version')
|
||||
def create_cmd(name: str, abspath: str, version: str):
|
||||
"""Create/register a Binary."""
|
||||
sys.exit(create_binary(name=name, abspath=abspath, version=version))
|
||||
|
||||
|
||||
@main.command('list')
|
||||
@click.option('--name', '-n', help='Filter by name')
|
||||
@click.option('--abspath__icontains', help='Filter by path contains')
|
||||
@click.option('--version__icontains', help='Filter by version contains')
|
||||
@click.option('--limit', type=int, help='Limit number of results')
|
||||
def list_cmd(name: Optional[str], abspath__icontains: Optional[str],
|
||||
version__icontains: Optional[str], limit: Optional[int]):
|
||||
"""List Binaries as JSONL."""
|
||||
sys.exit(list_binaries(
|
||||
name=name,
|
||||
abspath__icontains=abspath__icontains,
|
||||
version__icontains=version__icontains,
|
||||
limit=limit,
|
||||
))
|
||||
|
||||
|
||||
@main.command('update')
|
||||
@click.option('--version', '-v', help='Set version')
|
||||
@click.option('--abspath', '-p', help='Set path')
|
||||
def update_cmd(version: Optional[str], abspath: Optional[str]):
|
||||
"""Update Binaries from stdin JSONL."""
|
||||
sys.exit(update_binaries(version=version, abspath=abspath))
|
||||
|
||||
|
||||
@main.command('delete')
|
||||
@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
|
||||
@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
|
||||
def delete_cmd(yes: bool, dry_run: bool):
|
||||
"""Delete Binaries from stdin JSONL."""
|
||||
sys.exit(delete_binaries(yes=yes, dry_run=dry_run))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -1,108 +1,153 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
archivebox crawl [urls...] [--depth=N] [--tag=TAG]
|
||||
archivebox crawl <action> [args...] [--filters]
|
||||
|
||||
Create Crawl jobs from URLs. Accepts URLs as arguments, from stdin, or via JSONL.
|
||||
Does NOT immediately start the crawl - pipe to `archivebox snapshot` to process.
|
||||
Manage Crawl records.
|
||||
|
||||
Input formats:
|
||||
- Plain URLs (one per line)
|
||||
- JSONL: {"url": "...", "depth": 1, "tags": "..."}
|
||||
|
||||
Output (JSONL):
|
||||
{"type": "Crawl", "id": "...", "urls": "...", "status": "queued", ...}
|
||||
Actions:
|
||||
create - Create Crawl jobs from URLs
|
||||
list - List Crawls as JSONL (with optional filters)
|
||||
update - Update Crawls from stdin JSONL
|
||||
delete - Delete Crawls from stdin JSONL
|
||||
|
||||
Examples:
|
||||
# Create a crawl job
|
||||
archivebox crawl https://example.com
|
||||
# Create
|
||||
archivebox crawl create https://example.com https://foo.com --depth=1
|
||||
archivebox crawl create --tag=news https://example.com
|
||||
|
||||
# Create crawl with depth
|
||||
archivebox crawl --depth=1 https://example.com
|
||||
# List with filters
|
||||
archivebox crawl list --status=queued
|
||||
archivebox crawl list --urls__icontains=example.com
|
||||
|
||||
# Full pipeline: create crawl, create snapshots, run extractors
|
||||
archivebox crawl https://example.com | archivebox snapshot | archivebox extract
|
||||
# Update
|
||||
archivebox crawl list --status=started | archivebox crawl update --status=queued
|
||||
|
||||
# Process existing Crawl by ID (runs the crawl state machine)
|
||||
archivebox crawl 01234567-89ab-cdef-0123-456789abcdef
|
||||
# Delete
|
||||
archivebox crawl list --urls__icontains=spam.com | archivebox crawl delete --yes
|
||||
|
||||
# Full pipeline
|
||||
archivebox crawl create https://example.com | archivebox snapshot create | archivebox run
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox crawl'
|
||||
|
||||
import sys
|
||||
from typing import Optional
|
||||
from typing import Optional, Iterable
|
||||
|
||||
import rich_click as click
|
||||
from rich import print as rprint
|
||||
|
||||
from archivebox.cli.cli_utils import apply_filters
|
||||
|
||||
|
||||
def create_crawls(
|
||||
records: list,
|
||||
# =============================================================================
|
||||
# CREATE
|
||||
# =============================================================================
|
||||
|
||||
def create_crawl(
|
||||
urls: Iterable[str],
|
||||
depth: int = 0,
|
||||
tag: str = '',
|
||||
status: str = 'queued',
|
||||
created_by_id: Optional[int] = None,
|
||||
) -> int:
|
||||
"""
|
||||
Create a single Crawl job from all input URLs.
|
||||
Create a Crawl job from URLs.
|
||||
|
||||
Takes pre-read records, creates one Crawl with all URLs, outputs JSONL.
|
||||
Does NOT start the crawl - just creates the job in QUEUED state.
|
||||
Takes URLs as args or stdin, creates one Crawl with all URLs, outputs JSONL.
|
||||
Pass-through: Records that are not URLs are output unchanged (for piping).
|
||||
|
||||
Exit codes:
|
||||
0: Success
|
||||
1: Failure
|
||||
"""
|
||||
from rich import print as rprint
|
||||
|
||||
from archivebox.misc.jsonl import write_record
|
||||
from archivebox.misc.jsonl import read_args_or_stdin, write_record, TYPE_CRAWL
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
created_by_id = created_by_id or get_or_create_system_user_pk()
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
# Collect all input records
|
||||
records = list(read_args_or_stdin(urls))
|
||||
|
||||
if not records:
|
||||
rprint('[yellow]No URLs provided. Pass URLs as arguments or via stdin.[/yellow]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Collect all URLs into a single newline-separated string
|
||||
urls = []
|
||||
# Separate pass-through records from URL records
|
||||
url_list = []
|
||||
pass_through_records = []
|
||||
|
||||
for record in records:
|
||||
record_type = record.get('type', '')
|
||||
|
||||
# Pass-through: output records that aren't URL/Crawl types
|
||||
if record_type and record_type != TYPE_CRAWL and not record.get('url') and not record.get('urls'):
|
||||
pass_through_records.append(record)
|
||||
continue
|
||||
|
||||
# Handle existing Crawl records (just pass through with id)
|
||||
if record_type == TYPE_CRAWL and record.get('id'):
|
||||
pass_through_records.append(record)
|
||||
continue
|
||||
|
||||
# Collect URLs
|
||||
url = record.get('url')
|
||||
if url:
|
||||
urls.append(url)
|
||||
url_list.append(url)
|
||||
|
||||
if not urls:
|
||||
# Handle 'urls' field (newline-separated)
|
||||
urls_field = record.get('urls')
|
||||
if urls_field:
|
||||
for line in urls_field.split('\n'):
|
||||
line = line.strip()
|
||||
if line and not line.startswith('#'):
|
||||
url_list.append(line)
|
||||
|
||||
# Output pass-through records first
|
||||
if not is_tty:
|
||||
for record in pass_through_records:
|
||||
write_record(record)
|
||||
|
||||
if not url_list:
|
||||
if pass_through_records:
|
||||
# If we had pass-through records but no URLs, that's OK
|
||||
rprint(f'[dim]Passed through {len(pass_through_records)} records, no new URLs[/dim]', file=sys.stderr)
|
||||
return 0
|
||||
rprint('[red]No valid URLs found[/red]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
try:
|
||||
# Build crawl record with all URLs as newline-separated string
|
||||
crawl_record = {
|
||||
'urls': '\n'.join(urls),
|
||||
'urls': '\n'.join(url_list),
|
||||
'max_depth': depth,
|
||||
'tags_str': tag,
|
||||
'status': status,
|
||||
'label': '',
|
||||
}
|
||||
|
||||
crawl = Crawl.from_jsonl(crawl_record, overrides={'created_by_id': created_by_id})
|
||||
crawl = Crawl.from_json(crawl_record, overrides={'created_by_id': created_by_id})
|
||||
if not crawl:
|
||||
rprint('[red]Failed to create crawl[/red]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Output JSONL record (only when piped)
|
||||
if not is_tty:
|
||||
write_record(crawl.to_jsonl())
|
||||
write_record(crawl.to_json())
|
||||
|
||||
rprint(f'[green]Created crawl with {len(urls)} URLs[/green]', file=sys.stderr)
|
||||
rprint(f'[green]Created crawl with {len(url_list)} URLs[/green]', file=sys.stderr)
|
||||
|
||||
# If TTY, show human-readable output
|
||||
if is_tty:
|
||||
rprint(f' [dim]{crawl.id}[/dim]', file=sys.stderr)
|
||||
for url in urls[:5]: # Show first 5 URLs
|
||||
for url in url_list[:5]: # Show first 5 URLs
|
||||
rprint(f' {url[:70]}', file=sys.stderr)
|
||||
if len(urls) > 5:
|
||||
rprint(f' ... and {len(urls) - 5} more', file=sys.stderr)
|
||||
if len(url_list) > 5:
|
||||
rprint(f' ... and {len(url_list) - 5} more', file=sys.stderr)
|
||||
|
||||
return 0
|
||||
|
||||
@@ -111,81 +156,217 @@ def create_crawls(
|
||||
return 1
|
||||
|
||||
|
||||
def process_crawl_by_id(crawl_id: str) -> int:
|
||||
"""
|
||||
Process a single Crawl by ID (used by workers).
|
||||
# =============================================================================
|
||||
# LIST
|
||||
# =============================================================================
|
||||
|
||||
Triggers the Crawl's state machine tick() which will:
|
||||
- Transition from queued -> started (creates root snapshot)
|
||||
- Transition from started -> sealed (when all snapshots done)
|
||||
def list_crawls(
|
||||
status: Optional[str] = None,
|
||||
urls__icontains: Optional[str] = None,
|
||||
max_depth: Optional[int] = None,
|
||||
limit: Optional[int] = None,
|
||||
) -> int:
|
||||
"""
|
||||
from rich import print as rprint
|
||||
List Crawls as JSONL with optional filters.
|
||||
|
||||
Exit codes:
|
||||
0: Success (even if no results)
|
||||
"""
|
||||
from archivebox.misc.jsonl import write_record
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
try:
|
||||
crawl = Crawl.objects.get(id=crawl_id)
|
||||
except Crawl.DoesNotExist:
|
||||
rprint(f'[red]Crawl {crawl_id} not found[/red]', file=sys.stderr)
|
||||
return 1
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
rprint(f'[blue]Processing Crawl {crawl.id} (status={crawl.status})[/blue]', file=sys.stderr)
|
||||
queryset = Crawl.objects.all().order_by('-created_at')
|
||||
|
||||
try:
|
||||
crawl.sm.tick()
|
||||
crawl.refresh_from_db()
|
||||
rprint(f'[green]Crawl complete (status={crawl.status})[/green]', file=sys.stderr)
|
||||
return 0
|
||||
except Exception as e:
|
||||
rprint(f'[red]Crawl error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
|
||||
return 1
|
||||
# Apply filters
|
||||
filter_kwargs = {
|
||||
'status': status,
|
||||
'urls__icontains': urls__icontains,
|
||||
'max_depth': max_depth,
|
||||
}
|
||||
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
|
||||
|
||||
count = 0
|
||||
for crawl in queryset:
|
||||
if is_tty:
|
||||
status_color = {
|
||||
'queued': 'yellow',
|
||||
'started': 'blue',
|
||||
'sealed': 'green',
|
||||
}.get(crawl.status, 'dim')
|
||||
url_preview = crawl.urls[:50].replace('\n', ' ')
|
||||
rprint(f'[{status_color}]{crawl.status:8}[/{status_color}] [dim]{crawl.id}[/dim] {url_preview}...')
|
||||
else:
|
||||
write_record(crawl.to_json())
|
||||
count += 1
|
||||
|
||||
rprint(f'[dim]Listed {count} crawls[/dim]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
def is_crawl_id(value: str) -> bool:
|
||||
"""Check if value looks like a Crawl UUID."""
|
||||
import re
|
||||
uuid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.I)
|
||||
if not uuid_pattern.match(value):
|
||||
return False
|
||||
# Verify it's actually a Crawl (not a Snapshot or other object)
|
||||
# =============================================================================
|
||||
# UPDATE
|
||||
# =============================================================================
|
||||
|
||||
def update_crawls(
|
||||
status: Optional[str] = None,
|
||||
max_depth: Optional[int] = None,
|
||||
) -> int:
|
||||
"""
|
||||
Update Crawls from stdin JSONL.
|
||||
|
||||
Reads Crawl records from stdin and applies updates.
|
||||
Uses PATCH semantics - only specified fields are updated.
|
||||
|
||||
Exit codes:
|
||||
0: Success
|
||||
1: No input or error
|
||||
"""
|
||||
from django.utils import timezone
|
||||
|
||||
from archivebox.misc.jsonl import read_stdin, write_record
|
||||
from archivebox.crawls.models import Crawl
|
||||
return Crawl.objects.filter(id=value).exists()
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
@click.command()
|
||||
@click.option('--depth', '-d', type=int, default=0, help='Max depth for recursive crawling (default: 0, no recursion)')
|
||||
@click.option('--tag', '-t', default='', help='Comma-separated tags to add to snapshots')
|
||||
@click.argument('args', nargs=-1)
|
||||
def main(depth: int, tag: str, args: tuple):
|
||||
"""Create Crawl jobs from URLs, or process existing Crawls by ID"""
|
||||
from archivebox.misc.jsonl import read_args_or_stdin
|
||||
|
||||
# Read all input
|
||||
records = list(read_args_or_stdin(args))
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
from rich import print as rprint
|
||||
rprint('[yellow]No URLs or Crawl IDs provided. Pass as arguments or via stdin.[/yellow]', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Check if input looks like existing Crawl IDs to process
|
||||
# If ALL inputs are Crawl UUIDs, process them
|
||||
all_are_crawl_ids = all(
|
||||
is_crawl_id(r.get('id') or r.get('url', ''))
|
||||
for r in records
|
||||
)
|
||||
updated_count = 0
|
||||
for record in records:
|
||||
crawl_id = record.get('id')
|
||||
if not crawl_id:
|
||||
continue
|
||||
|
||||
if all_are_crawl_ids:
|
||||
# Process existing Crawls by ID
|
||||
exit_code = 0
|
||||
for record in records:
|
||||
crawl_id = record.get('id') or record.get('url')
|
||||
result = process_crawl_by_id(crawl_id)
|
||||
if result != 0:
|
||||
exit_code = result
|
||||
sys.exit(exit_code)
|
||||
else:
|
||||
# Default behavior: create Crawl jobs from URLs
|
||||
sys.exit(create_crawls(records, depth=depth, tag=tag))
|
||||
try:
|
||||
crawl = Crawl.objects.get(id=crawl_id)
|
||||
|
||||
# Apply updates from CLI flags
|
||||
if status:
|
||||
crawl.status = status
|
||||
crawl.retry_at = timezone.now()
|
||||
if max_depth is not None:
|
||||
crawl.max_depth = max_depth
|
||||
|
||||
crawl.save()
|
||||
updated_count += 1
|
||||
|
||||
if not is_tty:
|
||||
write_record(crawl.to_json())
|
||||
|
||||
except Crawl.DoesNotExist:
|
||||
rprint(f'[yellow]Crawl not found: {crawl_id}[/yellow]', file=sys.stderr)
|
||||
continue
|
||||
|
||||
rprint(f'[green]Updated {updated_count} crawls[/green]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# DELETE
|
||||
# =============================================================================
|
||||
|
||||
def delete_crawls(yes: bool = False, dry_run: bool = False) -> int:
|
||||
"""
|
||||
Delete Crawls from stdin JSONL.
|
||||
|
||||
Requires --yes flag to confirm deletion.
|
||||
|
||||
Exit codes:
|
||||
0: Success
|
||||
1: No input or missing --yes flag
|
||||
"""
|
||||
from archivebox.misc.jsonl import read_stdin
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
crawl_ids = [r.get('id') for r in records if r.get('id')]
|
||||
|
||||
if not crawl_ids:
|
||||
rprint('[yellow]No valid crawl IDs in input[/yellow]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
crawls = Crawl.objects.filter(id__in=crawl_ids)
|
||||
count = crawls.count()
|
||||
|
||||
if count == 0:
|
||||
rprint('[yellow]No matching crawls found[/yellow]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if dry_run:
|
||||
rprint(f'[yellow]Would delete {count} crawls (dry run)[/yellow]', file=sys.stderr)
|
||||
for crawl in crawls:
|
||||
url_preview = crawl.urls[:50].replace('\n', ' ')
|
||||
rprint(f' [dim]{crawl.id}[/dim] {url_preview}...', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if not yes:
|
||||
rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Perform deletion
|
||||
deleted_count, _ = crawls.delete()
|
||||
rprint(f'[green]Deleted {deleted_count} crawls[/green]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# CLI Commands
|
||||
# =============================================================================
|
||||
|
||||
@click.group()
|
||||
def main():
|
||||
"""Manage Crawl records."""
|
||||
pass
|
||||
|
||||
|
||||
@main.command('create')
|
||||
@click.argument('urls', nargs=-1)
|
||||
@click.option('--depth', '-d', type=int, default=0, help='Max crawl depth (default: 0)')
|
||||
@click.option('--tag', '-t', default='', help='Comma-separated tags to add')
|
||||
@click.option('--status', '-s', default='queued', help='Initial status (default: queued)')
|
||||
def create_cmd(urls: tuple, depth: int, tag: str, status: str):
|
||||
"""Create a Crawl job from URLs or stdin."""
|
||||
sys.exit(create_crawl(urls, depth=depth, tag=tag, status=status))
|
||||
|
||||
|
||||
@main.command('list')
|
||||
@click.option('--status', '-s', help='Filter by status (queued, started, sealed)')
|
||||
@click.option('--urls__icontains', help='Filter by URLs contains')
|
||||
@click.option('--max-depth', type=int, help='Filter by max depth')
|
||||
@click.option('--limit', '-n', type=int, help='Limit number of results')
|
||||
def list_cmd(status: Optional[str], urls__icontains: Optional[str],
|
||||
max_depth: Optional[int], limit: Optional[int]):
|
||||
"""List Crawls as JSONL."""
|
||||
sys.exit(list_crawls(
|
||||
status=status,
|
||||
urls__icontains=urls__icontains,
|
||||
max_depth=max_depth,
|
||||
limit=limit,
|
||||
))
|
||||
|
||||
|
||||
@main.command('update')
|
||||
@click.option('--status', '-s', help='Set status')
|
||||
@click.option('--max-depth', type=int, help='Set max depth')
|
||||
def update_cmd(status: Optional[str], max_depth: Optional[int]):
|
||||
"""Update Crawls from stdin JSONL."""
|
||||
sys.exit(update_crawls(status=status, max_depth=max_depth))
|
||||
|
||||
|
||||
@main.command('delete')
|
||||
@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
|
||||
@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
|
||||
def delete_cmd(yes: bool, dry_run: bool):
|
||||
"""Delete Crawls from stdin JSONL."""
|
||||
sys.exit(delete_crawls(yes=yes, dry_run=dry_run))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@@ -127,7 +127,7 @@ def init(force: bool=False, quick: bool=False, install: bool=False) -> None:
|
||||
|
||||
if pending_links:
|
||||
for link_dict in pending_links.values():
|
||||
Snapshot.from_jsonl(link_dict)
|
||||
Snapshot.from_json(link_dict)
|
||||
|
||||
# Hint for orphaned snapshot directories
|
||||
print()
|
||||
|
||||
99
archivebox/cli/archivebox_machine.py
Normal file
99
archivebox/cli/archivebox_machine.py
Normal file
@@ -0,0 +1,99 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
archivebox machine <action> [--filters]
|
||||
|
||||
Manage Machine records (system-managed, mostly read-only).
|
||||
|
||||
Machine records track the host machines where ArchiveBox runs.
|
||||
They are created automatically by the system and are primarily for debugging.
|
||||
|
||||
Actions:
|
||||
list - List Machines as JSONL (with optional filters)
|
||||
|
||||
Examples:
|
||||
# List all machines
|
||||
archivebox machine list
|
||||
|
||||
# List machines by hostname
|
||||
archivebox machine list --hostname__icontains=myserver
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox machine'
|
||||
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
import rich_click as click
|
||||
from rich import print as rprint
|
||||
|
||||
from archivebox.cli.cli_utils import apply_filters
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# LIST
|
||||
# =============================================================================
|
||||
|
||||
def list_machines(
|
||||
hostname__icontains: Optional[str] = None,
|
||||
os_platform: Optional[str] = None,
|
||||
limit: Optional[int] = None,
|
||||
) -> int:
|
||||
"""
|
||||
List Machines as JSONL with optional filters.
|
||||
|
||||
Exit codes:
|
||||
0: Success (even if no results)
|
||||
"""
|
||||
from archivebox.misc.jsonl import write_record
|
||||
from archivebox.machine.models import Machine
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
queryset = Machine.objects.all().order_by('-created_at')
|
||||
|
||||
# Apply filters
|
||||
filter_kwargs = {
|
||||
'hostname__icontains': hostname__icontains,
|
||||
'os_platform': os_platform,
|
||||
}
|
||||
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
|
||||
|
||||
count = 0
|
||||
for machine in queryset:
|
||||
if is_tty:
|
||||
rprint(f'[cyan]{machine.hostname:30}[/cyan] [dim]{machine.os_platform:10}[/dim] {machine.id}')
|
||||
else:
|
||||
write_record(machine.to_json())
|
||||
count += 1
|
||||
|
||||
rprint(f'[dim]Listed {count} machines[/dim]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# CLI Commands
|
||||
# =============================================================================
|
||||
|
||||
@click.group()
|
||||
def main():
|
||||
"""Manage Machine records (read-only, system-managed)."""
|
||||
pass
|
||||
|
||||
|
||||
@main.command('list')
|
||||
@click.option('--hostname__icontains', help='Filter by hostname contains')
|
||||
@click.option('--os-platform', help='Filter by OS platform')
|
||||
@click.option('--limit', '-n', type=int, help='Limit number of results')
|
||||
def list_cmd(hostname__icontains: Optional[str], os_platform: Optional[str], limit: Optional[int]):
|
||||
"""List Machines as JSONL."""
|
||||
sys.exit(list_machines(
|
||||
hostname__icontains=hostname__icontains,
|
||||
os_platform=os_platform,
|
||||
limit=limit,
|
||||
))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
356
archivebox/cli/archivebox_pluginmap.py
Normal file
356
archivebox/cli/archivebox_pluginmap.py
Normal file
@@ -0,0 +1,356 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
|
||||
from typing import Optional
|
||||
from pathlib import Path
|
||||
|
||||
import rich_click as click
|
||||
|
||||
from archivebox.misc.util import docstring, enforce_types
|
||||
|
||||
|
||||
# State Machine ASCII Art Diagrams
|
||||
CRAWL_MACHINE_DIAGRAM = """
|
||||
┌─────────────────────────────────────────────────────────────────────────────┐
|
||||
│ CrawlMachine │
|
||||
├─────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ ┌─────────────┐ │
|
||||
│ │ QUEUED │◄────────────────┐ │
|
||||
│ │ (initial) │ │ │
|
||||
│ └──────┬──────┘ │ │
|
||||
│ │ │ tick() unless can_start() │
|
||||
│ │ tick() when │ │
|
||||
│ │ can_start() │ │
|
||||
│ ▼ │ │
|
||||
│ ┌─────────────┐ │ │
|
||||
│ │ STARTED │─────────────────┘ │
|
||||
│ │ │◄────────────────┐ │
|
||||
│ │ enter: │ │ │
|
||||
│ │ crawl.run()│ │ tick() unless is_finished() │
|
||||
│ │ (discover │ │ │
|
||||
│ │ Crawl │─────────────────┘ │
|
||||
│ │ hooks) │ │
|
||||
│ └──────┬──────┘ │
|
||||
│ │ │
|
||||
│ │ tick() when is_finished() │
|
||||
│ ▼ │
|
||||
│ ┌─────────────┐ │
|
||||
│ │ SEALED │ │
|
||||
│ │ (final) │ │
|
||||
│ │ │ │
|
||||
│ │ enter: │ │
|
||||
│ │ cleanup() │ │
|
||||
│ └─────────────┘ │
|
||||
│ │
|
||||
│ Hooks triggered: on_Crawl__* (during STARTED.enter via crawl.run()) │
|
||||
│ on_CrawlEnd__* (during SEALED.enter via cleanup()) │
|
||||
└─────────────────────────────────────────────────────────────────────────────┘
|
||||
"""
|
||||
|
||||
SNAPSHOT_MACHINE_DIAGRAM = """
|
||||
┌─────────────────────────────────────────────────────────────────────────────┐
|
||||
│ SnapshotMachine │
|
||||
├─────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ ┌─────────────┐ │
|
||||
│ │ QUEUED │◄────────────────┐ │
|
||||
│ │ (initial) │ │ │
|
||||
│ └──────┬──────┘ │ │
|
||||
│ │ │ tick() unless can_start() │
|
||||
│ │ tick() when │ │
|
||||
│ │ can_start() │ │
|
||||
│ ▼ │ │
|
||||
│ ┌─────────────┐ │ │
|
||||
│ │ STARTED │─────────────────┘ │
|
||||
│ │ │◄────────────────┐ │
|
||||
│ │ enter: │ │ │
|
||||
│ │ snapshot │ │ tick() unless is_finished() │
|
||||
│ │ .run() │ │ │
|
||||
│ │ (discover │─────────────────┘ │
|
||||
│ │ Snapshot │ │
|
||||
│ │ hooks, │ │
|
||||
│ │ create │ │
|
||||
│ │ pending │ │
|
||||
│ │ results) │ │
|
||||
│ └──────┬──────┘ │
|
||||
│ │ │
|
||||
│ │ tick() when is_finished() │
|
||||
│ ▼ │
|
||||
│ ┌─────────────┐ │
|
||||
│ │ SEALED │ │
|
||||
│ │ (final) │ │
|
||||
│ │ │ │
|
||||
│ │ enter: │ │
|
||||
│ │ cleanup() │ │
|
||||
│ └─────────────┘ │
|
||||
│ │
|
||||
│ Hooks triggered: on_Snapshot__* (creates ArchiveResults in STARTED.enter) │
|
||||
└─────────────────────────────────────────────────────────────────────────────┘
|
||||
"""
|
||||
|
||||
ARCHIVERESULT_MACHINE_DIAGRAM = """
|
||||
┌─────────────────────────────────────────────────────────────────────────────┐
|
||||
│ ArchiveResultMachine │
|
||||
├─────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ ┌─────────────┐ │
|
||||
│ │ QUEUED │◄────────────────┐ │
|
||||
│ │ (initial) │ │ │
|
||||
│ └──────┬──────┘ │ │
|
||||
│ │ │ tick() unless can_start() │
|
||||
│ │ tick() when │ │
|
||||
│ │ can_start() │ │
|
||||
│ ▼ │ │
|
||||
│ ┌─────────────┐ │ │
|
||||
│ │ STARTED │─────────────────┘ │
|
||||
│ │ │◄────────────────┐ │
|
||||
│ │ enter: │ │ tick() unless is_finished() │
|
||||
│ │ result.run()│─────────────────┘ │
|
||||
│ │ (execute │ │
|
||||
│ │ hook via │ │
|
||||
│ │ run_hook())│ │
|
||||
│ └──────┬──────┘ │
|
||||
│ │ │
|
||||
│ │ tick() checks status set by hook output │
|
||||
│ ├────────────────┬────────────────┬────────────────┐ │
|
||||
│ ▼ ▼ ▼ ▼ │
|
||||
│ ┌───────────┐ ┌───────────┐ ┌───────────┐ ┌───────────┐ │
|
||||
│ │ SUCCEEDED │ │ FAILED │ │ SKIPPED │ │ BACKOFF │ │
|
||||
│ │ (final) │ │ (final) │ │ (final) │ │ │ │
|
||||
│ └───────────┘ └───────────┘ └───────────┘ └─────┬─────┘ │
|
||||
│ │ │
|
||||
│ can_start()───┘ │
|
||||
│ loops back to STARTED │
|
||||
│ │
|
||||
│ Each ArchiveResult runs ONE specific hook (stored in .hook_name field) │
|
||||
└─────────────────────────────────────────────────────────────────────────────┘
|
||||
"""
|
||||
|
||||
BINARY_MACHINE_DIAGRAM = """
|
||||
┌─────────────────────────────────────────────────────────────────────────────┐
|
||||
│ BinaryMachine │
|
||||
├─────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ ┌─────────────┐ │
|
||||
│ │ QUEUED │◄────────────────┐ │
|
||||
│ │ (initial) │ │ │
|
||||
│ └──────┬──────┘ │ │
|
||||
│ │ │ tick() unless can_start() │
|
||||
│ │ tick() when │ │
|
||||
│ │ can_start() │ │
|
||||
│ ▼ │ │
|
||||
│ ┌─────────────┐ │ │
|
||||
│ │ STARTED │─────────────────┘ │
|
||||
│ │ │◄────────────────┐ │
|
||||
│ │ enter: │ │ │
|
||||
│ │ binary.run()│ │ tick() unless is_finished() │
|
||||
│ │ (discover │─────────────────┘ │
|
||||
│ │ Binary │ │
|
||||
│ │ hooks, │ │
|
||||
│ │ try each │ │
|
||||
│ │ provider) │ │
|
||||
│ └──────┬──────┘ │
|
||||
│ │ │
|
||||
│ │ tick() checks status set by hook output │
|
||||
│ ├────────────────────────────────┐ │
|
||||
│ ▼ ▼ │
|
||||
│ ┌─────────────┐ ┌─────────────┐ │
|
||||
│ │ SUCCEEDED │ │ FAILED │ │
|
||||
│ │ (final) │ │ (final) │ │
|
||||
│ │ │ │ │ │
|
||||
│ │ abspath, │ │ no provider │ │
|
||||
│ │ version set │ │ succeeded │ │
|
||||
│ └─────────────┘ └─────────────┘ │
|
||||
│ │
|
||||
│ Hooks triggered: on_Binary__* (provider hooks during STARTED.enter) │
|
||||
│ Providers tried in sequence until one succeeds: apt, brew, pip, npm, etc. │
|
||||
└─────────────────────────────────────────────────────────────────────────────┘
|
||||
"""
|
||||
|
||||
|
||||
@enforce_types
|
||||
def pluginmap(
|
||||
show_disabled: bool = False,
|
||||
model: Optional[str] = None,
|
||||
quiet: bool = False,
|
||||
) -> dict:
|
||||
"""
|
||||
Show a map of all state machines and their associated plugin hooks.
|
||||
|
||||
Displays ASCII art diagrams of the core model state machines (Crawl, Snapshot,
|
||||
ArchiveResult, Binary) and lists all auto-detected on_Modelname_xyz hooks
|
||||
that will run for each model's transitions.
|
||||
"""
|
||||
from rich.console import Console
|
||||
from rich.table import Table
|
||||
from rich.panel import Panel
|
||||
from rich import box
|
||||
|
||||
from archivebox.hooks import (
|
||||
discover_hooks,
|
||||
extract_step,
|
||||
is_background_hook,
|
||||
BUILTIN_PLUGINS_DIR,
|
||||
USER_PLUGINS_DIR,
|
||||
)
|
||||
|
||||
console = Console()
|
||||
prnt = console.print
|
||||
|
||||
# Model event types that can have hooks
|
||||
model_events = {
|
||||
'Crawl': {
|
||||
'description': 'Hooks run when a Crawl starts (QUEUED→STARTED)',
|
||||
'machine': 'CrawlMachine',
|
||||
'diagram': CRAWL_MACHINE_DIAGRAM,
|
||||
},
|
||||
'CrawlEnd': {
|
||||
'description': 'Hooks run when a Crawl finishes (STARTED→SEALED)',
|
||||
'machine': 'CrawlMachine',
|
||||
'diagram': None, # Part of CrawlMachine
|
||||
},
|
||||
'Snapshot': {
|
||||
'description': 'Hooks run for each Snapshot (creates ArchiveResults)',
|
||||
'machine': 'SnapshotMachine',
|
||||
'diagram': SNAPSHOT_MACHINE_DIAGRAM,
|
||||
},
|
||||
'Binary': {
|
||||
'description': 'Hooks for installing binary dependencies (providers)',
|
||||
'machine': 'BinaryMachine',
|
||||
'diagram': BINARY_MACHINE_DIAGRAM,
|
||||
},
|
||||
}
|
||||
|
||||
# Filter to specific model if requested
|
||||
if model:
|
||||
model = model.title()
|
||||
if model not in model_events:
|
||||
prnt(f'[red]Error: Unknown model "{model}". Available: {", ".join(model_events.keys())}[/red]')
|
||||
return {}
|
||||
model_events = {model: model_events[model]}
|
||||
|
||||
result = {
|
||||
'models': {},
|
||||
'plugins_dir': str(BUILTIN_PLUGINS_DIR),
|
||||
'user_plugins_dir': str(USER_PLUGINS_DIR),
|
||||
}
|
||||
|
||||
if not quiet:
|
||||
prnt()
|
||||
prnt('[bold cyan]ArchiveBox Plugin Map[/bold cyan]')
|
||||
prnt(f'[dim]Built-in plugins: {BUILTIN_PLUGINS_DIR}[/dim]')
|
||||
prnt(f'[dim]User plugins: {USER_PLUGINS_DIR}[/dim]')
|
||||
prnt()
|
||||
|
||||
# Show diagrams first (unless quiet mode)
|
||||
if not quiet:
|
||||
# Show ArchiveResult diagram separately since it's different
|
||||
prnt(Panel(
|
||||
ARCHIVERESULT_MACHINE_DIAGRAM,
|
||||
title='[bold green]ArchiveResultMachine[/bold green]',
|
||||
border_style='green',
|
||||
expand=False,
|
||||
))
|
||||
prnt()
|
||||
|
||||
for event_name, info in model_events.items():
|
||||
# Discover hooks for this event
|
||||
hooks = discover_hooks(event_name, filter_disabled=not show_disabled)
|
||||
|
||||
# Build hook info list
|
||||
hook_infos = []
|
||||
for hook_path in hooks:
|
||||
# Get plugin name from parent directory (e.g., 'wget' from 'plugins/wget/on_Snapshot__61_wget.py')
|
||||
plugin_name = hook_path.parent.name
|
||||
step = extract_step(hook_path.name)
|
||||
is_bg = is_background_hook(hook_path.name)
|
||||
|
||||
hook_infos.append({
|
||||
'path': str(hook_path),
|
||||
'name': hook_path.name,
|
||||
'plugin': plugin_name,
|
||||
'step': step,
|
||||
'is_background': is_bg,
|
||||
'extension': hook_path.suffix,
|
||||
})
|
||||
|
||||
result['models'][event_name] = {
|
||||
'description': info['description'],
|
||||
'machine': info['machine'],
|
||||
'hooks': hook_infos,
|
||||
'hook_count': len(hook_infos),
|
||||
}
|
||||
|
||||
if not quiet:
|
||||
# Show diagram if this model has one
|
||||
if info.get('diagram'):
|
||||
prnt(Panel(
|
||||
info['diagram'],
|
||||
title=f'[bold green]{info["machine"]}[/bold green]',
|
||||
border_style='green',
|
||||
expand=False,
|
||||
))
|
||||
prnt()
|
||||
|
||||
# Create hooks table
|
||||
table = Table(
|
||||
title=f'[bold yellow]on_{event_name}__* Hooks[/bold yellow] ({len(hooks)} found)',
|
||||
box=box.ROUNDED,
|
||||
show_header=True,
|
||||
header_style='bold magenta',
|
||||
)
|
||||
table.add_column('Step', justify='center', width=6)
|
||||
table.add_column('Plugin', style='cyan', width=20)
|
||||
table.add_column('Hook Name', style='green')
|
||||
table.add_column('BG', justify='center', width=4)
|
||||
table.add_column('Type', justify='center', width=5)
|
||||
|
||||
# Sort by step then by name
|
||||
sorted_hooks = sorted(hook_infos, key=lambda h: (h['step'], h['name']))
|
||||
|
||||
for hook in sorted_hooks:
|
||||
bg_marker = '[yellow]bg[/yellow]' if hook['is_background'] else ''
|
||||
ext = hook['extension'].lstrip('.')
|
||||
table.add_row(
|
||||
str(hook['step']),
|
||||
hook['plugin'],
|
||||
hook['name'],
|
||||
bg_marker,
|
||||
ext,
|
||||
)
|
||||
|
||||
prnt(table)
|
||||
prnt()
|
||||
prnt(f'[dim]{info["description"]}[/dim]')
|
||||
prnt()
|
||||
|
||||
# Summary
|
||||
if not quiet:
|
||||
total_hooks = sum(m['hook_count'] for m in result['models'].values())
|
||||
prnt(f'[bold]Total hooks discovered: {total_hooks}[/bold]')
|
||||
prnt()
|
||||
prnt('[dim]Hook naming convention: on_{Model}__{XX}_{description}[.bg].{ext}[/dim]')
|
||||
prnt('[dim] - XX: Two-digit order (first digit = step 0-9)[/dim]')
|
||||
prnt('[dim] - .bg: Background hook (non-blocking)[/dim]')
|
||||
prnt('[dim] - ext: py, sh, or js[/dim]')
|
||||
prnt()
|
||||
|
||||
return result
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--show-disabled', '-a', is_flag=True, help='Show hooks from disabled plugins too')
|
||||
@click.option('--model', '-m', type=str, default=None, help='Filter to specific model (Crawl, Snapshot, Binary, CrawlEnd)')
|
||||
@click.option('--quiet', '-q', is_flag=True, help='Output JSON only, no ASCII diagrams')
|
||||
@docstring(pluginmap.__doc__)
|
||||
def main(**kwargs):
|
||||
import json
|
||||
result = pluginmap(**kwargs)
|
||||
if kwargs.get('quiet'):
|
||||
print(json.dumps(result, indent=2))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
107
archivebox/cli/archivebox_process.py
Normal file
107
archivebox/cli/archivebox_process.py
Normal file
@@ -0,0 +1,107 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
archivebox process <action> [--filters]
|
||||
|
||||
Manage Process records (system-managed, mostly read-only).
|
||||
|
||||
Process records track executions of binaries during extraction.
|
||||
They are created automatically by the system and are primarily for debugging.
|
||||
|
||||
Actions:
|
||||
list - List Processes as JSONL (with optional filters)
|
||||
|
||||
Examples:
|
||||
# List all processes
|
||||
archivebox process list
|
||||
|
||||
# List processes by binary
|
||||
archivebox process list --binary-name=chrome
|
||||
|
||||
# List recent processes
|
||||
archivebox process list --limit=10
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox process'
|
||||
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
import rich_click as click
|
||||
from rich import print as rprint
|
||||
|
||||
from archivebox.cli.cli_utils import apply_filters
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# LIST
|
||||
# =============================================================================
|
||||
|
||||
def list_processes(
|
||||
binary_name: Optional[str] = None,
|
||||
machine_id: Optional[str] = None,
|
||||
limit: Optional[int] = None,
|
||||
) -> int:
|
||||
"""
|
||||
List Processes as JSONL with optional filters.
|
||||
|
||||
Exit codes:
|
||||
0: Success (even if no results)
|
||||
"""
|
||||
from archivebox.misc.jsonl import write_record
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
queryset = Process.objects.all().select_related('binary', 'machine').order_by('-start_ts')
|
||||
|
||||
# Apply filters
|
||||
filter_kwargs = {}
|
||||
if binary_name:
|
||||
filter_kwargs['binary__name'] = binary_name
|
||||
if machine_id:
|
||||
filter_kwargs['machine_id'] = machine_id
|
||||
|
||||
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
|
||||
|
||||
count = 0
|
||||
for process in queryset:
|
||||
if is_tty:
|
||||
binary_name_str = process.binary.name if process.binary else 'unknown'
|
||||
exit_code = process.returncode if process.returncode is not None else '?'
|
||||
status_color = 'green' if process.returncode == 0 else 'red' if process.returncode else 'yellow'
|
||||
rprint(f'[{status_color}]exit={exit_code:3}[/{status_color}] [cyan]{binary_name_str:15}[/cyan] [dim]{process.id}[/dim]')
|
||||
else:
|
||||
write_record(process.to_json())
|
||||
count += 1
|
||||
|
||||
rprint(f'[dim]Listed {count} processes[/dim]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# CLI Commands
|
||||
# =============================================================================
|
||||
|
||||
@click.group()
|
||||
def main():
|
||||
"""Manage Process records (read-only, system-managed)."""
|
||||
pass
|
||||
|
||||
|
||||
@main.command('list')
|
||||
@click.option('--binary-name', '-b', help='Filter by binary name')
|
||||
@click.option('--machine-id', '-m', help='Filter by machine ID')
|
||||
@click.option('--limit', '-n', type=int, help='Limit number of results')
|
||||
def list_cmd(binary_name: Optional[str], machine_id: Optional[str], limit: Optional[int]):
|
||||
"""List Processes as JSONL."""
|
||||
sys.exit(list_processes(
|
||||
binary_name=binary_name,
|
||||
machine_id=machine_id,
|
||||
limit=limit,
|
||||
))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
207
archivebox/cli/archivebox_run.py
Normal file
207
archivebox/cli/archivebox_run.py
Normal file
@@ -0,0 +1,207 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
archivebox run [--daemon]
|
||||
|
||||
Unified command for processing queued work.
|
||||
|
||||
Modes:
|
||||
- With stdin JSONL: Process piped records, exit when complete
|
||||
- Without stdin (TTY): Run orchestrator in foreground until killed
|
||||
|
||||
Examples:
|
||||
# Run orchestrator in foreground (replaces `archivebox orchestrator`)
|
||||
archivebox run
|
||||
|
||||
# Run as daemon (don't exit on idle)
|
||||
archivebox run --daemon
|
||||
|
||||
# Process specific records (pipe any JSONL type, exits when done)
|
||||
archivebox snapshot list --status=queued | archivebox run
|
||||
archivebox archiveresult list --status=failed | archivebox run
|
||||
archivebox crawl list --status=queued | archivebox run
|
||||
|
||||
# Mixed types work too
|
||||
cat mixed_records.jsonl | archivebox run
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox run'
|
||||
|
||||
import sys
|
||||
|
||||
import rich_click as click
|
||||
from rich import print as rprint
|
||||
|
||||
|
||||
def process_stdin_records() -> int:
|
||||
"""
|
||||
Process JSONL records from stdin.
|
||||
|
||||
Create-or-update behavior:
|
||||
- Records WITHOUT id: Create via Model.from_json(), then queue
|
||||
- Records WITH id: Lookup existing, re-queue for processing
|
||||
|
||||
Outputs JSONL of all processed records (for chaining).
|
||||
|
||||
Handles any record type: Crawl, Snapshot, ArchiveResult.
|
||||
Auto-cascades: Crawl → Snapshots → ArchiveResults.
|
||||
|
||||
Returns exit code (0 = success, 1 = error).
|
||||
"""
|
||||
from django.utils import timezone
|
||||
|
||||
from archivebox.misc.jsonl import read_stdin, write_record, TYPE_CRAWL, TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from archivebox.core.models import Snapshot, ArchiveResult
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.workers.orchestrator import Orchestrator
|
||||
|
||||
records = list(read_stdin())
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
if not records:
|
||||
return 0 # Nothing to process
|
||||
|
||||
created_by_id = get_or_create_system_user_pk()
|
||||
queued_count = 0
|
||||
output_records = []
|
||||
|
||||
for record in records:
|
||||
record_type = record.get('type', '')
|
||||
record_id = record.get('id')
|
||||
|
||||
try:
|
||||
if record_type == TYPE_CRAWL:
|
||||
if record_id:
|
||||
# Existing crawl - re-queue
|
||||
try:
|
||||
crawl = Crawl.objects.get(id=record_id)
|
||||
except Crawl.DoesNotExist:
|
||||
crawl = Crawl.from_json(record, overrides={'created_by_id': created_by_id})
|
||||
else:
|
||||
# New crawl - create it
|
||||
crawl = Crawl.from_json(record, overrides={'created_by_id': created_by_id})
|
||||
|
||||
if crawl:
|
||||
crawl.retry_at = timezone.now()
|
||||
if crawl.status not in [Crawl.StatusChoices.SEALED]:
|
||||
crawl.status = Crawl.StatusChoices.QUEUED
|
||||
crawl.save()
|
||||
output_records.append(crawl.to_json())
|
||||
queued_count += 1
|
||||
|
||||
elif record_type == TYPE_SNAPSHOT or (record.get('url') and not record_type):
|
||||
if record_id:
|
||||
# Existing snapshot - re-queue
|
||||
try:
|
||||
snapshot = Snapshot.objects.get(id=record_id)
|
||||
except Snapshot.DoesNotExist:
|
||||
snapshot = Snapshot.from_json(record, overrides={'created_by_id': created_by_id})
|
||||
else:
|
||||
# New snapshot - create it
|
||||
snapshot = Snapshot.from_json(record, overrides={'created_by_id': created_by_id})
|
||||
|
||||
if snapshot:
|
||||
snapshot.retry_at = timezone.now()
|
||||
if snapshot.status not in [Snapshot.StatusChoices.SEALED]:
|
||||
snapshot.status = Snapshot.StatusChoices.QUEUED
|
||||
snapshot.save()
|
||||
output_records.append(snapshot.to_json())
|
||||
queued_count += 1
|
||||
|
||||
elif record_type == TYPE_ARCHIVERESULT:
|
||||
if record_id:
|
||||
# Existing archiveresult - re-queue
|
||||
try:
|
||||
archiveresult = ArchiveResult.objects.get(id=record_id)
|
||||
except ArchiveResult.DoesNotExist:
|
||||
archiveresult = ArchiveResult.from_json(record)
|
||||
else:
|
||||
# New archiveresult - create it
|
||||
archiveresult = ArchiveResult.from_json(record)
|
||||
|
||||
if archiveresult:
|
||||
archiveresult.retry_at = timezone.now()
|
||||
if archiveresult.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED, ArchiveResult.StatusChoices.BACKOFF]:
|
||||
archiveresult.status = ArchiveResult.StatusChoices.QUEUED
|
||||
archiveresult.save()
|
||||
output_records.append(archiveresult.to_json())
|
||||
queued_count += 1
|
||||
|
||||
else:
|
||||
# Unknown type - pass through
|
||||
output_records.append(record)
|
||||
|
||||
except Exception as e:
|
||||
rprint(f'[yellow]Error processing record: {e}[/yellow]', file=sys.stderr)
|
||||
continue
|
||||
|
||||
# Output all processed records (for chaining)
|
||||
if not is_tty:
|
||||
for rec in output_records:
|
||||
write_record(rec)
|
||||
|
||||
if queued_count == 0:
|
||||
rprint('[yellow]No records to process[/yellow]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
rprint(f'[blue]Processing {queued_count} records...[/blue]', file=sys.stderr)
|
||||
|
||||
# Run orchestrator until all queued work is done
|
||||
orchestrator = Orchestrator(exit_on_idle=True)
|
||||
orchestrator.runloop()
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def run_orchestrator(daemon: bool = False) -> int:
|
||||
"""
|
||||
Run the orchestrator process.
|
||||
|
||||
The orchestrator:
|
||||
1. Polls each model queue (Crawl, Snapshot, ArchiveResult)
|
||||
2. Spawns worker processes when there is work to do
|
||||
3. Monitors worker health and restarts failed workers
|
||||
4. Exits when all queues are empty (unless --daemon)
|
||||
|
||||
Args:
|
||||
daemon: Run forever (don't exit when idle)
|
||||
|
||||
Returns exit code (0 = success, 1 = error).
|
||||
"""
|
||||
from archivebox.workers.orchestrator import Orchestrator
|
||||
|
||||
if Orchestrator.is_running():
|
||||
rprint('[yellow]Orchestrator is already running[/yellow]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
try:
|
||||
orchestrator = Orchestrator(exit_on_idle=not daemon)
|
||||
orchestrator.runloop()
|
||||
return 0
|
||||
except KeyboardInterrupt:
|
||||
return 0
|
||||
except Exception as e:
|
||||
rprint(f'[red]Orchestrator error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--daemon', '-d', is_flag=True, help="Run forever (don't exit on idle)")
|
||||
def main(daemon: bool):
|
||||
"""
|
||||
Process queued work.
|
||||
|
||||
When stdin is piped: Process those specific records and exit.
|
||||
When run standalone: Run orchestrator in foreground.
|
||||
"""
|
||||
# Check if stdin has data (non-TTY means piped input)
|
||||
if not sys.stdin.isatty():
|
||||
sys.exit(process_stdin_records())
|
||||
else:
|
||||
sys.exit(run_orchestrator(daemon=daemon))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -1,95 +1,63 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
archivebox snapshot [urls_or_crawl_ids...] [--tag=TAG] [--plugins=NAMES]
|
||||
archivebox snapshot <action> [args...] [--filters]
|
||||
|
||||
Create Snapshots from URLs or Crawl jobs. Accepts URLs, Crawl JSONL, or Crawl IDs.
|
||||
Manage Snapshot records.
|
||||
|
||||
Input formats:
|
||||
- Plain URLs (one per line)
|
||||
- JSONL: {"type": "Crawl", "id": "...", "urls": "..."}
|
||||
- JSONL: {"type": "Snapshot", "url": "...", "title": "...", "tags": "..."}
|
||||
- Crawl UUIDs (one per line)
|
||||
|
||||
Output (JSONL):
|
||||
{"type": "Snapshot", "id": "...", "url": "...", "status": "queued", ...}
|
||||
Actions:
|
||||
create - Create Snapshots from URLs or Crawl JSONL
|
||||
list - List Snapshots as JSONL (with optional filters)
|
||||
update - Update Snapshots from stdin JSONL
|
||||
delete - Delete Snapshots from stdin JSONL
|
||||
|
||||
Examples:
|
||||
# Create snapshots from URLs directly
|
||||
archivebox snapshot https://example.com https://foo.com
|
||||
# Create
|
||||
archivebox snapshot create https://example.com --tag=news
|
||||
archivebox crawl create https://example.com | archivebox snapshot create
|
||||
|
||||
# Pipe from crawl command
|
||||
archivebox crawl https://example.com | archivebox snapshot
|
||||
# List with filters
|
||||
archivebox snapshot list --status=queued
|
||||
archivebox snapshot list --url__icontains=example.com
|
||||
|
||||
# Chain with extract
|
||||
archivebox crawl https://example.com | archivebox snapshot | archivebox extract
|
||||
# Update
|
||||
archivebox snapshot list --tag=old | archivebox snapshot update --tag=new
|
||||
|
||||
# Run specific plugins after creating snapshots
|
||||
archivebox snapshot --plugins=screenshot,singlefile https://example.com
|
||||
|
||||
# Process existing Snapshot by ID
|
||||
archivebox snapshot 01234567-89ab-cdef-0123-456789abcdef
|
||||
# Delete
|
||||
archivebox snapshot list --url__icontains=spam.com | archivebox snapshot delete --yes
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox snapshot'
|
||||
|
||||
import sys
|
||||
from typing import Optional
|
||||
from typing import Optional, Iterable
|
||||
|
||||
import rich_click as click
|
||||
from rich import print as rprint
|
||||
|
||||
from archivebox.misc.util import docstring
|
||||
from archivebox.cli.cli_utils import apply_filters
|
||||
|
||||
|
||||
def process_snapshot_by_id(snapshot_id: str) -> int:
|
||||
"""
|
||||
Process a single Snapshot by ID (used by workers).
|
||||
|
||||
Triggers the Snapshot's state machine tick() which will:
|
||||
- Transition from queued -> started (creates pending ArchiveResults)
|
||||
- Transition from started -> sealed (when all ArchiveResults done)
|
||||
"""
|
||||
from rich import print as rprint
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
try:
|
||||
snapshot = Snapshot.objects.get(id=snapshot_id)
|
||||
except Snapshot.DoesNotExist:
|
||||
rprint(f'[red]Snapshot {snapshot_id} not found[/red]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
rprint(f'[blue]Processing Snapshot {snapshot.id} {snapshot.url[:50]} (status={snapshot.status})[/blue]', file=sys.stderr)
|
||||
|
||||
try:
|
||||
snapshot.sm.tick()
|
||||
snapshot.refresh_from_db()
|
||||
rprint(f'[green]Snapshot complete (status={snapshot.status})[/green]', file=sys.stderr)
|
||||
return 0
|
||||
except Exception as e:
|
||||
rprint(f'[red]Snapshot error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# =============================================================================
|
||||
# CREATE
|
||||
# =============================================================================
|
||||
|
||||
def create_snapshots(
|
||||
args: tuple,
|
||||
urls: Iterable[str],
|
||||
tag: str = '',
|
||||
plugins: str = '',
|
||||
status: str = 'queued',
|
||||
depth: int = 0,
|
||||
created_by_id: Optional[int] = None,
|
||||
) -> int:
|
||||
"""
|
||||
Create Snapshots from URLs, Crawl JSONL, or Crawl IDs.
|
||||
|
||||
Reads from args or stdin, creates Snapshot objects, outputs JSONL.
|
||||
If --plugins is passed, also runs specified plugins (blocking).
|
||||
Create Snapshots from URLs or stdin JSONL (Crawl or Snapshot records).
|
||||
Pass-through: Records that are not Crawl/Snapshot/URL are output unchanged.
|
||||
|
||||
Exit codes:
|
||||
0: Success
|
||||
1: Failure
|
||||
"""
|
||||
from rich import print as rprint
|
||||
from django.utils import timezone
|
||||
|
||||
from archivebox.misc.jsonl import (
|
||||
read_args_or_stdin, write_record,
|
||||
TYPE_SNAPSHOT, TYPE_CRAWL
|
||||
@@ -102,7 +70,7 @@ def create_snapshots(
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
# Collect all input records
|
||||
records = list(read_args_or_stdin(args))
|
||||
records = list(read_args_or_stdin(urls))
|
||||
|
||||
if not records:
|
||||
rprint('[yellow]No URLs or Crawls provided. Pass URLs as arguments or via stdin.[/yellow]', file=sys.stderr)
|
||||
@@ -110,11 +78,17 @@ def create_snapshots(
|
||||
|
||||
# Process each record - handle Crawls and plain URLs/Snapshots
|
||||
created_snapshots = []
|
||||
pass_through_count = 0
|
||||
|
||||
for record in records:
|
||||
record_type = record.get('type')
|
||||
record_type = record.get('type', '')
|
||||
|
||||
try:
|
||||
if record_type == TYPE_CRAWL:
|
||||
# Pass through the Crawl record itself first
|
||||
if not is_tty:
|
||||
write_record(record)
|
||||
|
||||
# Input is a Crawl - get or create it, then create Snapshots for its URLs
|
||||
crawl = None
|
||||
crawl_id = record.get('id')
|
||||
@@ -122,145 +96,295 @@ def create_snapshots(
|
||||
try:
|
||||
crawl = Crawl.objects.get(id=crawl_id)
|
||||
except Crawl.DoesNotExist:
|
||||
# Crawl doesn't exist, create it
|
||||
crawl = Crawl.from_jsonl(record, overrides={'created_by_id': created_by_id})
|
||||
crawl = Crawl.from_json(record, overrides={'created_by_id': created_by_id})
|
||||
else:
|
||||
# No ID, create new crawl
|
||||
crawl = Crawl.from_jsonl(record, overrides={'created_by_id': created_by_id})
|
||||
crawl = Crawl.from_json(record, overrides={'created_by_id': created_by_id})
|
||||
|
||||
if not crawl:
|
||||
continue
|
||||
|
||||
# Create snapshots for each URL in the crawl
|
||||
for url in crawl.get_urls_list():
|
||||
# Merge CLI tags with crawl tags
|
||||
merged_tags = crawl.tags_str
|
||||
if tag:
|
||||
if merged_tags:
|
||||
merged_tags = f"{merged_tags},{tag}"
|
||||
else:
|
||||
merged_tags = tag
|
||||
merged_tags = f"{merged_tags},{tag}" if merged_tags else tag
|
||||
snapshot_record = {
|
||||
'url': url,
|
||||
'tags': merged_tags,
|
||||
'crawl_id': str(crawl.id),
|
||||
'depth': 0,
|
||||
'depth': depth,
|
||||
'status': status,
|
||||
}
|
||||
snapshot = Snapshot.from_jsonl(snapshot_record, overrides={'created_by_id': created_by_id})
|
||||
snapshot = Snapshot.from_json(snapshot_record, overrides={'created_by_id': created_by_id})
|
||||
if snapshot:
|
||||
created_snapshots.append(snapshot)
|
||||
if not is_tty:
|
||||
write_record(snapshot.to_jsonl())
|
||||
write_record(snapshot.to_json())
|
||||
|
||||
elif record_type == TYPE_SNAPSHOT or record.get('url'):
|
||||
# Input is a Snapshot or plain URL
|
||||
# Add tags if provided via CLI
|
||||
if tag and not record.get('tags'):
|
||||
record['tags'] = tag
|
||||
if status:
|
||||
record['status'] = status
|
||||
record['depth'] = record.get('depth', depth)
|
||||
|
||||
snapshot = Snapshot.from_jsonl(record, overrides={'created_by_id': created_by_id})
|
||||
snapshot = Snapshot.from_json(record, overrides={'created_by_id': created_by_id})
|
||||
if snapshot:
|
||||
created_snapshots.append(snapshot)
|
||||
if not is_tty:
|
||||
write_record(snapshot.to_jsonl())
|
||||
write_record(snapshot.to_json())
|
||||
|
||||
else:
|
||||
# Pass-through: output records we don't handle
|
||||
if not is_tty:
|
||||
write_record(record)
|
||||
pass_through_count += 1
|
||||
|
||||
except Exception as e:
|
||||
rprint(f'[red]Error creating snapshot: {e}[/red]', file=sys.stderr)
|
||||
continue
|
||||
|
||||
if not created_snapshots:
|
||||
if pass_through_count > 0:
|
||||
rprint(f'[dim]Passed through {pass_through_count} records, no new snapshots[/dim]', file=sys.stderr)
|
||||
return 0
|
||||
rprint('[red]No snapshots created[/red]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
rprint(f'[green]Created {len(created_snapshots)} snapshots[/green]', file=sys.stderr)
|
||||
|
||||
# If TTY, show human-readable output
|
||||
if is_tty:
|
||||
for snapshot in created_snapshots:
|
||||
rprint(f' [dim]{snapshot.id}[/dim] {snapshot.url[:60]}', file=sys.stderr)
|
||||
|
||||
# If --plugins is passed, create ArchiveResults and run the orchestrator
|
||||
if plugins:
|
||||
from archivebox.core.models import ArchiveResult
|
||||
from archivebox.workers.orchestrator import Orchestrator
|
||||
|
||||
# Parse comma-separated plugins list
|
||||
plugins_list = [p.strip() for p in plugins.split(',') if p.strip()]
|
||||
|
||||
# Create ArchiveResults for the specific plugins on each snapshot
|
||||
for snapshot in created_snapshots:
|
||||
for plugin_name in plugins_list:
|
||||
result, created = ArchiveResult.objects.get_or_create(
|
||||
snapshot=snapshot,
|
||||
plugin=plugin_name,
|
||||
defaults={
|
||||
'status': ArchiveResult.StatusChoices.QUEUED,
|
||||
'retry_at': timezone.now(),
|
||||
}
|
||||
)
|
||||
if not created and result.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED]:
|
||||
# Reset for retry
|
||||
result.status = ArchiveResult.StatusChoices.QUEUED
|
||||
result.retry_at = timezone.now()
|
||||
result.save()
|
||||
|
||||
rprint(f'[blue]Running plugins: {plugins}...[/blue]', file=sys.stderr)
|
||||
orchestrator = Orchestrator(exit_on_idle=True)
|
||||
orchestrator.runloop()
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def is_snapshot_id(value: str) -> bool:
|
||||
"""Check if value looks like a Snapshot UUID."""
|
||||
import re
|
||||
uuid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.I)
|
||||
if not uuid_pattern.match(value):
|
||||
return False
|
||||
# Verify it's actually a Snapshot (not a Crawl or other object)
|
||||
# =============================================================================
|
||||
# LIST
|
||||
# =============================================================================
|
||||
|
||||
def list_snapshots(
|
||||
status: Optional[str] = None,
|
||||
url__icontains: Optional[str] = None,
|
||||
url__istartswith: Optional[str] = None,
|
||||
tag: Optional[str] = None,
|
||||
crawl_id: Optional[str] = None,
|
||||
limit: Optional[int] = None,
|
||||
) -> int:
|
||||
"""
|
||||
List Snapshots as JSONL with optional filters.
|
||||
|
||||
Exit codes:
|
||||
0: Success (even if no results)
|
||||
"""
|
||||
from archivebox.misc.jsonl import write_record
|
||||
from archivebox.core.models import Snapshot
|
||||
return Snapshot.objects.filter(id=value).exists()
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
queryset = Snapshot.objects.all().order_by('-created_at')
|
||||
|
||||
# Apply filters
|
||||
filter_kwargs = {
|
||||
'status': status,
|
||||
'url__icontains': url__icontains,
|
||||
'url__istartswith': url__istartswith,
|
||||
'crawl_id': crawl_id,
|
||||
}
|
||||
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
|
||||
|
||||
# Tag filter requires special handling (M2M)
|
||||
if tag:
|
||||
queryset = queryset.filter(tags__name__iexact=tag)
|
||||
|
||||
count = 0
|
||||
for snapshot in queryset:
|
||||
if is_tty:
|
||||
status_color = {
|
||||
'queued': 'yellow',
|
||||
'started': 'blue',
|
||||
'sealed': 'green',
|
||||
}.get(snapshot.status, 'dim')
|
||||
rprint(f'[{status_color}]{snapshot.status:8}[/{status_color}] [dim]{snapshot.id}[/dim] {snapshot.url[:60]}')
|
||||
else:
|
||||
write_record(snapshot.to_json())
|
||||
count += 1
|
||||
|
||||
rprint(f'[dim]Listed {count} snapshots[/dim]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--tag', '-t', default='', help='Comma-separated tags to add to each snapshot')
|
||||
@click.option('--plugins', '-p', default='', help='Comma-separated list of plugins to run after creating snapshots (e.g., screenshot,singlefile)')
|
||||
@click.argument('args', nargs=-1)
|
||||
def main(tag: str, plugins: str, args: tuple):
|
||||
"""Create Snapshots from URLs/Crawls, or process existing Snapshots by ID"""
|
||||
from archivebox.misc.jsonl import read_args_or_stdin
|
||||
# =============================================================================
|
||||
# UPDATE
|
||||
# =============================================================================
|
||||
|
||||
# Read all input
|
||||
records = list(read_args_or_stdin(args))
|
||||
def update_snapshots(
|
||||
status: Optional[str] = None,
|
||||
tag: Optional[str] = None,
|
||||
) -> int:
|
||||
"""
|
||||
Update Snapshots from stdin JSONL.
|
||||
|
||||
Reads Snapshot records from stdin and applies updates.
|
||||
Uses PATCH semantics - only specified fields are updated.
|
||||
|
||||
Exit codes:
|
||||
0: Success
|
||||
1: No input or error
|
||||
"""
|
||||
from django.utils import timezone
|
||||
|
||||
from archivebox.misc.jsonl import read_stdin, write_record
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
from rich import print as rprint
|
||||
rprint('[yellow]No URLs, Crawl IDs, or Snapshot IDs provided. Pass as arguments or via stdin.[/yellow]', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Check if input looks like existing Snapshot IDs to process
|
||||
# If ALL inputs are UUIDs with no URL and exist as Snapshots, process them
|
||||
all_are_snapshot_ids = all(
|
||||
is_snapshot_id(r.get('id') or r.get('url', ''))
|
||||
for r in records
|
||||
if r.get('type') != 'Crawl' # Don't check Crawl records as Snapshot IDs
|
||||
)
|
||||
updated_count = 0
|
||||
for record in records:
|
||||
snapshot_id = record.get('id')
|
||||
if not snapshot_id:
|
||||
continue
|
||||
|
||||
# But also check that we're not receiving Crawl JSONL
|
||||
has_crawl_records = any(r.get('type') == 'Crawl' for r in records)
|
||||
try:
|
||||
snapshot = Snapshot.objects.get(id=snapshot_id)
|
||||
|
||||
if all_are_snapshot_ids and not has_crawl_records:
|
||||
# Process existing Snapshots by ID
|
||||
exit_code = 0
|
||||
for record in records:
|
||||
snapshot_id = record.get('id') or record.get('url')
|
||||
result = process_snapshot_by_id(snapshot_id)
|
||||
if result != 0:
|
||||
exit_code = result
|
||||
sys.exit(exit_code)
|
||||
else:
|
||||
# Create new Snapshots from URLs or Crawls
|
||||
sys.exit(create_snapshots(args, tag=tag, plugins=plugins))
|
||||
# Apply updates from CLI flags (override stdin values)
|
||||
if status:
|
||||
snapshot.status = status
|
||||
snapshot.retry_at = timezone.now()
|
||||
if tag:
|
||||
# Add tag to existing tags
|
||||
snapshot.save() # Ensure saved before M2M
|
||||
from archivebox.core.models import Tag
|
||||
tag_obj, _ = Tag.objects.get_or_create(name=tag)
|
||||
snapshot.tags.add(tag_obj)
|
||||
|
||||
snapshot.save()
|
||||
updated_count += 1
|
||||
|
||||
if not is_tty:
|
||||
write_record(snapshot.to_json())
|
||||
|
||||
except Snapshot.DoesNotExist:
|
||||
rprint(f'[yellow]Snapshot not found: {snapshot_id}[/yellow]', file=sys.stderr)
|
||||
continue
|
||||
|
||||
rprint(f'[green]Updated {updated_count} snapshots[/green]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# DELETE
|
||||
# =============================================================================
|
||||
|
||||
def delete_snapshots(yes: bool = False, dry_run: bool = False) -> int:
|
||||
"""
|
||||
Delete Snapshots from stdin JSONL.
|
||||
|
||||
Requires --yes flag to confirm deletion.
|
||||
|
||||
Exit codes:
|
||||
0: Success
|
||||
1: No input or missing --yes flag
|
||||
"""
|
||||
from archivebox.misc.jsonl import read_stdin
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
snapshot_ids = [r.get('id') for r in records if r.get('id')]
|
||||
|
||||
if not snapshot_ids:
|
||||
rprint('[yellow]No valid snapshot IDs in input[/yellow]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
snapshots = Snapshot.objects.filter(id__in=snapshot_ids)
|
||||
count = snapshots.count()
|
||||
|
||||
if count == 0:
|
||||
rprint('[yellow]No matching snapshots found[/yellow]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if dry_run:
|
||||
rprint(f'[yellow]Would delete {count} snapshots (dry run)[/yellow]', file=sys.stderr)
|
||||
for snapshot in snapshots:
|
||||
rprint(f' [dim]{snapshot.id}[/dim] {snapshot.url[:60]}', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if not yes:
|
||||
rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Perform deletion
|
||||
deleted_count, _ = snapshots.delete()
|
||||
rprint(f'[green]Deleted {deleted_count} snapshots[/green]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# CLI Commands
|
||||
# =============================================================================
|
||||
|
||||
@click.group()
|
||||
def main():
|
||||
"""Manage Snapshot records."""
|
||||
pass
|
||||
|
||||
|
||||
@main.command('create')
|
||||
@click.argument('urls', nargs=-1)
|
||||
@click.option('--tag', '-t', default='', help='Comma-separated tags to add')
|
||||
@click.option('--status', '-s', default='queued', help='Initial status (default: queued)')
|
||||
@click.option('--depth', '-d', type=int, default=0, help='Crawl depth (default: 0)')
|
||||
def create_cmd(urls: tuple, tag: str, status: str, depth: int):
|
||||
"""Create Snapshots from URLs or stdin JSONL."""
|
||||
sys.exit(create_snapshots(urls, tag=tag, status=status, depth=depth))
|
||||
|
||||
|
||||
@main.command('list')
|
||||
@click.option('--status', '-s', help='Filter by status (queued, started, sealed)')
|
||||
@click.option('--url__icontains', help='Filter by URL contains')
|
||||
@click.option('--url__istartswith', help='Filter by URL starts with')
|
||||
@click.option('--tag', '-t', help='Filter by tag name')
|
||||
@click.option('--crawl-id', help='Filter by crawl ID')
|
||||
@click.option('--limit', '-n', type=int, help='Limit number of results')
|
||||
def list_cmd(status: Optional[str], url__icontains: Optional[str], url__istartswith: Optional[str],
|
||||
tag: Optional[str], crawl_id: Optional[str], limit: Optional[int]):
|
||||
"""List Snapshots as JSONL."""
|
||||
sys.exit(list_snapshots(
|
||||
status=status,
|
||||
url__icontains=url__icontains,
|
||||
url__istartswith=url__istartswith,
|
||||
tag=tag,
|
||||
crawl_id=crawl_id,
|
||||
limit=limit,
|
||||
))
|
||||
|
||||
|
||||
@main.command('update')
|
||||
@click.option('--status', '-s', help='Set status')
|
||||
@click.option('--tag', '-t', help='Add tag')
|
||||
def update_cmd(status: Optional[str], tag: Optional[str]):
|
||||
"""Update Snapshots from stdin JSONL."""
|
||||
sys.exit(update_snapshots(status=status, tag=tag))
|
||||
|
||||
|
||||
@main.command('delete')
|
||||
@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
|
||||
@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
|
||||
def delete_cmd(yes: bool, dry_run: bool):
|
||||
"""Delete Snapshots from stdin JSONL."""
|
||||
sys.exit(delete_snapshots(yes=yes, dry_run=dry_run))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
293
archivebox/cli/archivebox_tag.py
Normal file
293
archivebox/cli/archivebox_tag.py
Normal file
@@ -0,0 +1,293 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
archivebox tag <action> [args...] [--filters]
|
||||
|
||||
Manage Tag records.
|
||||
|
||||
Actions:
|
||||
create - Create Tags
|
||||
list - List Tags as JSONL (with optional filters)
|
||||
update - Update Tags from stdin JSONL
|
||||
delete - Delete Tags from stdin JSONL
|
||||
|
||||
Examples:
|
||||
# Create
|
||||
archivebox tag create news tech science
|
||||
archivebox tag create "important stuff"
|
||||
|
||||
# List
|
||||
archivebox tag list
|
||||
archivebox tag list --name__icontains=news
|
||||
|
||||
# Update (rename tags)
|
||||
archivebox tag list --name=oldname | archivebox tag update --name=newname
|
||||
|
||||
# Delete
|
||||
archivebox tag list --name=unused | archivebox tag delete --yes
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox tag'
|
||||
|
||||
import sys
|
||||
from typing import Optional, Iterable
|
||||
|
||||
import rich_click as click
|
||||
from rich import print as rprint
|
||||
|
||||
from archivebox.cli.cli_utils import apply_filters
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# CREATE
|
||||
# =============================================================================
|
||||
|
||||
def create_tags(names: Iterable[str]) -> int:
|
||||
"""
|
||||
Create Tags from names.
|
||||
|
||||
Exit codes:
|
||||
0: Success
|
||||
1: Failure
|
||||
"""
|
||||
from archivebox.misc.jsonl import write_record
|
||||
from archivebox.core.models import Tag
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
# Convert to list if needed
|
||||
name_list = list(names) if names else []
|
||||
|
||||
if not name_list:
|
||||
rprint('[yellow]No tag names provided. Pass names as arguments.[/yellow]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
created_count = 0
|
||||
for name in name_list:
|
||||
name = name.strip()
|
||||
if not name:
|
||||
continue
|
||||
|
||||
tag, created = Tag.objects.get_or_create(name=name)
|
||||
|
||||
if not is_tty:
|
||||
write_record(tag.to_json())
|
||||
|
||||
if created:
|
||||
created_count += 1
|
||||
rprint(f'[green]Created tag: {name}[/green]', file=sys.stderr)
|
||||
else:
|
||||
rprint(f'[dim]Tag already exists: {name}[/dim]', file=sys.stderr)
|
||||
|
||||
rprint(f'[green]Created {created_count} new tags[/green]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# LIST
|
||||
# =============================================================================
|
||||
|
||||
def list_tags(
|
||||
name: Optional[str] = None,
|
||||
name__icontains: Optional[str] = None,
|
||||
limit: Optional[int] = None,
|
||||
) -> int:
|
||||
"""
|
||||
List Tags as JSONL with optional filters.
|
||||
|
||||
Exit codes:
|
||||
0: Success (even if no results)
|
||||
"""
|
||||
from archivebox.misc.jsonl import write_record
|
||||
from archivebox.core.models import Tag
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
queryset = Tag.objects.all().order_by('name')
|
||||
|
||||
# Apply filters
|
||||
filter_kwargs = {
|
||||
'name': name,
|
||||
'name__icontains': name__icontains,
|
||||
}
|
||||
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
|
||||
|
||||
count = 0
|
||||
for tag in queryset:
|
||||
snapshot_count = tag.snapshot_set.count()
|
||||
if is_tty:
|
||||
rprint(f'[cyan]{tag.name:30}[/cyan] [dim]({snapshot_count} snapshots)[/dim]')
|
||||
else:
|
||||
write_record(tag.to_json())
|
||||
count += 1
|
||||
|
||||
rprint(f'[dim]Listed {count} tags[/dim]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# UPDATE
|
||||
# =============================================================================
|
||||
|
||||
def update_tags(name: Optional[str] = None) -> int:
|
||||
"""
|
||||
Update Tags from stdin JSONL.
|
||||
|
||||
Reads Tag records from stdin and applies updates.
|
||||
Uses PATCH semantics - only specified fields are updated.
|
||||
|
||||
Exit codes:
|
||||
0: Success
|
||||
1: No input or error
|
||||
"""
|
||||
from archivebox.misc.jsonl import read_stdin, write_record
|
||||
from archivebox.core.models import Tag
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
updated_count = 0
|
||||
for record in records:
|
||||
tag_id = record.get('id')
|
||||
old_name = record.get('name')
|
||||
|
||||
if not tag_id and not old_name:
|
||||
continue
|
||||
|
||||
try:
|
||||
if tag_id:
|
||||
tag = Tag.objects.get(id=tag_id)
|
||||
else:
|
||||
tag = Tag.objects.get(name=old_name)
|
||||
|
||||
# Apply updates from CLI flags
|
||||
if name:
|
||||
tag.name = name
|
||||
tag.save()
|
||||
|
||||
updated_count += 1
|
||||
|
||||
if not is_tty:
|
||||
write_record(tag.to_json())
|
||||
|
||||
except Tag.DoesNotExist:
|
||||
rprint(f'[yellow]Tag not found: {tag_id or old_name}[/yellow]', file=sys.stderr)
|
||||
continue
|
||||
|
||||
rprint(f'[green]Updated {updated_count} tags[/green]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# DELETE
|
||||
# =============================================================================
|
||||
|
||||
def delete_tags(yes: bool = False, dry_run: bool = False) -> int:
|
||||
"""
|
||||
Delete Tags from stdin JSONL.
|
||||
|
||||
Requires --yes flag to confirm deletion.
|
||||
|
||||
Exit codes:
|
||||
0: Success
|
||||
1: No input or missing --yes flag
|
||||
"""
|
||||
from archivebox.misc.jsonl import read_stdin
|
||||
from archivebox.core.models import Tag
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Collect tag IDs or names
|
||||
tag_ids = []
|
||||
tag_names = []
|
||||
for r in records:
|
||||
if r.get('id'):
|
||||
tag_ids.append(r['id'])
|
||||
elif r.get('name'):
|
||||
tag_names.append(r['name'])
|
||||
|
||||
if not tag_ids and not tag_names:
|
||||
rprint('[yellow]No valid tag IDs or names in input[/yellow]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
from django.db.models import Q
|
||||
query = Q()
|
||||
if tag_ids:
|
||||
query |= Q(id__in=tag_ids)
|
||||
if tag_names:
|
||||
query |= Q(name__in=tag_names)
|
||||
|
||||
tags = Tag.objects.filter(query)
|
||||
count = tags.count()
|
||||
|
||||
if count == 0:
|
||||
rprint('[yellow]No matching tags found[/yellow]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if dry_run:
|
||||
rprint(f'[yellow]Would delete {count} tags (dry run)[/yellow]', file=sys.stderr)
|
||||
for tag in tags:
|
||||
rprint(f' {tag.name}', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if not yes:
|
||||
rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Perform deletion
|
||||
deleted_count, _ = tags.delete()
|
||||
rprint(f'[green]Deleted {deleted_count} tags[/green]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# CLI Commands
|
||||
# =============================================================================
|
||||
|
||||
@click.group()
|
||||
def main():
|
||||
"""Manage Tag records."""
|
||||
pass
|
||||
|
||||
|
||||
@main.command('create')
|
||||
@click.argument('names', nargs=-1)
|
||||
def create_cmd(names: tuple):
|
||||
"""Create Tags from names."""
|
||||
sys.exit(create_tags(names))
|
||||
|
||||
|
||||
@main.command('list')
|
||||
@click.option('--name', help='Filter by exact name')
|
||||
@click.option('--name__icontains', help='Filter by name contains')
|
||||
@click.option('--limit', '-n', type=int, help='Limit number of results')
|
||||
def list_cmd(name: Optional[str], name__icontains: Optional[str], limit: Optional[int]):
|
||||
"""List Tags as JSONL."""
|
||||
sys.exit(list_tags(name=name, name__icontains=name__icontains, limit=limit))
|
||||
|
||||
|
||||
@main.command('update')
|
||||
@click.option('--name', '-n', help='Set new name')
|
||||
def update_cmd(name: Optional[str]):
|
||||
"""Update Tags from stdin JSONL."""
|
||||
sys.exit(update_tags(name=name))
|
||||
|
||||
|
||||
@main.command('delete')
|
||||
@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
|
||||
@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
|
||||
def delete_cmd(yes: bool, dry_run: bool):
|
||||
"""Delete Tags from stdin JSONL."""
|
||||
sys.exit(delete_tags(yes=yes, dry_run=dry_run))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
46
archivebox/cli/cli_utils.py
Normal file
46
archivebox/cli/cli_utils.py
Normal file
@@ -0,0 +1,46 @@
|
||||
"""
|
||||
Shared CLI utilities for ArchiveBox commands.
|
||||
|
||||
This module contains common utilities used across multiple CLI commands,
|
||||
extracted to avoid code duplication.
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
|
||||
from typing import Optional
|
||||
|
||||
|
||||
def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
|
||||
"""
|
||||
Apply Django-style filters from CLI kwargs to a QuerySet.
|
||||
|
||||
Supports: --status=queued, --url__icontains=example, --id__in=uuid1,uuid2
|
||||
|
||||
Args:
|
||||
queryset: Django QuerySet to filter
|
||||
filter_kwargs: Dict of filter key-value pairs from CLI
|
||||
limit: Optional limit on results
|
||||
|
||||
Returns:
|
||||
Filtered QuerySet
|
||||
|
||||
Example:
|
||||
queryset = Snapshot.objects.all()
|
||||
filter_kwargs = {'status': 'queued', 'url__icontains': 'example.com'}
|
||||
filtered = apply_filters(queryset, filter_kwargs, limit=10)
|
||||
"""
|
||||
filters = {}
|
||||
for key, value in filter_kwargs.items():
|
||||
if value is None or key in ('limit', 'offset'):
|
||||
continue
|
||||
# Handle CSV lists for __in filters
|
||||
if key.endswith('__in') and isinstance(value, str):
|
||||
value = [v.strip() for v in value.split(',')]
|
||||
filters[key] = value
|
||||
|
||||
if filters:
|
||||
queryset = queryset.filter(**filters)
|
||||
if limit:
|
||||
queryset = queryset[:limit]
|
||||
|
||||
return queryset
|
||||
@@ -1,17 +1,18 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Tests for CLI piping workflow: crawl | snapshot | extract
|
||||
Tests for CLI piping workflow: crawl | snapshot | archiveresult | run
|
||||
|
||||
This module tests the JSONL-based piping between CLI commands as described in:
|
||||
https://github.com/ArchiveBox/ArchiveBox/issues/1363
|
||||
|
||||
Workflows tested:
|
||||
archivebox crawl URL -> Crawl JSONL
|
||||
archivebox snapshot -> Snapshot JSONL (accepts Crawl or URL input)
|
||||
archivebox extract -> ArchiveResult JSONL (accepts Snapshot input)
|
||||
archivebox crawl create URL -> Crawl JSONL
|
||||
archivebox snapshot create -> Snapshot JSONL (accepts Crawl or URL input)
|
||||
archivebox archiveresult create -> ArchiveResult JSONL (accepts Snapshot input)
|
||||
archivebox run -> Process queued records (accepts any JSONL)
|
||||
|
||||
Pipeline:
|
||||
archivebox crawl URL | archivebox snapshot | archivebox extract
|
||||
archivebox crawl create URL | archivebox snapshot create | archivebox archiveresult create | archivebox run
|
||||
|
||||
Each command should:
|
||||
- Accept URLs, IDs, or JSONL as input (args or stdin)
|
||||
@@ -154,13 +155,13 @@ class TestJSONLParsing(unittest.TestCase):
|
||||
class TestJSONLOutput(unittest.TestCase):
|
||||
"""Test JSONL output formatting."""
|
||||
|
||||
def test_crawl_to_jsonl(self):
|
||||
"""Crawl model should serialize to JSONL correctly."""
|
||||
def test_crawl_to_json(self):
|
||||
"""Crawl model should serialize to JSON correctly."""
|
||||
from archivebox.misc.jsonl import TYPE_CRAWL
|
||||
|
||||
# Create a mock crawl with to_jsonl method configured
|
||||
# Create a mock crawl with to_json method configured
|
||||
mock_crawl = MagicMock()
|
||||
mock_crawl.to_jsonl.return_value = {
|
||||
mock_crawl.to_json.return_value = {
|
||||
'type': TYPE_CRAWL,
|
||||
'schema_version': '0.9.0',
|
||||
'id': 'test-crawl-uuid',
|
||||
@@ -172,7 +173,7 @@ class TestJSONLOutput(unittest.TestCase):
|
||||
'created_at': None,
|
||||
}
|
||||
|
||||
result = mock_crawl.to_jsonl()
|
||||
result = mock_crawl.to_json()
|
||||
self.assertEqual(result['type'], TYPE_CRAWL)
|
||||
self.assertEqual(result['id'], 'test-crawl-uuid')
|
||||
self.assertEqual(result['urls'], 'https://example.com')
|
||||
@@ -351,8 +352,8 @@ class TestSnapshotCommand(unittest.TestCase):
|
||||
# using real Snapshot instances.
|
||||
|
||||
|
||||
class TestExtractCommand(unittest.TestCase):
|
||||
"""Unit tests for archivebox extract command."""
|
||||
class TestArchiveResultCommand(unittest.TestCase):
|
||||
"""Unit tests for archivebox archiveresult command."""
|
||||
|
||||
def setUp(self):
|
||||
"""Set up test environment."""
|
||||
@@ -363,8 +364,8 @@ class TestExtractCommand(unittest.TestCase):
|
||||
"""Clean up test environment."""
|
||||
shutil.rmtree(self.test_dir, ignore_errors=True)
|
||||
|
||||
def test_extract_accepts_snapshot_id(self):
|
||||
"""extract should accept snapshot IDs as input."""
|
||||
def test_archiveresult_accepts_snapshot_id(self):
|
||||
"""archiveresult should accept snapshot IDs as input."""
|
||||
from archivebox.misc.jsonl import read_args_or_stdin
|
||||
|
||||
uuid = '01234567-89ab-cdef-0123-456789abcdef'
|
||||
@@ -374,8 +375,8 @@ class TestExtractCommand(unittest.TestCase):
|
||||
self.assertEqual(len(records), 1)
|
||||
self.assertEqual(records[0]['id'], uuid)
|
||||
|
||||
def test_extract_accepts_jsonl_snapshot(self):
|
||||
"""extract should accept JSONL Snapshot records."""
|
||||
def test_archiveresult_accepts_jsonl_snapshot(self):
|
||||
"""archiveresult should accept JSONL Snapshot records."""
|
||||
from archivebox.misc.jsonl import read_args_or_stdin, TYPE_SNAPSHOT
|
||||
|
||||
stdin = StringIO('{"type": "Snapshot", "id": "abc123", "url": "https://example.com"}\n')
|
||||
@@ -387,8 +388,8 @@ class TestExtractCommand(unittest.TestCase):
|
||||
self.assertEqual(records[0]['type'], TYPE_SNAPSHOT)
|
||||
self.assertEqual(records[0]['id'], 'abc123')
|
||||
|
||||
def test_extract_gathers_snapshot_ids(self):
|
||||
"""extract should gather snapshot IDs from various input formats."""
|
||||
def test_archiveresult_gathers_snapshot_ids(self):
|
||||
"""archiveresult should gather snapshot IDs from various input formats."""
|
||||
from archivebox.misc.jsonl import TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
|
||||
|
||||
records = [
|
||||
@@ -529,7 +530,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
|
||||
|
||||
# Create crawl with multiple URLs (as newline-separated string)
|
||||
urls = 'https://test-crawl-1.example.com\nhttps://test-crawl-2.example.com'
|
||||
crawl = Crawl.from_jsonl({'urls': urls}, overrides={'created_by_id': created_by_id})
|
||||
crawl = Crawl.from_json({'urls': urls}, overrides={'created_by_id': created_by_id})
|
||||
|
||||
self.assertIsNotNone(crawl)
|
||||
self.assertIsNotNone(crawl.id)
|
||||
@@ -543,7 +544,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
|
||||
self.assertIn('https://test-crawl-2.example.com', urls_list)
|
||||
|
||||
# Verify output format
|
||||
output = crawl.to_jsonl()
|
||||
output = crawl.to_json()
|
||||
self.assertEqual(output['type'], TYPE_CRAWL)
|
||||
self.assertIn('id', output)
|
||||
self.assertEqual(output['urls'], urls)
|
||||
@@ -566,8 +567,8 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
|
||||
|
||||
# Step 1: Create crawl (simulating 'archivebox crawl')
|
||||
urls = 'https://crawl-to-snap-1.example.com\nhttps://crawl-to-snap-2.example.com'
|
||||
crawl = Crawl.from_jsonl({'urls': urls}, overrides={'created_by_id': created_by_id})
|
||||
crawl_output = crawl.to_jsonl()
|
||||
crawl = Crawl.from_json({'urls': urls}, overrides={'created_by_id': created_by_id})
|
||||
crawl_output = crawl.to_json()
|
||||
|
||||
# Step 2: Parse crawl output as snapshot input
|
||||
stdin = StringIO(json.dumps(crawl_output) + '\n')
|
||||
@@ -581,7 +582,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
|
||||
# Step 3: Create snapshots from crawl URLs
|
||||
created_snapshots = []
|
||||
for url in crawl.get_urls_list():
|
||||
snapshot = Snapshot.from_jsonl({'url': url}, overrides={'created_by_id': created_by_id})
|
||||
snapshot = Snapshot.from_json({'url': url}, overrides={'created_by_id': created_by_id})
|
||||
if snapshot:
|
||||
created_snapshots.append(snapshot)
|
||||
|
||||
@@ -589,7 +590,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
|
||||
|
||||
# Verify snapshot output
|
||||
for snapshot in created_snapshots:
|
||||
output = snapshot.to_jsonl()
|
||||
output = snapshot.to_json()
|
||||
self.assertEqual(output['type'], TYPE_SNAPSHOT)
|
||||
self.assertIn(output['url'], [
|
||||
'https://crawl-to-snap-1.example.com',
|
||||
@@ -619,13 +620,13 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
|
||||
|
||||
# Create snapshot
|
||||
overrides = {'created_by_id': created_by_id}
|
||||
snapshot = Snapshot.from_jsonl(records[0], overrides=overrides)
|
||||
snapshot = Snapshot.from_json(records[0], overrides=overrides)
|
||||
|
||||
self.assertIsNotNone(snapshot.id)
|
||||
self.assertEqual(snapshot.url, url)
|
||||
|
||||
# Verify output format
|
||||
output = snapshot.to_jsonl()
|
||||
output = snapshot.to_json()
|
||||
self.assertEqual(output['type'], TYPE_SNAPSHOT)
|
||||
self.assertIn('id', output)
|
||||
self.assertEqual(output['url'], url)
|
||||
@@ -647,8 +648,8 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
|
||||
# Step 1: Create snapshot (simulating 'archivebox snapshot')
|
||||
url = 'https://test-extract-1.example.com'
|
||||
overrides = {'created_by_id': created_by_id}
|
||||
snapshot = Snapshot.from_jsonl({'url': url}, overrides=overrides)
|
||||
snapshot_output = snapshot.to_jsonl()
|
||||
snapshot = Snapshot.from_json({'url': url}, overrides=overrides)
|
||||
snapshot_output = snapshot.to_json()
|
||||
|
||||
# Step 2: Parse snapshot output as extract input
|
||||
stdin = StringIO(json.dumps(snapshot_output) + '\n')
|
||||
@@ -686,8 +687,8 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
|
||||
|
||||
# === archivebox crawl https://example.com ===
|
||||
url = 'https://test-pipeline-full.example.com'
|
||||
crawl = Crawl.from_jsonl({'url': url}, overrides={'created_by_id': created_by_id})
|
||||
crawl_jsonl = json.dumps(crawl.to_jsonl())
|
||||
crawl = Crawl.from_json({'url': url}, overrides={'created_by_id': created_by_id})
|
||||
crawl_jsonl = json.dumps(crawl.to_json())
|
||||
|
||||
# === | archivebox snapshot ===
|
||||
stdin = StringIO(crawl_jsonl + '\n')
|
||||
@@ -705,7 +706,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
|
||||
if crawl_id:
|
||||
db_crawl = Crawl.objects.get(id=crawl_id)
|
||||
for crawl_url in db_crawl.get_urls_list():
|
||||
snapshot = Snapshot.from_jsonl({'url': crawl_url}, overrides={'created_by_id': created_by_id})
|
||||
snapshot = Snapshot.from_json({'url': crawl_url}, overrides={'created_by_id': created_by_id})
|
||||
if snapshot:
|
||||
created_snapshots.append(snapshot)
|
||||
|
||||
@@ -713,7 +714,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
|
||||
self.assertEqual(created_snapshots[0].url, url)
|
||||
|
||||
# === | archivebox extract ===
|
||||
snapshot_jsonl_lines = [json.dumps(s.to_jsonl()) for s in created_snapshots]
|
||||
snapshot_jsonl_lines = [json.dumps(s.to_json()) for s in created_snapshots]
|
||||
stdin = StringIO('\n'.join(snapshot_jsonl_lines) + '\n')
|
||||
stdin.isatty = lambda: False
|
||||
|
||||
@@ -757,12 +758,12 @@ class TestDepthWorkflows(unittest.TestCase):
|
||||
|
||||
# Create crawl with depth 0
|
||||
url = 'https://depth0-test.example.com'
|
||||
crawl = Crawl.from_jsonl({'url': url, 'max_depth': 0}, overrides={'created_by_id': created_by_id})
|
||||
crawl = Crawl.from_json({'url': url, 'max_depth': 0}, overrides={'created_by_id': created_by_id})
|
||||
|
||||
self.assertEqual(crawl.max_depth, 0)
|
||||
|
||||
# Create snapshot
|
||||
snapshot = Snapshot.from_jsonl({'url': url}, overrides={'created_by_id': created_by_id})
|
||||
snapshot = Snapshot.from_json({'url': url}, overrides={'created_by_id': created_by_id})
|
||||
self.assertEqual(snapshot.url, url)
|
||||
|
||||
def test_depth_metadata_in_crawl(self):
|
||||
@@ -773,7 +774,7 @@ class TestDepthWorkflows(unittest.TestCase):
|
||||
created_by_id = get_or_create_system_user_pk()
|
||||
|
||||
# Create crawl with depth
|
||||
crawl = Crawl.from_jsonl(
|
||||
crawl = Crawl.from_json(
|
||||
{'url': 'https://depth-meta-test.example.com', 'max_depth': 2},
|
||||
overrides={'created_by_id': created_by_id}
|
||||
)
|
||||
@@ -781,7 +782,7 @@ class TestDepthWorkflows(unittest.TestCase):
|
||||
self.assertEqual(crawl.max_depth, 2)
|
||||
|
||||
# Verify in JSONL output
|
||||
output = crawl.to_jsonl()
|
||||
output = crawl.to_json()
|
||||
self.assertEqual(output['max_depth'], 2)
|
||||
|
||||
|
||||
@@ -956,5 +957,129 @@ class TestEdgeCases(unittest.TestCase):
|
||||
self.assertEqual(urls[2], 'https://url3.com')
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Pass-Through Behavior Tests
|
||||
# =============================================================================
|
||||
|
||||
class TestPassThroughBehavior(unittest.TestCase):
|
||||
"""Test pass-through behavior in CLI commands."""
|
||||
|
||||
def test_crawl_passes_through_other_types(self):
|
||||
"""crawl create should pass through records with other types."""
|
||||
from archivebox.misc.jsonl import TYPE_CRAWL
|
||||
|
||||
# Input: a Tag record (not a Crawl or URL)
|
||||
tag_record = {'type': 'Tag', 'id': 'test-tag', 'name': 'example'}
|
||||
url_record = {'url': 'https://example.com'}
|
||||
|
||||
# Mock stdin with both records
|
||||
stdin = StringIO(
|
||||
json.dumps(tag_record) + '\n' +
|
||||
json.dumps(url_record)
|
||||
)
|
||||
stdin.isatty = lambda: False
|
||||
|
||||
# The Tag should be passed through, the URL should create a Crawl
|
||||
# (This is a unit test of the pass-through logic)
|
||||
from archivebox.misc.jsonl import read_args_or_stdin
|
||||
records = list(read_args_or_stdin((), stream=stdin))
|
||||
|
||||
self.assertEqual(len(records), 2)
|
||||
# First record is a Tag (other type)
|
||||
self.assertEqual(records[0]['type'], 'Tag')
|
||||
# Second record has a URL
|
||||
self.assertIn('url', records[1])
|
||||
|
||||
def test_snapshot_passes_through_crawl(self):
|
||||
"""snapshot create should pass through Crawl records."""
|
||||
from archivebox.misc.jsonl import TYPE_CRAWL, TYPE_SNAPSHOT
|
||||
|
||||
crawl_record = {
|
||||
'type': TYPE_CRAWL,
|
||||
'id': 'test-crawl',
|
||||
'urls': 'https://example.com',
|
||||
}
|
||||
|
||||
# Crawl records should be passed through AND create snapshots
|
||||
# This tests the accumulation behavior
|
||||
self.assertEqual(crawl_record['type'], TYPE_CRAWL)
|
||||
self.assertIn('urls', crawl_record)
|
||||
|
||||
def test_archiveresult_passes_through_snapshot(self):
|
||||
"""archiveresult create should pass through Snapshot records."""
|
||||
from archivebox.misc.jsonl import TYPE_SNAPSHOT
|
||||
|
||||
snapshot_record = {
|
||||
'type': TYPE_SNAPSHOT,
|
||||
'id': 'test-snapshot',
|
||||
'url': 'https://example.com',
|
||||
}
|
||||
|
||||
# Snapshot records should be passed through
|
||||
self.assertEqual(snapshot_record['type'], TYPE_SNAPSHOT)
|
||||
self.assertIn('url', snapshot_record)
|
||||
|
||||
def test_run_passes_through_unknown_types(self):
|
||||
"""run should pass through records with unknown types."""
|
||||
unknown_record = {'type': 'Unknown', 'id': 'test', 'data': 'value'}
|
||||
|
||||
# Unknown types should be passed through unchanged
|
||||
self.assertEqual(unknown_record['type'], 'Unknown')
|
||||
self.assertIn('data', unknown_record)
|
||||
|
||||
|
||||
class TestPipelineAccumulation(unittest.TestCase):
|
||||
"""Test that pipelines accumulate records correctly."""
|
||||
|
||||
def test_full_pipeline_output_types(self):
|
||||
"""Full pipeline should output all record types."""
|
||||
from archivebox.misc.jsonl import TYPE_CRAWL, TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
|
||||
|
||||
# Simulated pipeline output after: crawl | snapshot | archiveresult | run
|
||||
# Should contain Crawl, Snapshot, and ArchiveResult records
|
||||
pipeline_output = [
|
||||
{'type': TYPE_CRAWL, 'id': 'c1', 'urls': 'https://example.com'},
|
||||
{'type': TYPE_SNAPSHOT, 'id': 's1', 'url': 'https://example.com'},
|
||||
{'type': TYPE_ARCHIVERESULT, 'id': 'ar1', 'plugin': 'title'},
|
||||
]
|
||||
|
||||
types = {r['type'] for r in pipeline_output}
|
||||
self.assertIn(TYPE_CRAWL, types)
|
||||
self.assertIn(TYPE_SNAPSHOT, types)
|
||||
self.assertIn(TYPE_ARCHIVERESULT, types)
|
||||
|
||||
def test_pipeline_preserves_ids(self):
|
||||
"""Pipeline should preserve record IDs through all stages."""
|
||||
records = [
|
||||
{'type': 'Crawl', 'id': 'c1', 'urls': 'https://example.com'},
|
||||
{'type': 'Snapshot', 'id': 's1', 'url': 'https://example.com'},
|
||||
]
|
||||
|
||||
# All records should have IDs
|
||||
for record in records:
|
||||
self.assertIn('id', record)
|
||||
self.assertTrue(record['id'])
|
||||
|
||||
def test_jq_transform_pattern(self):
|
||||
"""Test pattern for jq transforms in pipeline."""
|
||||
# Simulated: archiveresult list --status=failed | jq 'del(.id) | .status = "queued"'
|
||||
failed_record = {
|
||||
'type': 'ArchiveResult',
|
||||
'id': 'ar1',
|
||||
'status': 'failed',
|
||||
'plugin': 'wget',
|
||||
}
|
||||
|
||||
# Transform: delete id, set status to queued
|
||||
transformed = {
|
||||
'type': failed_record['type'],
|
||||
'status': 'queued',
|
||||
'plugin': failed_record['plugin'],
|
||||
}
|
||||
|
||||
self.assertNotIn('id', transformed)
|
||||
self.assertEqual(transformed['status'], 'queued')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
||||
@@ -120,6 +120,7 @@ class BaseConfigSet(BaseSettings):
|
||||
def get_config(
|
||||
scope: str = "global",
|
||||
defaults: Optional[Dict] = None,
|
||||
persona: Any = None,
|
||||
user: Any = None,
|
||||
crawl: Any = None,
|
||||
snapshot: Any = None,
|
||||
@@ -131,14 +132,16 @@ def get_config(
|
||||
1. Per-snapshot config (snapshot.config JSON field)
|
||||
2. Per-crawl config (crawl.config JSON field)
|
||||
3. Per-user config (user.config JSON field)
|
||||
4. Environment variables
|
||||
5. Config file (ArchiveBox.conf)
|
||||
6. Plugin schema defaults (config.json)
|
||||
7. Core config defaults
|
||||
4. Per-persona config (persona.get_derived_config() - includes CHROME_USER_DATA_DIR etc.)
|
||||
5. Environment variables
|
||||
6. Config file (ArchiveBox.conf)
|
||||
7. Plugin schema defaults (config.json)
|
||||
8. Core config defaults
|
||||
|
||||
Args:
|
||||
scope: Config scope ('global', 'crawl', 'snapshot', etc.)
|
||||
defaults: Default values to start with
|
||||
persona: Persona object (provides derived paths like CHROME_USER_DATA_DIR)
|
||||
user: User object with config JSON field
|
||||
crawl: Crawl object with config JSON field
|
||||
snapshot: Snapshot object with config JSON field
|
||||
@@ -205,6 +208,10 @@ def get_config(
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
# Apply persona config overrides (includes derived paths like CHROME_USER_DATA_DIR)
|
||||
if persona and hasattr(persona, "get_derived_config"):
|
||||
config.update(persona.get_derived_config())
|
||||
|
||||
# Apply user config overrides
|
||||
if user and hasattr(user, "config") and user.config:
|
||||
config.update(user.config)
|
||||
@@ -213,6 +220,10 @@ def get_config(
|
||||
if crawl and hasattr(crawl, "config") and crawl.config:
|
||||
config.update(crawl.config)
|
||||
|
||||
# Add CRAWL_OUTPUT_DIR for snapshot hooks to find shared Chrome session
|
||||
if crawl and hasattr(crawl, "OUTPUT_DIR"):
|
||||
config['CRAWL_OUTPUT_DIR'] = str(crawl.OUTPUT_DIR)
|
||||
|
||||
# Apply snapshot config overrides (highest priority)
|
||||
if snapshot and hasattr(snapshot, "config") and snapshot.config:
|
||||
config.update(snapshot.config)
|
||||
|
||||
@@ -158,7 +158,7 @@ class AddLinkForm(forms.Form):
|
||||
'search_backend_ripgrep', 'search_backend_sonic', 'search_backend_sqlite'
|
||||
}
|
||||
binary = {'apt', 'brew', 'custom', 'env', 'npm', 'pip'}
|
||||
extensions = {'captcha2', 'istilldontcareaboutcookies', 'ublock'}
|
||||
extensions = {'twocaptcha', 'istilldontcareaboutcookies', 'ublock'}
|
||||
|
||||
# Populate plugin field choices
|
||||
self.fields['chrome_plugins'].choices = [
|
||||
|
||||
@@ -10,8 +10,8 @@ import archivebox.base_models.models
|
||||
|
||||
def cleanup_extra_columns(apps, schema_editor):
|
||||
"""
|
||||
Remove extra columns that were needed for v0.7.2/v0.8.6rc0 migration but don't exist in final models.
|
||||
The actual models use @property methods to access these values from the process FK.
|
||||
Create Process records from old cmd/pwd/cmd_version columns and remove those columns.
|
||||
This preserves the execution details by moving them to the Process model.
|
||||
"""
|
||||
with schema_editor.connection.cursor() as cursor:
|
||||
# Check if cmd column exists (means we came from v0.7.2/v0.8.6rc0)
|
||||
@@ -19,8 +19,41 @@ def cleanup_extra_columns(apps, schema_editor):
|
||||
has_cmd = cursor.fetchone()[0] > 0
|
||||
|
||||
if has_cmd:
|
||||
print(" Cleaning up temporary columns from core_archiveresult...")
|
||||
# Rebuild table without the extra columns
|
||||
print(" Migrating cmd/pwd/cmd_version data to Process records...")
|
||||
|
||||
# For each ArchiveResult, create a Process record with cmd/pwd data
|
||||
# Note: cmd_version from old schema is not preserved (it's now derived from Binary)
|
||||
cursor.execute("""
|
||||
SELECT id, cmd, pwd, binary_id, iface_id, start_ts, end_ts, status
|
||||
FROM core_archiveresult
|
||||
""")
|
||||
archive_results = cursor.fetchall()
|
||||
|
||||
from archivebox.uuid_compat import uuid7
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
|
||||
machine_id = cursor.execute("SELECT id FROM machine_machine LIMIT 1").fetchone()[0]
|
||||
|
||||
for ar_id, cmd, pwd, binary_id, iface_id, start_ts, end_ts, status in archive_results:
|
||||
# Create Process record
|
||||
process_id = str(uuid7())
|
||||
cursor.execute("""
|
||||
INSERT INTO machine_process (
|
||||
id, created_at, modified_at,
|
||||
machine_id, binary_id, iface_id,
|
||||
pwd, cmd, env, timeout,
|
||||
pid, exit_code, stdout, stderr,
|
||||
started_at, ended_at, url, status, retry_at
|
||||
) VALUES (?, datetime('now'), datetime('now'), ?, ?, ?, ?, ?, '{}', 120, NULL, NULL, '', '', ?, ?, '', ?, NULL)
|
||||
""", (process_id, machine_id, binary_id, iface_id, pwd or '', cmd or '[]', start_ts, end_ts, status or 'queued'))
|
||||
|
||||
# Update ArchiveResult to point to new Process
|
||||
cursor.execute("UPDATE core_archiveresult SET process_id = ? WHERE id = ?", (process_id, ar_id))
|
||||
|
||||
print(f" ✓ Created {len(archive_results)} Process records from ArchiveResult data")
|
||||
|
||||
# Now rebuild table without the extra columns
|
||||
print(" Rebuilding core_archiveresult table...")
|
||||
cursor.execute("""
|
||||
CREATE TABLE core_archiveresult_final (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
@@ -48,14 +81,14 @@ def cleanup_extra_columns(apps, schema_editor):
|
||||
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
|
||||
num_uses_failed INTEGER NOT NULL DEFAULT 0,
|
||||
|
||||
process_id TEXT,
|
||||
process_id TEXT NOT NULL,
|
||||
|
||||
FOREIGN KEY (snapshot_id) REFERENCES core_snapshot(id) ON DELETE CASCADE,
|
||||
FOREIGN KEY (process_id) REFERENCES machine_process(id) ON DELETE RESTRICT
|
||||
)
|
||||
""")
|
||||
|
||||
# Copy data (cmd, pwd, etc. are now accessed via process FK)
|
||||
# Copy data (cmd, pwd, etc. are now in Process records)
|
||||
cursor.execute("""
|
||||
INSERT INTO core_archiveresult_final SELECT
|
||||
id, uuid, created_at, modified_at,
|
||||
|
||||
@@ -0,0 +1,108 @@
|
||||
# Generated by Django 6.0 on 2025-12-31 09:04
|
||||
|
||||
import django.db.models.deletion
|
||||
import django.utils.timezone
|
||||
import uuid
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0026_final_field_adjustments'),
|
||||
('crawls', '0002_upgrade_to_0_9_0'),
|
||||
('machine', '0001_initial'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='hook_name',
|
||||
field=models.CharField(blank=True, db_index=True, default='', help_text='Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)', max_length=255),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='id',
|
||||
field=models.AutoField(editable=False, primary_key=True, serialize=False),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='output_files',
|
||||
field=models.JSONField(default=dict, help_text='Dict of {relative_path: {metadata}}'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='output_json',
|
||||
field=models.JSONField(blank=True, default=None, help_text='Structured metadata (headers, redirects, etc.)', null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='output_mimetypes',
|
||||
field=models.CharField(blank=True, default='', help_text='CSV of mimetypes sorted by size', max_length=512),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='output_size',
|
||||
field=models.BigIntegerField(default=0, help_text='Total bytes of all output files'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='output_str',
|
||||
field=models.TextField(blank=True, default='', help_text='Human-readable output summary'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='plugin',
|
||||
field=models.CharField(db_index=True, default='', max_length=32),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='process',
|
||||
field=models.OneToOneField(help_text='Process execution details for this archive result', on_delete=django.db.models.deletion.PROTECT, related_name='archiveresult', to='machine.process'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='retry_at',
|
||||
field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='status',
|
||||
field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('backoff', 'Waiting to retry'), ('succeeded', 'Succeeded'), ('failed', 'Failed'), ('skipped', 'Skipped')], db_index=True, default='queued', max_length=15),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='uuid',
|
||||
field=models.UUIDField(blank=True, db_index=True, default=uuid.uuid7, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='config',
|
||||
field=models.JSONField(default=dict),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='crawl',
|
||||
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to='crawls.crawl'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='current_step',
|
||||
field=models.PositiveSmallIntegerField(db_index=True, default=0, help_text='Current hook step being executed (0-9). Used for sequential hook execution.'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='depth',
|
||||
field=models.PositiveSmallIntegerField(db_index=True, default=0),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='id',
|
||||
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshottag',
|
||||
name='id',
|
||||
field=models.AutoField(primary_key=True, serialize=False),
|
||||
),
|
||||
]
|
||||
@@ -144,7 +144,7 @@ class BinaryAdmin(BaseModelAdmin):
|
||||
|
||||
|
||||
class ProcessAdmin(BaseModelAdmin):
|
||||
list_display = ('id', 'created_at', 'machine_info', 'archiveresult_link', 'cmd_str', 'status', 'exit_code', 'pid', 'binary_info', 'health')
|
||||
list_display = ('id', 'created_at', 'machine_info', 'archiveresult_link', 'cmd_str', 'status', 'exit_code', 'pid', 'binary_info')
|
||||
sort_fields = ('id', 'created_at', 'status', 'exit_code', 'pid')
|
||||
search_fields = ('id', 'machine__id', 'binary__name', 'cmd', 'pwd', 'stdout', 'stderr')
|
||||
|
||||
@@ -171,10 +171,6 @@ class ProcessAdmin(BaseModelAdmin):
|
||||
'fields': ('stdout', 'stderr'),
|
||||
'classes': ('card', 'wide', 'collapse'),
|
||||
}),
|
||||
('Usage', {
|
||||
'fields': ('num_uses_succeeded', 'num_uses_failed'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Timestamps', {
|
||||
'fields': ('created_at', 'modified_at'),
|
||||
'classes': ('card',),
|
||||
|
||||
@@ -105,8 +105,6 @@ class Migration(migrations.Migration):
|
||||
id TEXT PRIMARY KEY NOT NULL,
|
||||
created_at DATETIME NOT NULL,
|
||||
modified_at DATETIME NOT NULL,
|
||||
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
|
||||
num_uses_failed INTEGER NOT NULL DEFAULT 0,
|
||||
|
||||
machine_id TEXT NOT NULL,
|
||||
binary_id TEXT,
|
||||
@@ -234,8 +232,6 @@ class Migration(migrations.Migration):
|
||||
('id', models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
|
||||
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
|
||||
('modified_at', models.DateTimeField(auto_now=True)),
|
||||
('num_uses_succeeded', models.PositiveIntegerField(default=0)),
|
||||
('num_uses_failed', models.PositiveIntegerField(default=0)),
|
||||
('pwd', models.CharField(blank=True, default='', help_text='Working directory for process execution', max_length=512)),
|
||||
('cmd', models.JSONField(blank=True, default=list, help_text='Command as array of arguments')),
|
||||
('env', models.JSONField(blank=True, default=dict, help_text='Environment variables for process')),
|
||||
|
||||
@@ -24,7 +24,7 @@ __package__ = 'archivebox.misc'
|
||||
|
||||
import sys
|
||||
import json
|
||||
from typing import Iterator, Dict, Any, Optional, TextIO, Callable
|
||||
from typing import Iterator, Dict, Any, Optional, TextIO
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
@@ -150,36 +150,3 @@ def write_records(records: Iterator[Dict[str, Any]], stream: Optional[TextIO] =
|
||||
count += 1
|
||||
return count
|
||||
|
||||
|
||||
def filter_by_type(records: Iterator[Dict[str, Any]], record_type: str) -> Iterator[Dict[str, Any]]:
|
||||
"""
|
||||
Filter records by type.
|
||||
"""
|
||||
for record in records:
|
||||
if record.get('type') == record_type:
|
||||
yield record
|
||||
|
||||
|
||||
def process_records(
|
||||
records: Iterator[Dict[str, Any]],
|
||||
handlers: Dict[str, Callable[[Dict[str, Any]], Optional[Dict[str, Any]]]]
|
||||
) -> Iterator[Dict[str, Any]]:
|
||||
"""
|
||||
Process records through type-specific handlers.
|
||||
|
||||
Args:
|
||||
records: Input record iterator
|
||||
handlers: Dict mapping type names to handler functions
|
||||
Handlers return output records or None to skip
|
||||
|
||||
Yields output records from handlers.
|
||||
"""
|
||||
for record in records:
|
||||
record_type = record.get('type')
|
||||
handler = handlers.get(record_type)
|
||||
if handler:
|
||||
result = handler(record)
|
||||
if result:
|
||||
yield result
|
||||
|
||||
|
||||
|
||||
@@ -480,12 +480,39 @@ for url_str, num_urls in _test_url_strs.items():
|
||||
|
||||
def chrome_cleanup():
|
||||
"""
|
||||
Cleans up any state or runtime files that chrome leaves behind when killed by
|
||||
a timeout or other error
|
||||
Cleans up any state or runtime files that Chrome leaves behind when killed by
|
||||
a timeout or other error. Handles:
|
||||
- All persona chrome_user_data directories (via Persona.cleanup_chrome_all())
|
||||
- Explicit CHROME_USER_DATA_DIR from config
|
||||
- Legacy Docker chromium path
|
||||
"""
|
||||
import os
|
||||
from pathlib import Path
|
||||
from archivebox.config.permissions import IN_DOCKER
|
||||
|
||||
|
||||
# Clean up all persona chrome directories using Persona class
|
||||
try:
|
||||
from archivebox.personas.models import Persona
|
||||
|
||||
# Clean up all personas
|
||||
Persona.cleanup_chrome_all()
|
||||
|
||||
# Also clean up the active persona's explicit CHROME_USER_DATA_DIR if set
|
||||
# (in case it's a custom path not under PERSONAS_DIR)
|
||||
from archivebox.config.configset import get_config
|
||||
config = get_config()
|
||||
chrome_user_data_dir = config.get('CHROME_USER_DATA_DIR')
|
||||
if chrome_user_data_dir:
|
||||
singleton_lock = Path(chrome_user_data_dir) / 'SingletonLock'
|
||||
if os.path.lexists(singleton_lock):
|
||||
try:
|
||||
singleton_lock.unlink()
|
||||
except OSError:
|
||||
pass
|
||||
except Exception:
|
||||
pass # Persona/config not available during early startup
|
||||
|
||||
# Legacy Docker cleanup (for backwards compatibility)
|
||||
if IN_DOCKER:
|
||||
singleton_lock = "/home/archivebox/.config/chromium/SingletonLock"
|
||||
if os.path.lexists(singleton_lock):
|
||||
|
||||
29
archivebox/personas/migrations/0001_initial.py
Normal file
29
archivebox/personas/migrations/0001_initial.py
Normal file
@@ -0,0 +1,29 @@
|
||||
# Generated by Django 6.0 on 2025-12-31 09:06
|
||||
|
||||
import archivebox.base_models.models
|
||||
import django.db.models.deletion
|
||||
import django.utils.timezone
|
||||
from django.conf import settings
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
initial = True
|
||||
|
||||
dependencies = [
|
||||
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name='Persona',
|
||||
fields=[
|
||||
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('config', models.JSONField(blank=True, default=dict, null=True)),
|
||||
('name', models.CharField(max_length=64, unique=True)),
|
||||
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
|
||||
('created_by', models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
|
||||
],
|
||||
),
|
||||
]
|
||||
@@ -1,59 +1,155 @@
|
||||
# from django.db import models
|
||||
"""
|
||||
Persona management for ArchiveBox.
|
||||
|
||||
# from django.conf import settings
|
||||
A Persona represents a browser profile/identity used for archiving.
|
||||
Each persona has its own:
|
||||
- Chrome user data directory (for cookies, localStorage, extensions, etc.)
|
||||
- Chrome extensions directory
|
||||
- Cookies file
|
||||
- Config overrides
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.personas'
|
||||
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Iterator
|
||||
|
||||
from django.db import models
|
||||
from django.conf import settings
|
||||
from django.utils import timezone
|
||||
|
||||
from archivebox.base_models.models import ModelWithConfig, get_or_create_system_user_pk
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from django.db.models import QuerySet
|
||||
|
||||
|
||||
# class Persona(models.Model):
|
||||
# """Aka a "SessionType", its a template for a crawler browsing session containing some config."""
|
||||
class Persona(ModelWithConfig):
|
||||
"""
|
||||
Browser persona/profile for archiving sessions.
|
||||
|
||||
# id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
|
||||
|
||||
# created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False)
|
||||
# created_at = AutoDateTimeField(default=None, null=False, db_index=True)
|
||||
# modified_at = models.DateTimeField(auto_now=True)
|
||||
|
||||
# name = models.CharField(max_length=100, blank=False, null=False, editable=False)
|
||||
|
||||
# persona_dir = models.FilePathField(path=settings.PERSONAS_DIR, allow_files=False, allow_folders=True, blank=True, null=False, editable=False)
|
||||
# config = models.JSONField(default=dict)
|
||||
# # e.g. {
|
||||
# # USER_AGENT: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36',
|
||||
# # COOKIES_TXT_FILE: '/path/to/cookies.txt',
|
||||
# # CHROME_USER_DATA_DIR: '/path/to/chrome/user/data/dir',
|
||||
# # CHECK_SSL_VALIDITY: False,
|
||||
# # SAVE_ARCHIVEDOTORG: True,
|
||||
# # CHROME_BINARY: 'chromium'
|
||||
# # ...
|
||||
# # }
|
||||
# # domain_allowlist = models.CharField(max_length=1024, blank=True, null=False, default='')
|
||||
# # domain_denylist = models.CharField(max_length=1024, blank=True, null=False, default='')
|
||||
|
||||
# class Meta:
|
||||
# app_label = 'personas'
|
||||
# verbose_name = 'Session Type'
|
||||
# verbose_name_plural = 'Session Types'
|
||||
# unique_together = (('created_by', 'name'),)
|
||||
|
||||
Each persona provides:
|
||||
- CHROME_USER_DATA_DIR: Chrome profile directory
|
||||
- CHROME_EXTENSIONS_DIR: Installed extensions directory
|
||||
- COOKIES_FILE: Cookies file for wget/curl
|
||||
- config: JSON field with persona-specific config overrides
|
||||
|
||||
# def clean(self):
|
||||
# self.persona_dir = settings.PERSONAS_DIR / self.name
|
||||
# assert self.persona_dir == settings.PERSONAS_DIR / self.name, f'Persona dir {self.persona_dir} must match settings.PERSONAS_DIR / self.name'
|
||||
|
||||
|
||||
# # make sure config keys all exist in FLAT_CONFIG
|
||||
# # make sure config values all match expected types
|
||||
# pass
|
||||
|
||||
# def save(self, *args, **kwargs):
|
||||
# self.full_clean()
|
||||
|
||||
# # make sure basic file structure is present in persona_dir:
|
||||
# # - PERSONAS_DIR / self.name /
|
||||
# # - chrome_profile/
|
||||
# # - chrome_downloads/
|
||||
# # - chrome_extensions/
|
||||
# # - cookies.txt
|
||||
# # - auth.json
|
||||
# # - config.json # json dump of the model
|
||||
|
||||
# super().save(*args, **kwargs)
|
||||
Usage:
|
||||
# Get persona and its derived config
|
||||
config = get_config(persona=crawl.persona, crawl=crawl, snapshot=snapshot)
|
||||
chrome_dir = config['CHROME_USER_DATA_DIR']
|
||||
|
||||
# Or access directly from persona
|
||||
persona = Persona.objects.get(name='Default')
|
||||
persona.CHROME_USER_DATA_DIR # -> Path to chrome_user_data
|
||||
"""
|
||||
|
||||
name = models.CharField(max_length=64, unique=True)
|
||||
created_at = models.DateTimeField(default=timezone.now, db_index=True)
|
||||
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk)
|
||||
|
||||
class Meta:
|
||||
app_label = 'personas'
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.name
|
||||
|
||||
@property
|
||||
def path(self) -> Path:
|
||||
"""Path to persona directory under PERSONAS_DIR."""
|
||||
from archivebox.config.constants import CONSTANTS
|
||||
return CONSTANTS.PERSONAS_DIR / self.name
|
||||
|
||||
@property
|
||||
def CHROME_USER_DATA_DIR(self) -> str:
|
||||
"""Derived path to Chrome user data directory for this persona."""
|
||||
return str(self.path / 'chrome_user_data')
|
||||
|
||||
@property
|
||||
def CHROME_EXTENSIONS_DIR(self) -> str:
|
||||
"""Derived path to Chrome extensions directory for this persona."""
|
||||
return str(self.path / 'chrome_extensions')
|
||||
|
||||
@property
|
||||
def COOKIES_FILE(self) -> str:
|
||||
"""Derived path to cookies.txt file for this persona (if exists)."""
|
||||
cookies_path = self.path / 'cookies.txt'
|
||||
return str(cookies_path) if cookies_path.exists() else ''
|
||||
|
||||
def get_derived_config(self) -> dict:
|
||||
"""
|
||||
Get config dict with derived paths filled in.
|
||||
|
||||
Returns dict with:
|
||||
- All values from self.config JSONField
|
||||
- CHROME_USER_DATA_DIR (derived from persona path)
|
||||
- CHROME_EXTENSIONS_DIR (derived from persona path)
|
||||
- COOKIES_FILE (derived from persona path, if file exists)
|
||||
- ACTIVE_PERSONA (set to this persona's name)
|
||||
"""
|
||||
derived = dict(self.config or {})
|
||||
|
||||
# Add derived paths (don't override if explicitly set in config)
|
||||
if 'CHROME_USER_DATA_DIR' not in derived:
|
||||
derived['CHROME_USER_DATA_DIR'] = self.CHROME_USER_DATA_DIR
|
||||
if 'CHROME_EXTENSIONS_DIR' not in derived:
|
||||
derived['CHROME_EXTENSIONS_DIR'] = self.CHROME_EXTENSIONS_DIR
|
||||
if 'COOKIES_FILE' not in derived and self.COOKIES_FILE:
|
||||
derived['COOKIES_FILE'] = self.COOKIES_FILE
|
||||
|
||||
# Always set ACTIVE_PERSONA to this persona's name
|
||||
derived['ACTIVE_PERSONA'] = self.name
|
||||
|
||||
return derived
|
||||
|
||||
def ensure_dirs(self) -> None:
|
||||
"""Create persona directories if they don't exist."""
|
||||
self.path.mkdir(parents=True, exist_ok=True)
|
||||
(self.path / 'chrome_user_data').mkdir(parents=True, exist_ok=True)
|
||||
(self.path / 'chrome_extensions').mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def cleanup_chrome(self) -> bool:
|
||||
"""
|
||||
Clean up Chrome state files (SingletonLock, etc.) for this persona.
|
||||
|
||||
Returns:
|
||||
True if cleanup was performed, False if no cleanup needed
|
||||
"""
|
||||
cleaned = False
|
||||
chrome_dir = self.path / 'chrome_user_data'
|
||||
|
||||
if not chrome_dir.exists():
|
||||
return False
|
||||
|
||||
# Clean up SingletonLock files
|
||||
for lock_file in chrome_dir.glob('**/SingletonLock'):
|
||||
try:
|
||||
lock_file.unlink()
|
||||
cleaned = True
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
# Clean up SingletonSocket files
|
||||
for socket_file in chrome_dir.glob('**/SingletonSocket'):
|
||||
try:
|
||||
socket_file.unlink()
|
||||
cleaned = True
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
return cleaned
|
||||
|
||||
@classmethod
|
||||
def get_or_create_default(cls) -> 'Persona':
|
||||
"""Get or create the Default persona."""
|
||||
persona, _ = cls.objects.get_or_create(name='Default')
|
||||
return persona
|
||||
|
||||
@classmethod
|
||||
def cleanup_chrome_all(cls) -> int:
|
||||
"""Clean up Chrome state files for all personas."""
|
||||
cleaned = 0
|
||||
for persona in cls.objects.all():
|
||||
if persona.cleanup_chrome():
|
||||
cleaned += 1
|
||||
return cleaned
|
||||
|
||||
@@ -56,6 +56,40 @@ function getEnvInt(name, defaultValue = 0) {
|
||||
return isNaN(val) ? defaultValue : val;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get array environment variable (JSON array or comma-separated string).
|
||||
*
|
||||
* Parsing strategy:
|
||||
* - If value starts with '[', parse as JSON array
|
||||
* - Otherwise, parse as comma-separated values
|
||||
*
|
||||
* This prevents incorrect splitting of arguments that contain internal commas.
|
||||
* For arguments with commas, use JSON format:
|
||||
* CHROME_ARGS='["--user-data-dir=/path/with,comma", "--window-size=1440,900"]'
|
||||
*
|
||||
* @param {string} name - Environment variable name
|
||||
* @param {string[]} [defaultValue=[]] - Default value if not set
|
||||
* @returns {string[]} - Array of strings
|
||||
*/
|
||||
function getEnvArray(name, defaultValue = []) {
|
||||
const val = getEnv(name, '');
|
||||
if (!val) return defaultValue;
|
||||
|
||||
// If starts with '[', parse as JSON array
|
||||
if (val.startsWith('[')) {
|
||||
try {
|
||||
const parsed = JSON.parse(val);
|
||||
if (Array.isArray(parsed)) return parsed;
|
||||
} catch (e) {
|
||||
console.error(`[!] Failed to parse ${name} as JSON array: ${e.message}`);
|
||||
// Fall through to comma-separated parsing
|
||||
}
|
||||
}
|
||||
|
||||
// Parse as comma-separated values
|
||||
return val.split(',').map(s => s.trim()).filter(Boolean);
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse resolution string into width/height.
|
||||
* @param {string} resolution - Resolution string like "1440,2000"
|
||||
@@ -169,86 +203,115 @@ function waitForDebugPort(port, timeout = 30000) {
|
||||
|
||||
/**
|
||||
* Kill zombie Chrome processes from stale crawls.
|
||||
* Scans DATA_DIR/crawls/<crawl_id>/chrome/<name>.pid for stale processes.
|
||||
* Recursively scans DATA_DIR for any */chrome/*.pid files from stale crawls.
|
||||
* Does not assume specific directory structure - works with nested paths.
|
||||
* @param {string} [dataDir] - Data directory (defaults to DATA_DIR env or '.')
|
||||
* @returns {number} - Number of zombies killed
|
||||
*/
|
||||
function killZombieChrome(dataDir = null) {
|
||||
dataDir = dataDir || getEnv('DATA_DIR', '.');
|
||||
const crawlsDir = path.join(dataDir, 'crawls');
|
||||
const now = Date.now();
|
||||
const fiveMinutesAgo = now - 300000;
|
||||
let killed = 0;
|
||||
|
||||
console.error('[*] Checking for zombie Chrome processes...');
|
||||
|
||||
if (!fs.existsSync(crawlsDir)) {
|
||||
console.error('[+] No crawls directory found');
|
||||
if (!fs.existsSync(dataDir)) {
|
||||
console.error('[+] No data directory found');
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Recursively find all chrome/.pid files in directory tree
|
||||
* @param {string} dir - Directory to search
|
||||
* @param {number} depth - Current recursion depth (limit to 10)
|
||||
* @returns {Array<{pidFile: string, crawlDir: string}>} - Array of PID file info
|
||||
*/
|
||||
function findChromePidFiles(dir, depth = 0) {
|
||||
if (depth > 10) return []; // Prevent infinite recursion
|
||||
|
||||
const results = [];
|
||||
try {
|
||||
const entries = fs.readdirSync(dir, { withFileTypes: true });
|
||||
|
||||
for (const entry of entries) {
|
||||
if (!entry.isDirectory()) continue;
|
||||
|
||||
const fullPath = path.join(dir, entry.name);
|
||||
|
||||
// Found a chrome directory - check for .pid files
|
||||
if (entry.name === 'chrome') {
|
||||
try {
|
||||
const pidFiles = fs.readdirSync(fullPath).filter(f => f.endsWith('.pid'));
|
||||
const crawlDir = dir; // Parent of chrome/ is the crawl dir
|
||||
|
||||
for (const pidFileName of pidFiles) {
|
||||
results.push({
|
||||
pidFile: path.join(fullPath, pidFileName),
|
||||
crawlDir: crawlDir,
|
||||
});
|
||||
}
|
||||
} catch (e) {
|
||||
// Skip if can't read chrome dir
|
||||
}
|
||||
} else {
|
||||
// Recurse into subdirectory (skip hidden dirs and node_modules)
|
||||
if (!entry.name.startsWith('.') && entry.name !== 'node_modules') {
|
||||
results.push(...findChromePidFiles(fullPath, depth + 1));
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
// Skip if can't read directory
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
try {
|
||||
const crawls = fs.readdirSync(crawlsDir, { withFileTypes: true });
|
||||
|
||||
for (const crawl of crawls) {
|
||||
if (!crawl.isDirectory()) continue;
|
||||
|
||||
const crawlDir = path.join(crawlsDir, crawl.name);
|
||||
const chromeDir = path.join(crawlDir, 'chrome');
|
||||
|
||||
if (!fs.existsSync(chromeDir)) continue;
|
||||
const chromePids = findChromePidFiles(dataDir);
|
||||
|
||||
for (const {pidFile, crawlDir} of chromePids) {
|
||||
// Check if crawl was modified recently (still active)
|
||||
try {
|
||||
const crawlStats = fs.statSync(crawlDir);
|
||||
if (crawlStats.mtimeMs > fiveMinutesAgo) {
|
||||
continue;
|
||||
continue; // Crawl is active, skip
|
||||
}
|
||||
} catch (e) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Crawl is stale, check for PIDs
|
||||
// Crawl is stale, check PID
|
||||
try {
|
||||
const pidFiles = fs.readdirSync(chromeDir).filter(f => f.endsWith('.pid'));
|
||||
const pid = parseInt(fs.readFileSync(pidFile, 'utf8').trim(), 10);
|
||||
if (isNaN(pid) || pid <= 0) continue;
|
||||
|
||||
for (const pidFileName of pidFiles) {
|
||||
const pidFile = path.join(chromeDir, pidFileName);
|
||||
// Check if process exists
|
||||
try {
|
||||
process.kill(pid, 0);
|
||||
} catch (e) {
|
||||
// Process dead, remove stale PID file
|
||||
try { fs.unlinkSync(pidFile); } catch (e) {}
|
||||
continue;
|
||||
}
|
||||
|
||||
try {
|
||||
const pid = parseInt(fs.readFileSync(pidFile, 'utf8').trim(), 10);
|
||||
if (isNaN(pid) || pid <= 0) continue;
|
||||
// Process alive and crawl is stale - zombie!
|
||||
console.error(`[!] Found zombie (PID ${pid}) from stale crawl ${path.basename(crawlDir)}`);
|
||||
|
||||
// Check if process exists
|
||||
try {
|
||||
process.kill(pid, 0);
|
||||
} catch (e) {
|
||||
// Process dead, remove stale PID file
|
||||
try { fs.unlinkSync(pidFile); } catch (e) {}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Process alive and crawl is stale - zombie!
|
||||
console.error(`[!] Found zombie (PID ${pid}) from stale crawl ${crawl.name}`);
|
||||
|
||||
try {
|
||||
try { process.kill(-pid, 'SIGKILL'); } catch (e) { process.kill(pid, 'SIGKILL'); }
|
||||
killed++;
|
||||
console.error(`[+] Killed zombie (PID ${pid})`);
|
||||
try { fs.unlinkSync(pidFile); } catch (e) {}
|
||||
} catch (e) {
|
||||
console.error(`[!] Failed to kill PID ${pid}: ${e.message}`);
|
||||
}
|
||||
} catch (e) {
|
||||
// Skip invalid PID files
|
||||
}
|
||||
try {
|
||||
try { process.kill(-pid, 'SIGKILL'); } catch (e) { process.kill(pid, 'SIGKILL'); }
|
||||
killed++;
|
||||
console.error(`[+] Killed zombie (PID ${pid})`);
|
||||
try { fs.unlinkSync(pidFile); } catch (e) {}
|
||||
} catch (e) {
|
||||
console.error(`[!] Failed to kill PID ${pid}: ${e.message}`);
|
||||
}
|
||||
} catch (e) {
|
||||
// Skip if can't read chrome dir
|
||||
// Skip invalid PID files
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
console.error(`[!] Error scanning crawls: ${e.message}`);
|
||||
console.error(`[!] Error scanning for Chrome processes: ${e.message}`);
|
||||
}
|
||||
|
||||
if (killed > 0) {
|
||||
@@ -257,6 +320,31 @@ function killZombieChrome(dataDir = null) {
|
||||
console.error('[+] No zombies found');
|
||||
}
|
||||
|
||||
// Clean up stale SingletonLock files from persona chrome_user_data directories
|
||||
const personasDir = path.join(dataDir, 'personas');
|
||||
if (fs.existsSync(personasDir)) {
|
||||
try {
|
||||
const personas = fs.readdirSync(personasDir, { withFileTypes: true });
|
||||
for (const persona of personas) {
|
||||
if (!persona.isDirectory()) continue;
|
||||
|
||||
const userDataDir = path.join(personasDir, persona.name, 'chrome_user_data');
|
||||
const singletonLock = path.join(userDataDir, 'SingletonLock');
|
||||
|
||||
if (fs.existsSync(singletonLock)) {
|
||||
try {
|
||||
fs.unlinkSync(singletonLock);
|
||||
console.error(`[+] Removed stale SingletonLock: ${singletonLock}`);
|
||||
} catch (e) {
|
||||
// Ignore - may be in use by active Chrome
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
// Ignore errors scanning personas directory
|
||||
}
|
||||
}
|
||||
|
||||
return killed;
|
||||
}
|
||||
|
||||
@@ -270,8 +358,10 @@ function killZombieChrome(dataDir = null) {
|
||||
* @param {Object} options - Launch options
|
||||
* @param {string} [options.binary] - Chrome binary path (auto-detected if not provided)
|
||||
* @param {string} [options.outputDir='chrome'] - Directory for output files
|
||||
* @param {string} [options.userDataDir] - Chrome user data directory for persistent sessions
|
||||
* @param {string} [options.resolution='1440,2000'] - Window resolution
|
||||
* @param {boolean} [options.headless=true] - Run in headless mode
|
||||
* @param {boolean} [options.sandbox=true] - Enable Chrome sandbox
|
||||
* @param {boolean} [options.checkSsl=true] - Check SSL certificates
|
||||
* @param {string[]} [options.extensionPaths=[]] - Paths to unpacked extensions
|
||||
* @param {boolean} [options.killZombies=true] - Kill zombie processes first
|
||||
@@ -281,8 +371,10 @@ async function launchChromium(options = {}) {
|
||||
const {
|
||||
binary = findChromium(),
|
||||
outputDir = 'chrome',
|
||||
userDataDir = getEnv('CHROME_USER_DATA_DIR'),
|
||||
resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000'),
|
||||
headless = getEnvBool('CHROME_HEADLESS', true),
|
||||
sandbox = getEnvBool('CHROME_SANDBOX', true),
|
||||
checkSsl = getEnvBool('CHROME_CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true)),
|
||||
extensionPaths = [],
|
||||
killZombies = true,
|
||||
@@ -304,41 +396,65 @@ async function launchChromium(options = {}) {
|
||||
fs.mkdirSync(outputDir, { recursive: true });
|
||||
}
|
||||
|
||||
// Create user data directory if specified and doesn't exist
|
||||
if (userDataDir) {
|
||||
if (!fs.existsSync(userDataDir)) {
|
||||
fs.mkdirSync(userDataDir, { recursive: true });
|
||||
console.error(`[*] Created user data directory: ${userDataDir}`);
|
||||
}
|
||||
// Clean up any stale SingletonLock file from previous crashed sessions
|
||||
const singletonLock = path.join(userDataDir, 'SingletonLock');
|
||||
if (fs.existsSync(singletonLock)) {
|
||||
try {
|
||||
fs.unlinkSync(singletonLock);
|
||||
console.error(`[*] Removed stale SingletonLock: ${singletonLock}`);
|
||||
} catch (e) {
|
||||
console.error(`[!] Failed to remove SingletonLock: ${e.message}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Find a free port
|
||||
const debugPort = await findFreePort();
|
||||
console.error(`[*] Using debug port: ${debugPort}`);
|
||||
|
||||
// Build Chrome arguments
|
||||
const chromiumArgs = [
|
||||
// Get base Chrome args from config (static flags from CHROME_ARGS env var)
|
||||
// These come from config.json defaults, merged by get_config() in Python
|
||||
const baseArgs = getEnvArray('CHROME_ARGS', []);
|
||||
|
||||
// Get extra user-provided args
|
||||
const extraArgs = getEnvArray('CHROME_ARGS_EXTRA', []);
|
||||
|
||||
// Build dynamic Chrome arguments (these must be computed at runtime)
|
||||
const dynamicArgs = [
|
||||
// Remote debugging setup
|
||||
`--remote-debugging-port=${debugPort}`,
|
||||
'--remote-debugging-address=127.0.0.1',
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
|
||||
// Sandbox settings (disable in Docker)
|
||||
...(sandbox ? [] : ['--no-sandbox', '--disable-setuid-sandbox']),
|
||||
|
||||
// Docker-specific workarounds
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-gpu',
|
||||
'--disable-sync',
|
||||
'--no-first-run',
|
||||
'--no-default-browser-check',
|
||||
'--disable-default-apps',
|
||||
'--disable-infobars',
|
||||
'--disable-blink-features=AutomationControlled',
|
||||
'--disable-component-update',
|
||||
'--disable-domain-reliability',
|
||||
'--disable-breakpad',
|
||||
'--disable-background-networking',
|
||||
'--disable-background-timer-throttling',
|
||||
'--disable-backgrounding-occluded-windows',
|
||||
'--disable-renderer-backgrounding',
|
||||
'--disable-ipc-flooding-protection',
|
||||
'--password-store=basic',
|
||||
'--use-mock-keychain',
|
||||
'--font-render-hinting=none',
|
||||
'--force-color-profile=srgb',
|
||||
|
||||
// Window size
|
||||
`--window-size=${width},${height}`,
|
||||
|
||||
// User data directory (for persistent sessions with persona)
|
||||
...(userDataDir ? [`--user-data-dir=${userDataDir}`] : []),
|
||||
|
||||
// Headless mode
|
||||
...(headless ? ['--headless=new'] : []),
|
||||
|
||||
// SSL certificate checking
|
||||
...(checkSsl ? [] : ['--ignore-certificate-errors']),
|
||||
];
|
||||
|
||||
// Combine all args: base (from config) + dynamic (runtime) + extra (user overrides)
|
||||
// Dynamic args come after base so they can override if needed
|
||||
const chromiumArgs = [...baseArgs, ...dynamicArgs, ...extraArgs];
|
||||
|
||||
// Add extension loading flags
|
||||
if (extensionPaths.length > 0) {
|
||||
const extPathsArg = extensionPaths.join(',');
|
||||
@@ -533,9 +649,9 @@ async function killChrome(pid, outputDir = null) {
|
||||
}
|
||||
|
||||
// Step 8: Clean up PID files
|
||||
// Note: hook-specific .pid files are cleaned up by run_hook() and Snapshot.cleanup()
|
||||
if (outputDir) {
|
||||
try { fs.unlinkSync(path.join(outputDir, 'chrome.pid')); } catch (e) {}
|
||||
try { fs.unlinkSync(path.join(outputDir, 'hook.pid')); } catch (e) {}
|
||||
}
|
||||
|
||||
console.error('[*] Chrome cleanup completed');
|
||||
@@ -766,7 +882,8 @@ async function loadOrInstallExtension(ext, extensions_dir = null) {
|
||||
}
|
||||
|
||||
// Determine extensions directory
|
||||
const EXTENSIONS_DIR = extensions_dir || process.env.CHROME_EXTENSIONS_DIR || './data/chrome_extensions';
|
||||
// Use provided dir, or fall back to getExtensionsDir() which handles env vars and defaults
|
||||
const EXTENSIONS_DIR = extensions_dir || getExtensionsDir();
|
||||
|
||||
// Set statically computable extension metadata
|
||||
ext.webstore_id = ext.webstore_id || ext.id;
|
||||
@@ -1225,12 +1342,183 @@ function findChromium() {
|
||||
return null;
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Shared Extension Installer Utilities
|
||||
// ============================================================================
|
||||
|
||||
/**
|
||||
* Get the extensions directory path.
|
||||
* Centralized path calculation used by extension installers and chrome launch.
|
||||
*
|
||||
* Path is derived from environment variables in this priority:
|
||||
* 1. CHROME_EXTENSIONS_DIR (explicit override)
|
||||
* 2. DATA_DIR/personas/ACTIVE_PERSONA/chrome_extensions (default)
|
||||
*
|
||||
* @returns {string} - Absolute path to extensions directory
|
||||
*/
|
||||
function getExtensionsDir() {
|
||||
const dataDir = getEnv('DATA_DIR', '.');
|
||||
const persona = getEnv('ACTIVE_PERSONA', 'Default');
|
||||
return getEnv('CHROME_EXTENSIONS_DIR') ||
|
||||
path.join(dataDir, 'personas', persona, 'chrome_extensions');
|
||||
}
|
||||
|
||||
/**
|
||||
* Get machine type string for platform-specific paths.
|
||||
* Matches Python's archivebox.config.paths.get_machine_type()
|
||||
*
|
||||
* @returns {string} - Machine type (e.g., 'x86_64-linux', 'arm64-darwin')
|
||||
*/
|
||||
function getMachineType() {
|
||||
if (process.env.MACHINE_TYPE) {
|
||||
return process.env.MACHINE_TYPE;
|
||||
}
|
||||
|
||||
let machine = process.arch;
|
||||
const system = process.platform;
|
||||
|
||||
// Normalize machine type to match Python's convention
|
||||
if (machine === 'arm64' || machine === 'aarch64') {
|
||||
machine = 'arm64';
|
||||
} else if (machine === 'x64' || machine === 'x86_64' || machine === 'amd64') {
|
||||
machine = 'x86_64';
|
||||
} else if (machine === 'ia32' || machine === 'x86') {
|
||||
machine = 'x86';
|
||||
}
|
||||
|
||||
return `${machine}-${system}`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get LIB_DIR path for platform-specific binaries.
|
||||
* Returns DATA_DIR/lib/MACHINE_TYPE/
|
||||
*
|
||||
* @returns {string} - Absolute path to lib directory
|
||||
*/
|
||||
function getLibDir() {
|
||||
if (process.env.LIB_DIR) {
|
||||
return process.env.LIB_DIR;
|
||||
}
|
||||
const dataDir = getEnv('DATA_DIR', './data');
|
||||
const machineType = getMachineType();
|
||||
return path.join(dataDir, 'lib', machineType);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get NODE_MODULES_DIR path for npm packages.
|
||||
* Returns LIB_DIR/npm/node_modules/
|
||||
*
|
||||
* @returns {string} - Absolute path to node_modules directory
|
||||
*/
|
||||
function getNodeModulesDir() {
|
||||
if (process.env.NODE_MODULES_DIR) {
|
||||
return process.env.NODE_MODULES_DIR;
|
||||
}
|
||||
return path.join(getLibDir(), 'npm', 'node_modules');
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all test environment paths as a JSON object.
|
||||
* This is the single source of truth for path calculations - Python calls this
|
||||
* to avoid duplicating path logic.
|
||||
*
|
||||
* @returns {Object} - Object with all test environment paths
|
||||
*/
|
||||
function getTestEnv() {
|
||||
const dataDir = getEnv('DATA_DIR', './data');
|
||||
const machineType = getMachineType();
|
||||
const libDir = getLibDir();
|
||||
const nodeModulesDir = getNodeModulesDir();
|
||||
|
||||
return {
|
||||
DATA_DIR: dataDir,
|
||||
MACHINE_TYPE: machineType,
|
||||
LIB_DIR: libDir,
|
||||
NODE_MODULES_DIR: nodeModulesDir,
|
||||
NPM_BIN_DIR: path.join(libDir, 'npm', '.bin'),
|
||||
CHROME_EXTENSIONS_DIR: getExtensionsDir(),
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Install a Chrome extension with caching support.
|
||||
*
|
||||
* This is the main entry point for extension installer hooks. It handles:
|
||||
* - Checking for cached extension metadata
|
||||
* - Installing the extension if not cached
|
||||
* - Writing cache file for future runs
|
||||
*
|
||||
* @param {Object} extension - Extension metadata object
|
||||
* @param {string} extension.webstore_id - Chrome Web Store extension ID
|
||||
* @param {string} extension.name - Human-readable extension name (used for cache file)
|
||||
* @param {Object} [options] - Options
|
||||
* @param {string} [options.extensionsDir] - Override extensions directory
|
||||
* @param {boolean} [options.quiet=false] - Suppress info logging
|
||||
* @returns {Promise<Object|null>} - Installed extension metadata or null on failure
|
||||
*/
|
||||
async function installExtensionWithCache(extension, options = {}) {
|
||||
const {
|
||||
extensionsDir = getExtensionsDir(),
|
||||
quiet = false,
|
||||
} = options;
|
||||
|
||||
const cacheFile = path.join(extensionsDir, `${extension.name}.extension.json`);
|
||||
|
||||
// Check if extension is already cached and valid
|
||||
if (fs.existsSync(cacheFile)) {
|
||||
try {
|
||||
const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
|
||||
const manifestPath = path.join(cached.unpacked_path, 'manifest.json');
|
||||
|
||||
if (fs.existsSync(manifestPath)) {
|
||||
if (!quiet) {
|
||||
console.log(`[*] ${extension.name} extension already installed (using cache)`);
|
||||
}
|
||||
return cached;
|
||||
}
|
||||
} catch (e) {
|
||||
// Cache file corrupted, re-install
|
||||
console.warn(`[⚠️] Extension cache corrupted for ${extension.name}, re-installing...`);
|
||||
}
|
||||
}
|
||||
|
||||
// Install extension
|
||||
if (!quiet) {
|
||||
console.log(`[*] Installing ${extension.name} extension...`);
|
||||
}
|
||||
|
||||
const installedExt = await loadOrInstallExtension(extension, extensionsDir);
|
||||
|
||||
if (!installedExt?.version) {
|
||||
console.error(`[❌] Failed to install ${extension.name} extension`);
|
||||
return null;
|
||||
}
|
||||
|
||||
// Write cache file
|
||||
try {
|
||||
await fs.promises.mkdir(extensionsDir, { recursive: true });
|
||||
await fs.promises.writeFile(cacheFile, JSON.stringify(installedExt, null, 2));
|
||||
if (!quiet) {
|
||||
console.log(`[+] Extension metadata written to ${cacheFile}`);
|
||||
}
|
||||
} catch (e) {
|
||||
console.warn(`[⚠️] Failed to write cache file: ${e.message}`);
|
||||
}
|
||||
|
||||
if (!quiet) {
|
||||
console.log(`[+] ${extension.name} extension installed`);
|
||||
}
|
||||
|
||||
return installedExt;
|
||||
}
|
||||
|
||||
// Export all functions
|
||||
module.exports = {
|
||||
// Environment helpers
|
||||
getEnv,
|
||||
getEnvBool,
|
||||
getEnvInt,
|
||||
getEnvArray,
|
||||
parseResolution,
|
||||
// PID file management
|
||||
writePidWithMtime,
|
||||
@@ -1261,6 +1549,14 @@ module.exports = {
|
||||
getExtensionPaths,
|
||||
waitForExtensionTarget,
|
||||
getExtensionTargets,
|
||||
// Shared path utilities (single source of truth for Python/JS)
|
||||
getMachineType,
|
||||
getLibDir,
|
||||
getNodeModulesDir,
|
||||
getExtensionsDir,
|
||||
getTestEnv,
|
||||
// Shared extension installer utilities
|
||||
installExtensionWithCache,
|
||||
// Deprecated - use enableExtensions option instead
|
||||
getExtensionLaunchArgs,
|
||||
};
|
||||
@@ -1273,16 +1569,31 @@ if (require.main === module) {
|
||||
console.log('Usage: chrome_utils.js <command> [args...]');
|
||||
console.log('');
|
||||
console.log('Commands:');
|
||||
console.log(' findChromium');
|
||||
console.log(' installChromium');
|
||||
console.log(' installPuppeteerCore [npm_prefix]');
|
||||
console.log(' launchChromium [output_dir] [extension_paths_json]');
|
||||
console.log(' killChrome <pid> [output_dir]');
|
||||
console.log(' killZombieChrome [data_dir]');
|
||||
console.log(' getExtensionId <path>');
|
||||
console.log(' loadExtensionManifest <path>');
|
||||
console.log(' getExtensionLaunchArgs <extensions_json>');
|
||||
console.log(' loadOrInstallExtension <webstore_id> <name> [extensions_dir]');
|
||||
console.log(' findChromium Find Chrome/Chromium binary');
|
||||
console.log(' installChromium Install Chromium via @puppeteer/browsers');
|
||||
console.log(' installPuppeteerCore Install puppeteer-core npm package');
|
||||
console.log(' launchChromium Launch Chrome with CDP debugging');
|
||||
console.log(' killChrome <pid> Kill Chrome process by PID');
|
||||
console.log(' killZombieChrome Clean up zombie Chrome processes');
|
||||
console.log('');
|
||||
console.log(' getMachineType Get machine type (e.g., x86_64-linux)');
|
||||
console.log(' getLibDir Get LIB_DIR path');
|
||||
console.log(' getNodeModulesDir Get NODE_MODULES_DIR path');
|
||||
console.log(' getExtensionsDir Get Chrome extensions directory');
|
||||
console.log(' getTestEnv Get all paths as JSON (for tests)');
|
||||
console.log('');
|
||||
console.log(' getExtensionId <path> Get extension ID from unpacked path');
|
||||
console.log(' loadExtensionManifest Load extension manifest.json');
|
||||
console.log(' loadOrInstallExtension Load or install an extension');
|
||||
console.log(' installExtensionWithCache Install extension with caching');
|
||||
console.log('');
|
||||
console.log('Environment variables:');
|
||||
console.log(' DATA_DIR Base data directory');
|
||||
console.log(' LIB_DIR Library directory (computed if not set)');
|
||||
console.log(' MACHINE_TYPE Machine type override');
|
||||
console.log(' NODE_MODULES_DIR Node modules directory');
|
||||
console.log(' CHROME_BINARY Chrome binary path');
|
||||
console.log(' CHROME_EXTENSIONS_DIR Extensions directory');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
@@ -1395,6 +1706,46 @@ if (require.main === module) {
|
||||
break;
|
||||
}
|
||||
|
||||
case 'getMachineType': {
|
||||
console.log(getMachineType());
|
||||
break;
|
||||
}
|
||||
|
||||
case 'getLibDir': {
|
||||
console.log(getLibDir());
|
||||
break;
|
||||
}
|
||||
|
||||
case 'getNodeModulesDir': {
|
||||
console.log(getNodeModulesDir());
|
||||
break;
|
||||
}
|
||||
|
||||
case 'getExtensionsDir': {
|
||||
console.log(getExtensionsDir());
|
||||
break;
|
||||
}
|
||||
|
||||
case 'getTestEnv': {
|
||||
console.log(JSON.stringify(getTestEnv(), null, 2));
|
||||
break;
|
||||
}
|
||||
|
||||
case 'installExtensionWithCache': {
|
||||
const [webstore_id, name] = commandArgs;
|
||||
if (!webstore_id || !name) {
|
||||
console.error('Usage: installExtensionWithCache <webstore_id> <name>');
|
||||
process.exit(1);
|
||||
}
|
||||
const ext = await installExtensionWithCache({ webstore_id, name });
|
||||
if (ext) {
|
||||
console.log(JSON.stringify(ext, null, 2));
|
||||
} else {
|
||||
process.exit(1);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
console.error(`Unknown command: ${command}`);
|
||||
process.exit(1);
|
||||
|
||||
@@ -42,7 +42,7 @@
|
||||
"CHROME_USER_DATA_DIR": {
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"description": "Path to Chrome user data directory for persistent sessions"
|
||||
"description": "Path to Chrome user data directory for persistent sessions (derived from ACTIVE_PERSONA if not set)"
|
||||
},
|
||||
"CHROME_USER_AGENT": {
|
||||
"type": "string",
|
||||
@@ -53,16 +53,74 @@
|
||||
"CHROME_ARGS": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"default": [],
|
||||
"default": [
|
||||
"--no-first-run",
|
||||
"--no-default-browser-check",
|
||||
"--disable-default-apps",
|
||||
"--disable-sync",
|
||||
"--disable-infobars",
|
||||
"--disable-blink-features=AutomationControlled",
|
||||
"--disable-component-update",
|
||||
"--disable-domain-reliability",
|
||||
"--disable-breakpad",
|
||||
"--disable-client-side-phishing-detection",
|
||||
"--disable-hang-monitor",
|
||||
"--disable-speech-synthesis-api",
|
||||
"--disable-speech-api",
|
||||
"--disable-print-preview",
|
||||
"--disable-notifications",
|
||||
"--disable-desktop-notifications",
|
||||
"--disable-popup-blocking",
|
||||
"--disable-prompt-on-repost",
|
||||
"--disable-external-intent-requests",
|
||||
"--disable-session-crashed-bubble",
|
||||
"--disable-search-engine-choice-screen",
|
||||
"--disable-datasaver-prompt",
|
||||
"--ash-no-nudges",
|
||||
"--hide-crash-restore-bubble",
|
||||
"--suppress-message-center-popups",
|
||||
"--noerrdialogs",
|
||||
"--no-pings",
|
||||
"--silent-debugger-extension-api",
|
||||
"--deny-permission-prompts",
|
||||
"--safebrowsing-disable-auto-update",
|
||||
"--metrics-recording-only",
|
||||
"--password-store=basic",
|
||||
"--use-mock-keychain",
|
||||
"--disable-cookie-encryption",
|
||||
"--font-render-hinting=none",
|
||||
"--force-color-profile=srgb",
|
||||
"--disable-partial-raster",
|
||||
"--disable-skia-runtime-opts",
|
||||
"--disable-2d-canvas-clip-aa",
|
||||
"--enable-webgl",
|
||||
"--hide-scrollbars",
|
||||
"--export-tagged-pdf",
|
||||
"--generate-pdf-document-outline",
|
||||
"--disable-lazy-loading",
|
||||
"--disable-renderer-backgrounding",
|
||||
"--disable-background-networking",
|
||||
"--disable-background-timer-throttling",
|
||||
"--disable-backgrounding-occluded-windows",
|
||||
"--disable-ipc-flooding-protection",
|
||||
"--disable-extensions-http-throttling",
|
||||
"--disable-field-trial-config",
|
||||
"--disable-back-forward-cache",
|
||||
"--autoplay-policy=no-user-gesture-required",
|
||||
"--disable-gesture-requirement-for-media-playback",
|
||||
"--lang=en-US,en;q=0.9",
|
||||
"--log-level=2",
|
||||
"--enable-logging=stderr"
|
||||
],
|
||||
"x-aliases": ["CHROME_DEFAULT_ARGS"],
|
||||
"description": "Default Chrome command-line arguments"
|
||||
"description": "Default Chrome command-line arguments (static flags only, dynamic args like --user-data-dir are added at runtime)"
|
||||
},
|
||||
"CHROME_ARGS_EXTRA": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"default": [],
|
||||
"x-aliases": ["CHROME_EXTRA_ARGS"],
|
||||
"description": "Extra arguments to append to Chrome command"
|
||||
"description": "Extra arguments to append to Chrome command (for user customization)"
|
||||
},
|
||||
"CHROME_PAGELOAD_TIMEOUT": {
|
||||
"type": "integer",
|
||||
|
||||
@@ -0,0 +1,265 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Install hook for Chrome/Chromium and puppeteer-core.
|
||||
|
||||
Runs at crawl start to install/find Chromium and puppeteer-core.
|
||||
Also validates config and computes derived values.
|
||||
|
||||
Outputs:
|
||||
- JSONL for Binary and Machine config updates
|
||||
- COMPUTED:KEY=VALUE lines that hooks.py parses and adds to env
|
||||
|
||||
Respects CHROME_BINARY env var for custom binary paths.
|
||||
Uses `npx @puppeteer/browsers install chromium@latest` and parses output.
|
||||
|
||||
NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for
|
||||
--load-extension and --disable-extensions-except flags, which are needed for
|
||||
loading unpacked extensions in headless mode.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
return os.environ.get(name, default).strip()
|
||||
|
||||
|
||||
def get_env_bool(name: str, default: bool = False) -> bool:
|
||||
val = get_env(name, '').lower()
|
||||
if val in ('true', '1', 'yes', 'on'):
|
||||
return True
|
||||
if val in ('false', '0', 'no', 'off'):
|
||||
return False
|
||||
return default
|
||||
|
||||
|
||||
def detect_docker() -> bool:
|
||||
"""Detect if running inside Docker container."""
|
||||
return (
|
||||
os.path.exists('/.dockerenv') or
|
||||
os.environ.get('IN_DOCKER', '').lower() in ('true', '1', 'yes') or
|
||||
os.path.exists('/run/.containerenv')
|
||||
)
|
||||
|
||||
|
||||
def get_chrome_version(binary_path: str) -> str | None:
|
||||
"""Get Chrome/Chromium version string."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[binary_path, '--version'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5
|
||||
)
|
||||
if result.returncode == 0:
|
||||
return result.stdout.strip()
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def install_puppeteer_core() -> bool:
|
||||
"""Install puppeteer-core to NODE_MODULES_DIR if not present."""
|
||||
node_modules_dir = os.environ.get('NODE_MODULES_DIR', '').strip()
|
||||
if not node_modules_dir:
|
||||
# No isolated node_modules, skip (will use global)
|
||||
return True
|
||||
|
||||
node_modules_path = Path(node_modules_dir)
|
||||
if (node_modules_path / 'puppeteer-core').exists():
|
||||
return True
|
||||
|
||||
# Get npm prefix from NODE_MODULES_DIR (parent of node_modules)
|
||||
npm_prefix = node_modules_path.parent
|
||||
|
||||
try:
|
||||
print(f"[*] Installing puppeteer-core to {npm_prefix}...", file=sys.stderr)
|
||||
result = subprocess.run(
|
||||
['npm', 'install', '--prefix', str(npm_prefix), 'puppeteer-core', '@puppeteer/browsers'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60
|
||||
)
|
||||
if result.returncode == 0:
|
||||
print(f"[+] puppeteer-core installed", file=sys.stderr)
|
||||
return True
|
||||
else:
|
||||
print(f"[!] Failed to install puppeteer-core: {result.stderr}", file=sys.stderr)
|
||||
return False
|
||||
except Exception as e:
|
||||
print(f"[!] Failed to install puppeteer-core: {e}", file=sys.stderr)
|
||||
return False
|
||||
|
||||
|
||||
def install_chromium() -> dict | None:
|
||||
"""Install Chromium using @puppeteer/browsers and parse output for binary path.
|
||||
|
||||
Output format: "chromium@<version> <path_to_binary>"
|
||||
e.g.: "chromium@1563294 /Users/x/.cache/puppeteer/chromium/.../Chromium"
|
||||
|
||||
Note: npx is fast when chromium is already cached - it returns the path without re-downloading.
|
||||
"""
|
||||
try:
|
||||
print("[*] Installing Chromium via @puppeteer/browsers...", file=sys.stderr)
|
||||
|
||||
# Use --path to install to puppeteer's standard cache location
|
||||
cache_path = os.path.expanduser('~/.cache/puppeteer')
|
||||
|
||||
result = subprocess.run(
|
||||
['npx', '@puppeteer/browsers', 'install', 'chromium@1563297', f'--path={cache_path}'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
stdin=subprocess.DEVNULL,
|
||||
timeout=300
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
print(f"[!] Failed to install Chromium: {result.stderr}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
# Parse output: "chromium@1563294 /path/to/Chromium"
|
||||
output = result.stdout.strip()
|
||||
parts = output.split(' ', 1)
|
||||
if len(parts) != 2:
|
||||
print(f"[!] Failed to parse install output: {output}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
version_str = parts[0] # "chromium@1563294"
|
||||
binary_path = parts[1].strip()
|
||||
|
||||
if not binary_path or not os.path.exists(binary_path):
|
||||
print(f"[!] Binary not found at: {binary_path}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
# Extract version number
|
||||
version = version_str.split('@')[1] if '@' in version_str else None
|
||||
|
||||
print(f"[+] Chromium installed: {binary_path}", file=sys.stderr)
|
||||
|
||||
return {
|
||||
'name': 'chromium',
|
||||
'abspath': binary_path,
|
||||
'version': version,
|
||||
'binprovider': 'puppeteer',
|
||||
}
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
print("[!] Chromium install timed out", file=sys.stderr)
|
||||
except FileNotFoundError:
|
||||
print("[!] npx not found - is Node.js installed?", file=sys.stderr)
|
||||
except Exception as e:
|
||||
print(f"[!] Failed to install Chromium: {e}", file=sys.stderr)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
warnings = []
|
||||
errors = []
|
||||
computed = {}
|
||||
|
||||
# Install puppeteer-core if NODE_MODULES_DIR is set
|
||||
install_puppeteer_core()
|
||||
|
||||
# Check if Chrome is enabled
|
||||
chrome_enabled = get_env_bool('CHROME_ENABLED', True)
|
||||
|
||||
# Detect Docker and adjust sandbox
|
||||
in_docker = detect_docker()
|
||||
computed['IN_DOCKER'] = str(in_docker).lower()
|
||||
|
||||
chrome_sandbox = get_env_bool('CHROME_SANDBOX', True)
|
||||
if in_docker and chrome_sandbox:
|
||||
warnings.append(
|
||||
"Running in Docker with CHROME_SANDBOX=true. "
|
||||
"Chrome may fail to start. Consider setting CHROME_SANDBOX=false."
|
||||
)
|
||||
# Auto-disable sandbox in Docker unless explicitly set
|
||||
if not get_env('CHROME_SANDBOX'):
|
||||
computed['CHROME_SANDBOX'] = 'false'
|
||||
|
||||
# Check Node.js availability
|
||||
node_binary = get_env('NODE_BINARY', 'node')
|
||||
computed['NODE_BINARY'] = node_binary
|
||||
|
||||
# Check if CHROME_BINARY is already set and valid
|
||||
configured_binary = get_env('CHROME_BINARY', '')
|
||||
if configured_binary and os.path.isfile(configured_binary) and os.access(configured_binary, os.X_OK):
|
||||
version = get_chrome_version(configured_binary)
|
||||
computed['CHROME_BINARY'] = configured_binary
|
||||
computed['CHROME_VERSION'] = version or 'unknown'
|
||||
|
||||
print(json.dumps({
|
||||
'type': 'Binary',
|
||||
'name': 'chromium',
|
||||
'abspath': configured_binary,
|
||||
'version': version,
|
||||
'binprovider': 'env',
|
||||
}))
|
||||
|
||||
# Output computed values
|
||||
for key, value in computed.items():
|
||||
print(f"COMPUTED:{key}={value}")
|
||||
for warning in warnings:
|
||||
print(f"WARNING:{warning}", file=sys.stderr)
|
||||
|
||||
sys.exit(0)
|
||||
|
||||
# Install/find Chromium via puppeteer
|
||||
result = install_chromium()
|
||||
|
||||
if result and result.get('abspath'):
|
||||
computed['CHROME_BINARY'] = result['abspath']
|
||||
computed['CHROME_VERSION'] = result['version'] or 'unknown'
|
||||
|
||||
print(json.dumps({
|
||||
'type': 'Binary',
|
||||
'name': result['name'],
|
||||
'abspath': result['abspath'],
|
||||
'version': result['version'],
|
||||
'binprovider': result['binprovider'],
|
||||
}))
|
||||
|
||||
print(json.dumps({
|
||||
'type': 'Machine',
|
||||
'_method': 'update',
|
||||
'key': 'config/CHROME_BINARY',
|
||||
'value': result['abspath'],
|
||||
}))
|
||||
|
||||
if result['version']:
|
||||
print(json.dumps({
|
||||
'type': 'Machine',
|
||||
'_method': 'update',
|
||||
'key': 'config/CHROMIUM_VERSION',
|
||||
'value': result['version'],
|
||||
}))
|
||||
|
||||
# Output computed values
|
||||
for key, value in computed.items():
|
||||
print(f"COMPUTED:{key}={value}")
|
||||
for warning in warnings:
|
||||
print(f"WARNING:{warning}", file=sys.stderr)
|
||||
|
||||
sys.exit(0)
|
||||
else:
|
||||
errors.append("Chromium binary not found")
|
||||
computed['CHROME_BINARY'] = ''
|
||||
|
||||
# Output computed values and errors
|
||||
for key, value in computed.items():
|
||||
print(f"COMPUTED:{key}={value}")
|
||||
for warning in warnings:
|
||||
print(f"WARNING:{warning}", file=sys.stderr)
|
||||
for error in errors:
|
||||
print(f"ERROR:{error}", file=sys.stderr)
|
||||
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
323
archivebox/plugins/chrome/on_Crawl__30_chrome_launch.bg.js
Normal file
323
archivebox/plugins/chrome/on_Crawl__30_chrome_launch.bg.js
Normal file
@@ -0,0 +1,323 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* Launch a shared Chromium browser session for the entire crawl.
|
||||
*
|
||||
* This runs once per crawl and keeps Chromium alive for all snapshots to share.
|
||||
* Each snapshot creates its own tab via on_Snapshot__20_chrome_tab.bg.js.
|
||||
*
|
||||
* NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for
|
||||
* --load-extension and --disable-extensions-except flags.
|
||||
*
|
||||
* Usage: on_Crawl__30_chrome_launch.bg.js --crawl-id=<uuid> --source-url=<url>
|
||||
* Output: Writes to current directory (executor creates chrome/ dir):
|
||||
* - cdp_url.txt: WebSocket URL for CDP connection
|
||||
* - chrome.pid: Chromium process ID (for cleanup)
|
||||
* - port.txt: Debug port number
|
||||
* - extensions.json: Loaded extensions metadata
|
||||
*
|
||||
* Environment variables:
|
||||
* NODE_MODULES_DIR: Path to node_modules directory for module resolution
|
||||
* CHROME_BINARY: Path to Chromium binary (falls back to auto-detection)
|
||||
* CHROME_RESOLUTION: Page resolution (default: 1440,2000)
|
||||
* CHROME_HEADLESS: Run in headless mode (default: true)
|
||||
* CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true)
|
||||
* CHROME_EXTENSIONS_DIR: Directory containing Chrome extensions
|
||||
*/
|
||||
|
||||
// Add NODE_MODULES_DIR to module resolution paths if set
|
||||
if (process.env.NODE_MODULES_DIR) {
|
||||
module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
}
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const puppeteer = require('puppeteer-core');
|
||||
const {
|
||||
findChromium,
|
||||
launchChromium,
|
||||
killChrome,
|
||||
getEnv,
|
||||
writePidWithMtime,
|
||||
getExtensionsDir,
|
||||
} = require('./chrome_utils.js');
|
||||
|
||||
// Extractor metadata
|
||||
const PLUGIN_NAME = 'chrome_launch';
|
||||
const OUTPUT_DIR = '.';
|
||||
|
||||
// Global state for cleanup
|
||||
let chromePid = null;
|
||||
let browserInstance = null;
|
||||
|
||||
// Parse command line arguments
|
||||
function parseArgs() {
|
||||
const args = {};
|
||||
process.argv.slice(2).forEach((arg) => {
|
||||
if (arg.startsWith('--')) {
|
||||
const [key, ...valueParts] = arg.slice(2).split('=');
|
||||
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
|
||||
}
|
||||
});
|
||||
return args;
|
||||
}
|
||||
|
||||
// Cleanup handler for SIGTERM
|
||||
async function cleanup() {
|
||||
console.error('[*] Cleaning up Chrome session...');
|
||||
|
||||
// Try graceful browser close first
|
||||
if (browserInstance) {
|
||||
try {
|
||||
console.error('[*] Closing browser gracefully...');
|
||||
await browserInstance.close();
|
||||
browserInstance = null;
|
||||
console.error('[+] Browser closed gracefully');
|
||||
} catch (e) {
|
||||
console.error(`[!] Graceful close failed: ${e.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Kill Chrome process
|
||||
if (chromePid) {
|
||||
await killChrome(chromePid, OUTPUT_DIR);
|
||||
}
|
||||
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
// Register signal handlers
|
||||
process.on('SIGTERM', cleanup);
|
||||
process.on('SIGINT', cleanup);
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs();
|
||||
const crawlId = args.crawl_id;
|
||||
|
||||
try {
|
||||
const binary = findChromium();
|
||||
if (!binary) {
|
||||
console.error('ERROR: Chromium binary not found');
|
||||
console.error('DEPENDENCY_NEEDED=chromium');
|
||||
console.error('BIN_PROVIDERS=puppeteer,env,playwright,apt,brew');
|
||||
console.error('INSTALL_HINT=npx @puppeteer/browsers install chromium@latest');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// Get Chromium version
|
||||
let version = '';
|
||||
try {
|
||||
const { execSync } = require('child_process');
|
||||
version = execSync(`"${binary}" --version`, { encoding: 'utf8', timeout: 5000 })
|
||||
.trim()
|
||||
.slice(0, 64);
|
||||
} catch (e) {}
|
||||
|
||||
console.error(`[*] Using browser: ${binary}`);
|
||||
if (version) console.error(`[*] Version: ${version}`);
|
||||
|
||||
// Load installed extensions
|
||||
const extensionsDir = getExtensionsDir();
|
||||
const userDataDir = getEnv('CHROME_USER_DATA_DIR');
|
||||
|
||||
if (userDataDir) {
|
||||
console.error(`[*] Using user data dir: ${userDataDir}`);
|
||||
}
|
||||
|
||||
const installedExtensions = [];
|
||||
const extensionPaths = [];
|
||||
if (fs.existsSync(extensionsDir)) {
|
||||
const files = fs.readdirSync(extensionsDir);
|
||||
for (const file of files) {
|
||||
if (file.endsWith('.extension.json')) {
|
||||
try {
|
||||
const extPath = path.join(extensionsDir, file);
|
||||
const extData = JSON.parse(fs.readFileSync(extPath, 'utf-8'));
|
||||
if (extData.unpacked_path && fs.existsSync(extData.unpacked_path)) {
|
||||
installedExtensions.push(extData);
|
||||
extensionPaths.push(extData.unpacked_path);
|
||||
console.error(`[*] Loading extension: ${extData.name || file}`);
|
||||
}
|
||||
} catch (e) {
|
||||
console.warn(`[!] Skipping invalid extension cache: ${file}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (installedExtensions.length > 0) {
|
||||
console.error(`[+] Found ${installedExtensions.length} extension(s) to load`);
|
||||
}
|
||||
|
||||
// Note: PID file is written by run_hook() with hook-specific name
|
||||
// Snapshot.cleanup() kills all *.pid processes when done
|
||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
}
|
||||
|
||||
// Launch Chromium using consolidated function
|
||||
// userDataDir is derived from ACTIVE_PERSONA by get_config() if not explicitly set
|
||||
const result = await launchChromium({
|
||||
binary,
|
||||
outputDir: OUTPUT_DIR,
|
||||
userDataDir,
|
||||
extensionPaths,
|
||||
});
|
||||
|
||||
if (!result.success) {
|
||||
console.error(`ERROR: ${result.error}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
chromePid = result.pid;
|
||||
const cdpUrl = result.cdpUrl;
|
||||
|
||||
// Connect puppeteer for extension verification
|
||||
console.error(`[*] Connecting puppeteer to CDP...`);
|
||||
const browser = await puppeteer.connect({
|
||||
browserWSEndpoint: cdpUrl,
|
||||
defaultViewport: null,
|
||||
});
|
||||
browserInstance = browser;
|
||||
|
||||
// Get actual extension IDs from chrome://extensions page
|
||||
if (extensionPaths.length > 0) {
|
||||
await new Promise(r => setTimeout(r, 2000));
|
||||
|
||||
try {
|
||||
const extPage = await browser.newPage();
|
||||
await extPage.goto('chrome://extensions', { waitUntil: 'domcontentloaded', timeout: 10000 });
|
||||
await new Promise(r => setTimeout(r, 2000));
|
||||
|
||||
// Parse extension info from the page
|
||||
const extensionsFromPage = await extPage.evaluate(() => {
|
||||
const extensions = [];
|
||||
// Extensions manager uses shadow DOM
|
||||
const manager = document.querySelector('extensions-manager');
|
||||
if (!manager || !manager.shadowRoot) return extensions;
|
||||
|
||||
const itemList = manager.shadowRoot.querySelector('extensions-item-list');
|
||||
if (!itemList || !itemList.shadowRoot) return extensions;
|
||||
|
||||
const items = itemList.shadowRoot.querySelectorAll('extensions-item');
|
||||
for (const item of items) {
|
||||
const id = item.getAttribute('id');
|
||||
const nameEl = item.shadowRoot?.querySelector('#name');
|
||||
const name = nameEl?.textContent?.trim() || '';
|
||||
if (id && name) {
|
||||
extensions.push({ id, name });
|
||||
}
|
||||
}
|
||||
return extensions;
|
||||
});
|
||||
|
||||
console.error(`[*] Found ${extensionsFromPage.length} extension(s) on chrome://extensions`);
|
||||
for (const e of extensionsFromPage) {
|
||||
console.error(` - ${e.id}: "${e.name}"`);
|
||||
}
|
||||
|
||||
// Match extensions by name (strict matching)
|
||||
for (const ext of installedExtensions) {
|
||||
// Read the extension's manifest to get its display name
|
||||
const manifestPath = path.join(ext.unpacked_path, 'manifest.json');
|
||||
if (fs.existsSync(manifestPath)) {
|
||||
const manifest = JSON.parse(fs.readFileSync(manifestPath, 'utf-8'));
|
||||
let manifestName = manifest.name || '';
|
||||
|
||||
// Resolve message placeholder (e.g., __MSG_extName__)
|
||||
if (manifestName.startsWith('__MSG_') && manifestName.endsWith('__')) {
|
||||
const msgKey = manifestName.slice(6, -2); // Extract key from __MSG_key__
|
||||
const defaultLocale = manifest.default_locale || 'en';
|
||||
const messagesPath = path.join(ext.unpacked_path, '_locales', defaultLocale, 'messages.json');
|
||||
if (fs.existsSync(messagesPath)) {
|
||||
try {
|
||||
const messages = JSON.parse(fs.readFileSync(messagesPath, 'utf-8'));
|
||||
if (messages[msgKey] && messages[msgKey].message) {
|
||||
manifestName = messages[msgKey].message;
|
||||
}
|
||||
} catch (e) {
|
||||
console.error(`[!] Failed to read messages.json: ${e.message}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
console.error(`[*] Looking for match: ext.name="${ext.name}" manifest.name="${manifestName}"`);
|
||||
|
||||
// Find matching extension from page by exact name match first
|
||||
let match = extensionsFromPage.find(e => e.name === manifestName);
|
||||
|
||||
// If no exact match, try case-insensitive exact match
|
||||
if (!match) {
|
||||
match = extensionsFromPage.find(e =>
|
||||
e.name.toLowerCase() === manifestName.toLowerCase()
|
||||
);
|
||||
}
|
||||
|
||||
if (match) {
|
||||
ext.id = match.id;
|
||||
console.error(`[+] Matched extension: ${ext.name} (${manifestName}) -> ${match.id}`);
|
||||
} else {
|
||||
console.error(`[!] No match found for: ${ext.name} (${manifestName})`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
await extPage.close();
|
||||
} catch (e) {
|
||||
console.error(`[!] Failed to get extensions from chrome://extensions: ${e.message}`);
|
||||
}
|
||||
|
||||
// Fallback: check browser targets
|
||||
const targets = browser.targets();
|
||||
const builtinIds = [
|
||||
'nkeimhogjdpnpccoofpliimaahmaaome',
|
||||
'fignfifoniblkonapihmkfakmlgkbkcf',
|
||||
'ahfgeienlihckogmohjhadlkjgocpleb',
|
||||
'mhjfbmdgcfjbbpaeojofohoefgiehjai',
|
||||
];
|
||||
const customExtTargets = targets.filter(t => {
|
||||
const url = t.url();
|
||||
if (!url.startsWith('chrome-extension://')) return false;
|
||||
const extId = url.split('://')[1].split('/')[0];
|
||||
return !builtinIds.includes(extId);
|
||||
});
|
||||
|
||||
console.error(`[+] Found ${customExtTargets.length} custom extension target(s)`);
|
||||
|
||||
for (const target of customExtTargets) {
|
||||
const url = target.url();
|
||||
const extId = url.split('://')[1].split('/')[0];
|
||||
console.error(`[+] Extension target: ${extId} (${target.type()})`);
|
||||
}
|
||||
|
||||
if (customExtTargets.length === 0 && extensionPaths.length > 0) {
|
||||
console.error(`[!] Warning: No custom extensions detected. Extension loading may have failed.`);
|
||||
console.error(`[!] Make sure you are using Chromium, not Chrome (Chrome 137+ removed --load-extension support)`);
|
||||
}
|
||||
}
|
||||
|
||||
// Write extensions metadata with actual IDs
|
||||
if (installedExtensions.length > 0) {
|
||||
fs.writeFileSync(
|
||||
path.join(OUTPUT_DIR, 'extensions.json'),
|
||||
JSON.stringify(installedExtensions, null, 2)
|
||||
);
|
||||
}
|
||||
|
||||
console.error(`[+] Chromium session started for crawl ${crawlId}`);
|
||||
console.error(`[+] CDP URL: ${cdpUrl}`);
|
||||
console.error(`[+] PID: ${chromePid}`);
|
||||
|
||||
// Stay alive to handle cleanup on SIGTERM
|
||||
console.log('[*] Chromium launch hook staying alive to handle cleanup...');
|
||||
setInterval(() => {}, 1000000);
|
||||
|
||||
} catch (e) {
|
||||
console.error(`ERROR: ${e.name}: ${e.message}`);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
main().catch((e) => {
|
||||
console.error(`Fatal error: ${e.message}`);
|
||||
process.exit(1);
|
||||
});
|
||||
@@ -2,7 +2,7 @@
|
||||
/**
|
||||
* Create a Chrome tab for this snapshot in the shared crawl Chrome session.
|
||||
*
|
||||
* If a crawl-level Chrome session exists (from on_Crawl__20_chrome_launch.bg.js),
|
||||
* If a crawl-level Chrome session exists (from on_Crawl__30_chrome_launch.bg.js),
|
||||
* this connects to it and creates a new tab. Otherwise, falls back to launching
|
||||
* its own Chrome instance.
|
||||
*
|
||||
@@ -89,7 +89,7 @@ process.on('SIGINT', cleanup);
|
||||
function findCrawlChromeSession(crawlId) {
|
||||
if (!crawlId) return null;
|
||||
|
||||
// Use CRAWL_OUTPUT_DIR env var set by hooks.py
|
||||
// Use CRAWL_OUTPUT_DIR env var set by get_config() in configset.py
|
||||
const crawlOutputDir = getEnv('CRAWL_OUTPUT_DIR', '');
|
||||
if (!crawlOutputDir) return null;
|
||||
|
||||
@@ -215,7 +215,7 @@ async function launchNewChrome(url, binary) {
|
||||
console.log(`[*] Launched Chrome (PID: ${chromePid}), waiting for debug port...`);
|
||||
|
||||
// Write PID immediately for cleanup
|
||||
fs.writeFileSync(path.join(OUTPUT_DIR, 'pid.txt'), String(chromePid));
|
||||
fs.writeFileSync(path.join(OUTPUT_DIR, 'chrome.pid'), String(chromePid));
|
||||
|
||||
try {
|
||||
// Wait for Chrome to be ready
|
||||
|
||||
869
archivebox/plugins/chrome/tests/chrome_test_helpers.py
Normal file
869
archivebox/plugins/chrome/tests/chrome_test_helpers.py
Normal file
@@ -0,0 +1,869 @@
|
||||
"""
|
||||
Shared Chrome test helpers for plugin integration tests.
|
||||
|
||||
This module provides common utilities for Chrome-based plugin tests, reducing
|
||||
duplication across test files. Functions delegate to chrome_utils.js (the single
|
||||
source of truth) with Python fallbacks.
|
||||
|
||||
Function names match the JS equivalents in snake_case:
|
||||
JS: getMachineType() -> Python: get_machine_type()
|
||||
JS: getLibDir() -> Python: get_lib_dir()
|
||||
JS: getNodeModulesDir() -> Python: get_node_modules_dir()
|
||||
JS: getExtensionsDir() -> Python: get_extensions_dir()
|
||||
JS: findChromium() -> Python: find_chromium()
|
||||
JS: killChrome() -> Python: kill_chrome()
|
||||
JS: getTestEnv() -> Python: get_test_env()
|
||||
|
||||
Usage:
|
||||
# Path helpers (delegate to chrome_utils.js):
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
get_test_env, # env dict with LIB_DIR, NODE_MODULES_DIR, MACHINE_TYPE
|
||||
get_machine_type, # e.g., 'x86_64-linux', 'arm64-darwin'
|
||||
get_lib_dir, # Path to lib dir
|
||||
get_node_modules_dir, # Path to node_modules
|
||||
get_extensions_dir, # Path to chrome extensions
|
||||
find_chromium, # Find Chrome/Chromium binary
|
||||
kill_chrome, # Kill Chrome process by PID
|
||||
)
|
||||
|
||||
# Test file helpers:
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
get_plugin_dir, # get_plugin_dir(__file__) -> plugin dir Path
|
||||
get_hook_script, # Find hook script by glob pattern
|
||||
PLUGINS_ROOT, # Path to plugins root
|
||||
LIB_DIR, # Path to lib dir (lazy-loaded)
|
||||
NODE_MODULES_DIR, # Path to node_modules (lazy-loaded)
|
||||
)
|
||||
|
||||
# For Chrome session tests:
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
setup_chrome_session, # Full Chrome + tab setup
|
||||
cleanup_chrome, # Cleanup by PID
|
||||
chrome_session, # Context manager
|
||||
)
|
||||
|
||||
# For extension tests:
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
setup_test_env, # Full dir structure + Chrome install
|
||||
launch_chromium_session, # Launch Chrome, return CDP URL
|
||||
kill_chromium_session, # Cleanup Chrome
|
||||
)
|
||||
|
||||
# Run hooks and parse JSONL:
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
run_hook, # Run hook, return (returncode, stdout, stderr)
|
||||
parse_jsonl_output, # Parse JSONL from stdout
|
||||
)
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import platform
|
||||
import signal
|
||||
import subprocess
|
||||
import time
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Tuple, Optional, List, Dict, Any
|
||||
from contextlib import contextmanager
|
||||
|
||||
|
||||
# Plugin directory locations
|
||||
CHROME_PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = CHROME_PLUGIN_DIR.parent
|
||||
|
||||
# Hook script locations
|
||||
CHROME_INSTALL_HOOK = CHROME_PLUGIN_DIR / 'on_Crawl__00_install_puppeteer_chromium.py'
|
||||
CHROME_LAUNCH_HOOK = CHROME_PLUGIN_DIR / 'on_Crawl__30_chrome_launch.bg.js'
|
||||
CHROME_TAB_HOOK = CHROME_PLUGIN_DIR / 'on_Snapshot__20_chrome_tab.bg.js'
|
||||
CHROME_NAVIGATE_HOOK = next(CHROME_PLUGIN_DIR.glob('on_Snapshot__*_chrome_navigate.*'), None)
|
||||
CHROME_UTILS = CHROME_PLUGIN_DIR / 'chrome_utils.js'
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Path Helpers - delegates to chrome_utils.js with Python fallback
|
||||
# Function names match JS: getMachineType -> get_machine_type, etc.
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def _call_chrome_utils(command: str, *args: str, env: Optional[dict] = None) -> Tuple[int, str, str]:
|
||||
"""Call chrome_utils.js CLI command (internal helper).
|
||||
|
||||
This is the central dispatch for calling the JS utilities from Python.
|
||||
All path calculations and Chrome operations are centralized in chrome_utils.js
|
||||
to ensure consistency between Python and JavaScript code.
|
||||
|
||||
Args:
|
||||
command: The CLI command (e.g., 'findChromium', 'getTestEnv')
|
||||
*args: Additional command arguments
|
||||
env: Environment dict (default: current env)
|
||||
|
||||
Returns:
|
||||
Tuple of (returncode, stdout, stderr)
|
||||
"""
|
||||
cmd = ['node', str(CHROME_UTILS), command] + list(args)
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
env=env or os.environ.copy()
|
||||
)
|
||||
return result.returncode, result.stdout, result.stderr
|
||||
|
||||
|
||||
def get_plugin_dir(test_file: str) -> Path:
|
||||
"""Get the plugin directory from a test file path.
|
||||
|
||||
Usage:
|
||||
PLUGIN_DIR = get_plugin_dir(__file__)
|
||||
|
||||
Args:
|
||||
test_file: The __file__ of the test module (e.g., test_screenshot.py)
|
||||
|
||||
Returns:
|
||||
Path to the plugin directory (e.g., plugins/screenshot/)
|
||||
"""
|
||||
return Path(test_file).parent.parent
|
||||
|
||||
|
||||
def get_hook_script(plugin_dir: Path, pattern: str) -> Optional[Path]:
|
||||
"""Find a hook script in a plugin directory by pattern.
|
||||
|
||||
Usage:
|
||||
HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_screenshot.*')
|
||||
|
||||
Args:
|
||||
plugin_dir: Path to the plugin directory
|
||||
pattern: Glob pattern to match
|
||||
|
||||
Returns:
|
||||
Path to the hook script or None if not found
|
||||
"""
|
||||
matches = list(plugin_dir.glob(pattern))
|
||||
return matches[0] if matches else None
|
||||
|
||||
|
||||
def get_machine_type() -> str:
|
||||
"""Get machine type string (e.g., 'x86_64-linux', 'arm64-darwin').
|
||||
|
||||
Matches JS: getMachineType()
|
||||
|
||||
Tries chrome_utils.js first, falls back to Python computation.
|
||||
"""
|
||||
# Try JS first (single source of truth)
|
||||
returncode, stdout, stderr = _call_chrome_utils('getMachineType')
|
||||
if returncode == 0 and stdout.strip():
|
||||
return stdout.strip()
|
||||
|
||||
# Fallback to Python computation
|
||||
if os.environ.get('MACHINE_TYPE'):
|
||||
return os.environ['MACHINE_TYPE']
|
||||
|
||||
machine = platform.machine().lower()
|
||||
system = platform.system().lower()
|
||||
if machine in ('arm64', 'aarch64'):
|
||||
machine = 'arm64'
|
||||
elif machine in ('x86_64', 'amd64'):
|
||||
machine = 'x86_64'
|
||||
return f"{machine}-{system}"
|
||||
|
||||
|
||||
def get_lib_dir() -> Path:
|
||||
"""Get LIB_DIR path for platform-specific binaries.
|
||||
|
||||
Matches JS: getLibDir()
|
||||
|
||||
Tries chrome_utils.js first, falls back to Python computation.
|
||||
"""
|
||||
# Try JS first
|
||||
returncode, stdout, stderr = _call_chrome_utils('getLibDir')
|
||||
if returncode == 0 and stdout.strip():
|
||||
return Path(stdout.strip())
|
||||
|
||||
# Fallback to Python
|
||||
if os.environ.get('LIB_DIR'):
|
||||
return Path(os.environ['LIB_DIR'])
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
return Path(str(STORAGE_CONFIG.LIB_DIR))
|
||||
|
||||
|
||||
def get_node_modules_dir() -> Path:
|
||||
"""Get NODE_MODULES_DIR path for npm packages.
|
||||
|
||||
Matches JS: getNodeModulesDir()
|
||||
|
||||
Tries chrome_utils.js first, falls back to Python computation.
|
||||
"""
|
||||
# Try JS first
|
||||
returncode, stdout, stderr = _call_chrome_utils('getNodeModulesDir')
|
||||
if returncode == 0 and stdout.strip():
|
||||
return Path(stdout.strip())
|
||||
|
||||
# Fallback to Python
|
||||
if os.environ.get('NODE_MODULES_DIR'):
|
||||
return Path(os.environ['NODE_MODULES_DIR'])
|
||||
lib_dir = get_lib_dir()
|
||||
return lib_dir / 'npm' / 'node_modules'
|
||||
|
||||
|
||||
def get_extensions_dir() -> str:
|
||||
"""Get the Chrome extensions directory path.
|
||||
|
||||
Matches JS: getExtensionsDir()
|
||||
|
||||
Tries chrome_utils.js first, falls back to Python computation.
|
||||
"""
|
||||
try:
|
||||
returncode, stdout, stderr = _call_chrome_utils('getExtensionsDir')
|
||||
if returncode == 0 and stdout.strip():
|
||||
return stdout.strip()
|
||||
except subprocess.TimeoutExpired:
|
||||
pass # Fall through to default computation
|
||||
|
||||
# Fallback to default computation if JS call fails
|
||||
data_dir = os.environ.get('DATA_DIR', '.')
|
||||
persona = os.environ.get('ACTIVE_PERSONA', 'Default')
|
||||
return str(Path(data_dir) / 'personas' / persona / 'chrome_extensions')
|
||||
|
||||
|
||||
def find_chromium(data_dir: Optional[str] = None) -> Optional[str]:
|
||||
"""Find the Chromium binary path.
|
||||
|
||||
Matches JS: findChromium()
|
||||
|
||||
Uses chrome_utils.js which checks:
|
||||
- CHROME_BINARY env var
|
||||
- @puppeteer/browsers install locations
|
||||
- System Chromium locations
|
||||
- Falls back to Chrome (with warning)
|
||||
|
||||
Args:
|
||||
data_dir: Optional DATA_DIR override
|
||||
|
||||
Returns:
|
||||
Path to Chromium binary or None if not found
|
||||
"""
|
||||
env = os.environ.copy()
|
||||
if data_dir:
|
||||
env['DATA_DIR'] = str(data_dir)
|
||||
returncode, stdout, stderr = _call_chrome_utils('findChromium', env=env)
|
||||
if returncode == 0 and stdout.strip():
|
||||
return stdout.strip()
|
||||
return None
|
||||
|
||||
|
||||
def kill_chrome(pid: int, output_dir: Optional[str] = None) -> bool:
|
||||
"""Kill a Chrome process by PID.
|
||||
|
||||
Matches JS: killChrome()
|
||||
|
||||
Uses chrome_utils.js which handles:
|
||||
- SIGTERM then SIGKILL
|
||||
- Process group killing
|
||||
- Zombie process cleanup
|
||||
|
||||
Args:
|
||||
pid: Process ID to kill
|
||||
output_dir: Optional chrome output directory for PID file cleanup
|
||||
|
||||
Returns:
|
||||
True if the kill command succeeded
|
||||
"""
|
||||
args = [str(pid)]
|
||||
if output_dir:
|
||||
args.append(str(output_dir))
|
||||
returncode, stdout, stderr = _call_chrome_utils('killChrome', *args)
|
||||
return returncode == 0
|
||||
|
||||
|
||||
def get_test_env() -> dict:
|
||||
"""Get environment dict with all paths set correctly for tests.
|
||||
|
||||
Matches JS: getTestEnv()
|
||||
|
||||
Tries chrome_utils.js first for path values, builds env dict.
|
||||
Use this for all subprocess calls in plugin tests.
|
||||
"""
|
||||
env = os.environ.copy()
|
||||
|
||||
# Try to get all paths from JS (single source of truth)
|
||||
returncode, stdout, stderr = _call_chrome_utils('getTestEnv')
|
||||
if returncode == 0 and stdout.strip():
|
||||
try:
|
||||
js_env = json.loads(stdout)
|
||||
env.update(js_env)
|
||||
return env
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Fallback to Python computation
|
||||
lib_dir = get_lib_dir()
|
||||
env['LIB_DIR'] = str(lib_dir)
|
||||
env['NODE_MODULES_DIR'] = str(get_node_modules_dir())
|
||||
env['MACHINE_TYPE'] = get_machine_type()
|
||||
return env
|
||||
|
||||
|
||||
# Backward compatibility aliases (deprecated, use new names)
|
||||
find_chromium_binary = find_chromium
|
||||
kill_chrome_via_js = kill_chrome
|
||||
get_machine_type_from_js = get_machine_type
|
||||
get_test_env_from_js = get_test_env
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Module-level constants (lazy-loaded on first access)
|
||||
# Import these directly: from chrome_test_helpers import LIB_DIR, NODE_MODULES_DIR
|
||||
# =============================================================================
|
||||
|
||||
# These are computed once when first accessed
|
||||
_LIB_DIR: Optional[Path] = None
|
||||
_NODE_MODULES_DIR: Optional[Path] = None
|
||||
|
||||
|
||||
def _get_lib_dir_cached() -> Path:
|
||||
global _LIB_DIR
|
||||
if _LIB_DIR is None:
|
||||
_LIB_DIR = get_lib_dir()
|
||||
return _LIB_DIR
|
||||
|
||||
|
||||
def _get_node_modules_dir_cached() -> Path:
|
||||
global _NODE_MODULES_DIR
|
||||
if _NODE_MODULES_DIR is None:
|
||||
_NODE_MODULES_DIR = get_node_modules_dir()
|
||||
return _NODE_MODULES_DIR
|
||||
|
||||
|
||||
# Module-level constants that can be imported directly
|
||||
# Usage: from chrome_test_helpers import LIB_DIR, NODE_MODULES_DIR
|
||||
class _LazyPath:
|
||||
"""Lazy path that computes value on first access."""
|
||||
def __init__(self, getter):
|
||||
self._getter = getter
|
||||
self._value = None
|
||||
|
||||
def __fspath__(self):
|
||||
if self._value is None:
|
||||
self._value = self._getter()
|
||||
return str(self._value)
|
||||
|
||||
def __truediv__(self, other):
|
||||
if self._value is None:
|
||||
self._value = self._getter()
|
||||
return self._value / other
|
||||
|
||||
def __str__(self):
|
||||
return self.__fspath__()
|
||||
|
||||
def __repr__(self):
|
||||
return f"<LazyPath: {self.__fspath__()}>"
|
||||
|
||||
|
||||
LIB_DIR = _LazyPath(_get_lib_dir_cached)
|
||||
NODE_MODULES_DIR = _LazyPath(_get_node_modules_dir_cached)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Hook Execution Helpers
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def run_hook(
|
||||
hook_script: Path,
|
||||
url: str,
|
||||
snapshot_id: str,
|
||||
cwd: Optional[Path] = None,
|
||||
env: Optional[dict] = None,
|
||||
timeout: int = 60,
|
||||
extra_args: Optional[List[str]] = None,
|
||||
) -> Tuple[int, str, str]:
|
||||
"""Run a hook script and return (returncode, stdout, stderr).
|
||||
|
||||
Usage:
|
||||
returncode, stdout, stderr = run_hook(
|
||||
HOOK_SCRIPT, 'https://example.com', 'test-snap-123',
|
||||
cwd=tmpdir, env=get_test_env()
|
||||
)
|
||||
|
||||
Args:
|
||||
hook_script: Path to the hook script
|
||||
url: URL to process
|
||||
snapshot_id: Snapshot ID
|
||||
cwd: Working directory (default: current dir)
|
||||
env: Environment dict (default: get_test_env())
|
||||
timeout: Timeout in seconds
|
||||
extra_args: Additional arguments to pass
|
||||
|
||||
Returns:
|
||||
Tuple of (returncode, stdout, stderr)
|
||||
"""
|
||||
if env is None:
|
||||
env = get_test_env()
|
||||
|
||||
# Determine interpreter based on file extension
|
||||
if hook_script.suffix == '.py':
|
||||
cmd = ['python', str(hook_script)]
|
||||
elif hook_script.suffix == '.js':
|
||||
cmd = ['node', str(hook_script)]
|
||||
else:
|
||||
cmd = [str(hook_script)]
|
||||
|
||||
cmd.extend([f'--url={url}', f'--snapshot-id={snapshot_id}'])
|
||||
if extra_args:
|
||||
cmd.extend(extra_args)
|
||||
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
cwd=str(cwd) if cwd else None,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=timeout
|
||||
)
|
||||
return result.returncode, result.stdout, result.stderr
|
||||
|
||||
|
||||
def parse_jsonl_output(stdout: str, record_type: str = 'ArchiveResult') -> Optional[Dict[str, Any]]:
|
||||
"""Parse JSONL output from hook stdout and return the specified record type.
|
||||
|
||||
Usage:
|
||||
result = parse_jsonl_output(stdout)
|
||||
if result and result['status'] == 'succeeded':
|
||||
print("Success!")
|
||||
|
||||
Args:
|
||||
stdout: The stdout from a hook execution
|
||||
record_type: The 'type' field to look for (default: 'ArchiveResult')
|
||||
|
||||
Returns:
|
||||
The parsed JSON dict or None if not found
|
||||
"""
|
||||
for line in stdout.strip().split('\n'):
|
||||
line = line.strip()
|
||||
if not line.startswith('{'):
|
||||
continue
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == record_type:
|
||||
return record
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
return None
|
||||
|
||||
|
||||
def run_hook_and_parse(
|
||||
hook_script: Path,
|
||||
url: str,
|
||||
snapshot_id: str,
|
||||
cwd: Optional[Path] = None,
|
||||
env: Optional[dict] = None,
|
||||
timeout: int = 60,
|
||||
extra_args: Optional[List[str]] = None,
|
||||
) -> Tuple[int, Optional[Dict[str, Any]], str]:
|
||||
"""Run a hook and parse its JSONL output.
|
||||
|
||||
Convenience function combining run_hook() and parse_jsonl_output().
|
||||
|
||||
Returns:
|
||||
Tuple of (returncode, parsed_result_or_none, stderr)
|
||||
"""
|
||||
returncode, stdout, stderr = run_hook(
|
||||
hook_script, url, snapshot_id,
|
||||
cwd=cwd, env=env, timeout=timeout, extra_args=extra_args
|
||||
)
|
||||
result = parse_jsonl_output(stdout)
|
||||
return returncode, result, stderr
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Extension Test Helpers
|
||||
# Used by extension tests (ublock, istilldontcareaboutcookies, twocaptcha)
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def setup_test_env(tmpdir: Path) -> dict:
|
||||
"""Set up isolated data/lib directory structure for extension tests.
|
||||
|
||||
Creates structure matching real ArchiveBox data dir:
|
||||
<tmpdir>/data/
|
||||
lib/
|
||||
arm64-darwin/ (or x86_64-linux, etc.)
|
||||
npm/
|
||||
.bin/
|
||||
node_modules/
|
||||
personas/
|
||||
Default/
|
||||
chrome_extensions/
|
||||
users/
|
||||
testuser/
|
||||
crawls/
|
||||
snapshots/
|
||||
|
||||
Calls chrome install hook which handles puppeteer-core and chromium installation.
|
||||
Returns env dict with DATA_DIR, LIB_DIR, NPM_BIN_DIR, NODE_MODULES_DIR, CHROME_BINARY, etc.
|
||||
|
||||
Args:
|
||||
tmpdir: Base temporary directory for the test
|
||||
|
||||
Returns:
|
||||
Environment dict with all paths set, or pytest.skip() if Chrome install fails
|
||||
"""
|
||||
import pytest
|
||||
|
||||
# Determine machine type (matches archivebox.config.paths.get_machine_type())
|
||||
machine = platform.machine().lower()
|
||||
system = platform.system().lower()
|
||||
if machine in ('arm64', 'aarch64'):
|
||||
machine = 'arm64'
|
||||
elif machine in ('x86_64', 'amd64'):
|
||||
machine = 'x86_64'
|
||||
machine_type = f"{machine}-{system}"
|
||||
|
||||
# Create proper directory structure matching real ArchiveBox layout
|
||||
data_dir = tmpdir / 'data'
|
||||
lib_dir = data_dir / 'lib' / machine_type
|
||||
npm_dir = lib_dir / 'npm'
|
||||
npm_bin_dir = npm_dir / '.bin'
|
||||
node_modules_dir = npm_dir / 'node_modules'
|
||||
|
||||
# Extensions go under personas/Default/
|
||||
chrome_extensions_dir = data_dir / 'personas' / 'Default' / 'chrome_extensions'
|
||||
|
||||
# User data goes under users/{username}/
|
||||
date_str = datetime.now().strftime('%Y%m%d')
|
||||
users_dir = data_dir / 'users' / 'testuser'
|
||||
crawls_dir = users_dir / 'crawls' / date_str
|
||||
snapshots_dir = users_dir / 'snapshots' / date_str
|
||||
|
||||
# Create all directories
|
||||
node_modules_dir.mkdir(parents=True, exist_ok=True)
|
||||
npm_bin_dir.mkdir(parents=True, exist_ok=True)
|
||||
chrome_extensions_dir.mkdir(parents=True, exist_ok=True)
|
||||
crawls_dir.mkdir(parents=True, exist_ok=True)
|
||||
snapshots_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Build complete env dict
|
||||
env = os.environ.copy()
|
||||
env.update({
|
||||
'DATA_DIR': str(data_dir),
|
||||
'LIB_DIR': str(lib_dir),
|
||||
'MACHINE_TYPE': machine_type,
|
||||
'NPM_BIN_DIR': str(npm_bin_dir),
|
||||
'NODE_MODULES_DIR': str(node_modules_dir),
|
||||
'CHROME_EXTENSIONS_DIR': str(chrome_extensions_dir),
|
||||
'CRAWLS_DIR': str(crawls_dir),
|
||||
'SNAPSHOTS_DIR': str(snapshots_dir),
|
||||
})
|
||||
|
||||
# Only set headless if not already in environment (allow override for debugging)
|
||||
if 'CHROME_HEADLESS' not in os.environ:
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
|
||||
# Call chrome install hook (installs puppeteer-core and chromium, outputs JSONL)
|
||||
result = subprocess.run(
|
||||
['python', str(CHROME_INSTALL_HOOK)],
|
||||
capture_output=True, text=True, timeout=120, env=env
|
||||
)
|
||||
if result.returncode != 0:
|
||||
pytest.skip(f"Chrome install hook failed: {result.stderr}")
|
||||
|
||||
# Parse JSONL output to get CHROME_BINARY
|
||||
chrome_binary = None
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if not line.strip():
|
||||
continue
|
||||
try:
|
||||
data = json.loads(line)
|
||||
if data.get('type') == 'Binary' and data.get('abspath'):
|
||||
chrome_binary = data['abspath']
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
if not chrome_binary or not Path(chrome_binary).exists():
|
||||
pytest.skip(f"Chromium binary not found: {chrome_binary}")
|
||||
|
||||
env['CHROME_BINARY'] = chrome_binary
|
||||
return env
|
||||
|
||||
|
||||
def launch_chromium_session(env: dict, chrome_dir: Path, crawl_id: str) -> Tuple[subprocess.Popen, str]:
|
||||
"""Launch Chromium and return (process, cdp_url).
|
||||
|
||||
This launches Chrome using the chrome launch hook and waits for the CDP URL
|
||||
to become available. Use this for extension tests that need direct CDP access.
|
||||
|
||||
Args:
|
||||
env: Environment dict (from setup_test_env)
|
||||
chrome_dir: Directory for Chrome to write its files (cdp_url.txt, chrome.pid, etc.)
|
||||
crawl_id: ID for the crawl
|
||||
|
||||
Returns:
|
||||
Tuple of (chrome_launch_process, cdp_url)
|
||||
|
||||
Raises:
|
||||
RuntimeError: If Chrome fails to launch or CDP URL not available after 20s
|
||||
"""
|
||||
chrome_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
chrome_launch_process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
|
||||
cwd=str(chrome_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
|
||||
# Wait for Chromium to launch and CDP URL to be available
|
||||
cdp_url = None
|
||||
for i in range(20):
|
||||
if chrome_launch_process.poll() is not None:
|
||||
stdout, stderr = chrome_launch_process.communicate()
|
||||
raise RuntimeError(f"Chromium launch failed:\nStdout: {stdout}\nStderr: {stderr}")
|
||||
cdp_file = chrome_dir / 'cdp_url.txt'
|
||||
if cdp_file.exists():
|
||||
cdp_url = cdp_file.read_text().strip()
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
if not cdp_url:
|
||||
chrome_launch_process.kill()
|
||||
raise RuntimeError("Chromium CDP URL not found after 20s")
|
||||
|
||||
return chrome_launch_process, cdp_url
|
||||
|
||||
|
||||
def kill_chromium_session(chrome_launch_process: subprocess.Popen, chrome_dir: Path) -> None:
|
||||
"""Clean up Chromium process launched by launch_chromium_session.
|
||||
|
||||
Uses chrome_utils.js killChrome for proper process group handling.
|
||||
|
||||
Args:
|
||||
chrome_launch_process: The Popen object from launch_chromium_session
|
||||
chrome_dir: The chrome directory containing chrome.pid
|
||||
"""
|
||||
# First try to terminate the launch process gracefully
|
||||
try:
|
||||
chrome_launch_process.send_signal(signal.SIGTERM)
|
||||
chrome_launch_process.wait(timeout=5)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Read PID and use JS to kill with proper cleanup
|
||||
chrome_pid_file = chrome_dir / 'chrome.pid'
|
||||
if chrome_pid_file.exists():
|
||||
try:
|
||||
chrome_pid = int(chrome_pid_file.read_text().strip())
|
||||
kill_chrome(chrome_pid, str(chrome_dir))
|
||||
except (ValueError, FileNotFoundError):
|
||||
pass
|
||||
|
||||
|
||||
@contextmanager
|
||||
def chromium_session(env: dict, chrome_dir: Path, crawl_id: str):
|
||||
"""Context manager for Chromium sessions with automatic cleanup.
|
||||
|
||||
Usage:
|
||||
with chromium_session(env, chrome_dir, 'test-crawl') as (process, cdp_url):
|
||||
# Use cdp_url to connect with puppeteer
|
||||
pass
|
||||
# Chromium automatically cleaned up
|
||||
|
||||
Args:
|
||||
env: Environment dict (from setup_test_env)
|
||||
chrome_dir: Directory for Chrome files
|
||||
crawl_id: ID for the crawl
|
||||
|
||||
Yields:
|
||||
Tuple of (chrome_launch_process, cdp_url)
|
||||
"""
|
||||
chrome_launch_process = None
|
||||
try:
|
||||
chrome_launch_process, cdp_url = launch_chromium_session(env, chrome_dir, crawl_id)
|
||||
yield chrome_launch_process, cdp_url
|
||||
finally:
|
||||
if chrome_launch_process:
|
||||
kill_chromium_session(chrome_launch_process, chrome_dir)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Tab-based Test Helpers
|
||||
# Used by tab-based tests (infiniscroll, modalcloser)
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def setup_chrome_session(
|
||||
tmpdir: Path,
|
||||
crawl_id: str = 'test-crawl',
|
||||
snapshot_id: str = 'test-snapshot',
|
||||
test_url: str = 'about:blank',
|
||||
navigate: bool = True,
|
||||
timeout: int = 15,
|
||||
) -> Tuple[subprocess.Popen, int, Path]:
|
||||
"""Set up a Chrome session with tab and optional navigation.
|
||||
|
||||
Creates the directory structure, launches Chrome, creates a tab,
|
||||
and optionally navigates to the test URL.
|
||||
|
||||
Args:
|
||||
tmpdir: Temporary directory for test files
|
||||
crawl_id: ID to use for the crawl
|
||||
snapshot_id: ID to use for the snapshot
|
||||
test_url: URL to navigate to (if navigate=True)
|
||||
navigate: Whether to navigate to the URL after creating tab
|
||||
timeout: Seconds to wait for Chrome to start
|
||||
|
||||
Returns:
|
||||
Tuple of (chrome_launch_process, chrome_pid, snapshot_chrome_dir)
|
||||
|
||||
Raises:
|
||||
RuntimeError: If Chrome fails to start or tab creation fails
|
||||
"""
|
||||
crawl_dir = Path(tmpdir) / 'crawl'
|
||||
crawl_dir.mkdir(exist_ok=True)
|
||||
chrome_dir = crawl_dir / 'chrome'
|
||||
chrome_dir.mkdir(exist_ok=True)
|
||||
|
||||
env = get_test_env()
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
|
||||
# Launch Chrome at crawl level
|
||||
chrome_launch_process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
|
||||
cwd=str(chrome_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
|
||||
# Wait for Chrome to launch
|
||||
for i in range(timeout):
|
||||
if chrome_launch_process.poll() is not None:
|
||||
stdout, stderr = chrome_launch_process.communicate()
|
||||
raise RuntimeError(f"Chrome launch failed:\nStdout: {stdout}\nStderr: {stderr}")
|
||||
if (chrome_dir / 'cdp_url.txt').exists():
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
if not (chrome_dir / 'cdp_url.txt').exists():
|
||||
raise RuntimeError(f"Chrome CDP URL not found after {timeout}s")
|
||||
|
||||
chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
|
||||
|
||||
# Create snapshot directory structure
|
||||
snapshot_dir = Path(tmpdir) / 'snapshot'
|
||||
snapshot_dir.mkdir(exist_ok=True)
|
||||
snapshot_chrome_dir = snapshot_dir / 'chrome'
|
||||
snapshot_chrome_dir.mkdir(exist_ok=True)
|
||||
|
||||
# Create tab
|
||||
tab_env = env.copy()
|
||||
tab_env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['node', str(CHROME_TAB_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}', f'--crawl-id={crawl_id}'],
|
||||
cwd=str(snapshot_chrome_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
env=tab_env
|
||||
)
|
||||
if result.returncode != 0:
|
||||
cleanup_chrome(chrome_launch_process, chrome_pid)
|
||||
raise RuntimeError(f"Tab creation failed: {result.stderr}")
|
||||
except subprocess.TimeoutExpired:
|
||||
cleanup_chrome(chrome_launch_process, chrome_pid)
|
||||
raise RuntimeError("Tab creation timed out after 60s")
|
||||
|
||||
# Navigate to URL if requested
|
||||
if navigate and CHROME_NAVIGATE_HOOK and test_url != 'about:blank':
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['node', str(CHROME_NAVIGATE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
|
||||
cwd=str(snapshot_chrome_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120,
|
||||
env=env
|
||||
)
|
||||
if result.returncode != 0:
|
||||
cleanup_chrome(chrome_launch_process, chrome_pid)
|
||||
raise RuntimeError(f"Navigation failed: {result.stderr}")
|
||||
except subprocess.TimeoutExpired:
|
||||
cleanup_chrome(chrome_launch_process, chrome_pid)
|
||||
raise RuntimeError("Navigation timed out after 120s")
|
||||
|
||||
return chrome_launch_process, chrome_pid, snapshot_chrome_dir
|
||||
|
||||
|
||||
def cleanup_chrome(chrome_launch_process: subprocess.Popen, chrome_pid: int, chrome_dir: Optional[Path] = None) -> None:
|
||||
"""Clean up Chrome processes using chrome_utils.js killChrome.
|
||||
|
||||
Uses the centralized kill logic from chrome_utils.js which handles:
|
||||
- SIGTERM then SIGKILL
|
||||
- Process group killing
|
||||
- Zombie process cleanup
|
||||
|
||||
Args:
|
||||
chrome_launch_process: The Popen object for the chrome launch hook
|
||||
chrome_pid: The PID of the Chrome process
|
||||
chrome_dir: Optional path to chrome output directory
|
||||
"""
|
||||
# First try to terminate the launch process gracefully
|
||||
try:
|
||||
chrome_launch_process.send_signal(signal.SIGTERM)
|
||||
chrome_launch_process.wait(timeout=5)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Use JS to kill Chrome with proper process group handling
|
||||
kill_chrome(chrome_pid, str(chrome_dir) if chrome_dir else None)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def chrome_session(
|
||||
tmpdir: Path,
|
||||
crawl_id: str = 'test-crawl',
|
||||
snapshot_id: str = 'test-snapshot',
|
||||
test_url: str = 'about:blank',
|
||||
navigate: bool = True,
|
||||
timeout: int = 15,
|
||||
):
|
||||
"""Context manager for Chrome sessions with automatic cleanup.
|
||||
|
||||
Usage:
|
||||
with chrome_session(tmpdir, test_url='https://example.com') as (process, pid, chrome_dir):
|
||||
# Run tests with chrome session
|
||||
pass
|
||||
# Chrome automatically cleaned up
|
||||
|
||||
Args:
|
||||
tmpdir: Temporary directory for test files
|
||||
crawl_id: ID to use for the crawl
|
||||
snapshot_id: ID to use for the snapshot
|
||||
test_url: URL to navigate to (if navigate=True)
|
||||
navigate: Whether to navigate to the URL after creating tab
|
||||
timeout: Seconds to wait for Chrome to start
|
||||
|
||||
Yields:
|
||||
Tuple of (chrome_launch_process, chrome_pid, snapshot_chrome_dir)
|
||||
"""
|
||||
chrome_launch_process = None
|
||||
chrome_pid = None
|
||||
try:
|
||||
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(
|
||||
tmpdir=tmpdir,
|
||||
crawl_id=crawl_id,
|
||||
snapshot_id=snapshot_id,
|
||||
test_url=test_url,
|
||||
navigate=navigate,
|
||||
timeout=timeout,
|
||||
)
|
||||
yield chrome_launch_process, chrome_pid, snapshot_chrome_dir
|
||||
finally:
|
||||
if chrome_launch_process and chrome_pid:
|
||||
cleanup_chrome(chrome_launch_process, chrome_pid)
|
||||
@@ -28,70 +28,25 @@ import tempfile
|
||||
import shutil
|
||||
import platform
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
CHROME_LAUNCH_HOOK = PLUGIN_DIR / 'on_Crawl__20_chrome_launch.bg.js'
|
||||
CHROME_TAB_HOOK = PLUGIN_DIR / 'on_Snapshot__20_chrome_tab.bg.js'
|
||||
CHROME_NAVIGATE_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_chrome_navigate.*'), None)
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
get_test_env,
|
||||
get_lib_dir,
|
||||
get_node_modules_dir,
|
||||
find_chromium_binary,
|
||||
CHROME_PLUGIN_DIR as PLUGIN_DIR,
|
||||
CHROME_LAUNCH_HOOK,
|
||||
CHROME_TAB_HOOK,
|
||||
CHROME_NAVIGATE_HOOK,
|
||||
)
|
||||
|
||||
# Get LIB_DIR and MACHINE_TYPE from environment or compute them
|
||||
def get_lib_dir_and_machine_type():
|
||||
"""Get or compute LIB_DIR and MACHINE_TYPE for tests."""
|
||||
from archivebox.config.paths import get_machine_type
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
|
||||
lib_dir = os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR)
|
||||
machine_type = os.environ.get('MACHINE_TYPE') or get_machine_type()
|
||||
|
||||
return Path(lib_dir), machine_type
|
||||
|
||||
# Setup NODE_MODULES_DIR to find npm packages
|
||||
LIB_DIR, MACHINE_TYPE = get_lib_dir_and_machine_type()
|
||||
# Note: LIB_DIR already includes machine_type (e.g., data/lib/arm64-darwin)
|
||||
NODE_MODULES_DIR = LIB_DIR / 'npm' / 'node_modules'
|
||||
# Get LIB_DIR and NODE_MODULES_DIR from shared helpers
|
||||
LIB_DIR = get_lib_dir()
|
||||
NODE_MODULES_DIR = get_node_modules_dir()
|
||||
NPM_PREFIX = LIB_DIR / 'npm'
|
||||
|
||||
# Chromium install location (relative to DATA_DIR)
|
||||
CHROMIUM_INSTALL_DIR = Path(os.environ.get('DATA_DIR', '.')).resolve() / 'chromium'
|
||||
|
||||
def get_test_env():
|
||||
"""Get environment with NODE_MODULES_DIR and CHROME_BINARY set correctly."""
|
||||
env = os.environ.copy()
|
||||
env['NODE_MODULES_DIR'] = str(NODE_MODULES_DIR)
|
||||
env['LIB_DIR'] = str(LIB_DIR)
|
||||
env['MACHINE_TYPE'] = MACHINE_TYPE
|
||||
# Ensure CHROME_BINARY is set to Chromium
|
||||
if 'CHROME_BINARY' not in env:
|
||||
chromium = find_chromium_binary()
|
||||
if chromium:
|
||||
env['CHROME_BINARY'] = chromium
|
||||
return env
|
||||
|
||||
|
||||
def find_chromium_binary(data_dir=None):
|
||||
"""Find the Chromium binary using chrome_utils.js findChromium().
|
||||
|
||||
This uses the centralized findChromium() function which checks:
|
||||
- CHROME_BINARY env var
|
||||
- @puppeteer/browsers install locations (in data_dir/chromium)
|
||||
- System Chromium locations
|
||||
- Falls back to Chrome (with warning)
|
||||
|
||||
Args:
|
||||
data_dir: Directory where chromium was installed (contains chromium/ subdir)
|
||||
"""
|
||||
chrome_utils = PLUGIN_DIR / 'chrome_utils.js'
|
||||
# Use provided data_dir, or fall back to env var, or current dir
|
||||
search_dir = data_dir or os.environ.get('DATA_DIR', '.')
|
||||
result = subprocess.run(
|
||||
['node', str(chrome_utils), 'findChromium', str(search_dir)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10
|
||||
)
|
||||
if result.returncode == 0 and result.stdout.strip():
|
||||
return result.stdout.strip()
|
||||
return None
|
||||
|
||||
|
||||
@pytest.fixture(scope="session", autouse=True)
|
||||
def ensure_chromium_and_puppeteer_installed():
|
||||
@@ -176,6 +131,7 @@ def test_chrome_launch_and_tab_creation():
|
||||
crawl_dir = Path(tmpdir) / 'crawl'
|
||||
crawl_dir.mkdir()
|
||||
chrome_dir = crawl_dir / 'chrome'
|
||||
chrome_dir.mkdir()
|
||||
|
||||
# Get test environment with NODE_MODULES_DIR set
|
||||
env = get_test_env()
|
||||
@@ -184,7 +140,7 @@ def test_chrome_launch_and_tab_creation():
|
||||
# Launch Chrome at crawl level (background process)
|
||||
chrome_launch_process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-crawl-123'],
|
||||
cwd=str(crawl_dir),
|
||||
cwd=str(chrome_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
@@ -292,7 +248,7 @@ def test_chrome_navigation():
|
||||
# Launch Chrome (background process)
|
||||
chrome_launch_process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-crawl-nav'],
|
||||
cwd=str(crawl_dir),
|
||||
cwd=str(chrome_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
@@ -363,7 +319,7 @@ def test_tab_cleanup_on_sigterm():
|
||||
# Launch Chrome (background process)
|
||||
chrome_launch_process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-cleanup'],
|
||||
cwd=str(crawl_dir),
|
||||
cwd=str(chrome_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
@@ -423,11 +379,12 @@ def test_multiple_snapshots_share_chrome():
|
||||
crawl_dir = Path(tmpdir) / 'crawl'
|
||||
crawl_dir.mkdir()
|
||||
chrome_dir = crawl_dir / 'chrome'
|
||||
chrome_dir.mkdir()
|
||||
|
||||
# Launch Chrome at crawl level
|
||||
chrome_launch_process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-multi-crawl'],
|
||||
cwd=str(crawl_dir),
|
||||
cwd=str(chrome_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
@@ -513,7 +470,7 @@ def test_chrome_cleanup_on_crawl_end():
|
||||
# Launch Chrome in background
|
||||
chrome_launch_process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-crawl-end'],
|
||||
cwd=str(crawl_dir),
|
||||
cwd=str(chrome_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
@@ -554,11 +511,12 @@ def test_zombie_prevention_hook_killed():
|
||||
crawl_dir = Path(tmpdir) / 'crawl'
|
||||
crawl_dir.mkdir()
|
||||
chrome_dir = crawl_dir / 'chrome'
|
||||
chrome_dir.mkdir()
|
||||
|
||||
# Launch Chrome
|
||||
chrome_launch_process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-zombie'],
|
||||
cwd=str(crawl_dir),
|
||||
cwd=str(chrome_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
|
||||
@@ -19,7 +19,7 @@ const puppeteer = require('puppeteer-core');
|
||||
const PLUGIN_NAME = 'consolelog';
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'console.jsonl';
|
||||
const PID_FILE = 'hook.pid';
|
||||
// PID file is now written by run_hook() with hook-specific name
|
||||
const CHROME_SESSION_DIR = '../chrome';
|
||||
|
||||
function parseArgs() {
|
||||
@@ -221,8 +221,8 @@ async function main() {
|
||||
// Set up listeners BEFORE navigation
|
||||
await setupListeners();
|
||||
|
||||
// Write PID file so chrome_cleanup can kill any remaining processes
|
||||
fs.writeFileSync(path.join(OUTPUT_DIR, PID_FILE), String(process.pid));
|
||||
// Note: PID file is written by run_hook() with hook-specific name
|
||||
// Snapshot.cleanup() kills all *.pid processes when done
|
||||
|
||||
// Wait for chrome_navigate to complete (BLOCKING)
|
||||
await waitForNavigation();
|
||||
|
||||
@@ -20,29 +20,22 @@ from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
get_test_env,
|
||||
get_plugin_dir,
|
||||
get_hook_script,
|
||||
run_hook_and_parse,
|
||||
LIB_DIR,
|
||||
NODE_MODULES_DIR,
|
||||
PLUGINS_ROOT,
|
||||
)
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
DOM_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_dom.*'), None)
|
||||
NPM_PROVIDER_HOOK = next((PLUGINS_ROOT / 'npm').glob('on_Binary__install_using_npm_provider.py'), None)
|
||||
|
||||
PLUGIN_DIR = get_plugin_dir(__file__)
|
||||
DOM_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_dom.*')
|
||||
NPM_PROVIDER_HOOK = get_hook_script(PLUGINS_ROOT / 'npm', 'on_Binary__install_using_npm_provider.py')
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
# Get LIB_DIR for NODE_MODULES_DIR
|
||||
def get_lib_dir():
|
||||
"""Get LIB_DIR for tests."""
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
return Path(os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR))
|
||||
|
||||
LIB_DIR = get_lib_dir()
|
||||
NODE_MODULES_DIR = LIB_DIR / 'npm' / 'node_modules'
|
||||
|
||||
def get_test_env():
|
||||
"""Get environment with NODE_MODULES_DIR set correctly."""
|
||||
env = os.environ.copy()
|
||||
env['NODE_MODULES_DIR'] = str(NODE_MODULES_DIR)
|
||||
env['LIB_DIR'] = str(LIB_DIR)
|
||||
return env
|
||||
|
||||
|
||||
def test_hook_script_exists():
|
||||
"""Verify on_Snapshot hook exists."""
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
Integration tests for favicon plugin
|
||||
|
||||
Tests verify:
|
||||
pass
|
||||
1. Plugin script exists
|
||||
2. requests library is available
|
||||
3. Favicon extraction works for real example.com
|
||||
@@ -21,9 +20,15 @@ from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
get_plugin_dir,
|
||||
get_hook_script,
|
||||
parse_jsonl_output,
|
||||
)
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
FAVICON_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_favicon.*'), None)
|
||||
|
||||
PLUGIN_DIR = get_plugin_dir(__file__)
|
||||
FAVICON_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_favicon.*')
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
|
||||
|
||||
@@ -14,7 +14,6 @@ Tests verify:
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import signal
|
||||
import subprocess
|
||||
import time
|
||||
import tempfile
|
||||
@@ -22,37 +21,19 @@ from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
# Import shared Chrome test helpers
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
get_test_env,
|
||||
setup_chrome_session,
|
||||
cleanup_chrome,
|
||||
)
|
||||
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
INFINISCROLL_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_infiniscroll.*'), None)
|
||||
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__20_chrome_launch.bg.js'
|
||||
CHROME_TAB_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Snapshot__20_chrome_tab.bg.js'
|
||||
CHROME_NAVIGATE_HOOK = next((PLUGINS_ROOT / 'chrome').glob('on_Snapshot__*_chrome_navigate.*'), None)
|
||||
TEST_URL = 'https://www.singsing.movie/'
|
||||
|
||||
|
||||
def get_node_modules_dir():
|
||||
"""Get NODE_MODULES_DIR for tests, checking env first."""
|
||||
# Check if NODE_MODULES_DIR is already set in environment
|
||||
if os.environ.get('NODE_MODULES_DIR'):
|
||||
return Path(os.environ['NODE_MODULES_DIR'])
|
||||
# Otherwise compute from LIB_DIR
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
lib_dir = Path(os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR))
|
||||
return lib_dir / 'npm' / 'node_modules'
|
||||
|
||||
|
||||
NODE_MODULES_DIR = get_node_modules_dir()
|
||||
|
||||
|
||||
def get_test_env():
|
||||
"""Get environment with NODE_MODULES_DIR set correctly."""
|
||||
env = os.environ.copy()
|
||||
env['NODE_MODULES_DIR'] = str(NODE_MODULES_DIR)
|
||||
return env
|
||||
|
||||
|
||||
def test_hook_script_exists():
|
||||
"""Verify on_Snapshot hook exists."""
|
||||
assert INFINISCROLL_HOOK is not None, "Infiniscroll hook not found"
|
||||
@@ -117,94 +98,18 @@ def test_fails_gracefully_without_chrome_session():
|
||||
f"Should mention chrome/CDP/puppeteer in error: {result.stderr}"
|
||||
|
||||
|
||||
def setup_chrome_session(tmpdir):
|
||||
"""Helper to set up Chrome session with tab and navigation."""
|
||||
crawl_dir = Path(tmpdir) / 'crawl'
|
||||
crawl_dir.mkdir()
|
||||
chrome_dir = crawl_dir / 'chrome'
|
||||
|
||||
env = get_test_env()
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
|
||||
# Launch Chrome at crawl level
|
||||
chrome_launch_process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-infiniscroll'],
|
||||
cwd=str(crawl_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
|
||||
# Wait for Chrome to launch
|
||||
for i in range(15):
|
||||
if chrome_launch_process.poll() is not None:
|
||||
stdout, stderr = chrome_launch_process.communicate()
|
||||
raise RuntimeError(f"Chrome launch failed:\nStdout: {stdout}\nStderr: {stderr}")
|
||||
if (chrome_dir / 'cdp_url.txt').exists():
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
if not (chrome_dir / 'cdp_url.txt').exists():
|
||||
raise RuntimeError("Chrome CDP URL not found after 15s")
|
||||
|
||||
chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
|
||||
|
||||
# Create snapshot directory structure
|
||||
snapshot_dir = Path(tmpdir) / 'snapshot'
|
||||
snapshot_dir.mkdir()
|
||||
snapshot_chrome_dir = snapshot_dir / 'chrome'
|
||||
snapshot_chrome_dir.mkdir()
|
||||
|
||||
# Create tab
|
||||
tab_env = env.copy()
|
||||
tab_env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
|
||||
result = subprocess.run(
|
||||
['node', str(CHROME_TAB_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-infiniscroll', '--crawl-id=test-infiniscroll'],
|
||||
cwd=str(snapshot_chrome_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
env=tab_env
|
||||
)
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"Tab creation failed: {result.stderr}")
|
||||
|
||||
# Navigate to URL
|
||||
result = subprocess.run(
|
||||
['node', str(CHROME_NAVIGATE_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-infiniscroll'],
|
||||
cwd=str(snapshot_chrome_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120,
|
||||
env=env
|
||||
)
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"Navigation failed: {result.stderr}")
|
||||
|
||||
return chrome_launch_process, chrome_pid, snapshot_chrome_dir
|
||||
|
||||
|
||||
def cleanup_chrome(chrome_launch_process, chrome_pid):
|
||||
"""Helper to clean up Chrome processes."""
|
||||
try:
|
||||
chrome_launch_process.send_signal(signal.SIGTERM)
|
||||
chrome_launch_process.wait(timeout=5)
|
||||
except:
|
||||
pass
|
||||
try:
|
||||
os.kill(chrome_pid, signal.SIGKILL)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
def test_scrolls_page_and_outputs_stats():
|
||||
"""Integration test: scroll page and verify JSONL output format."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
chrome_launch_process = None
|
||||
chrome_pid = None
|
||||
try:
|
||||
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(tmpdir)
|
||||
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(
|
||||
Path(tmpdir),
|
||||
crawl_id='test-infiniscroll',
|
||||
snapshot_id='snap-infiniscroll',
|
||||
test_url=TEST_URL,
|
||||
)
|
||||
|
||||
# Create infiniscroll output directory (sibling to chrome)
|
||||
infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll'
|
||||
@@ -264,7 +169,12 @@ def test_config_scroll_limit_honored():
|
||||
chrome_launch_process = None
|
||||
chrome_pid = None
|
||||
try:
|
||||
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(tmpdir)
|
||||
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(
|
||||
Path(tmpdir),
|
||||
crawl_id='test-scroll-limit',
|
||||
snapshot_id='snap-limit',
|
||||
test_url=TEST_URL,
|
||||
)
|
||||
|
||||
infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll'
|
||||
infiniscroll_dir.mkdir()
|
||||
@@ -316,7 +226,12 @@ def test_config_timeout_honored():
|
||||
chrome_launch_process = None
|
||||
chrome_pid = None
|
||||
try:
|
||||
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(tmpdir)
|
||||
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(
|
||||
Path(tmpdir),
|
||||
crawl_id='test-timeout',
|
||||
snapshot_id='snap-timeout',
|
||||
test_url=TEST_URL,
|
||||
)
|
||||
|
||||
infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll'
|
||||
infiniscroll_dir.mkdir()
|
||||
|
||||
@@ -0,0 +1,59 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* I Still Don't Care About Cookies Extension Plugin
|
||||
*
|
||||
* Installs and configures the "I still don't care about cookies" Chrome extension
|
||||
* for automatic cookie consent banner dismissal during page archiving.
|
||||
*
|
||||
* Extension: https://chromewebstore.google.com/detail/edibdbjcniadpccecjdfdjjppcpchdlm
|
||||
*
|
||||
* Priority: 02 (early) - Must install before Chrome session starts at Crawl level
|
||||
* Hook: on_Crawl (runs once per crawl, not per snapshot)
|
||||
*
|
||||
* This extension automatically:
|
||||
* - Dismisses cookie consent popups
|
||||
* - Removes cookie banners
|
||||
* - Accepts necessary cookies to proceed with browsing
|
||||
* - Works on thousands of websites out of the box
|
||||
*/
|
||||
|
||||
// Import extension utilities
|
||||
const { installExtensionWithCache } = require('../chrome/chrome_utils.js');
|
||||
|
||||
// Extension metadata
|
||||
const EXTENSION = {
|
||||
webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm',
|
||||
name: 'istilldontcareaboutcookies',
|
||||
};
|
||||
|
||||
/**
|
||||
* Main entry point - install extension before archiving
|
||||
*
|
||||
* Note: This extension works out of the box with no configuration needed.
|
||||
* It automatically detects and dismisses cookie banners on page load.
|
||||
*/
|
||||
async function main() {
|
||||
const extension = await installExtensionWithCache(EXTENSION);
|
||||
|
||||
if (extension) {
|
||||
console.log('[+] Cookie banners will be automatically dismissed during archiving');
|
||||
}
|
||||
|
||||
return extension;
|
||||
}
|
||||
|
||||
// Export functions for use by other plugins
|
||||
module.exports = {
|
||||
EXTENSION,
|
||||
};
|
||||
|
||||
// Run if executed directly
|
||||
if (require.main === module) {
|
||||
main().then(() => {
|
||||
console.log('[✓] I Still Don\'t Care About Cookies extension setup complete');
|
||||
process.exit(0);
|
||||
}).catch(err => {
|
||||
console.error('[❌] I Still Don\'t Care About Cookies extension setup failed:', err);
|
||||
process.exit(1);
|
||||
});
|
||||
}
|
||||
@@ -14,9 +14,17 @@ from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
setup_test_env,
|
||||
launch_chromium_session,
|
||||
kill_chromium_session,
|
||||
CHROME_LAUNCH_HOOK,
|
||||
PLUGINS_ROOT,
|
||||
)
|
||||
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_istilldontcareaboutcookies.*'), None)
|
||||
INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_install_istilldontcareaboutcookies_extension.*'), None)
|
||||
|
||||
|
||||
def test_install_script_exists():
|
||||
@@ -124,79 +132,6 @@ def test_no_configuration_required():
|
||||
assert "API" not in (result.stdout + result.stderr) or result.returncode == 0
|
||||
|
||||
|
||||
def setup_test_lib_dirs(tmpdir: Path) -> dict:
|
||||
"""Create isolated lib directories for tests and return env dict.
|
||||
|
||||
Sets up:
|
||||
LIB_DIR: tmpdir/lib/<arch>
|
||||
NODE_MODULES_DIR: tmpdir/lib/<arch>/npm/node_modules
|
||||
NPM_BIN_DIR: tmpdir/lib/<arch>/npm/bin
|
||||
PIP_VENV_DIR: tmpdir/lib/<arch>/pip/venv
|
||||
PIP_BIN_DIR: tmpdir/lib/<arch>/pip/venv/bin
|
||||
"""
|
||||
import platform
|
||||
arch = platform.machine()
|
||||
system = platform.system().lower()
|
||||
arch_dir = f"{arch}-{system}"
|
||||
|
||||
lib_dir = tmpdir / 'lib' / arch_dir
|
||||
npm_dir = lib_dir / 'npm'
|
||||
node_modules_dir = npm_dir / 'node_modules'
|
||||
npm_bin_dir = npm_dir / 'bin'
|
||||
pip_venv_dir = lib_dir / 'pip' / 'venv'
|
||||
pip_bin_dir = pip_venv_dir / 'bin'
|
||||
|
||||
# Create directories
|
||||
node_modules_dir.mkdir(parents=True, exist_ok=True)
|
||||
npm_bin_dir.mkdir(parents=True, exist_ok=True)
|
||||
pip_bin_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Install puppeteer-core to the test node_modules if not present
|
||||
if not (node_modules_dir / 'puppeteer-core').exists():
|
||||
result = subprocess.run(
|
||||
['npm', 'install', '--prefix', str(npm_dir), 'puppeteer-core'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120
|
||||
)
|
||||
if result.returncode != 0:
|
||||
pytest.skip(f"Failed to install puppeteer-core: {result.stderr}")
|
||||
|
||||
return {
|
||||
'LIB_DIR': str(lib_dir),
|
||||
'NODE_MODULES_DIR': str(node_modules_dir),
|
||||
'NPM_BIN_DIR': str(npm_bin_dir),
|
||||
'PIP_VENV_DIR': str(pip_venv_dir),
|
||||
'PIP_BIN_DIR': str(pip_bin_dir),
|
||||
}
|
||||
|
||||
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
|
||||
|
||||
def find_chromium_binary():
|
||||
"""Find the Chromium binary using chrome_utils.js findChromium().
|
||||
|
||||
This uses the centralized findChromium() function which checks:
|
||||
- CHROME_BINARY env var
|
||||
- @puppeteer/browsers install locations
|
||||
- System Chromium locations
|
||||
- Falls back to Chrome (with warning)
|
||||
"""
|
||||
chrome_utils = PLUGINS_ROOT / 'chrome' / 'chrome_utils.js'
|
||||
result = subprocess.run(
|
||||
['node', str(chrome_utils), 'findChromium'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10
|
||||
)
|
||||
if result.returncode == 0 and result.stdout.strip():
|
||||
return result.stdout.strip()
|
||||
return None
|
||||
|
||||
|
||||
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__20_chrome_launch.bg.js'
|
||||
|
||||
TEST_URL = 'https://www.filmin.es/'
|
||||
|
||||
|
||||
@@ -210,22 +145,11 @@ def test_extension_loads_in_chromium():
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Set up isolated lib directories for this test
|
||||
lib_env = setup_test_lib_dirs(tmpdir)
|
||||
# Set up isolated env with proper directory structure
|
||||
env = setup_test_env(tmpdir)
|
||||
env.setdefault('CHROME_HEADLESS', 'true')
|
||||
|
||||
# Set up extensions directory
|
||||
ext_dir = tmpdir / 'chrome_extensions'
|
||||
ext_dir.mkdir(parents=True)
|
||||
|
||||
env = os.environ.copy()
|
||||
env.update(lib_env)
|
||||
env['CHROME_EXTENSIONS_DIR'] = str(ext_dir)
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
|
||||
# Ensure CHROME_BINARY points to Chromium
|
||||
chromium = find_chromium_binary()
|
||||
if chromium:
|
||||
env['CHROME_BINARY'] = chromium
|
||||
ext_dir = Path(env['CHROME_EXTENSIONS_DIR'])
|
||||
|
||||
# Step 1: Install the extension
|
||||
result = subprocess.run(
|
||||
@@ -245,13 +169,16 @@ def test_extension_loads_in_chromium():
|
||||
print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}")
|
||||
|
||||
# Step 2: Launch Chromium using the chrome hook (loads extensions automatically)
|
||||
crawl_dir = tmpdir / 'crawl'
|
||||
crawl_dir.mkdir()
|
||||
crawl_id = 'test-cookies'
|
||||
crawl_dir = Path(env['CRAWLS_DIR']) / crawl_id
|
||||
crawl_dir.mkdir(parents=True, exist_ok=True)
|
||||
chrome_dir = crawl_dir / 'chrome'
|
||||
chrome_dir.mkdir(parents=True, exist_ok=True)
|
||||
env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
|
||||
|
||||
chrome_launch_process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-cookies'],
|
||||
cwd=str(crawl_dir),
|
||||
['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
|
||||
cwd=str(chrome_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
@@ -400,156 +327,314 @@ const puppeteer = require('puppeteer-core');
|
||||
pass
|
||||
|
||||
|
||||
def test_hides_cookie_consent_on_filmin():
|
||||
"""Live test: verify extension hides cookie consent popup on filmin.es.
|
||||
def check_cookie_consent_visibility(cdp_url: str, test_url: str, env: dict, script_dir: Path) -> dict:
|
||||
"""Check if cookie consent elements are visible on a page.
|
||||
|
||||
Uses Chromium with extensions loaded automatically via chrome hook.
|
||||
Returns dict with:
|
||||
- visible: bool - whether any cookie consent element is visible
|
||||
- selector: str - which selector matched (if visible)
|
||||
- elements_found: list - all cookie-related elements found in DOM
|
||||
- html_snippet: str - snippet of the page HTML for debugging
|
||||
"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Set up isolated lib directories for this test
|
||||
lib_env = setup_test_lib_dirs(tmpdir)
|
||||
|
||||
# Set up extensions directory
|
||||
ext_dir = tmpdir / 'chrome_extensions'
|
||||
ext_dir.mkdir(parents=True)
|
||||
|
||||
env = os.environ.copy()
|
||||
env.update(lib_env)
|
||||
env['CHROME_EXTENSIONS_DIR'] = str(ext_dir)
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
|
||||
# Ensure CHROME_BINARY points to Chromium
|
||||
chromium = find_chromium_binary()
|
||||
if chromium:
|
||||
env['CHROME_BINARY'] = chromium
|
||||
|
||||
# Step 1: Install the extension
|
||||
result = subprocess.run(
|
||||
['node', str(INSTALL_SCRIPT)],
|
||||
cwd=str(tmpdir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=60
|
||||
)
|
||||
assert result.returncode == 0, f"Extension install failed: {result.stderr}"
|
||||
|
||||
# Verify extension cache was created
|
||||
cache_file = ext_dir / 'istilldontcareaboutcookies.extension.json'
|
||||
assert cache_file.exists(), "Extension cache not created"
|
||||
ext_data = json.loads(cache_file.read_text())
|
||||
print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}")
|
||||
|
||||
# Step 2: Launch Chromium using the chrome hook (loads extensions automatically)
|
||||
crawl_dir = tmpdir / 'crawl'
|
||||
crawl_dir.mkdir()
|
||||
chrome_dir = crawl_dir / 'chrome'
|
||||
|
||||
chrome_launch_process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-cookies'],
|
||||
cwd=str(crawl_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
|
||||
# Wait for Chromium to launch and CDP URL to be available
|
||||
cdp_url = None
|
||||
for i in range(20):
|
||||
if chrome_launch_process.poll() is not None:
|
||||
stdout, stderr = chrome_launch_process.communicate()
|
||||
raise RuntimeError(f"Chromium launch failed:\nStdout: {stdout}\nStderr: {stderr}")
|
||||
cdp_file = chrome_dir / 'cdp_url.txt'
|
||||
if cdp_file.exists():
|
||||
cdp_url = cdp_file.read_text().strip()
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
assert cdp_url, "Chromium CDP URL not found after 20s"
|
||||
print(f"Chromium launched with CDP URL: {cdp_url}")
|
||||
|
||||
try:
|
||||
# Step 3: Connect to Chromium and test cookie consent hiding
|
||||
test_script = f'''
|
||||
test_script = f'''
|
||||
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
(async () => {{
|
||||
const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }});
|
||||
|
||||
// Wait for extension to initialize
|
||||
await new Promise(r => setTimeout(r, 2000));
|
||||
|
||||
const page = await browser.newPage();
|
||||
await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36');
|
||||
await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
|
||||
await page.setViewport({{ width: 1440, height: 900 }});
|
||||
|
||||
console.error('Navigating to {TEST_URL}...');
|
||||
await page.goto('{TEST_URL}', {{ waitUntil: 'networkidle2', timeout: 30000 }});
|
||||
console.error('Navigating to {test_url}...');
|
||||
await page.goto('{test_url}', {{ waitUntil: 'networkidle2', timeout: 30000 }});
|
||||
|
||||
// Wait for extension content script to process page
|
||||
await new Promise(r => setTimeout(r, 5000));
|
||||
// Wait for page to fully render and any cookie scripts to run
|
||||
await new Promise(r => setTimeout(r, 3000));
|
||||
|
||||
// Check cookie consent visibility
|
||||
// Check cookie consent visibility using multiple common selectors
|
||||
const result = await page.evaluate(() => {{
|
||||
const selectors = ['.cky-consent-container', '.cky-popup-center', '.cky-overlay'];
|
||||
// Common cookie consent selectors used by various consent management platforms
|
||||
const selectors = [
|
||||
// CookieYes
|
||||
'.cky-consent-container', '.cky-popup-center', '.cky-overlay', '.cky-modal',
|
||||
// OneTrust
|
||||
'#onetrust-consent-sdk', '#onetrust-banner-sdk', '.onetrust-pc-dark-filter',
|
||||
// Cookiebot
|
||||
'#CybotCookiebotDialog', '#CybotCookiebotDialogBodyUnderlay',
|
||||
// Generic cookie banners
|
||||
'[class*="cookie-consent"]', '[class*="cookie-banner"]', '[class*="cookie-notice"]',
|
||||
'[class*="cookie-popup"]', '[class*="cookie-modal"]', '[class*="cookie-dialog"]',
|
||||
'[id*="cookie-consent"]', '[id*="cookie-banner"]', '[id*="cookie-notice"]',
|
||||
'[id*="cookieconsent"]', '[id*="cookie-law"]',
|
||||
// GDPR banners
|
||||
'[class*="gdpr"]', '[id*="gdpr"]',
|
||||
// Consent banners
|
||||
'[class*="consent-banner"]', '[class*="consent-modal"]', '[class*="consent-popup"]',
|
||||
// Privacy banners
|
||||
'[class*="privacy-banner"]', '[class*="privacy-notice"]',
|
||||
// Common frameworks
|
||||
'.cc-window', '.cc-banner', '#cc-main', // Cookie Consent by Insites
|
||||
'.qc-cmp2-container', // Quantcast
|
||||
'.sp-message-container', // SourcePoint
|
||||
];
|
||||
|
||||
const elementsFound = [];
|
||||
let visibleElement = null;
|
||||
|
||||
for (const sel of selectors) {{
|
||||
const el = document.querySelector(sel);
|
||||
if (el) {{
|
||||
const style = window.getComputedStyle(el);
|
||||
const rect = el.getBoundingClientRect();
|
||||
const visible = style.display !== 'none' &&
|
||||
style.visibility !== 'hidden' &&
|
||||
rect.width > 0 && rect.height > 0;
|
||||
if (visible) return {{ visible: true, selector: sel }};
|
||||
try {{
|
||||
const elements = document.querySelectorAll(sel);
|
||||
for (const el of elements) {{
|
||||
const style = window.getComputedStyle(el);
|
||||
const rect = el.getBoundingClientRect();
|
||||
const isVisible = style.display !== 'none' &&
|
||||
style.visibility !== 'hidden' &&
|
||||
style.opacity !== '0' &&
|
||||
rect.width > 0 && rect.height > 0;
|
||||
|
||||
elementsFound.push({{
|
||||
selector: sel,
|
||||
visible: isVisible,
|
||||
display: style.display,
|
||||
visibility: style.visibility,
|
||||
opacity: style.opacity,
|
||||
width: rect.width,
|
||||
height: rect.height
|
||||
}});
|
||||
|
||||
if (isVisible && !visibleElement) {{
|
||||
visibleElement = {{ selector: sel, width: rect.width, height: rect.height }};
|
||||
}}
|
||||
}}
|
||||
}} catch (e) {{
|
||||
// Invalid selector, skip
|
||||
}}
|
||||
}}
|
||||
return {{ visible: false }};
|
||||
|
||||
// Also grab a snippet of the HTML to help debug
|
||||
const bodyHtml = document.body.innerHTML.slice(0, 2000);
|
||||
const hasCookieKeyword = bodyHtml.toLowerCase().includes('cookie') ||
|
||||
bodyHtml.toLowerCase().includes('consent') ||
|
||||
bodyHtml.toLowerCase().includes('gdpr');
|
||||
|
||||
return {{
|
||||
visible: visibleElement !== null,
|
||||
selector: visibleElement ? visibleElement.selector : null,
|
||||
elements_found: elementsFound,
|
||||
has_cookie_keyword_in_html: hasCookieKeyword,
|
||||
html_snippet: bodyHtml.slice(0, 500)
|
||||
}};
|
||||
}});
|
||||
|
||||
console.error('Cookie consent:', JSON.stringify(result));
|
||||
console.error('Cookie consent check result:', JSON.stringify({{
|
||||
visible: result.visible,
|
||||
selector: result.selector,
|
||||
elements_found_count: result.elements_found.length
|
||||
}}));
|
||||
|
||||
browser.disconnect();
|
||||
console.log(JSON.stringify(result));
|
||||
}})();
|
||||
'''
|
||||
script_path = tmpdir / 'test_extension.js'
|
||||
script_path.write_text(test_script)
|
||||
script_path = script_dir / 'check_cookies.js'
|
||||
script_path.write_text(test_script)
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(script_path)],
|
||||
cwd=str(tmpdir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=90
|
||||
result = subprocess.run(
|
||||
['node', str(script_path)],
|
||||
cwd=str(script_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=90
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"Cookie check script failed: {result.stderr}")
|
||||
|
||||
output_lines = [l for l in result.stdout.strip().split('\n') if l.startswith('{')]
|
||||
if not output_lines:
|
||||
raise RuntimeError(f"No JSON output from cookie check: {result.stdout}\nstderr: {result.stderr}")
|
||||
|
||||
return json.loads(output_lines[-1])
|
||||
|
||||
|
||||
def test_hides_cookie_consent_on_filmin():
|
||||
"""Live test: verify extension hides cookie consent popup on filmin.es.
|
||||
|
||||
This test runs TWO browser sessions:
|
||||
1. WITHOUT extension - verifies cookie consent IS visible (baseline)
|
||||
2. WITH extension - verifies cookie consent is HIDDEN
|
||||
|
||||
This ensures we're actually testing the extension's effect, not just
|
||||
that a page happens to not have cookie consent.
|
||||
"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Set up isolated env with proper directory structure
|
||||
env_base = setup_test_env(tmpdir)
|
||||
env_base['CHROME_HEADLESS'] = 'true'
|
||||
|
||||
ext_dir = Path(env_base['CHROME_EXTENSIONS_DIR'])
|
||||
|
||||
# ============================================================
|
||||
# STEP 1: BASELINE - Run WITHOUT extension, verify cookie consent IS visible
|
||||
# ============================================================
|
||||
print("\n" + "="*60)
|
||||
print("STEP 1: BASELINE TEST (no extension)")
|
||||
print("="*60)
|
||||
|
||||
data_dir = Path(env_base['DATA_DIR'])
|
||||
|
||||
env_no_ext = env_base.copy()
|
||||
env_no_ext['CHROME_EXTENSIONS_DIR'] = str(data_dir / 'personas' / 'Default' / 'empty_extensions')
|
||||
(data_dir / 'personas' / 'Default' / 'empty_extensions').mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Launch baseline Chromium in crawls directory
|
||||
baseline_crawl_id = 'baseline-no-ext'
|
||||
baseline_crawl_dir = Path(env_base['CRAWLS_DIR']) / baseline_crawl_id
|
||||
baseline_crawl_dir.mkdir(parents=True, exist_ok=True)
|
||||
baseline_chrome_dir = baseline_crawl_dir / 'chrome'
|
||||
env_no_ext['CRAWL_OUTPUT_DIR'] = str(baseline_crawl_dir)
|
||||
baseline_process = None
|
||||
|
||||
try:
|
||||
baseline_process, baseline_cdp_url = launch_chromium_session(
|
||||
env_no_ext, baseline_chrome_dir, baseline_crawl_id
|
||||
)
|
||||
print(f"Baseline Chromium launched: {baseline_cdp_url}")
|
||||
|
||||
# Wait a moment for browser to be ready
|
||||
time.sleep(2)
|
||||
|
||||
baseline_result = check_cookie_consent_visibility(
|
||||
baseline_cdp_url, TEST_URL, env_no_ext, tmpdir
|
||||
)
|
||||
|
||||
print(f"stderr: {result.stderr}")
|
||||
print(f"stdout: {result.stdout}")
|
||||
print(f"Baseline result: visible={baseline_result['visible']}, "
|
||||
f"elements_found={len(baseline_result['elements_found'])}")
|
||||
|
||||
assert result.returncode == 0, f"Test failed: {result.stderr}"
|
||||
|
||||
output_lines = [l for l in result.stdout.strip().split('\n') if l.startswith('{')]
|
||||
assert output_lines, f"No JSON output: {result.stdout}"
|
||||
|
||||
test_result = json.loads(output_lines[-1])
|
||||
assert not test_result['visible'], \
|
||||
f"Cookie consent should be hidden by extension. Result: {test_result}"
|
||||
if baseline_result['elements_found']:
|
||||
print("Elements found in baseline:")
|
||||
for el in baseline_result['elements_found'][:5]: # Show first 5
|
||||
print(f" - {el['selector']}: visible={el['visible']}, "
|
||||
f"display={el['display']}, size={el['width']}x{el['height']}")
|
||||
|
||||
finally:
|
||||
# Clean up Chromium
|
||||
try:
|
||||
chrome_launch_process.send_signal(signal.SIGTERM)
|
||||
chrome_launch_process.wait(timeout=5)
|
||||
except:
|
||||
pass
|
||||
chrome_pid_file = chrome_dir / 'chrome.pid'
|
||||
if chrome_pid_file.exists():
|
||||
try:
|
||||
chrome_pid = int(chrome_pid_file.read_text().strip())
|
||||
os.kill(chrome_pid, signal.SIGKILL)
|
||||
except (OSError, ValueError):
|
||||
pass
|
||||
if baseline_process:
|
||||
kill_chromium_session(baseline_process, baseline_chrome_dir)
|
||||
|
||||
# Verify baseline shows cookie consent
|
||||
if not baseline_result['visible']:
|
||||
# If no cookie consent visible in baseline, we can't test the extension
|
||||
# This could happen if:
|
||||
# - The site changed and no longer shows cookie consent
|
||||
# - Cookie consent is region-specific
|
||||
# - Our selectors don't match this site
|
||||
print("\nWARNING: No cookie consent visible in baseline!")
|
||||
print(f"HTML has cookie keywords: {baseline_result.get('has_cookie_keyword_in_html')}")
|
||||
print(f"HTML snippet: {baseline_result.get('html_snippet', '')[:200]}")
|
||||
|
||||
pytest.skip(
|
||||
f"Cannot test extension: no cookie consent visible in baseline on {TEST_URL}. "
|
||||
f"Elements found: {len(baseline_result['elements_found'])}. "
|
||||
f"The site may have changed or cookie consent may be region-specific."
|
||||
)
|
||||
|
||||
print(f"\n✓ Baseline confirmed: Cookie consent IS visible (selector: {baseline_result['selector']})")
|
||||
|
||||
# ============================================================
|
||||
# STEP 2: Install the extension
|
||||
# ============================================================
|
||||
print("\n" + "="*60)
|
||||
print("STEP 2: INSTALLING EXTENSION")
|
||||
print("="*60)
|
||||
|
||||
env_with_ext = env_base.copy()
|
||||
env_with_ext['CHROME_EXTENSIONS_DIR'] = str(ext_dir)
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(INSTALL_SCRIPT)],
|
||||
cwd=str(tmpdir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env_with_ext,
|
||||
timeout=60
|
||||
)
|
||||
assert result.returncode == 0, f"Extension install failed: {result.stderr}"
|
||||
|
||||
cache_file = ext_dir / 'istilldontcareaboutcookies.extension.json'
|
||||
assert cache_file.exists(), "Extension cache not created"
|
||||
ext_data = json.loads(cache_file.read_text())
|
||||
print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}")
|
||||
|
||||
# ============================================================
|
||||
# STEP 3: Run WITH extension, verify cookie consent is HIDDEN
|
||||
# ============================================================
|
||||
print("\n" + "="*60)
|
||||
print("STEP 3: TEST WITH EXTENSION")
|
||||
print("="*60)
|
||||
|
||||
# Launch extension test Chromium in crawls directory
|
||||
ext_crawl_id = 'test-with-ext'
|
||||
ext_crawl_dir = Path(env_base['CRAWLS_DIR']) / ext_crawl_id
|
||||
ext_crawl_dir.mkdir(parents=True, exist_ok=True)
|
||||
ext_chrome_dir = ext_crawl_dir / 'chrome'
|
||||
env_with_ext['CRAWL_OUTPUT_DIR'] = str(ext_crawl_dir)
|
||||
ext_process = None
|
||||
|
||||
try:
|
||||
ext_process, ext_cdp_url = launch_chromium_session(
|
||||
env_with_ext, ext_chrome_dir, ext_crawl_id
|
||||
)
|
||||
print(f"Extension Chromium launched: {ext_cdp_url}")
|
||||
|
||||
# Check that extension was loaded
|
||||
extensions_file = ext_chrome_dir / 'extensions.json'
|
||||
if extensions_file.exists():
|
||||
loaded_exts = json.loads(extensions_file.read_text())
|
||||
print(f"Extensions loaded: {[e.get('name') for e in loaded_exts]}")
|
||||
|
||||
# Wait for extension to initialize
|
||||
time.sleep(3)
|
||||
|
||||
ext_result = check_cookie_consent_visibility(
|
||||
ext_cdp_url, TEST_URL, env_with_ext, tmpdir
|
||||
)
|
||||
|
||||
print(f"Extension result: visible={ext_result['visible']}, "
|
||||
f"elements_found={len(ext_result['elements_found'])}")
|
||||
|
||||
if ext_result['elements_found']:
|
||||
print("Elements found with extension:")
|
||||
for el in ext_result['elements_found'][:5]:
|
||||
print(f" - {el['selector']}: visible={el['visible']}, "
|
||||
f"display={el['display']}, size={el['width']}x{el['height']}")
|
||||
|
||||
finally:
|
||||
if ext_process:
|
||||
kill_chromium_session(ext_process, ext_chrome_dir)
|
||||
|
||||
# ============================================================
|
||||
# STEP 4: Compare results
|
||||
# ============================================================
|
||||
print("\n" + "="*60)
|
||||
print("STEP 4: COMPARISON")
|
||||
print("="*60)
|
||||
print(f"Baseline (no extension): cookie consent visible = {baseline_result['visible']}")
|
||||
print(f"With extension: cookie consent visible = {ext_result['visible']}")
|
||||
|
||||
assert baseline_result['visible'], \
|
||||
"Baseline should show cookie consent (this shouldn't happen, we checked above)"
|
||||
|
||||
assert not ext_result['visible'], \
|
||||
f"Cookie consent should be HIDDEN by extension.\n" \
|
||||
f"Baseline showed consent at: {baseline_result['selector']}\n" \
|
||||
f"But with extension, consent is still visible.\n" \
|
||||
f"Elements still visible: {[e for e in ext_result['elements_found'] if e['visible']]}"
|
||||
|
||||
print("\n✓ SUCCESS: Extension correctly hides cookie consent!")
|
||||
print(f" - Baseline showed consent at: {baseline_result['selector']}")
|
||||
print(f" - Extension successfully hid it")
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
Integration tests for mercury plugin
|
||||
|
||||
Tests verify:
|
||||
pass
|
||||
1. Hook script exists
|
||||
2. Dependencies installed via validation hooks
|
||||
3. Verify deps with abx-pkg
|
||||
@@ -19,9 +18,15 @@ import tempfile
|
||||
from pathlib import Path
|
||||
import pytest
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
MERCURY_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_mercury.*'), None)
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
get_plugin_dir,
|
||||
get_hook_script,
|
||||
PLUGINS_ROOT,
|
||||
)
|
||||
|
||||
|
||||
PLUGIN_DIR = get_plugin_dir(__file__)
|
||||
MERCURY_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_mercury.*')
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
def test_hook_script_exists():
|
||||
|
||||
@@ -22,38 +22,20 @@ from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
# Import shared Chrome test helpers
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
get_test_env,
|
||||
setup_chrome_session,
|
||||
cleanup_chrome,
|
||||
)
|
||||
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
MODALCLOSER_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_modalcloser.*'), None)
|
||||
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__20_chrome_launch.bg.js'
|
||||
CHROME_TAB_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Snapshot__20_chrome_tab.bg.js'
|
||||
CHROME_NAVIGATE_HOOK = next((PLUGINS_ROOT / 'chrome').glob('on_Snapshot__*_chrome_navigate.*'), None)
|
||||
TEST_URL = 'https://www.singsing.movie/'
|
||||
COOKIE_CONSENT_TEST_URL = 'https://www.filmin.es/'
|
||||
|
||||
|
||||
def get_node_modules_dir():
|
||||
"""Get NODE_MODULES_DIR for tests, checking env first."""
|
||||
# Check if NODE_MODULES_DIR is already set in environment
|
||||
if os.environ.get('NODE_MODULES_DIR'):
|
||||
return Path(os.environ['NODE_MODULES_DIR'])
|
||||
# Otherwise compute from LIB_DIR
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
lib_dir = Path(os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR))
|
||||
return lib_dir / 'npm' / 'node_modules'
|
||||
|
||||
|
||||
NODE_MODULES_DIR = get_node_modules_dir()
|
||||
|
||||
|
||||
def get_test_env():
|
||||
"""Get environment with NODE_MODULES_DIR set correctly."""
|
||||
env = os.environ.copy()
|
||||
env['NODE_MODULES_DIR'] = str(NODE_MODULES_DIR)
|
||||
return env
|
||||
|
||||
|
||||
def test_hook_script_exists():
|
||||
"""Verify on_Snapshot hook exists."""
|
||||
assert MODALCLOSER_HOOK is not None, "Modalcloser hook not found"
|
||||
@@ -118,75 +100,6 @@ def test_fails_gracefully_without_chrome_session():
|
||||
f"Should mention chrome/CDP/puppeteer in error: {result.stderr}"
|
||||
|
||||
|
||||
def setup_chrome_session(tmpdir):
|
||||
"""Helper to set up Chrome session with tab."""
|
||||
crawl_dir = Path(tmpdir) / 'crawl'
|
||||
crawl_dir.mkdir()
|
||||
chrome_dir = crawl_dir / 'chrome'
|
||||
|
||||
env = get_test_env()
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
|
||||
# Launch Chrome at crawl level
|
||||
chrome_launch_process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-modalcloser'],
|
||||
cwd=str(crawl_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
|
||||
# Wait for Chrome to launch
|
||||
for i in range(15):
|
||||
if chrome_launch_process.poll() is not None:
|
||||
stdout, stderr = chrome_launch_process.communicate()
|
||||
raise RuntimeError(f"Chrome launch failed:\nStdout: {stdout}\nStderr: {stderr}")
|
||||
if (chrome_dir / 'cdp_url.txt').exists():
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
if not (chrome_dir / 'cdp_url.txt').exists():
|
||||
raise RuntimeError("Chrome CDP URL not found after 15s")
|
||||
|
||||
chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
|
||||
|
||||
# Create snapshot directory structure
|
||||
snapshot_dir = Path(tmpdir) / 'snapshot'
|
||||
snapshot_dir.mkdir()
|
||||
snapshot_chrome_dir = snapshot_dir / 'chrome'
|
||||
snapshot_chrome_dir.mkdir()
|
||||
|
||||
# Create tab
|
||||
tab_env = env.copy()
|
||||
tab_env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
|
||||
result = subprocess.run(
|
||||
['node', str(CHROME_TAB_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-modalcloser', '--crawl-id=test-modalcloser'],
|
||||
cwd=str(snapshot_chrome_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
env=tab_env
|
||||
)
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"Tab creation failed: {result.stderr}")
|
||||
|
||||
return chrome_launch_process, chrome_pid, snapshot_chrome_dir
|
||||
|
||||
|
||||
def cleanup_chrome(chrome_launch_process, chrome_pid):
|
||||
"""Helper to clean up Chrome processes."""
|
||||
try:
|
||||
chrome_launch_process.send_signal(signal.SIGTERM)
|
||||
chrome_launch_process.wait(timeout=5)
|
||||
except:
|
||||
pass
|
||||
try:
|
||||
os.kill(chrome_pid, signal.SIGKILL)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
def test_background_script_handles_sigterm():
|
||||
"""Test that background script runs and handles SIGTERM correctly."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
@@ -194,7 +107,12 @@ def test_background_script_handles_sigterm():
|
||||
chrome_pid = None
|
||||
modalcloser_process = None
|
||||
try:
|
||||
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(tmpdir)
|
||||
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(
|
||||
Path(tmpdir),
|
||||
crawl_id='test-modalcloser',
|
||||
snapshot_id='snap-modalcloser',
|
||||
test_url=TEST_URL,
|
||||
)
|
||||
|
||||
# Create modalcloser output directory (sibling to chrome)
|
||||
modalcloser_dir = snapshot_chrome_dir.parent / 'modalcloser'
|
||||
@@ -264,7 +182,12 @@ def test_dialog_handler_logs_dialogs():
|
||||
chrome_pid = None
|
||||
modalcloser_process = None
|
||||
try:
|
||||
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(tmpdir)
|
||||
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(
|
||||
Path(tmpdir),
|
||||
crawl_id='test-dialog',
|
||||
snapshot_id='snap-dialog',
|
||||
test_url=TEST_URL,
|
||||
)
|
||||
|
||||
modalcloser_dir = snapshot_chrome_dir.parent / 'modalcloser'
|
||||
modalcloser_dir.mkdir()
|
||||
@@ -312,7 +235,12 @@ def test_config_poll_interval():
|
||||
chrome_pid = None
|
||||
modalcloser_process = None
|
||||
try:
|
||||
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(tmpdir)
|
||||
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(
|
||||
Path(tmpdir),
|
||||
crawl_id='test-poll',
|
||||
snapshot_id='snap-poll',
|
||||
test_url=TEST_URL,
|
||||
)
|
||||
|
||||
modalcloser_dir = snapshot_chrome_dir.parent / 'modalcloser'
|
||||
modalcloser_dir.mkdir()
|
||||
|
||||
@@ -21,29 +21,22 @@ from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
get_test_env,
|
||||
get_plugin_dir,
|
||||
get_hook_script,
|
||||
run_hook_and_parse,
|
||||
LIB_DIR,
|
||||
NODE_MODULES_DIR,
|
||||
PLUGINS_ROOT,
|
||||
)
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
PDF_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_pdf.*'), None)
|
||||
|
||||
PLUGIN_DIR = get_plugin_dir(__file__)
|
||||
PDF_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_pdf.*')
|
||||
NPM_PROVIDER_HOOK = PLUGINS_ROOT / 'npm' / 'on_Binary__install_using_npm_provider.py'
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
# Get LIB_DIR for NODE_MODULES_DIR
|
||||
def get_lib_dir():
|
||||
"""Get LIB_DIR for tests."""
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
return Path(os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR))
|
||||
|
||||
LIB_DIR = get_lib_dir()
|
||||
NODE_MODULES_DIR = LIB_DIR / 'npm' / 'node_modules'
|
||||
|
||||
def get_test_env():
|
||||
"""Get environment with NODE_MODULES_DIR set correctly."""
|
||||
env = os.environ.copy()
|
||||
env['NODE_MODULES_DIR'] = str(NODE_MODULES_DIR)
|
||||
env['LIB_DIR'] = str(LIB_DIR)
|
||||
return env
|
||||
|
||||
|
||||
def test_hook_script_exists():
|
||||
"""Verify on_Snapshot hook exists."""
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
Integration tests for readability plugin
|
||||
|
||||
Tests verify:
|
||||
pass
|
||||
1. Validate hook checks for readability-extractor binary
|
||||
2. Verify deps with abx-pkg
|
||||
3. Plugin reports missing dependency correctly
|
||||
@@ -18,10 +17,15 @@ from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
get_plugin_dir,
|
||||
get_hook_script,
|
||||
PLUGINS_ROOT,
|
||||
)
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
READABILITY_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_readability.*'))
|
||||
|
||||
PLUGIN_DIR = get_plugin_dir(__file__)
|
||||
READABILITY_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_readability.*')
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
|
||||
|
||||
@@ -19,7 +19,7 @@ const puppeteer = require('puppeteer-core');
|
||||
const PLUGIN_NAME = 'redirects';
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'redirects.jsonl';
|
||||
const PID_FILE = 'hook.pid';
|
||||
// PID file is now written by run_hook() with hook-specific name
|
||||
const CHROME_SESSION_DIR = '../chrome';
|
||||
|
||||
// Global state
|
||||
@@ -274,8 +274,8 @@ async function main() {
|
||||
// Set up redirect listener BEFORE navigation
|
||||
await setupRedirectListener();
|
||||
|
||||
// Write PID file
|
||||
fs.writeFileSync(path.join(OUTPUT_DIR, PID_FILE), String(process.pid));
|
||||
// Note: PID file is written by run_hook() with hook-specific name
|
||||
// Snapshot.cleanup() kills all *.pid processes when done
|
||||
|
||||
// Wait for chrome_navigate to complete (BLOCKING)
|
||||
await waitForNavigation();
|
||||
|
||||
@@ -19,7 +19,7 @@ const puppeteer = require('puppeteer-core');
|
||||
|
||||
const PLUGIN_NAME = 'responses';
|
||||
const OUTPUT_DIR = '.';
|
||||
const PID_FILE = 'hook.pid';
|
||||
// PID file is now written by run_hook() with hook-specific name
|
||||
const CHROME_SESSION_DIR = '../chrome';
|
||||
|
||||
// Resource types to capture (by default, capture everything)
|
||||
@@ -323,8 +323,8 @@ async function main() {
|
||||
// Set up listener BEFORE navigation
|
||||
await setupListener();
|
||||
|
||||
// Write PID file
|
||||
fs.writeFileSync(path.join(OUTPUT_DIR, PID_FILE), String(process.pid));
|
||||
// Note: PID file is written by run_hook() with hook-specific name
|
||||
// Snapshot.cleanup() kills all *.pid processes when done
|
||||
|
||||
// Wait for chrome_navigate to complete (BLOCKING)
|
||||
await waitForNavigation();
|
||||
|
||||
@@ -20,28 +20,20 @@ from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
get_test_env,
|
||||
get_plugin_dir,
|
||||
get_hook_script,
|
||||
run_hook_and_parse,
|
||||
LIB_DIR,
|
||||
NODE_MODULES_DIR,
|
||||
)
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
SCREENSHOT_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_screenshot.*'), None)
|
||||
|
||||
PLUGIN_DIR = get_plugin_dir(__file__)
|
||||
SCREENSHOT_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_screenshot.*')
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
# Get LIB_DIR for NODE_MODULES_DIR
|
||||
def get_lib_dir():
|
||||
"""Get LIB_DIR for tests."""
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
return Path(os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR))
|
||||
|
||||
LIB_DIR = get_lib_dir()
|
||||
NODE_MODULES_DIR = LIB_DIR / 'npm' / 'node_modules'
|
||||
|
||||
def get_test_env():
|
||||
"""Get environment with NODE_MODULES_DIR set correctly."""
|
||||
env = os.environ.copy()
|
||||
env['NODE_MODULES_DIR'] = str(NODE_MODULES_DIR)
|
||||
env['LIB_DIR'] = str(LIB_DIR)
|
||||
return env
|
||||
|
||||
|
||||
def test_hook_script_exists():
|
||||
"""Verify on_Snapshot hook exists."""
|
||||
|
||||
281
archivebox/plugins/singlefile/on_Crawl__20_install_singlefile_extension.js
Executable file
281
archivebox/plugins/singlefile/on_Crawl__20_install_singlefile_extension.js
Executable file
@@ -0,0 +1,281 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* SingleFile Extension Plugin
|
||||
*
|
||||
* DISABLED: Extension functionality commented out - using single-file-cli only
|
||||
*
|
||||
* Installs and uses the SingleFile Chrome extension for archiving complete web pages.
|
||||
* Falls back to single-file-cli if the extension is not available.
|
||||
*
|
||||
* Extension: https://chromewebstore.google.com/detail/mpiodijhokgodhhofbcjdecpffjipkle
|
||||
*
|
||||
* Priority: 04 (early) - Must install before Chrome session starts at Crawl level
|
||||
* Hook: on_Crawl (runs once per crawl, not per snapshot)
|
||||
*
|
||||
* This extension automatically:
|
||||
* - Saves complete web pages as single HTML files
|
||||
* - Inlines all resources (CSS, JS, images, fonts)
|
||||
* - Preserves page fidelity better than wget/curl
|
||||
* - Works with SPAs and dynamically loaded content
|
||||
*/
|
||||
|
||||
const path = require('path');
|
||||
const fs = require('fs');
|
||||
const { promisify } = require('util');
|
||||
const { exec } = require('child_process');
|
||||
|
||||
const execAsync = promisify(exec);
|
||||
|
||||
// DISABLED: Extension functionality - using single-file-cli only
|
||||
// // Import extension utilities
|
||||
// const extensionUtils = require('../chrome/chrome_utils.js');
|
||||
|
||||
// // Extension metadata
|
||||
// const EXTENSION = {
|
||||
// webstore_id: 'mpiodijhokgodhhofbcjdecpffjipkle',
|
||||
// name: 'singlefile',
|
||||
// };
|
||||
|
||||
// // Get extensions directory from environment or use default
|
||||
// const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
|
||||
// path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions');
|
||||
|
||||
// const CHROME_DOWNLOADS_DIR = process.env.CHROME_DOWNLOADS_DIR ||
|
||||
// path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_downloads');
|
||||
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'singlefile.html';
|
||||
|
||||
// DISABLED: Extension functionality - using single-file-cli only
|
||||
// /**
|
||||
// * Install the SingleFile extension
|
||||
// */
|
||||
// async function installSinglefileExtension() {
|
||||
// console.log('[*] Installing SingleFile extension...');
|
||||
|
||||
// // Install the extension
|
||||
// const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR);
|
||||
|
||||
// if (!extension) {
|
||||
// console.error('[❌] Failed to install SingleFile extension');
|
||||
// return null;
|
||||
// }
|
||||
|
||||
// console.log('[+] SingleFile extension installed');
|
||||
// console.log('[+] Web pages will be saved as single HTML files');
|
||||
|
||||
// return extension;
|
||||
// }
|
||||
|
||||
// /**
|
||||
// * Wait for a specified amount of time
|
||||
// */
|
||||
// function wait(ms) {
|
||||
// return new Promise(resolve => setTimeout(resolve, ms));
|
||||
// }
|
||||
|
||||
// /**
|
||||
// * Save a page using the SingleFile extension
|
||||
// *
|
||||
// * @param {Object} page - Puppeteer page object
|
||||
// * @param {Object} extension - Extension metadata with dispatchAction method
|
||||
// * @param {Object} options - Additional options
|
||||
// * @returns {Promise<string|null>} - Path to saved file or null on failure
|
||||
// */
|
||||
// async function saveSinglefileWithExtension(page, extension, options = {}) {
|
||||
// if (!extension || !extension.version) {
|
||||
// throw new Error('SingleFile extension not found or not loaded');
|
||||
// }
|
||||
|
||||
// const url = await page.url();
|
||||
|
||||
// // Check for unsupported URL schemes
|
||||
// const URL_SCHEMES_IGNORED = ['about', 'chrome', 'chrome-extension', 'data', 'javascript', 'blob'];
|
||||
// const scheme = url.split(':')[0];
|
||||
// if (URL_SCHEMES_IGNORED.includes(scheme)) {
|
||||
// console.log(`[⚠️] Skipping SingleFile for URL scheme: ${scheme}`);
|
||||
// return null;
|
||||
// }
|
||||
|
||||
// // Ensure downloads directory exists
|
||||
// await fs.promises.mkdir(CHROME_DOWNLOADS_DIR, { recursive: true });
|
||||
|
||||
// // Get list of existing files to ignore
|
||||
// const files_before = new Set(
|
||||
// (await fs.promises.readdir(CHROME_DOWNLOADS_DIR))
|
||||
// .filter(fn => fn.endsWith('.html'))
|
||||
// );
|
||||
|
||||
// // Output directory is current directory (hook already runs in output dir)
|
||||
// const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
// console.log(`[🛠️] Saving SingleFile HTML using extension (${extension.id})...`);
|
||||
|
||||
// // Bring page to front (extension action button acts on foreground tab)
|
||||
// await page.bringToFront();
|
||||
|
||||
// // Trigger the extension's action (toolbar button click)
|
||||
// await extension.dispatchAction();
|
||||
|
||||
// // Wait for file to appear in downloads directory
|
||||
// const check_delay = 3000; // 3 seconds
|
||||
// const max_tries = 10;
|
||||
// let files_new = [];
|
||||
|
||||
// for (let attempt = 0; attempt < max_tries; attempt++) {
|
||||
// await wait(check_delay);
|
||||
|
||||
// const files_after = (await fs.promises.readdir(CHROME_DOWNLOADS_DIR))
|
||||
// .filter(fn => fn.endsWith('.html'));
|
||||
|
||||
// files_new = files_after.filter(file => !files_before.has(file));
|
||||
|
||||
// if (files_new.length === 0) {
|
||||
// continue;
|
||||
// }
|
||||
|
||||
// // Find the matching file by checking if it contains the URL in the HTML header
|
||||
// for (const file of files_new) {
|
||||
// const dl_path = path.join(CHROME_DOWNLOADS_DIR, file);
|
||||
// const dl_text = await fs.promises.readFile(dl_path, 'utf-8');
|
||||
// const dl_header = dl_text.split('meta charset')[0];
|
||||
|
||||
// if (dl_header.includes(`url: ${url}`)) {
|
||||
// console.log(`[✍️] Moving SingleFile download from ${file} to ${out_path}`);
|
||||
// await fs.promises.rename(dl_path, out_path);
|
||||
// return out_path;
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
// console.warn(`[❌] Couldn't find matching SingleFile HTML in ${CHROME_DOWNLOADS_DIR} after waiting ${(check_delay * max_tries) / 1000}s`);
|
||||
// console.warn(`[⚠️] New files found: ${files_new.join(', ')}`);
|
||||
// return null;
|
||||
// }
|
||||
|
||||
/**
|
||||
* Save a page using single-file-cli (fallback method)
|
||||
*
|
||||
* @param {string} url - URL to archive
|
||||
* @param {Object} options - Additional options
|
||||
* @returns {Promise<string|null>} - Path to saved file or null on failure
|
||||
*/
|
||||
async function saveSinglefileWithCLI(url, options = {}) {
|
||||
console.log('[*] Falling back to single-file-cli...');
|
||||
|
||||
// Find single-file binary
|
||||
let binary = null;
|
||||
try {
|
||||
const { stdout } = await execAsync('which single-file');
|
||||
binary = stdout.trim();
|
||||
} catch (err) {
|
||||
console.error('[❌] single-file-cli not found. Install with: npm install -g single-file-cli');
|
||||
return null;
|
||||
}
|
||||
|
||||
// Output directory is current directory (hook already runs in output dir)
|
||||
const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
// Build command
|
||||
const cmd = [
|
||||
binary,
|
||||
'--browser-headless',
|
||||
url,
|
||||
out_path,
|
||||
];
|
||||
|
||||
// Add optional args
|
||||
if (options.userAgent) {
|
||||
cmd.splice(2, 0, '--browser-user-agent', options.userAgent);
|
||||
}
|
||||
if (options.cookiesFile && fs.existsSync(options.cookiesFile)) {
|
||||
cmd.splice(2, 0, '--browser-cookies-file', options.cookiesFile);
|
||||
}
|
||||
if (options.ignoreSSL) {
|
||||
cmd.splice(2, 0, '--browser-ignore-insecure-certs');
|
||||
}
|
||||
|
||||
// Execute
|
||||
try {
|
||||
const timeout = options.timeout || 120000;
|
||||
await execAsync(cmd.join(' '), { timeout });
|
||||
|
||||
if (fs.existsSync(out_path) && fs.statSync(out_path).size > 0) {
|
||||
console.log(`[+] SingleFile saved via CLI: ${out_path}`);
|
||||
return out_path;
|
||||
}
|
||||
|
||||
console.error('[❌] SingleFile CLI completed but no output file found');
|
||||
return null;
|
||||
} catch (err) {
|
||||
console.error(`[❌] SingleFile CLI error: ${err.message}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
// DISABLED: Extension functionality - using single-file-cli only
|
||||
// /**
|
||||
// * Main entry point - install extension before archiving
|
||||
// */
|
||||
// async function main() {
|
||||
// // Check if extension is already cached
|
||||
// const cacheFile = path.join(EXTENSIONS_DIR, 'singlefile.extension.json');
|
||||
|
||||
// if (fs.existsSync(cacheFile)) {
|
||||
// try {
|
||||
// const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
|
||||
// const manifestPath = path.join(cached.unpacked_path, 'manifest.json');
|
||||
|
||||
// if (fs.existsSync(manifestPath)) {
|
||||
// console.log('[*] SingleFile extension already installed (using cache)');
|
||||
// return cached;
|
||||
// }
|
||||
// } catch (e) {
|
||||
// // Cache file corrupted, re-install
|
||||
// console.warn('[⚠️] Extension cache corrupted, re-installing...');
|
||||
// }
|
||||
// }
|
||||
|
||||
// // Install extension
|
||||
// const extension = await installSinglefileExtension();
|
||||
|
||||
// // Export extension metadata for chrome plugin to load
|
||||
// if (extension) {
|
||||
// // Write extension info to a cache file that chrome plugin can read
|
||||
// await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true });
|
||||
// await fs.promises.writeFile(
|
||||
// cacheFile,
|
||||
// JSON.stringify(extension, null, 2)
|
||||
// );
|
||||
// console.log(`[+] Extension metadata written to ${cacheFile}`);
|
||||
// }
|
||||
|
||||
// return extension;
|
||||
// }
|
||||
|
||||
// Export functions for use by other plugins
|
||||
module.exports = {
|
||||
// DISABLED: Extension functionality - using single-file-cli only
|
||||
// EXTENSION,
|
||||
// installSinglefileExtension,
|
||||
// saveSinglefileWithExtension,
|
||||
saveSinglefileWithCLI,
|
||||
};
|
||||
|
||||
// DISABLED: Extension functionality - using single-file-cli only
|
||||
// // Run if executed directly
|
||||
// if (require.main === module) {
|
||||
// main().then(() => {
|
||||
// console.log('[✓] SingleFile extension setup complete');
|
||||
// process.exit(0);
|
||||
// }).catch(err => {
|
||||
// console.error('[❌] SingleFile extension setup failed:', err);
|
||||
// process.exit(1);
|
||||
// });
|
||||
// }
|
||||
|
||||
// No-op when run directly (extension install disabled)
|
||||
if (require.main === module) {
|
||||
console.log('[*] SingleFile extension install disabled - using single-file-cli only');
|
||||
process.exit(0);
|
||||
}
|
||||
@@ -77,27 +77,9 @@ def has_staticfile_output() -> bool:
|
||||
return staticfile_dir.exists() and any(staticfile_dir.iterdir())
|
||||
|
||||
|
||||
# Chrome binary search paths
|
||||
CHROMIUM_BINARY_NAMES_LINUX = [
|
||||
'chromium', 'chromium-browser', 'chromium-browser-beta',
|
||||
'chromium-browser-unstable', 'chromium-browser-canary', 'chromium-browser-dev',
|
||||
]
|
||||
CHROME_BINARY_NAMES_LINUX = [
|
||||
'google-chrome', 'google-chrome-stable', 'google-chrome-beta',
|
||||
'google-chrome-canary', 'google-chrome-unstable', 'google-chrome-dev', 'chrome',
|
||||
]
|
||||
CHROME_BINARY_NAMES_MACOS = [
|
||||
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
|
||||
'/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary',
|
||||
]
|
||||
CHROMIUM_BINARY_NAMES_MACOS = ['/Applications/Chromium.app/Contents/MacOS/Chromium']
|
||||
|
||||
ALL_CHROME_BINARIES = (
|
||||
CHROME_BINARY_NAMES_LINUX + CHROMIUM_BINARY_NAMES_LINUX +
|
||||
CHROME_BINARY_NAMES_MACOS + CHROMIUM_BINARY_NAMES_MACOS
|
||||
)
|
||||
|
||||
|
||||
# Chrome session directory (relative to extractor output dir)
|
||||
# Note: Chrome binary is obtained via CHROME_BINARY env var, not searched for.
|
||||
# The centralized Chrome binary search is in chrome_utils.js findChromium().
|
||||
CHROME_SESSION_DIR = '../chrome'
|
||||
|
||||
|
||||
|
||||
@@ -2,195 +2,173 @@
|
||||
Integration tests for singlefile plugin
|
||||
|
||||
Tests verify:
|
||||
1. Hook script exists and has correct metadata
|
||||
2. Extension installation and caching works
|
||||
3. Chrome/node dependencies available
|
||||
4. Hook can be executed successfully
|
||||
1. Hook scripts exist with correct naming
|
||||
2. CLI-based singlefile extraction works
|
||||
3. Dependencies available via abx-pkg
|
||||
4. Output contains valid HTML
|
||||
5. Connects to Chrome session via CDP when available
|
||||
6. Works with extensions loaded (ublock, etc.)
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
get_test_env,
|
||||
get_plugin_dir,
|
||||
get_hook_script,
|
||||
setup_chrome_session,
|
||||
cleanup_chrome,
|
||||
)
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_singlefile.*'), None)
|
||||
NPM_PROVIDER_HOOK = PLUGINS_ROOT / 'npm' / 'on_Binary__install_using_npm_provider.py'
|
||||
|
||||
PLUGIN_DIR = get_plugin_dir(__file__)
|
||||
SNAPSHOT_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_singlefile.py')
|
||||
TEST_URL = "https://example.com"
|
||||
|
||||
|
||||
def test_install_script_exists():
|
||||
"""Verify install script exists"""
|
||||
assert INSTALL_SCRIPT.exists(), f"Install script not found: {INSTALL_SCRIPT}"
|
||||
def test_snapshot_hook_exists():
|
||||
"""Verify snapshot extraction hook exists"""
|
||||
assert SNAPSHOT_HOOK is not None and SNAPSHOT_HOOK.exists(), f"Snapshot hook not found in {PLUGIN_DIR}"
|
||||
|
||||
|
||||
def test_extension_metadata():
|
||||
"""Test that SingleFile extension has correct metadata"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
env = os.environ.copy()
|
||||
env["CHROME_EXTENSIONS_DIR"] = str(Path(tmpdir) / "chrome_extensions")
|
||||
|
||||
result = subprocess.run(
|
||||
["node", "-e", f"const ext = require('{INSTALL_SCRIPT}'); console.log(JSON.stringify(ext.EXTENSION))"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Failed to load extension metadata: {result.stderr}"
|
||||
|
||||
metadata = json.loads(result.stdout)
|
||||
assert metadata["webstore_id"] == "mpiodijhokgodhhofbcjdecpffjipkle"
|
||||
assert metadata["name"] == "singlefile"
|
||||
|
||||
|
||||
def test_install_creates_cache():
|
||||
"""Test that install creates extension cache"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
ext_dir = Path(tmpdir) / "chrome_extensions"
|
||||
ext_dir.mkdir(parents=True)
|
||||
|
||||
env = os.environ.copy()
|
||||
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
|
||||
|
||||
result = subprocess.run(
|
||||
["node", str(INSTALL_SCRIPT)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
# Check output mentions installation
|
||||
assert "SingleFile" in result.stdout or "singlefile" in result.stdout
|
||||
|
||||
# Check cache file was created
|
||||
cache_file = ext_dir / "singlefile.extension.json"
|
||||
assert cache_file.exists(), "Cache file should be created"
|
||||
|
||||
# Verify cache content
|
||||
cache_data = json.loads(cache_file.read_text())
|
||||
assert cache_data["webstore_id"] == "mpiodijhokgodhhofbcjdecpffjipkle"
|
||||
assert cache_data["name"] == "singlefile"
|
||||
|
||||
|
||||
def test_install_twice_uses_cache():
|
||||
"""Test that running install twice uses existing cache on second run"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
ext_dir = Path(tmpdir) / "chrome_extensions"
|
||||
ext_dir.mkdir(parents=True)
|
||||
|
||||
env = os.environ.copy()
|
||||
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
|
||||
|
||||
# First install - downloads the extension
|
||||
result1 = subprocess.run(
|
||||
["node", str(INSTALL_SCRIPT)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=60
|
||||
)
|
||||
assert result1.returncode == 0, f"First install failed: {result1.stderr}"
|
||||
|
||||
# Verify cache was created
|
||||
cache_file = ext_dir / "singlefile.extension.json"
|
||||
assert cache_file.exists(), "Cache file should exist after first install"
|
||||
|
||||
# Second install - should use cache
|
||||
result2 = subprocess.run(
|
||||
["node", str(INSTALL_SCRIPT)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=30
|
||||
)
|
||||
assert result2.returncode == 0, f"Second install failed: {result2.stderr}"
|
||||
|
||||
# Second run should be faster (uses cache) and mention cache
|
||||
assert "already installed" in result2.stdout or "cache" in result2.stdout.lower() or result2.returncode == 0
|
||||
|
||||
|
||||
def test_no_configuration_required():
|
||||
"""Test that SingleFile works without configuration"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
ext_dir = Path(tmpdir) / "chrome_extensions"
|
||||
ext_dir.mkdir(parents=True)
|
||||
|
||||
env = os.environ.copy()
|
||||
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
|
||||
# No API keys needed
|
||||
|
||||
result = subprocess.run(
|
||||
["node", str(INSTALL_SCRIPT)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
# Should work without API keys
|
||||
assert result.returncode == 0
|
||||
|
||||
|
||||
def test_priority_order():
|
||||
"""Test that singlefile has correct priority (04)"""
|
||||
# Extract priority from filename
|
||||
filename = INSTALL_SCRIPT.name
|
||||
assert "04" in filename, "SingleFile should have priority 04"
|
||||
assert filename.startswith("on_Crawl__04_"), "Should follow priority naming convention for Crawl hooks"
|
||||
|
||||
|
||||
def test_output_directory_structure():
|
||||
"""Test that plugin defines correct output structure"""
|
||||
# Verify the script mentions singlefile output directory
|
||||
script_content = INSTALL_SCRIPT.read_text()
|
||||
|
||||
# Should mention singlefile output directory
|
||||
assert "singlefile" in script_content.lower()
|
||||
# Should mention HTML output
|
||||
assert ".html" in script_content or "html" in script_content.lower()
|
||||
def test_snapshot_hook_priority():
|
||||
"""Test that snapshot hook has correct priority (50)"""
|
||||
filename = SNAPSHOT_HOOK.name
|
||||
assert "50" in filename, "SingleFile snapshot hook should have priority 50"
|
||||
assert filename.startswith("on_Snapshot__50_"), "Should follow priority naming convention"
|
||||
|
||||
|
||||
def test_verify_deps_with_abx_pkg():
|
||||
"""Verify dependencies are available via abx-pkg after hook installation."""
|
||||
from abx_pkg import Binary, EnvProvider, BinProviderOverrides
|
||||
"""Verify dependencies are available via abx-pkg."""
|
||||
from abx_pkg import Binary, EnvProvider
|
||||
|
||||
EnvProvider.model_rebuild()
|
||||
|
||||
# Verify node is available (singlefile uses Chrome extension, needs Node)
|
||||
# Verify node is available
|
||||
node_binary = Binary(name='node', binproviders=[EnvProvider()])
|
||||
node_loaded = node_binary.load()
|
||||
assert node_loaded and node_loaded.abspath, "Node.js required for singlefile plugin"
|
||||
|
||||
|
||||
def test_singlefile_hook_runs():
|
||||
"""Verify singlefile hook can be executed and completes."""
|
||||
# Prerequisites checked by earlier test
|
||||
|
||||
def test_singlefile_cli_archives_example_com():
|
||||
"""Test that singlefile CLI archives example.com and produces valid HTML."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Run singlefile extraction hook
|
||||
env = get_test_env()
|
||||
env['SINGLEFILE_ENABLED'] = 'true'
|
||||
|
||||
# Run singlefile snapshot hook
|
||||
result = subprocess.run(
|
||||
['node', str(INSTALL_SCRIPT), f'--url={TEST_URL}', '--snapshot-id=test789'],
|
||||
['python', str(SNAPSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=120
|
||||
)
|
||||
|
||||
# Hook should complete successfully (even if it just installs extension)
|
||||
assert result.returncode == 0, f"Hook execution failed: {result.stderr}"
|
||||
|
||||
# Verify extension installation happens
|
||||
assert 'SingleFile extension' in result.stdout or result.returncode == 0, "Should install extension or complete"
|
||||
# Verify output file exists
|
||||
output_file = tmpdir / 'singlefile.html'
|
||||
assert output_file.exists(), f"singlefile.html not created. stdout: {result.stdout}, stderr: {result.stderr}"
|
||||
|
||||
# Verify it contains real HTML
|
||||
html_content = output_file.read_text()
|
||||
assert len(html_content) > 500, "Output file too small to be valid HTML"
|
||||
assert '<!DOCTYPE html>' in html_content or '<html' in html_content, "Output should contain HTML doctype or html tag"
|
||||
assert 'Example Domain' in html_content, "Output should contain example.com content"
|
||||
|
||||
|
||||
def test_singlefile_with_chrome_session():
|
||||
"""Test singlefile connects to existing Chrome session via CDP.
|
||||
|
||||
When a Chrome session exists (chrome/cdp_url.txt), singlefile should
|
||||
connect to it instead of launching a new Chrome instance.
|
||||
"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
try:
|
||||
# Set up Chrome session using shared helper
|
||||
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(
|
||||
tmpdir=tmpdir,
|
||||
crawl_id='singlefile-test-crawl',
|
||||
snapshot_id='singlefile-test-snap',
|
||||
test_url=TEST_URL,
|
||||
navigate=False, # Don't navigate, singlefile will do that
|
||||
timeout=20,
|
||||
)
|
||||
|
||||
# singlefile looks for ../chrome/cdp_url.txt relative to cwd
|
||||
# So we need to run from a directory that has ../chrome pointing to our chrome dir
|
||||
singlefile_output_dir = tmpdir / 'snapshot' / 'singlefile'
|
||||
singlefile_output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Create symlink so singlefile can find the chrome session
|
||||
chrome_link = singlefile_output_dir.parent / 'chrome'
|
||||
if not chrome_link.exists():
|
||||
chrome_link.symlink_to(tmpdir / 'crawl' / 'chrome')
|
||||
|
||||
env = get_test_env()
|
||||
env['SINGLEFILE_ENABLED'] = 'true'
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
|
||||
# Run singlefile - it should find and use the existing Chrome session
|
||||
result = subprocess.run(
|
||||
['python', str(SNAPSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=singlefile-test-snap'],
|
||||
cwd=str(singlefile_output_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=120
|
||||
)
|
||||
|
||||
# Verify output
|
||||
output_file = singlefile_output_dir / 'singlefile.html'
|
||||
if output_file.exists():
|
||||
html_content = output_file.read_text()
|
||||
assert len(html_content) > 500, "Output file too small"
|
||||
assert 'Example Domain' in html_content, "Should contain example.com content"
|
||||
else:
|
||||
# If singlefile couldn't connect to Chrome, it may have failed
|
||||
# Check if it mentioned browser-server in its args (indicating it tried to use CDP)
|
||||
assert result.returncode == 0 or 'browser-server' in result.stderr or 'cdp' in result.stderr.lower(), \
|
||||
f"Singlefile should attempt CDP connection. stderr: {result.stderr}"
|
||||
|
||||
finally:
|
||||
cleanup_chrome(chrome_launch_process, chrome_pid)
|
||||
|
||||
|
||||
def test_singlefile_disabled_skips():
|
||||
"""Test that SINGLEFILE_ENABLED=False exits without JSONL."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
env = get_test_env()
|
||||
env['SINGLEFILE_ENABLED'] = 'False'
|
||||
|
||||
result = subprocess.run(
|
||||
['python', str(SNAPSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=test-disabled'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Should exit 0 when disabled: {result.stderr}"
|
||||
|
||||
# Should NOT emit JSONL when disabled
|
||||
jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')]
|
||||
assert len(jsonl_lines) == 0, f"Should not emit JSONL when disabled, but got: {jsonl_lines}"
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@@ -19,7 +19,7 @@ const puppeteer = require('puppeteer-core');
|
||||
const PLUGIN_NAME = 'ssl';
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'ssl.jsonl';
|
||||
const PID_FILE = 'hook.pid';
|
||||
// PID file is now written by run_hook() with hook-specific name
|
||||
const CHROME_SESSION_DIR = '../chrome';
|
||||
|
||||
function parseArgs() {
|
||||
@@ -211,8 +211,8 @@ async function main() {
|
||||
// Set up listener BEFORE navigation
|
||||
await setupListener(url);
|
||||
|
||||
// Write PID file so chrome_cleanup can kill any remaining processes
|
||||
fs.writeFileSync(path.join(OUTPUT_DIR, PID_FILE), String(process.pid));
|
||||
// Note: PID file is written by run_hook() with hook-specific name
|
||||
// Snapshot.cleanup() kills all *.pid processes when done
|
||||
|
||||
// Wait for chrome_navigate to complete (BLOCKING)
|
||||
await waitForNavigation();
|
||||
|
||||
@@ -18,7 +18,7 @@ const puppeteer = require('puppeteer-core');
|
||||
|
||||
const PLUGIN_NAME = 'staticfile';
|
||||
const OUTPUT_DIR = '.';
|
||||
const PID_FILE = 'hook.pid';
|
||||
// PID file is now written by run_hook() with hook-specific name
|
||||
const CHROME_SESSION_DIR = '../chrome';
|
||||
|
||||
// Content-Types that indicate static files
|
||||
@@ -398,8 +398,8 @@ async function main() {
|
||||
// Set up static file listener BEFORE navigation
|
||||
await setupStaticFileListener();
|
||||
|
||||
// Write PID file
|
||||
fs.writeFileSync(path.join(OUTPUT_DIR, PID_FILE), String(process.pid));
|
||||
// Note: PID file is written by run_hook() with hook-specific name
|
||||
// Snapshot.cleanup() kills all *.pid processes when done
|
||||
|
||||
// Wait for chrome_navigate to complete (BLOCKING)
|
||||
await waitForNavigation();
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
Integration tests for title plugin
|
||||
|
||||
Tests verify:
|
||||
pass
|
||||
1. Plugin script exists
|
||||
2. Node.js is available
|
||||
3. Title extraction works for real example.com
|
||||
@@ -20,9 +19,15 @@ from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
get_plugin_dir,
|
||||
get_hook_script,
|
||||
parse_jsonl_output,
|
||||
)
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
TITLE_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_title.*'), None)
|
||||
|
||||
PLUGIN_DIR = get_plugin_dir(__file__)
|
||||
TITLE_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_title.*')
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
|
||||
|
||||
50
archivebox/plugins/twocaptcha/config.json
Normal file
50
archivebox/plugins/twocaptcha/config.json
Normal file
@@ -0,0 +1,50 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"required_plugins": ["chrome"],
|
||||
"properties": {
|
||||
"TWOCAPTCHA_ENABLED": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["CAPTCHA2_ENABLED", "USE_CAPTCHA2", "USE_TWOCAPTCHA"],
|
||||
"description": "Enable 2captcha browser extension for automatic CAPTCHA solving"
|
||||
},
|
||||
"TWOCAPTCHA_API_KEY": {
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"x-aliases": ["API_KEY_2CAPTCHA", "CAPTCHA2_API_KEY"],
|
||||
"x-sensitive": true,
|
||||
"description": "2captcha API key for CAPTCHA solving service (get from https://2captcha.com)"
|
||||
},
|
||||
"TWOCAPTCHA_RETRY_COUNT": {
|
||||
"type": "integer",
|
||||
"default": 3,
|
||||
"minimum": 0,
|
||||
"maximum": 10,
|
||||
"x-aliases": ["CAPTCHA2_RETRY_COUNT"],
|
||||
"description": "Number of times to retry CAPTCHA solving on error"
|
||||
},
|
||||
"TWOCAPTCHA_RETRY_DELAY": {
|
||||
"type": "integer",
|
||||
"default": 5,
|
||||
"minimum": 0,
|
||||
"maximum": 60,
|
||||
"x-aliases": ["CAPTCHA2_RETRY_DELAY"],
|
||||
"description": "Delay in seconds between CAPTCHA solving retries"
|
||||
},
|
||||
"TWOCAPTCHA_TIMEOUT": {
|
||||
"type": "integer",
|
||||
"default": 60,
|
||||
"minimum": 5,
|
||||
"x-fallback": "TIMEOUT",
|
||||
"x-aliases": ["CAPTCHA2_TIMEOUT"],
|
||||
"description": "Timeout for CAPTCHA solving in seconds"
|
||||
},
|
||||
"TWOCAPTCHA_AUTO_SUBMIT": {
|
||||
"type": "boolean",
|
||||
"default": false,
|
||||
"description": "Automatically submit forms after CAPTCHA is solved"
|
||||
}
|
||||
}
|
||||
}
|
||||
66
archivebox/plugins/twocaptcha/on_Crawl__20_install_twocaptcha_extension.js
Executable file
66
archivebox/plugins/twocaptcha/on_Crawl__20_install_twocaptcha_extension.js
Executable file
@@ -0,0 +1,66 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* 2Captcha Extension Plugin
|
||||
*
|
||||
* Installs and configures the 2captcha Chrome extension for automatic
|
||||
* CAPTCHA solving during page archiving.
|
||||
*
|
||||
* Extension: https://chromewebstore.google.com/detail/ifibfemgeogfhoebkmokieepdoobkbpo
|
||||
* Documentation: https://2captcha.com/blog/how-to-use-2captcha-solver-extension-in-puppeteer
|
||||
*
|
||||
* Priority: 01 (early) - Must install before Chrome session starts at Crawl level
|
||||
* Hook: on_Crawl (runs once per crawl, not per snapshot)
|
||||
*
|
||||
* Requirements:
|
||||
* - TWOCAPTCHA_API_KEY environment variable must be set
|
||||
* - Extension will automatically solve reCAPTCHA, hCaptcha, Cloudflare Turnstile, etc.
|
||||
*/
|
||||
|
||||
// Import extension utilities
|
||||
const { installExtensionWithCache } = require('../chrome/chrome_utils.js');
|
||||
|
||||
// Extension metadata
|
||||
const EXTENSION = {
|
||||
webstore_id: 'ifibfemgeogfhoebkmokieepdoobkbpo',
|
||||
name: 'twocaptcha',
|
||||
};
|
||||
|
||||
/**
|
||||
* Main entry point - install extension before archiving
|
||||
*
|
||||
* Note: 2captcha configuration is handled by on_Crawl__25_configure_twocaptcha_extension_options.js
|
||||
* during first-time browser setup to avoid repeated configuration on every snapshot.
|
||||
* The API key is injected via chrome.storage API once per browser session.
|
||||
*/
|
||||
async function main() {
|
||||
const extension = await installExtensionWithCache(EXTENSION);
|
||||
|
||||
if (extension) {
|
||||
// Check if API key is configured
|
||||
const apiKey = process.env.TWOCAPTCHA_API_KEY || process.env.API_KEY_2CAPTCHA;
|
||||
if (!apiKey || apiKey === 'YOUR_API_KEY_HERE') {
|
||||
console.warn('[⚠️] 2captcha extension installed but TWOCAPTCHA_API_KEY not configured');
|
||||
console.warn('[⚠️] Set TWOCAPTCHA_API_KEY environment variable to enable automatic CAPTCHA solving');
|
||||
} else {
|
||||
console.log('[+] 2captcha extension installed and API key configured');
|
||||
}
|
||||
}
|
||||
|
||||
return extension;
|
||||
}
|
||||
|
||||
// Export functions for use by other plugins
|
||||
module.exports = {
|
||||
EXTENSION,
|
||||
};
|
||||
|
||||
// Run if executed directly
|
||||
if (require.main === module) {
|
||||
main().then(() => {
|
||||
console.log('[✓] 2captcha extension setup complete');
|
||||
process.exit(0);
|
||||
}).catch(err => {
|
||||
console.error('[❌] 2captcha extension setup failed:', err);
|
||||
process.exit(1);
|
||||
});
|
||||
}
|
||||
@@ -0,0 +1,348 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* 2Captcha Extension Configuration
|
||||
*
|
||||
* Configures the 2captcha extension with API key and settings after Crawl-level Chrome session starts.
|
||||
* Runs once per crawl to inject configuration into extension storage.
|
||||
*
|
||||
* Priority: 25 (after chrome_launch at 30, before snapshots start)
|
||||
* Hook: on_Crawl (runs once per crawl, not per snapshot)
|
||||
*
|
||||
* Config Options (from config.json / environment):
|
||||
* - TWOCAPTCHA_API_KEY: API key for 2captcha service
|
||||
* - TWOCAPTCHA_ENABLED: Enable/disable the extension
|
||||
* - TWOCAPTCHA_RETRY_COUNT: Number of retries on error
|
||||
* - TWOCAPTCHA_RETRY_DELAY: Delay between retries (seconds)
|
||||
* - TWOCAPTCHA_AUTO_SUBMIT: Auto-submit forms after solving
|
||||
*
|
||||
* Requirements:
|
||||
* - TWOCAPTCHA_API_KEY environment variable must be set
|
||||
* - chrome plugin must have loaded extensions (extensions.json must exist)
|
||||
*/
|
||||
|
||||
const path = require('path');
|
||||
const fs = require('fs');
|
||||
// Add NODE_MODULES_DIR to module resolution paths if set
|
||||
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Get crawl's chrome directory from environment variable set by hooks.py
|
||||
function getCrawlChromeSessionDir() {
|
||||
const crawlOutputDir = process.env.CRAWL_OUTPUT_DIR || '';
|
||||
if (!crawlOutputDir) {
|
||||
return null;
|
||||
}
|
||||
return path.join(crawlOutputDir, 'chrome');
|
||||
}
|
||||
|
||||
const CHROME_SESSION_DIR = getCrawlChromeSessionDir() || '../chrome';
|
||||
const CONFIG_MARKER = path.join(CHROME_SESSION_DIR, '.twocaptcha_configured');
|
||||
|
||||
// Get environment variable with default
|
||||
function getEnv(name, defaultValue = '') {
|
||||
return (process.env[name] || defaultValue).trim();
|
||||
}
|
||||
|
||||
// Get boolean environment variable
|
||||
function getEnvBool(name, defaultValue = false) {
|
||||
const val = getEnv(name, '').toLowerCase();
|
||||
if (['true', '1', 'yes', 'on'].includes(val)) return true;
|
||||
if (['false', '0', 'no', 'off'].includes(val)) return false;
|
||||
return defaultValue;
|
||||
}
|
||||
|
||||
// Get integer environment variable
|
||||
function getEnvInt(name, defaultValue = 0) {
|
||||
const val = parseInt(getEnv(name, String(defaultValue)), 10);
|
||||
return isNaN(val) ? defaultValue : val;
|
||||
}
|
||||
|
||||
// Parse command line arguments
|
||||
function parseArgs() {
|
||||
const args = {};
|
||||
process.argv.slice(2).forEach(arg => {
|
||||
if (arg.startsWith('--')) {
|
||||
const [key, ...valueParts] = arg.slice(2).split('=');
|
||||
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
|
||||
}
|
||||
});
|
||||
return args;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get 2captcha configuration from environment variables.
|
||||
* Supports both TWOCAPTCHA_* and legacy API_KEY_2CAPTCHA naming.
|
||||
*/
|
||||
function getTwoCaptchaConfig() {
|
||||
const apiKey = getEnv('TWOCAPTCHA_API_KEY') || getEnv('API_KEY_2CAPTCHA') || getEnv('CAPTCHA2_API_KEY');
|
||||
const isEnabled = getEnvBool('TWOCAPTCHA_ENABLED', true);
|
||||
const retryCount = getEnvInt('TWOCAPTCHA_RETRY_COUNT', 3);
|
||||
const retryDelay = getEnvInt('TWOCAPTCHA_RETRY_DELAY', 5);
|
||||
const autoSubmit = getEnvBool('TWOCAPTCHA_AUTO_SUBMIT', false);
|
||||
|
||||
// Build the full config object matching the extension's storage structure
|
||||
// Structure: chrome.storage.local.set({config: {...}})
|
||||
return {
|
||||
// API key - both variants for compatibility
|
||||
apiKey: apiKey,
|
||||
api_key: apiKey,
|
||||
|
||||
// Plugin enabled state
|
||||
isPluginEnabled: isEnabled,
|
||||
|
||||
// Retry settings
|
||||
repeatOnErrorTimes: retryCount,
|
||||
repeatOnErrorDelay: retryDelay,
|
||||
|
||||
// Auto-submit setting
|
||||
autoSubmitForms: autoSubmit,
|
||||
submitFormsDelay: 0,
|
||||
|
||||
// Enable all CAPTCHA types
|
||||
enabledForNormal: true,
|
||||
enabledForRecaptchaV2: true,
|
||||
enabledForInvisibleRecaptchaV2: true,
|
||||
enabledForRecaptchaV3: true,
|
||||
enabledForRecaptchaAudio: false,
|
||||
enabledForGeetest: true,
|
||||
enabledForGeetest_v4: true,
|
||||
enabledForKeycaptcha: true,
|
||||
enabledForArkoselabs: true,
|
||||
enabledForLemin: true,
|
||||
enabledForYandex: true,
|
||||
enabledForCapyPuzzle: true,
|
||||
enabledForTurnstile: true,
|
||||
enabledForAmazonWaf: true,
|
||||
enabledForMTCaptcha: true,
|
||||
|
||||
// Auto-solve all CAPTCHA types
|
||||
autoSolveNormal: true,
|
||||
autoSolveRecaptchaV2: true,
|
||||
autoSolveInvisibleRecaptchaV2: true,
|
||||
autoSolveRecaptchaV3: true,
|
||||
autoSolveRecaptchaAudio: false,
|
||||
autoSolveGeetest: true,
|
||||
autoSolveGeetest_v4: true,
|
||||
autoSolveKeycaptcha: true,
|
||||
autoSolveArkoselabs: true,
|
||||
autoSolveLemin: true,
|
||||
autoSolveYandex: true,
|
||||
autoSolveCapyPuzzle: true,
|
||||
autoSolveTurnstile: true,
|
||||
autoSolveAmazonWaf: true,
|
||||
autoSolveMTCaptcha: true,
|
||||
|
||||
// Other settings with sensible defaults
|
||||
recaptchaV2Type: 'token',
|
||||
recaptchaV3MinScore: 0.3,
|
||||
buttonPosition: 'inner',
|
||||
useProxy: false,
|
||||
proxy: '',
|
||||
proxytype: 'HTTP',
|
||||
blackListDomain: '',
|
||||
autoSubmitRules: [],
|
||||
normalSources: [],
|
||||
};
|
||||
}
|
||||
|
||||
async function configure2Captcha() {
|
||||
// Check if already configured in this session
|
||||
if (fs.existsSync(CONFIG_MARKER)) {
|
||||
console.error('[*] 2captcha already configured in this browser session');
|
||||
return { success: true, skipped: true };
|
||||
}
|
||||
|
||||
// Get configuration
|
||||
const config = getTwoCaptchaConfig();
|
||||
|
||||
// Check if API key is set
|
||||
if (!config.apiKey || config.apiKey === 'YOUR_API_KEY_HERE') {
|
||||
console.warn('[!] 2captcha extension loaded but TWOCAPTCHA_API_KEY not configured');
|
||||
console.warn('[!] Set TWOCAPTCHA_API_KEY environment variable to enable automatic CAPTCHA solving');
|
||||
return { success: false, error: 'TWOCAPTCHA_API_KEY not configured' };
|
||||
}
|
||||
|
||||
console.error('[*] Configuring 2captcha extension...');
|
||||
console.error(`[*] API Key: ${config.apiKey.slice(0, 8)}...${config.apiKey.slice(-4)}`);
|
||||
console.error(`[*] Enabled: ${config.isPluginEnabled}`);
|
||||
console.error(`[*] Retry Count: ${config.repeatOnErrorTimes}`);
|
||||
console.error(`[*] Retry Delay: ${config.repeatOnErrorDelay}s`);
|
||||
console.error(`[*] Auto Submit: ${config.autoSubmitForms}`);
|
||||
console.error(`[*] Auto Solve: all CAPTCHA types enabled`);
|
||||
|
||||
try {
|
||||
// Connect to the existing Chrome session via CDP
|
||||
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
|
||||
if (!fs.existsSync(cdpFile)) {
|
||||
return { success: false, error: 'CDP URL not found - chrome plugin must run first' };
|
||||
}
|
||||
|
||||
const cdpUrl = fs.readFileSync(cdpFile, 'utf-8').trim();
|
||||
const browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl });
|
||||
|
||||
try {
|
||||
// First, navigate to a page to trigger extension content scripts and wake up service worker
|
||||
console.error('[*] Waking up extension by visiting a page...');
|
||||
const triggerPage = await browser.newPage();
|
||||
try {
|
||||
await triggerPage.goto('https://www.google.com', { waitUntil: 'domcontentloaded', timeout: 10000 });
|
||||
await new Promise(r => setTimeout(r, 3000)); // Give extension time to initialize
|
||||
} catch (e) {
|
||||
console.warn(`[!] Trigger page failed: ${e.message}`);
|
||||
}
|
||||
try { await triggerPage.close(); } catch (e) {}
|
||||
|
||||
// Get 2captcha extension info from extensions.json
|
||||
const extensionsFile = path.join(CHROME_SESSION_DIR, 'extensions.json');
|
||||
if (!fs.existsSync(extensionsFile)) {
|
||||
return { success: false, error: 'extensions.json not found - chrome plugin must run first' };
|
||||
}
|
||||
|
||||
const extensions = JSON.parse(fs.readFileSync(extensionsFile, 'utf-8'));
|
||||
const captchaExt = extensions.find(ext => ext.name === 'twocaptcha');
|
||||
|
||||
if (!captchaExt) {
|
||||
console.error('[*] 2captcha extension not installed, skipping configuration');
|
||||
return { success: true, skipped: true };
|
||||
}
|
||||
|
||||
if (!captchaExt.id) {
|
||||
return { success: false, error: '2captcha extension ID not found in extensions.json' };
|
||||
}
|
||||
|
||||
const extensionId = captchaExt.id;
|
||||
console.error(`[*] 2captcha Extension ID: ${extensionId}`);
|
||||
|
||||
// Configure via options page
|
||||
console.error('[*] Configuring via options page...');
|
||||
const optionsUrl = `chrome-extension://${extensionId}/options/options.html`;
|
||||
|
||||
let configPage = await browser.newPage();
|
||||
|
||||
try {
|
||||
// Navigate to options page - catch error but continue since page may still load
|
||||
try {
|
||||
await configPage.goto(optionsUrl, { waitUntil: 'networkidle0', timeout: 10000 });
|
||||
} catch (navError) {
|
||||
// Navigation may throw ERR_BLOCKED_BY_CLIENT but page still loads
|
||||
console.error(`[*] Navigation threw error (may still work): ${navError.message}`);
|
||||
}
|
||||
|
||||
// Wait a moment for page to settle
|
||||
await new Promise(r => setTimeout(r, 3000));
|
||||
|
||||
// Check all pages for the extension page (Chrome may open it in a different tab)
|
||||
const pages = await browser.pages();
|
||||
for (const page of pages) {
|
||||
const url = page.url();
|
||||
if (url.startsWith(`chrome-extension://${extensionId}`)) {
|
||||
configPage = page;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
const currentUrl = configPage.url();
|
||||
console.error(`[*] Current URL: ${currentUrl}`);
|
||||
|
||||
if (!currentUrl.startsWith(`chrome-extension://${extensionId}`)) {
|
||||
return { success: false, error: `Failed to navigate to options page, got: ${currentUrl}` };
|
||||
}
|
||||
|
||||
// Wait for Config object to be available
|
||||
console.error('[*] Waiting for Config object...');
|
||||
await configPage.waitForFunction(() => typeof Config !== 'undefined', { timeout: 10000 });
|
||||
|
||||
// Use chrome.storage.local.set with the config wrapper
|
||||
const result = await configPage.evaluate((cfg) => {
|
||||
return new Promise((resolve) => {
|
||||
if (typeof chrome !== 'undefined' && chrome.storage) {
|
||||
chrome.storage.local.set({ config: cfg }, () => {
|
||||
if (chrome.runtime.lastError) {
|
||||
resolve({ success: false, error: chrome.runtime.lastError.message });
|
||||
} else {
|
||||
resolve({ success: true, method: 'options_page' });
|
||||
}
|
||||
});
|
||||
} else {
|
||||
resolve({ success: false, error: 'chrome.storage not available' });
|
||||
}
|
||||
});
|
||||
}, config);
|
||||
|
||||
if (result.success) {
|
||||
console.error(`[+] 2captcha configured via ${result.method}`);
|
||||
fs.writeFileSync(CONFIG_MARKER, JSON.stringify({
|
||||
timestamp: new Date().toISOString(),
|
||||
method: result.method,
|
||||
extensionId: extensionId,
|
||||
config: {
|
||||
apiKeySet: !!config.apiKey,
|
||||
isPluginEnabled: config.isPluginEnabled,
|
||||
repeatOnErrorTimes: config.repeatOnErrorTimes,
|
||||
repeatOnErrorDelay: config.repeatOnErrorDelay,
|
||||
autoSubmitForms: config.autoSubmitForms,
|
||||
autoSolveEnabled: true,
|
||||
}
|
||||
}, null, 2));
|
||||
return { success: true, method: result.method };
|
||||
}
|
||||
|
||||
return { success: false, error: result.error || 'Config failed' };
|
||||
} finally {
|
||||
try { await configPage.close(); } catch (e) {}
|
||||
}
|
||||
} finally {
|
||||
browser.disconnect();
|
||||
}
|
||||
} catch (e) {
|
||||
return { success: false, error: `${e.name}: ${e.message}` };
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs();
|
||||
const url = args.url;
|
||||
const snapshotId = args.snapshot_id;
|
||||
|
||||
if (!url || !snapshotId) {
|
||||
console.error('Usage: on_Crawl__25_configure_twocaptcha_extension_options.js --url=<url> --snapshot-id=<uuid>');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const startTs = new Date();
|
||||
let status = 'failed';
|
||||
let error = '';
|
||||
|
||||
try {
|
||||
const result = await configure2Captcha();
|
||||
|
||||
if (result.skipped) {
|
||||
status = 'skipped';
|
||||
} else if (result.success) {
|
||||
status = 'succeeded';
|
||||
} else {
|
||||
status = 'failed';
|
||||
error = result.error || 'Configuration failed';
|
||||
}
|
||||
} catch (e) {
|
||||
error = `${e.name}: ${e.message}`;
|
||||
status = 'failed';
|
||||
}
|
||||
|
||||
const endTs = new Date();
|
||||
const duration = (endTs - startTs) / 1000;
|
||||
|
||||
if (error) {
|
||||
console.error(`ERROR: ${error}`);
|
||||
}
|
||||
|
||||
// Config hooks don't emit JSONL - they're utility hooks for setup
|
||||
// Exit code indicates success/failure
|
||||
|
||||
process.exit(status === 'succeeded' || status === 'skipped' ? 0 : 1);
|
||||
}
|
||||
|
||||
main().catch(e => {
|
||||
console.error(`Fatal error: ${e.message}`);
|
||||
process.exit(1);
|
||||
});
|
||||
0
archivebox/plugins/twocaptcha/templates/icon.html
Normal file
0
archivebox/plugins/twocaptcha/templates/icon.html
Normal file
237
archivebox/plugins/twocaptcha/tests/test_twocaptcha.py
Normal file
237
archivebox/plugins/twocaptcha/tests/test_twocaptcha.py
Normal file
@@ -0,0 +1,237 @@
|
||||
"""
|
||||
Integration tests for twocaptcha plugin
|
||||
|
||||
Run with: TWOCAPTCHA_API_KEY=your_key pytest archivebox/plugins/twocaptcha/tests/ -xvs
|
||||
|
||||
NOTE: Chrome 137+ removed --load-extension support, so these tests MUST use Chromium.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import signal
|
||||
import subprocess
|
||||
import tempfile
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
setup_test_env,
|
||||
launch_chromium_session,
|
||||
kill_chromium_session,
|
||||
CHROME_LAUNCH_HOOK,
|
||||
PLUGINS_ROOT,
|
||||
)
|
||||
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
INSTALL_SCRIPT = PLUGIN_DIR / 'on_Crawl__20_install_twocaptcha_extension.js'
|
||||
CONFIG_SCRIPT = PLUGIN_DIR / 'on_Crawl__25_configure_twocaptcha_extension_options.js'
|
||||
|
||||
TEST_URL = 'https://2captcha.com/demo/recaptcha-v2'
|
||||
|
||||
|
||||
# Alias for backward compatibility with existing test names
|
||||
launch_chrome = launch_chromium_session
|
||||
kill_chrome = kill_chromium_session
|
||||
|
||||
|
||||
class TestTwoCaptcha:
|
||||
"""Integration tests requiring TWOCAPTCHA_API_KEY."""
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def setup(self):
|
||||
self.api_key = os.environ.get('TWOCAPTCHA_API_KEY') or os.environ.get('API_KEY_2CAPTCHA')
|
||||
if not self.api_key:
|
||||
pytest.skip("TWOCAPTCHA_API_KEY required")
|
||||
|
||||
def test_install_and_load(self):
|
||||
"""Extension installs and loads in Chromium."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
env = setup_test_env(tmpdir)
|
||||
env['TWOCAPTCHA_API_KEY'] = self.api_key
|
||||
|
||||
# Install
|
||||
result = subprocess.run(['node', str(INSTALL_SCRIPT)], env=env, timeout=120, capture_output=True, text=True)
|
||||
assert result.returncode == 0, f"Install failed: {result.stderr}"
|
||||
|
||||
cache = Path(env['CHROME_EXTENSIONS_DIR']) / 'twocaptcha.extension.json'
|
||||
assert cache.exists()
|
||||
data = json.loads(cache.read_text())
|
||||
assert data['webstore_id'] == 'ifibfemgeogfhoebkmokieepdoobkbpo'
|
||||
|
||||
# Launch Chromium in crawls directory
|
||||
crawl_id = 'test'
|
||||
crawl_dir = Path(env['CRAWLS_DIR']) / crawl_id
|
||||
chrome_dir = crawl_dir / 'chrome'
|
||||
env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
|
||||
process, cdp_url = launch_chrome(env, chrome_dir, crawl_id)
|
||||
|
||||
try:
|
||||
exts = json.loads((chrome_dir / 'extensions.json').read_text())
|
||||
assert any(e['name'] == 'twocaptcha' for e in exts), f"Not loaded: {exts}"
|
||||
print(f"[+] Extension loaded: id={next(e['id'] for e in exts if e['name']=='twocaptcha')}")
|
||||
finally:
|
||||
kill_chrome(process, chrome_dir)
|
||||
|
||||
def test_config_applied(self):
|
||||
"""Configuration is applied to extension and verified via Config.getAll()."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
env = setup_test_env(tmpdir)
|
||||
env['TWOCAPTCHA_API_KEY'] = self.api_key
|
||||
env['TWOCAPTCHA_RETRY_COUNT'] = '5'
|
||||
env['TWOCAPTCHA_RETRY_DELAY'] = '10'
|
||||
|
||||
subprocess.run(['node', str(INSTALL_SCRIPT)], env=env, timeout=120, capture_output=True)
|
||||
|
||||
# Launch Chromium in crawls directory
|
||||
crawl_id = 'cfg'
|
||||
crawl_dir = Path(env['CRAWLS_DIR']) / crawl_id
|
||||
chrome_dir = crawl_dir / 'chrome'
|
||||
env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
|
||||
process, cdp_url = launch_chrome(env, chrome_dir, crawl_id)
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['node', str(CONFIG_SCRIPT), '--url=https://example.com', '--snapshot-id=test'],
|
||||
env=env, timeout=30, capture_output=True, text=True
|
||||
)
|
||||
assert result.returncode == 0, f"Config failed: {result.stderr}"
|
||||
assert (chrome_dir / '.twocaptcha_configured').exists()
|
||||
|
||||
# Verify config via options.html and Config.getAll()
|
||||
# Get the actual extension ID from the config marker (Chrome computes IDs differently)
|
||||
config_marker = json.loads((chrome_dir / '.twocaptcha_configured').read_text())
|
||||
ext_id = config_marker['extensionId']
|
||||
script = f'''
|
||||
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
const puppeteer = require('puppeteer-core');
|
||||
(async () => {{
|
||||
const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }});
|
||||
|
||||
// Load options.html and use Config.getAll() to verify
|
||||
const optionsUrl = 'chrome-extension://{ext_id}/options/options.html';
|
||||
const page = await browser.newPage();
|
||||
console.error('[*] Loading options page:', optionsUrl);
|
||||
|
||||
// Navigate - catch error but continue since page may still load
|
||||
try {{
|
||||
await page.goto(optionsUrl, {{ waitUntil: 'networkidle0', timeout: 10000 }});
|
||||
}} catch (e) {{
|
||||
console.error('[*] Navigation threw error (may still work):', e.message);
|
||||
}}
|
||||
|
||||
// Wait for page to settle
|
||||
await new Promise(r => setTimeout(r, 2000));
|
||||
console.error('[*] Current URL:', page.url());
|
||||
|
||||
// Wait for Config object to be available
|
||||
await page.waitForFunction(() => typeof Config !== 'undefined', {{ timeout: 5000 }});
|
||||
|
||||
// Call Config.getAll() - the extension's own API (returns a Promise)
|
||||
const cfg = await page.evaluate(async () => await Config.getAll());
|
||||
console.error('[*] Config.getAll() returned:', JSON.stringify(cfg));
|
||||
|
||||
await page.close();
|
||||
browser.disconnect();
|
||||
console.log(JSON.stringify(cfg));
|
||||
}})();
|
||||
'''
|
||||
(tmpdir / 'v.js').write_text(script)
|
||||
r = subprocess.run(['node', str(tmpdir / 'v.js')], env=env, timeout=30, capture_output=True, text=True)
|
||||
print(r.stderr)
|
||||
assert r.returncode == 0, f"Verify failed: {r.stderr}"
|
||||
|
||||
cfg = json.loads(r.stdout.strip().split('\n')[-1])
|
||||
print(f"[*] Config from extension: {json.dumps(cfg, indent=2)}")
|
||||
|
||||
# Verify all the fields we care about
|
||||
assert cfg.get('apiKey') == self.api_key or cfg.get('api_key') == self.api_key, f"API key not set: {cfg}"
|
||||
assert cfg.get('isPluginEnabled') == True, f"Plugin not enabled: {cfg}"
|
||||
assert cfg.get('repeatOnErrorTimes') == 5, f"Retry count wrong: {cfg}"
|
||||
assert cfg.get('repeatOnErrorDelay') == 10, f"Retry delay wrong: {cfg}"
|
||||
assert cfg.get('autoSolveRecaptchaV2') == True, f"autoSolveRecaptchaV2 not enabled: {cfg}"
|
||||
assert cfg.get('autoSolveRecaptchaV3') == True, f"autoSolveRecaptchaV3 not enabled: {cfg}"
|
||||
assert cfg.get('autoSolveTurnstile') == True, f"autoSolveTurnstile not enabled: {cfg}"
|
||||
assert cfg.get('enabledForRecaptchaV2') == True, f"enabledForRecaptchaV2 not enabled: {cfg}"
|
||||
|
||||
print(f"[+] Config verified via Config.getAll()!")
|
||||
finally:
|
||||
kill_chrome(process, chrome_dir)
|
||||
|
||||
def test_solves_recaptcha(self):
|
||||
"""Extension solves reCAPTCHA on demo page."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
env = setup_test_env(tmpdir)
|
||||
env['TWOCAPTCHA_API_KEY'] = self.api_key
|
||||
|
||||
subprocess.run(['node', str(INSTALL_SCRIPT)], env=env, timeout=120, capture_output=True)
|
||||
|
||||
# Launch Chromium in crawls directory
|
||||
crawl_id = 'solve'
|
||||
crawl_dir = Path(env['CRAWLS_DIR']) / crawl_id
|
||||
chrome_dir = crawl_dir / 'chrome'
|
||||
env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
|
||||
process, cdp_url = launch_chrome(env, chrome_dir, crawl_id)
|
||||
|
||||
try:
|
||||
subprocess.run(['node', str(CONFIG_SCRIPT), '--url=x', '--snapshot-id=x'], env=env, timeout=30, capture_output=True)
|
||||
|
||||
script = f'''
|
||||
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
const puppeteer = require('puppeteer-core');
|
||||
(async () => {{
|
||||
const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }});
|
||||
const page = await browser.newPage();
|
||||
await page.setViewport({{ width: 1440, height: 900 }});
|
||||
console.error('[*] Loading {TEST_URL}...');
|
||||
await page.goto('{TEST_URL}', {{ waitUntil: 'networkidle2', timeout: 30000 }});
|
||||
await new Promise(r => setTimeout(r, 3000));
|
||||
|
||||
const start = Date.now();
|
||||
const maxWait = 90000;
|
||||
|
||||
while (Date.now() - start < maxWait) {{
|
||||
const state = await page.evaluate(() => {{
|
||||
const resp = document.querySelector('textarea[name="g-recaptcha-response"]');
|
||||
const solver = document.querySelector('.captcha-solver');
|
||||
return {{
|
||||
solved: resp ? resp.value.length > 0 : false,
|
||||
state: solver?.getAttribute('data-state'),
|
||||
text: solver?.textContent?.trim() || ''
|
||||
}};
|
||||
}});
|
||||
const sec = Math.round((Date.now() - start) / 1000);
|
||||
console.error('[*] ' + sec + 's state=' + state.state + ' solved=' + state.solved + ' text=' + state.text.slice(0,30));
|
||||
if (state.solved) {{ console.error('[+] SOLVED!'); break; }}
|
||||
if (state.state === 'error') {{ console.error('[!] ERROR'); break; }}
|
||||
await new Promise(r => setTimeout(r, 2000));
|
||||
}}
|
||||
|
||||
const final = await page.evaluate(() => {{
|
||||
const resp = document.querySelector('textarea[name="g-recaptcha-response"]');
|
||||
return {{ solved: resp ? resp.value.length > 0 : false, preview: resp?.value?.slice(0,50) || '' }};
|
||||
}});
|
||||
browser.disconnect();
|
||||
console.log(JSON.stringify(final));
|
||||
}})();
|
||||
'''
|
||||
(tmpdir / 's.js').write_text(script)
|
||||
print("\n[*] Solving CAPTCHA (10-60s)...")
|
||||
r = subprocess.run(['node', str(tmpdir / 's.js')], env=env, timeout=120, capture_output=True, text=True)
|
||||
print(r.stderr)
|
||||
assert r.returncode == 0, f"Failed: {r.stderr}"
|
||||
|
||||
final = json.loads([l for l in r.stdout.strip().split('\n') if l.startswith('{')][-1])
|
||||
assert final.get('solved'), f"Not solved: {final}"
|
||||
print(f"[+] SOLVED! {final.get('preview','')[:30]}...")
|
||||
finally:
|
||||
kill_chrome(process, chrome_dir)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-xvs'])
|
||||
60
archivebox/plugins/ublock/on_Crawl__20_install_ublock_extension.js
Executable file
60
archivebox/plugins/ublock/on_Crawl__20_install_ublock_extension.js
Executable file
@@ -0,0 +1,60 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* uBlock Origin Extension Plugin
|
||||
*
|
||||
* Installs and configures the uBlock Origin Chrome extension for ad blocking
|
||||
* and privacy protection during page archiving.
|
||||
*
|
||||
* Extension: https://chromewebstore.google.com/detail/cjpalhdlnbpafiamejdnhcphjbkeiagm
|
||||
*
|
||||
* Priority: 03 (early) - Must install before Chrome session starts at Crawl level
|
||||
* Hook: on_Crawl (runs once per crawl, not per snapshot)
|
||||
*
|
||||
* This extension automatically:
|
||||
* - Blocks ads, trackers, and malware domains
|
||||
* - Reduces page load time and bandwidth usage
|
||||
* - Improves privacy during archiving
|
||||
* - Removes clutter from archived pages
|
||||
* - Uses efficient blocking with filter lists
|
||||
*/
|
||||
|
||||
// Import extension utilities
|
||||
const { installExtensionWithCache } = require('../chrome/chrome_utils.js');
|
||||
|
||||
// Extension metadata
|
||||
const EXTENSION = {
|
||||
webstore_id: 'cjpalhdlnbpafiamejdnhcphjbkeiagm',
|
||||
name: 'ublock',
|
||||
};
|
||||
|
||||
/**
|
||||
* Main entry point - install extension before archiving
|
||||
*
|
||||
* Note: uBlock Origin works automatically with default filter lists.
|
||||
* No configuration needed - blocks ads, trackers, and malware domains out of the box.
|
||||
*/
|
||||
async function main() {
|
||||
const extension = await installExtensionWithCache(EXTENSION);
|
||||
|
||||
if (extension) {
|
||||
console.log('[+] Ads and trackers will be blocked during archiving');
|
||||
}
|
||||
|
||||
return extension;
|
||||
}
|
||||
|
||||
// Export functions for use by other plugins
|
||||
module.exports = {
|
||||
EXTENSION,
|
||||
};
|
||||
|
||||
// Run if executed directly
|
||||
if (require.main === module) {
|
||||
main().then(() => {
|
||||
console.log('[✓] uBlock Origin extension setup complete');
|
||||
process.exit(0);
|
||||
}).catch(err => {
|
||||
console.error('[❌] uBlock Origin extension setup failed:', err);
|
||||
process.exit(1);
|
||||
});
|
||||
}
|
||||
@@ -12,9 +12,17 @@ from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
setup_test_env,
|
||||
launch_chromium_session,
|
||||
kill_chromium_session,
|
||||
CHROME_LAUNCH_HOOK,
|
||||
PLUGINS_ROOT,
|
||||
)
|
||||
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_ublock.*'), None)
|
||||
INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_install_ublock_extension.*'), None)
|
||||
|
||||
|
||||
def test_install_script_exists():
|
||||
@@ -157,91 +165,143 @@ def test_large_extension_size():
|
||||
assert size_bytes > 1_000_000, f"uBlock Origin should be > 1MB, got {size_bytes} bytes"
|
||||
|
||||
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
CHROME_INSTALL_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__00_chrome_install.py'
|
||||
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__20_chrome_launch.bg.js'
|
||||
def check_ad_blocking(cdp_url: str, test_url: str, env: dict, script_dir: Path) -> dict:
|
||||
"""Check ad blocking effectiveness by counting ad elements on page.
|
||||
|
||||
|
||||
def setup_test_env(tmpdir: Path) -> dict:
|
||||
"""Set up isolated data/lib directory structure for tests.
|
||||
|
||||
Creates structure like:
|
||||
<tmpdir>/data/
|
||||
lib/
|
||||
arm64-darwin/ (or x86_64-linux, etc.)
|
||||
npm/
|
||||
bin/
|
||||
node_modules/
|
||||
chrome_extensions/
|
||||
|
||||
Calls chrome install hook which handles puppeteer-core and chromium installation.
|
||||
Returns env dict with DATA_DIR, LIB_DIR, NPM_BIN_DIR, NODE_MODULES_DIR, CHROME_BINARY, etc.
|
||||
Returns dict with:
|
||||
- adElementsFound: int - number of ad-related elements found
|
||||
- adElementsVisible: int - number of visible ad elements
|
||||
- blockedRequests: int - number of blocked network requests (ads/trackers)
|
||||
- totalRequests: int - total network requests made
|
||||
- percentBlocked: int - percentage of ad elements hidden (0-100)
|
||||
"""
|
||||
import platform
|
||||
test_script = f'''
|
||||
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
# Determine machine type (matches archivebox.config.paths.get_machine_type())
|
||||
machine = platform.machine().lower()
|
||||
system = platform.system().lower()
|
||||
if machine in ('arm64', 'aarch64'):
|
||||
machine = 'arm64'
|
||||
elif machine in ('x86_64', 'amd64'):
|
||||
machine = 'x86_64'
|
||||
machine_type = f"{machine}-{system}"
|
||||
(async () => {{
|
||||
const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }});
|
||||
|
||||
# Create proper directory structure
|
||||
data_dir = tmpdir / 'data'
|
||||
lib_dir = data_dir / 'lib' / machine_type
|
||||
npm_dir = lib_dir / 'npm'
|
||||
npm_bin_dir = npm_dir / 'bin'
|
||||
node_modules_dir = npm_dir / 'node_modules'
|
||||
chrome_extensions_dir = data_dir / 'chrome_extensions'
|
||||
const page = await browser.newPage();
|
||||
await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
|
||||
await page.setViewport({{ width: 1440, height: 900 }});
|
||||
|
||||
# Create all directories
|
||||
node_modules_dir.mkdir(parents=True, exist_ok=True)
|
||||
npm_bin_dir.mkdir(parents=True, exist_ok=True)
|
||||
chrome_extensions_dir.mkdir(parents=True, exist_ok=True)
|
||||
// Track network requests
|
||||
let blockedRequests = 0;
|
||||
let totalRequests = 0;
|
||||
const adDomains = ['doubleclick', 'googlesyndication', 'googleadservices', 'facebook.com/tr',
|
||||
'analytics', 'adservice', 'advertising', 'taboola', 'outbrain', 'criteo',
|
||||
'amazon-adsystem', 'ads.yahoo', 'gemini.yahoo', 'yimg.com/cv/', 'beap.gemini'];
|
||||
|
||||
# Build complete env dict
|
||||
env = os.environ.copy()
|
||||
env.update({
|
||||
'DATA_DIR': str(data_dir),
|
||||
'LIB_DIR': str(lib_dir),
|
||||
'MACHINE_TYPE': machine_type,
|
||||
'NPM_BIN_DIR': str(npm_bin_dir),
|
||||
'NODE_MODULES_DIR': str(node_modules_dir),
|
||||
'CHROME_EXTENSIONS_DIR': str(chrome_extensions_dir),
|
||||
})
|
||||
page.on('request', request => {{
|
||||
totalRequests++;
|
||||
const url = request.url().toLowerCase();
|
||||
if (adDomains.some(d => url.includes(d))) {{
|
||||
// This is an ad request
|
||||
}}
|
||||
}});
|
||||
|
||||
page.on('requestfailed', request => {{
|
||||
const url = request.url().toLowerCase();
|
||||
if (adDomains.some(d => url.includes(d))) {{
|
||||
blockedRequests++;
|
||||
}}
|
||||
}});
|
||||
|
||||
console.error('Navigating to {test_url}...');
|
||||
await page.goto('{test_url}', {{ waitUntil: 'domcontentloaded', timeout: 60000 }});
|
||||
|
||||
// Wait for page to fully render and ads to load
|
||||
await new Promise(r => setTimeout(r, 5000));
|
||||
|
||||
// Check for ad elements in the DOM
|
||||
const result = await page.evaluate(() => {{
|
||||
// Common ad-related selectors
|
||||
const adSelectors = [
|
||||
// Generic ad containers
|
||||
'[class*="ad-"]', '[class*="ad_"]', '[class*="-ad"]', '[class*="_ad"]',
|
||||
'[id*="ad-"]', '[id*="ad_"]', '[id*="-ad"]', '[id*="_ad"]',
|
||||
'[class*="advertisement"]', '[id*="advertisement"]',
|
||||
'[class*="sponsored"]', '[id*="sponsored"]',
|
||||
// Google ads
|
||||
'ins.adsbygoogle', '[data-ad-client]', '[data-ad-slot]',
|
||||
// Yahoo specific
|
||||
'[class*="gemini"]', '[data-beacon]', '[class*="native-ad"]',
|
||||
'[class*="stream-ad"]', '[class*="LDRB"]', '[class*="ntv-ad"]',
|
||||
// iframes (often ads)
|
||||
'iframe[src*="ad"]', 'iframe[src*="doubleclick"]', 'iframe[src*="googlesyndication"]',
|
||||
// Common ad sizes
|
||||
'[style*="300px"][style*="250px"]', '[style*="728px"][style*="90px"]',
|
||||
'[style*="160px"][style*="600px"]', '[style*="320px"][style*="50px"]',
|
||||
];
|
||||
|
||||
let adElementsFound = 0;
|
||||
let adElementsVisible = 0;
|
||||
|
||||
for (const selector of adSelectors) {{
|
||||
try {{
|
||||
const elements = document.querySelectorAll(selector);
|
||||
for (const el of elements) {{
|
||||
adElementsFound++;
|
||||
const style = window.getComputedStyle(el);
|
||||
const rect = el.getBoundingClientRect();
|
||||
const isVisible = style.display !== 'none' &&
|
||||
style.visibility !== 'hidden' &&
|
||||
style.opacity !== '0' &&
|
||||
rect.width > 0 && rect.height > 0;
|
||||
if (isVisible) {{
|
||||
adElementsVisible++;
|
||||
}}
|
||||
}}
|
||||
}} catch (e) {{
|
||||
// Invalid selector, skip
|
||||
}}
|
||||
}}
|
||||
|
||||
return {{
|
||||
adElementsFound,
|
||||
adElementsVisible,
|
||||
pageTitle: document.title
|
||||
}};
|
||||
}});
|
||||
|
||||
result.blockedRequests = blockedRequests;
|
||||
result.totalRequests = totalRequests;
|
||||
// Calculate how many ad elements were hidden (found but not visible)
|
||||
const hiddenAds = result.adElementsFound - result.adElementsVisible;
|
||||
result.percentBlocked = result.adElementsFound > 0
|
||||
? Math.round((hiddenAds / result.adElementsFound) * 100)
|
||||
: 0;
|
||||
|
||||
console.error('Ad blocking result:', JSON.stringify(result));
|
||||
browser.disconnect();
|
||||
console.log(JSON.stringify(result));
|
||||
}})();
|
||||
'''
|
||||
script_path = script_dir / 'check_ads.js'
|
||||
script_path.write_text(test_script)
|
||||
|
||||
# Call chrome install hook (installs puppeteer-core and chromium, outputs JSONL)
|
||||
result = subprocess.run(
|
||||
['python', str(CHROME_INSTALL_HOOK)],
|
||||
capture_output=True, text=True, timeout=10, env=env
|
||||
['node', str(script_path)],
|
||||
cwd=str(script_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=90
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
pytest.skip(f"Chrome install hook failed: {result.stderr}")
|
||||
raise RuntimeError(f"Ad check script failed: {result.stderr}")
|
||||
|
||||
# Parse JSONL output to get CHROME_BINARY
|
||||
chrome_binary = None
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if not line.strip():
|
||||
continue
|
||||
try:
|
||||
data = json.loads(line)
|
||||
if data.get('type') == 'Binary' and data.get('abspath'):
|
||||
chrome_binary = data['abspath']
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
output_lines = [l for l in result.stdout.strip().split('\n') if l.startswith('{')]
|
||||
if not output_lines:
|
||||
raise RuntimeError(f"No JSON output from ad check: {result.stdout}\nstderr: {result.stderr}")
|
||||
|
||||
if not chrome_binary or not Path(chrome_binary).exists():
|
||||
pytest.skip(f"Chromium binary not found: {chrome_binary}")
|
||||
|
||||
env['CHROME_BINARY'] = chrome_binary
|
||||
return env
|
||||
return json.loads(output_lines[-1])
|
||||
|
||||
|
||||
# Test URL: ad blocker test page that shows if ads are blocked
|
||||
TEST_URL = 'https://d3ward.github.io/toolz/adblock.html'
|
||||
# Test URL: Yahoo has many ads that uBlock should block
|
||||
TEST_URL = 'https://www.yahoo.com/'
|
||||
|
||||
|
||||
@pytest.mark.timeout(15)
|
||||
@@ -290,14 +350,18 @@ def test_extension_loads_in_chromium():
|
||||
print(f"[test] NODE_MODULES_DIR={env.get('NODE_MODULES_DIR')}", flush=True)
|
||||
print(f"[test] puppeteer-core exists: {(Path(env['NODE_MODULES_DIR']) / 'puppeteer-core').exists()}", flush=True)
|
||||
print("[test] Launching Chromium...", flush=True)
|
||||
data_dir = Path(env['DATA_DIR'])
|
||||
crawl_dir = data_dir / 'crawl'
|
||||
crawl_dir.mkdir()
|
||||
|
||||
# Launch Chromium in crawls directory
|
||||
crawl_id = 'test-ublock'
|
||||
crawl_dir = Path(env['CRAWLS_DIR']) / crawl_id
|
||||
crawl_dir.mkdir(parents=True, exist_ok=True)
|
||||
chrome_dir = crawl_dir / 'chrome'
|
||||
chrome_dir.mkdir(parents=True, exist_ok=True)
|
||||
env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
|
||||
|
||||
chrome_launch_process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-ublock'],
|
||||
cwd=str(crawl_dir),
|
||||
['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
|
||||
cwd=str(chrome_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
@@ -457,161 +521,177 @@ const puppeteer = require('puppeteer-core');
|
||||
def test_blocks_ads_on_test_page():
|
||||
"""Live test: verify uBlock Origin blocks ads on a test page.
|
||||
|
||||
Uses Chromium with extensions loaded automatically via chrome hook.
|
||||
Tests against d3ward's ad blocker test page which checks ad domains.
|
||||
This test runs TWO browser sessions:
|
||||
1. WITHOUT extension - verifies ads are NOT blocked (baseline)
|
||||
2. WITH extension - verifies ads ARE blocked
|
||||
|
||||
This ensures we're actually testing the extension's effect, not just
|
||||
that a test page happens to show ads as blocked.
|
||||
"""
|
||||
import signal
|
||||
import time
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Set up isolated env with proper directory structure
|
||||
env = setup_test_env(tmpdir)
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
env_base = setup_test_env(tmpdir)
|
||||
env_base['CHROME_HEADLESS'] = 'true'
|
||||
|
||||
ext_dir = Path(env['CHROME_EXTENSIONS_DIR'])
|
||||
# ============================================================
|
||||
# STEP 1: BASELINE - Run WITHOUT extension, verify ads are NOT blocked
|
||||
# ============================================================
|
||||
print("\n" + "="*60)
|
||||
print("STEP 1: BASELINE TEST (no extension)")
|
||||
print("="*60)
|
||||
|
||||
data_dir = Path(env_base['DATA_DIR'])
|
||||
|
||||
env_no_ext = env_base.copy()
|
||||
env_no_ext['CHROME_EXTENSIONS_DIR'] = str(data_dir / 'personas' / 'Default' / 'empty_extensions')
|
||||
(data_dir / 'personas' / 'Default' / 'empty_extensions').mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Launch baseline Chromium in crawls directory
|
||||
baseline_crawl_id = 'baseline-no-ext'
|
||||
baseline_crawl_dir = Path(env_base['CRAWLS_DIR']) / baseline_crawl_id
|
||||
baseline_crawl_dir.mkdir(parents=True, exist_ok=True)
|
||||
baseline_chrome_dir = baseline_crawl_dir / 'chrome'
|
||||
env_no_ext['CRAWL_OUTPUT_DIR'] = str(baseline_crawl_dir)
|
||||
baseline_process = None
|
||||
|
||||
try:
|
||||
baseline_process, baseline_cdp_url = launch_chromium_session(
|
||||
env_no_ext, baseline_chrome_dir, baseline_crawl_id
|
||||
)
|
||||
print(f"Baseline Chromium launched: {baseline_cdp_url}")
|
||||
|
||||
# Wait a moment for browser to be ready
|
||||
time.sleep(2)
|
||||
|
||||
baseline_result = check_ad_blocking(
|
||||
baseline_cdp_url, TEST_URL, env_no_ext, tmpdir
|
||||
)
|
||||
|
||||
print(f"Baseline result: {baseline_result['adElementsVisible']} visible ads "
|
||||
f"(found {baseline_result['adElementsFound']} ad elements)")
|
||||
|
||||
finally:
|
||||
if baseline_process:
|
||||
kill_chromium_session(baseline_process, baseline_chrome_dir)
|
||||
|
||||
# Verify baseline shows ads ARE visible (not blocked)
|
||||
if baseline_result['adElementsFound'] == 0:
|
||||
pytest.skip(
|
||||
f"Cannot test extension: no ad elements found on {TEST_URL}. "
|
||||
f"The page may have changed or loaded differently."
|
||||
)
|
||||
|
||||
if baseline_result['adElementsVisible'] == 0:
|
||||
print(f"\nWARNING: Baseline shows 0 visible ads despite finding {baseline_result['adElementsFound']} elements!")
|
||||
print("This suggests either:")
|
||||
print(" - There's another ad blocker interfering")
|
||||
print(" - Network-level ad blocking is in effect")
|
||||
|
||||
pytest.skip(
|
||||
f"Cannot test extension: baseline shows no visible ads "
|
||||
f"despite finding {baseline_result['adElementsFound']} ad elements."
|
||||
)
|
||||
|
||||
print(f"\n✓ Baseline confirmed: {baseline_result['adElementsVisible']} visible ads without extension")
|
||||
|
||||
# ============================================================
|
||||
# STEP 2: Install the uBlock extension
|
||||
# ============================================================
|
||||
print("\n" + "="*60)
|
||||
print("STEP 2: INSTALLING EXTENSION")
|
||||
print("="*60)
|
||||
|
||||
ext_dir = Path(env_base['CHROME_EXTENSIONS_DIR'])
|
||||
|
||||
# Step 1: Install the uBlock extension
|
||||
result = subprocess.run(
|
||||
['node', str(INSTALL_SCRIPT)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=15
|
||||
env=env_base,
|
||||
timeout=60
|
||||
)
|
||||
assert result.returncode == 0, f"Extension install failed: {result.stderr}"
|
||||
|
||||
# Verify extension cache was created
|
||||
cache_file = ext_dir / 'ublock.extension.json'
|
||||
assert cache_file.exists(), "Extension cache not created"
|
||||
ext_data = json.loads(cache_file.read_text())
|
||||
print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}")
|
||||
|
||||
# Step 2: Launch Chromium using the chrome hook (loads extensions automatically)
|
||||
data_dir = Path(env['DATA_DIR'])
|
||||
crawl_dir = data_dir / 'crawl'
|
||||
crawl_dir.mkdir()
|
||||
chrome_dir = crawl_dir / 'chrome'
|
||||
# ============================================================
|
||||
# STEP 3: Run WITH extension, verify ads ARE blocked
|
||||
# ============================================================
|
||||
print("\n" + "="*60)
|
||||
print("STEP 3: TEST WITH EXTENSION")
|
||||
print("="*60)
|
||||
|
||||
chrome_launch_process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-ublock'],
|
||||
cwd=str(crawl_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
|
||||
# Wait for Chrome to launch and CDP URL to be available
|
||||
cdp_url = None
|
||||
for i in range(20):
|
||||
if chrome_launch_process.poll() is not None:
|
||||
stdout, stderr = chrome_launch_process.communicate()
|
||||
raise RuntimeError(f"Chrome launch failed:\nStdout: {stdout}\nStderr: {stderr}")
|
||||
cdp_file = chrome_dir / 'cdp_url.txt'
|
||||
if cdp_file.exists():
|
||||
cdp_url = cdp_file.read_text().strip()
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
assert cdp_url, "Chrome CDP URL not found after 20s"
|
||||
print(f"Chrome launched with CDP URL: {cdp_url}")
|
||||
|
||||
# Check that extensions were loaded
|
||||
extensions_file = chrome_dir / 'extensions.json'
|
||||
if extensions_file.exists():
|
||||
loaded_exts = json.loads(extensions_file.read_text())
|
||||
print(f"Extensions loaded: {[e.get('name') for e in loaded_exts]}")
|
||||
# Launch extension test Chromium in crawls directory
|
||||
ext_crawl_id = 'test-with-ext'
|
||||
ext_crawl_dir = Path(env_base['CRAWLS_DIR']) / ext_crawl_id
|
||||
ext_crawl_dir.mkdir(parents=True, exist_ok=True)
|
||||
ext_chrome_dir = ext_crawl_dir / 'chrome'
|
||||
env_base['CRAWL_OUTPUT_DIR'] = str(ext_crawl_dir)
|
||||
ext_process = None
|
||||
|
||||
try:
|
||||
# Step 3: Connect to Chrome and test ad blocking
|
||||
test_script = f'''
|
||||
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
const puppeteer = require('puppeteer-core');
|
||||
ext_process, ext_cdp_url = launch_chromium_session(
|
||||
env_base, ext_chrome_dir, ext_crawl_id
|
||||
)
|
||||
print(f"Extension Chromium launched: {ext_cdp_url}")
|
||||
|
||||
(async () => {{
|
||||
const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }});
|
||||
# Check that extension was loaded
|
||||
extensions_file = ext_chrome_dir / 'extensions.json'
|
||||
if extensions_file.exists():
|
||||
loaded_exts = json.loads(extensions_file.read_text())
|
||||
print(f"Extensions loaded: {[e.get('name') for e in loaded_exts]}")
|
||||
|
||||
// Wait for extension to initialize
|
||||
await new Promise(r => setTimeout(r, 500));
|
||||
# Wait for extension to initialize
|
||||
time.sleep(3)
|
||||
|
||||
// Check extension loaded by looking at targets
|
||||
const targets = browser.targets();
|
||||
const extTargets = targets.filter(t =>
|
||||
t.url().startsWith('chrome-extension://') ||
|
||||
t.type() === 'service_worker' ||
|
||||
t.type() === 'background_page'
|
||||
);
|
||||
console.error('Extension targets found:', extTargets.length);
|
||||
extTargets.forEach(t => console.error(' -', t.type(), t.url().substring(0, 60)));
|
||||
|
||||
const page = await browser.newPage();
|
||||
await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36');
|
||||
await page.setViewport({{ width: 1440, height: 900 }});
|
||||
|
||||
console.error('Navigating to {TEST_URL}...');
|
||||
await page.goto('{TEST_URL}', {{ waitUntil: 'networkidle2', timeout: 60000 }});
|
||||
|
||||
// Wait for the test page to run its checks
|
||||
await new Promise(r => setTimeout(r, 5000));
|
||||
|
||||
// The d3ward test page shows blocked percentage
|
||||
const result = await page.evaluate(() => {{
|
||||
const scoreEl = document.querySelector('#score');
|
||||
const score = scoreEl ? scoreEl.textContent : null;
|
||||
const blockedItems = document.querySelectorAll('.blocked').length;
|
||||
const totalItems = document.querySelectorAll('.testlist li').length;
|
||||
return {{
|
||||
score,
|
||||
blockedItems,
|
||||
totalItems,
|
||||
percentBlocked: totalItems > 0 ? Math.round((blockedItems / totalItems) * 100) : 0
|
||||
}};
|
||||
}});
|
||||
|
||||
console.error('Ad blocking result:', JSON.stringify(result));
|
||||
browser.disconnect();
|
||||
console.log(JSON.stringify(result));
|
||||
}})();
|
||||
'''
|
||||
script_path = tmpdir / 'test_ublock.js'
|
||||
script_path.write_text(test_script)
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(script_path)],
|
||||
cwd=str(tmpdir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=10
|
||||
ext_result = check_ad_blocking(
|
||||
ext_cdp_url, TEST_URL, env_base, tmpdir
|
||||
)
|
||||
|
||||
print(f"stderr: {result.stderr}")
|
||||
print(f"stdout: {result.stdout}")
|
||||
|
||||
assert result.returncode == 0, f"Test failed: {result.stderr}"
|
||||
|
||||
output_lines = [l for l in result.stdout.strip().split('\n') if l.startswith('{')]
|
||||
assert output_lines, f"No JSON output: {result.stdout}"
|
||||
|
||||
test_result = json.loads(output_lines[-1])
|
||||
|
||||
# uBlock should block most ad domains on the test page
|
||||
assert test_result['percentBlocked'] >= 50, \
|
||||
f"uBlock should block at least 50% of ads, only blocked {test_result['percentBlocked']}%. Result: {test_result}"
|
||||
print(f"Extension result: {ext_result['adElementsVisible']} visible ads "
|
||||
f"(found {ext_result['adElementsFound']} ad elements)")
|
||||
|
||||
finally:
|
||||
# Clean up Chrome
|
||||
try:
|
||||
chrome_launch_process.send_signal(signal.SIGTERM)
|
||||
chrome_launch_process.wait(timeout=5)
|
||||
except:
|
||||
pass
|
||||
chrome_pid_file = chrome_dir / 'chrome.pid'
|
||||
if chrome_pid_file.exists():
|
||||
try:
|
||||
chrome_pid = int(chrome_pid_file.read_text().strip())
|
||||
os.kill(chrome_pid, signal.SIGKILL)
|
||||
except (OSError, ValueError):
|
||||
pass
|
||||
if ext_process:
|
||||
kill_chromium_session(ext_process, ext_chrome_dir)
|
||||
|
||||
# ============================================================
|
||||
# STEP 4: Compare results
|
||||
# ============================================================
|
||||
print("\n" + "="*60)
|
||||
print("STEP 4: COMPARISON")
|
||||
print("="*60)
|
||||
print(f"Baseline (no extension): {baseline_result['adElementsVisible']} visible ads")
|
||||
print(f"With extension: {ext_result['adElementsVisible']} visible ads")
|
||||
|
||||
# Calculate reduction in visible ads
|
||||
ads_blocked = baseline_result['adElementsVisible'] - ext_result['adElementsVisible']
|
||||
reduction_percent = (ads_blocked / baseline_result['adElementsVisible'] * 100) if baseline_result['adElementsVisible'] > 0 else 0
|
||||
|
||||
print(f"Reduction: {ads_blocked} fewer visible ads ({reduction_percent:.0f}% reduction)")
|
||||
|
||||
# Extension should significantly reduce visible ads
|
||||
assert ext_result['adElementsVisible'] < baseline_result['adElementsVisible'], \
|
||||
f"uBlock should reduce visible ads.\n" \
|
||||
f"Baseline: {baseline_result['adElementsVisible']} visible ads\n" \
|
||||
f"With extension: {ext_result['adElementsVisible']} visible ads\n" \
|
||||
f"Expected fewer ads with extension."
|
||||
|
||||
# Extension should block at least 30% of ads
|
||||
assert reduction_percent >= 30, \
|
||||
f"uBlock should block at least 30% of ads.\n" \
|
||||
f"Baseline: {baseline_result['adElementsVisible']} visible ads\n" \
|
||||
f"With extension: {ext_result['adElementsVisible']} visible ads\n" \
|
||||
f"Reduction: only {reduction_percent:.0f}% (expected at least 30%)"
|
||||
|
||||
print(f"\n✓ SUCCESS: uBlock correctly blocks ads!")
|
||||
print(f" - Baseline: {baseline_result['adElementsVisible']} visible ads")
|
||||
print(f" - With extension: {ext_result['adElementsVisible']} visible ads")
|
||||
print(f" - Blocked: {ads_blocked} ads ({reduction_percent:.0f}% reduction)")
|
||||
|
||||
130
archivebox/plugins/wget/on_Crawl__10_install_wget.py
Normal file
130
archivebox/plugins/wget/on_Crawl__10_install_wget.py
Normal file
@@ -0,0 +1,130 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Validate and compute derived wget config values.
|
||||
|
||||
This hook runs early in the Crawl lifecycle to:
|
||||
1. Validate config values with warnings (not hard errors)
|
||||
2. Compute derived values (USE_WGET from WGET_ENABLED)
|
||||
3. Check binary availability and version
|
||||
|
||||
Output:
|
||||
- COMPUTED:KEY=VALUE lines that hooks.py parses and adds to env
|
||||
- Binary JSONL records to stdout when binaries are found
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
from abx_pkg import Binary, EnvProvider
|
||||
|
||||
|
||||
# Read config from environment (already validated by JSONSchema)
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
return os.environ.get(name, default).strip()
|
||||
|
||||
def get_env_bool(name: str, default: bool = False) -> bool:
|
||||
val = get_env(name, '').lower()
|
||||
if val in ('true', '1', 'yes', 'on'):
|
||||
return True
|
||||
if val in ('false', '0', 'no', 'off'):
|
||||
return False
|
||||
return default
|
||||
|
||||
def get_env_int(name: str, default: int = 0) -> int:
|
||||
try:
|
||||
return int(get_env(name, str(default)))
|
||||
except ValueError:
|
||||
return default
|
||||
|
||||
|
||||
def output_binary(binary: Binary, name: str):
|
||||
"""Output Binary JSONL record to stdout."""
|
||||
machine_id = os.environ.get('MACHINE_ID', '')
|
||||
|
||||
record = {
|
||||
'type': 'Binary',
|
||||
'name': name,
|
||||
'abspath': str(binary.abspath),
|
||||
'version': str(binary.version) if binary.version else '',
|
||||
'sha256': binary.sha256 or '',
|
||||
'binprovider': 'env',
|
||||
'machine_id': machine_id,
|
||||
}
|
||||
print(json.dumps(record))
|
||||
|
||||
|
||||
def main():
|
||||
warnings = []
|
||||
errors = []
|
||||
computed = {}
|
||||
|
||||
# Get config values
|
||||
wget_enabled = get_env_bool('WGET_ENABLED', True)
|
||||
wget_save_warc = get_env_bool('WGET_SAVE_WARC', True)
|
||||
wget_timeout = get_env_int('WGET_TIMEOUT') or get_env_int('TIMEOUT', 60)
|
||||
wget_binary = get_env('WGET_BINARY', 'wget')
|
||||
|
||||
# Compute derived values (USE_WGET for backward compatibility)
|
||||
use_wget = wget_enabled
|
||||
computed['USE_WGET'] = str(use_wget).lower()
|
||||
|
||||
# Validate timeout with warning (not error)
|
||||
if use_wget and wget_timeout < 20:
|
||||
warnings.append(
|
||||
f"WGET_TIMEOUT={wget_timeout} is very low. "
|
||||
"wget may fail to archive sites if set to less than ~20 seconds. "
|
||||
"Consider setting WGET_TIMEOUT=60 or higher."
|
||||
)
|
||||
|
||||
# Check binary availability using abx-pkg
|
||||
provider = EnvProvider()
|
||||
try:
|
||||
binary = Binary(name=wget_binary, binproviders=[provider]).load()
|
||||
binary_path = str(binary.abspath) if binary.abspath else ''
|
||||
except Exception:
|
||||
binary = None
|
||||
binary_path = ''
|
||||
|
||||
if not binary_path:
|
||||
if use_wget:
|
||||
errors.append(f"WGET_BINARY={wget_binary} not found. Install wget or set WGET_ENABLED=false.")
|
||||
computed['WGET_BINARY'] = ''
|
||||
else:
|
||||
computed['WGET_BINARY'] = binary_path
|
||||
wget_version = str(binary.version) if binary.version else 'unknown'
|
||||
computed['WGET_VERSION'] = wget_version
|
||||
|
||||
# Output Binary JSONL record
|
||||
output_binary(binary, name='wget')
|
||||
|
||||
# Check for compression support
|
||||
if computed.get('WGET_BINARY'):
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[computed['WGET_BINARY'], '--compression=auto', '--help'],
|
||||
capture_output=True, timeout=5
|
||||
)
|
||||
computed['WGET_AUTO_COMPRESSION'] = 'true' if result.returncode == 0 else 'false'
|
||||
except Exception:
|
||||
computed['WGET_AUTO_COMPRESSION'] = 'false'
|
||||
|
||||
# Output results
|
||||
# Format: KEY=VALUE lines that hooks.py will parse and add to env
|
||||
for key, value in computed.items():
|
||||
print(f"COMPUTED:{key}={value}")
|
||||
|
||||
for warning in warnings:
|
||||
print(f"WARNING:{warning}", file=sys.stderr)
|
||||
|
||||
for error in errors:
|
||||
print(f"ERROR:{error}", file=sys.stderr)
|
||||
|
||||
# Exit with error if any hard errors
|
||||
sys.exit(1 if errors else 0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
218
archivebox/tests/conftest.py
Normal file
218
archivebox/tests/conftest.py
Normal file
@@ -0,0 +1,218 @@
|
||||
"""archivebox/tests/conftest.py - Pytest fixtures for CLI tests."""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Any, Optional, Tuple
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Fixtures
|
||||
# =============================================================================
|
||||
|
||||
@pytest.fixture
|
||||
def isolated_data_dir(tmp_path, settings):
|
||||
"""
|
||||
Create isolated DATA_DIR for each test.
|
||||
|
||||
Uses tmp_path for isolation, configures Django settings.
|
||||
"""
|
||||
data_dir = tmp_path / 'archivebox_data'
|
||||
data_dir.mkdir()
|
||||
|
||||
# Set environment for subprocess calls
|
||||
os.environ['DATA_DIR'] = str(data_dir)
|
||||
|
||||
# Update Django settings
|
||||
settings.DATA_DIR = data_dir
|
||||
|
||||
yield data_dir
|
||||
|
||||
# Cleanup handled by tmp_path fixture
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def initialized_archive(isolated_data_dir):
|
||||
"""
|
||||
Initialize ArchiveBox archive in isolated directory.
|
||||
|
||||
Runs `archivebox init` to set up database and directories.
|
||||
"""
|
||||
from archivebox.cli.archivebox_init import init
|
||||
init(setup=True, quick=True)
|
||||
return isolated_data_dir
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def cli_env(initialized_archive):
|
||||
"""
|
||||
Environment dict for CLI subprocess calls.
|
||||
|
||||
Includes DATA_DIR and disables slow extractors.
|
||||
"""
|
||||
return {
|
||||
**os.environ,
|
||||
'DATA_DIR': str(initialized_archive),
|
||||
'USE_COLOR': 'False',
|
||||
'SHOW_PROGRESS': 'False',
|
||||
'SAVE_TITLE': 'True',
|
||||
'SAVE_FAVICON': 'False',
|
||||
'SAVE_WGET': 'False',
|
||||
'SAVE_WARC': 'False',
|
||||
'SAVE_PDF': 'False',
|
||||
'SAVE_SCREENSHOT': 'False',
|
||||
'SAVE_DOM': 'False',
|
||||
'SAVE_SINGLEFILE': 'False',
|
||||
'SAVE_READABILITY': 'False',
|
||||
'SAVE_MERCURY': 'False',
|
||||
'SAVE_GIT': 'False',
|
||||
'SAVE_YTDLP': 'False',
|
||||
'SAVE_HEADERS': 'False',
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# CLI Helpers
|
||||
# =============================================================================
|
||||
|
||||
def run_archivebox_cmd(
|
||||
args: List[str],
|
||||
stdin: Optional[str] = None,
|
||||
cwd: Optional[Path] = None,
|
||||
env: Optional[Dict[str, str]] = None,
|
||||
timeout: int = 60,
|
||||
) -> Tuple[str, str, int]:
|
||||
"""
|
||||
Run archivebox command, return (stdout, stderr, returncode).
|
||||
|
||||
Args:
|
||||
args: Command arguments (e.g., ['crawl', 'create', 'https://example.com'])
|
||||
stdin: Optional string to pipe to stdin
|
||||
cwd: Working directory (defaults to DATA_DIR from env)
|
||||
env: Environment variables (defaults to os.environ with DATA_DIR)
|
||||
timeout: Command timeout in seconds
|
||||
|
||||
Returns:
|
||||
Tuple of (stdout, stderr, returncode)
|
||||
"""
|
||||
cmd = [sys.executable, '-m', 'archivebox'] + args
|
||||
|
||||
env = env or {**os.environ}
|
||||
cwd = cwd or Path(env.get('DATA_DIR', '.'))
|
||||
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
input=stdin,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
cwd=cwd,
|
||||
env=env,
|
||||
timeout=timeout,
|
||||
)
|
||||
|
||||
return result.stdout, result.stderr, result.returncode
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Output Assertions
|
||||
# =============================================================================
|
||||
|
||||
def parse_jsonl_output(stdout: str) -> List[Dict[str, Any]]:
|
||||
"""Parse JSONL output into list of dicts."""
|
||||
records = []
|
||||
for line in stdout.strip().split('\n'):
|
||||
line = line.strip()
|
||||
if line and line.startswith('{'):
|
||||
try:
|
||||
records.append(json.loads(line))
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
return records
|
||||
|
||||
|
||||
def assert_jsonl_contains_type(stdout: str, record_type: str, min_count: int = 1):
|
||||
"""Assert output contains at least min_count records of type."""
|
||||
records = parse_jsonl_output(stdout)
|
||||
matching = [r for r in records if r.get('type') == record_type]
|
||||
assert len(matching) >= min_count, \
|
||||
f"Expected >= {min_count} {record_type}, got {len(matching)}"
|
||||
return matching
|
||||
|
||||
|
||||
def assert_jsonl_pass_through(stdout: str, input_records: List[Dict[str, Any]]):
|
||||
"""Assert that input records appear in output (pass-through behavior)."""
|
||||
output_records = parse_jsonl_output(stdout)
|
||||
output_ids = {r.get('id') for r in output_records if r.get('id')}
|
||||
|
||||
for input_rec in input_records:
|
||||
input_id = input_rec.get('id')
|
||||
if input_id:
|
||||
assert input_id in output_ids, \
|
||||
f"Input record {input_id} not found in output (pass-through failed)"
|
||||
|
||||
|
||||
def assert_record_has_fields(record: Dict[str, Any], required_fields: List[str]):
|
||||
"""Assert record has all required fields with non-None values."""
|
||||
for field in required_fields:
|
||||
assert field in record, f"Record missing field: {field}"
|
||||
assert record[field] is not None, f"Record field is None: {field}"
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Database Assertions
|
||||
# =============================================================================
|
||||
|
||||
def assert_db_count(model_class, filters: Dict[str, Any], expected: int):
|
||||
"""Assert database count matches expected."""
|
||||
actual = model_class.objects.filter(**filters).count()
|
||||
assert actual == expected, \
|
||||
f"Expected {expected} {model_class.__name__}, got {actual}"
|
||||
|
||||
|
||||
def assert_db_exists(model_class, **filters):
|
||||
"""Assert at least one record exists matching filters."""
|
||||
assert model_class.objects.filter(**filters).exists(), \
|
||||
f"No {model_class.__name__} found matching {filters}"
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Test Data Factories
|
||||
# =============================================================================
|
||||
|
||||
def create_test_url(domain: str = 'example.com', path: str = None) -> str:
|
||||
"""Generate unique test URL."""
|
||||
import uuid
|
||||
path = path or uuid.uuid4().hex[:8]
|
||||
return f'https://{domain}/{path}'
|
||||
|
||||
|
||||
def create_test_crawl_json(urls: List[str] = None, **kwargs) -> Dict[str, Any]:
|
||||
"""Create Crawl JSONL record for testing."""
|
||||
from archivebox.misc.jsonl import TYPE_CRAWL
|
||||
|
||||
urls = urls or [create_test_url()]
|
||||
return {
|
||||
'type': TYPE_CRAWL,
|
||||
'urls': '\n'.join(urls),
|
||||
'max_depth': kwargs.get('max_depth', 0),
|
||||
'tags_str': kwargs.get('tags_str', ''),
|
||||
'status': kwargs.get('status', 'queued'),
|
||||
**{k: v for k, v in kwargs.items() if k not in ('max_depth', 'tags_str', 'status')},
|
||||
}
|
||||
|
||||
|
||||
def create_test_snapshot_json(url: str = None, **kwargs) -> Dict[str, Any]:
|
||||
"""Create Snapshot JSONL record for testing."""
|
||||
from archivebox.misc.jsonl import TYPE_SNAPSHOT
|
||||
|
||||
return {
|
||||
'type': TYPE_SNAPSHOT,
|
||||
'url': url or create_test_url(),
|
||||
'tags_str': kwargs.get('tags_str', ''),
|
||||
'status': kwargs.get('status', 'queued'),
|
||||
**{k: v for k, v in kwargs.items() if k not in ('tags_str', 'status')},
|
||||
}
|
||||
264
archivebox/tests/test_cli_archiveresult.py
Normal file
264
archivebox/tests/test_cli_archiveresult.py
Normal file
@@ -0,0 +1,264 @@
|
||||
"""
|
||||
Tests for archivebox archiveresult CLI command.
|
||||
|
||||
Tests cover:
|
||||
- archiveresult create (from Snapshot JSONL, with --plugin, pass-through)
|
||||
- archiveresult list (with filters)
|
||||
- archiveresult update
|
||||
- archiveresult delete
|
||||
"""
|
||||
|
||||
import json
|
||||
import pytest
|
||||
|
||||
from archivebox.tests.conftest import (
|
||||
run_archivebox_cmd,
|
||||
parse_jsonl_output,
|
||||
create_test_url,
|
||||
)
|
||||
|
||||
|
||||
class TestArchiveResultCreate:
|
||||
"""Tests for `archivebox archiveresult create`."""
|
||||
|
||||
def test_create_from_snapshot_jsonl(self, cli_env, initialized_archive):
|
||||
"""Create archive results from Snapshot JSONL input."""
|
||||
url = create_test_url()
|
||||
|
||||
# Create a snapshot first
|
||||
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
|
||||
snapshot = parse_jsonl_output(stdout1)[0]
|
||||
|
||||
# Pipe snapshot to archiveresult create
|
||||
stdout2, stderr, code = run_archivebox_cmd(
|
||||
['archiveresult', 'create', '--plugin=title'],
|
||||
stdin=json.dumps(snapshot),
|
||||
env=cli_env,
|
||||
)
|
||||
|
||||
assert code == 0, f"Command failed: {stderr}"
|
||||
|
||||
records = parse_jsonl_output(stdout2)
|
||||
# Should have the Snapshot passed through and ArchiveResult created
|
||||
types = [r.get('type') for r in records]
|
||||
assert 'Snapshot' in types
|
||||
assert 'ArchiveResult' in types
|
||||
|
||||
ar = next(r for r in records if r['type'] == 'ArchiveResult')
|
||||
assert ar['plugin'] == 'title'
|
||||
|
||||
def test_create_with_specific_plugin(self, cli_env, initialized_archive):
|
||||
"""Create archive result for specific plugin."""
|
||||
url = create_test_url()
|
||||
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
|
||||
snapshot = parse_jsonl_output(stdout1)[0]
|
||||
|
||||
stdout2, stderr, code = run_archivebox_cmd(
|
||||
['archiveresult', 'create', '--plugin=screenshot'],
|
||||
stdin=json.dumps(snapshot),
|
||||
env=cli_env,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
records = parse_jsonl_output(stdout2)
|
||||
ar_records = [r for r in records if r.get('type') == 'ArchiveResult']
|
||||
assert len(ar_records) >= 1
|
||||
assert ar_records[0]['plugin'] == 'screenshot'
|
||||
|
||||
def test_create_pass_through_crawl(self, cli_env, initialized_archive):
|
||||
"""Pass-through Crawl records unchanged."""
|
||||
url = create_test_url()
|
||||
|
||||
# Create crawl and snapshot
|
||||
stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], env=cli_env)
|
||||
crawl = parse_jsonl_output(stdout1)[0]
|
||||
|
||||
stdout2, _, _ = run_archivebox_cmd(
|
||||
['snapshot', 'create'],
|
||||
stdin=json.dumps(crawl),
|
||||
env=cli_env,
|
||||
)
|
||||
|
||||
# Now pipe all to archiveresult create
|
||||
stdout3, stderr, code = run_archivebox_cmd(
|
||||
['archiveresult', 'create', '--plugin=title'],
|
||||
stdin=stdout2,
|
||||
env=cli_env,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
records = parse_jsonl_output(stdout3)
|
||||
|
||||
types = [r.get('type') for r in records]
|
||||
assert 'Crawl' in types
|
||||
assert 'Snapshot' in types
|
||||
assert 'ArchiveResult' in types
|
||||
|
||||
def test_create_pass_through_only_when_no_snapshots(self, cli_env, initialized_archive):
|
||||
"""Only pass-through records but no new snapshots returns success."""
|
||||
crawl_record = {'type': 'Crawl', 'id': 'fake-id', 'urls': 'https://example.com'}
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['archiveresult', 'create'],
|
||||
stdin=json.dumps(crawl_record),
|
||||
env=cli_env,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
assert 'Passed through' in stderr
|
||||
|
||||
|
||||
class TestArchiveResultList:
|
||||
"""Tests for `archivebox archiveresult list`."""
|
||||
|
||||
def test_list_empty(self, cli_env, initialized_archive):
|
||||
"""List with no archive results returns empty."""
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['archiveresult', 'list'],
|
||||
env=cli_env,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
assert 'Listed 0 archive results' in stderr
|
||||
|
||||
def test_list_filter_by_status(self, cli_env, initialized_archive):
|
||||
"""Filter archive results by status."""
|
||||
# Create snapshot and archive result
|
||||
url = create_test_url()
|
||||
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
|
||||
snapshot = parse_jsonl_output(stdout1)[0]
|
||||
run_archivebox_cmd(
|
||||
['archiveresult', 'create', '--plugin=title'],
|
||||
stdin=json.dumps(snapshot),
|
||||
env=cli_env,
|
||||
)
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['archiveresult', 'list', '--status=queued'],
|
||||
env=cli_env,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
records = parse_jsonl_output(stdout)
|
||||
for r in records:
|
||||
assert r['status'] == 'queued'
|
||||
|
||||
def test_list_filter_by_plugin(self, cli_env, initialized_archive):
|
||||
"""Filter archive results by plugin."""
|
||||
url = create_test_url()
|
||||
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
|
||||
snapshot = parse_jsonl_output(stdout1)[0]
|
||||
run_archivebox_cmd(
|
||||
['archiveresult', 'create', '--plugin=title'],
|
||||
stdin=json.dumps(snapshot),
|
||||
env=cli_env,
|
||||
)
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['archiveresult', 'list', '--plugin=title'],
|
||||
env=cli_env,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
records = parse_jsonl_output(stdout)
|
||||
for r in records:
|
||||
assert r['plugin'] == 'title'
|
||||
|
||||
def test_list_with_limit(self, cli_env, initialized_archive):
|
||||
"""Limit number of results."""
|
||||
# Create multiple archive results
|
||||
for _ in range(3):
|
||||
url = create_test_url()
|
||||
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
|
||||
snapshot = parse_jsonl_output(stdout1)[0]
|
||||
run_archivebox_cmd(
|
||||
['archiveresult', 'create', '--plugin=title'],
|
||||
stdin=json.dumps(snapshot),
|
||||
env=cli_env,
|
||||
)
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['archiveresult', 'list', '--limit=2'],
|
||||
env=cli_env,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
records = parse_jsonl_output(stdout)
|
||||
assert len(records) == 2
|
||||
|
||||
|
||||
class TestArchiveResultUpdate:
|
||||
"""Tests for `archivebox archiveresult update`."""
|
||||
|
||||
def test_update_status(self, cli_env, initialized_archive):
|
||||
"""Update archive result status."""
|
||||
url = create_test_url()
|
||||
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
|
||||
snapshot = parse_jsonl_output(stdout1)[0]
|
||||
|
||||
stdout2, _, _ = run_archivebox_cmd(
|
||||
['archiveresult', 'create', '--plugin=title'],
|
||||
stdin=json.dumps(snapshot),
|
||||
env=cli_env,
|
||||
)
|
||||
ar = next(r for r in parse_jsonl_output(stdout2) if r.get('type') == 'ArchiveResult')
|
||||
|
||||
stdout3, stderr, code = run_archivebox_cmd(
|
||||
['archiveresult', 'update', '--status=failed'],
|
||||
stdin=json.dumps(ar),
|
||||
env=cli_env,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
assert 'Updated 1 archive results' in stderr
|
||||
|
||||
records = parse_jsonl_output(stdout3)
|
||||
assert records[0]['status'] == 'failed'
|
||||
|
||||
|
||||
class TestArchiveResultDelete:
|
||||
"""Tests for `archivebox archiveresult delete`."""
|
||||
|
||||
def test_delete_requires_yes(self, cli_env, initialized_archive):
|
||||
"""Delete requires --yes flag."""
|
||||
url = create_test_url()
|
||||
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
|
||||
snapshot = parse_jsonl_output(stdout1)[0]
|
||||
|
||||
stdout2, _, _ = run_archivebox_cmd(
|
||||
['archiveresult', 'create', '--plugin=title'],
|
||||
stdin=json.dumps(snapshot),
|
||||
env=cli_env,
|
||||
)
|
||||
ar = next(r for r in parse_jsonl_output(stdout2) if r.get('type') == 'ArchiveResult')
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['archiveresult', 'delete'],
|
||||
stdin=json.dumps(ar),
|
||||
env=cli_env,
|
||||
)
|
||||
|
||||
assert code == 1
|
||||
assert '--yes' in stderr
|
||||
|
||||
def test_delete_with_yes(self, cli_env, initialized_archive):
|
||||
"""Delete with --yes flag works."""
|
||||
url = create_test_url()
|
||||
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
|
||||
snapshot = parse_jsonl_output(stdout1)[0]
|
||||
|
||||
stdout2, _, _ = run_archivebox_cmd(
|
||||
['archiveresult', 'create', '--plugin=title'],
|
||||
stdin=json.dumps(snapshot),
|
||||
env=cli_env,
|
||||
)
|
||||
ar = next(r for r in parse_jsonl_output(stdout2) if r.get('type') == 'ArchiveResult')
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['archiveresult', 'delete', '--yes'],
|
||||
stdin=json.dumps(ar),
|
||||
env=cli_env,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
assert 'Deleted 1 archive results' in stderr
|
||||
261
archivebox/tests/test_cli_crawl.py
Normal file
261
archivebox/tests/test_cli_crawl.py
Normal file
@@ -0,0 +1,261 @@
|
||||
"""
|
||||
Tests for archivebox crawl CLI command.
|
||||
|
||||
Tests cover:
|
||||
- crawl create (with URLs, from stdin, pass-through)
|
||||
- crawl list (with filters)
|
||||
- crawl update
|
||||
- crawl delete
|
||||
"""
|
||||
|
||||
import json
|
||||
import pytest
|
||||
|
||||
from archivebox.tests.conftest import (
|
||||
run_archivebox_cmd,
|
||||
parse_jsonl_output,
|
||||
assert_jsonl_contains_type,
|
||||
create_test_url,
|
||||
create_test_crawl_json,
|
||||
)
|
||||
|
||||
|
||||
class TestCrawlCreate:
|
||||
"""Tests for `archivebox crawl create`."""
|
||||
|
||||
def test_create_from_url_args(self, cli_env, initialized_archive):
|
||||
"""Create crawl from URL arguments."""
|
||||
url = create_test_url()
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['crawl', 'create', url],
|
||||
env=cli_env,
|
||||
)
|
||||
|
||||
assert code == 0, f"Command failed: {stderr}"
|
||||
assert 'Created crawl' in stderr
|
||||
|
||||
# Check JSONL output
|
||||
records = parse_jsonl_output(stdout)
|
||||
assert len(records) == 1
|
||||
assert records[0]['type'] == 'Crawl'
|
||||
assert url in records[0]['urls']
|
||||
|
||||
def test_create_from_stdin_urls(self, cli_env, initialized_archive):
|
||||
"""Create crawl from stdin URLs (one per line)."""
|
||||
urls = [create_test_url() for _ in range(3)]
|
||||
stdin = '\n'.join(urls)
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['crawl', 'create'],
|
||||
stdin=stdin,
|
||||
env=cli_env,
|
||||
)
|
||||
|
||||
assert code == 0, f"Command failed: {stderr}"
|
||||
|
||||
records = parse_jsonl_output(stdout)
|
||||
assert len(records) == 1
|
||||
crawl = records[0]
|
||||
assert crawl['type'] == 'Crawl'
|
||||
# All URLs should be in the crawl
|
||||
for url in urls:
|
||||
assert url in crawl['urls']
|
||||
|
||||
def test_create_with_depth(self, cli_env, initialized_archive):
|
||||
"""Create crawl with --depth flag."""
|
||||
url = create_test_url()
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['crawl', 'create', '--depth=2', url],
|
||||
env=cli_env,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
records = parse_jsonl_output(stdout)
|
||||
assert records[0]['max_depth'] == 2
|
||||
|
||||
def test_create_with_tag(self, cli_env, initialized_archive):
|
||||
"""Create crawl with --tag flag."""
|
||||
url = create_test_url()
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['crawl', 'create', '--tag=test-tag', url],
|
||||
env=cli_env,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
records = parse_jsonl_output(stdout)
|
||||
assert 'test-tag' in records[0].get('tags_str', '')
|
||||
|
||||
def test_create_pass_through_other_types(self, cli_env, initialized_archive):
|
||||
"""Pass-through records of other types unchanged."""
|
||||
tag_record = {'type': 'Tag', 'id': 'fake-tag-id', 'name': 'test'}
|
||||
url = create_test_url()
|
||||
stdin = json.dumps(tag_record) + '\n' + json.dumps({'url': url})
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['crawl', 'create'],
|
||||
stdin=stdin,
|
||||
env=cli_env,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
records = parse_jsonl_output(stdout)
|
||||
|
||||
# Should have both the passed-through Tag and the new Crawl
|
||||
types = [r.get('type') for r in records]
|
||||
assert 'Tag' in types
|
||||
assert 'Crawl' in types
|
||||
|
||||
def test_create_pass_through_existing_crawl(self, cli_env, initialized_archive):
|
||||
"""Existing Crawl records (with id) are passed through."""
|
||||
# First create a crawl
|
||||
url = create_test_url()
|
||||
stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], env=cli_env)
|
||||
crawl = parse_jsonl_output(stdout1)[0]
|
||||
|
||||
# Now pipe it back - should pass through
|
||||
stdout2, stderr, code = run_archivebox_cmd(
|
||||
['crawl', 'create'],
|
||||
stdin=json.dumps(crawl),
|
||||
env=cli_env,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
records = parse_jsonl_output(stdout2)
|
||||
assert len(records) == 1
|
||||
assert records[0]['id'] == crawl['id']
|
||||
|
||||
|
||||
class TestCrawlList:
|
||||
"""Tests for `archivebox crawl list`."""
|
||||
|
||||
def test_list_empty(self, cli_env, initialized_archive):
|
||||
"""List with no crawls returns empty."""
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['crawl', 'list'],
|
||||
env=cli_env,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
assert 'Listed 0 crawls' in stderr
|
||||
|
||||
def test_list_returns_created(self, cli_env, initialized_archive):
|
||||
"""List returns previously created crawls."""
|
||||
url = create_test_url()
|
||||
run_archivebox_cmd(['crawl', 'create', url], env=cli_env)
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['crawl', 'list'],
|
||||
env=cli_env,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
records = parse_jsonl_output(stdout)
|
||||
assert len(records) >= 1
|
||||
assert any(url in r.get('urls', '') for r in records)
|
||||
|
||||
def test_list_filter_by_status(self, cli_env, initialized_archive):
|
||||
"""Filter crawls by status."""
|
||||
url = create_test_url()
|
||||
run_archivebox_cmd(['crawl', 'create', url], env=cli_env)
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['crawl', 'list', '--status=queued'],
|
||||
env=cli_env,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
records = parse_jsonl_output(stdout)
|
||||
for r in records:
|
||||
assert r['status'] == 'queued'
|
||||
|
||||
def test_list_with_limit(self, cli_env, initialized_archive):
|
||||
"""Limit number of results."""
|
||||
# Create multiple crawls
|
||||
for _ in range(3):
|
||||
run_archivebox_cmd(['crawl', 'create', create_test_url()], env=cli_env)
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['crawl', 'list', '--limit=2'],
|
||||
env=cli_env,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
records = parse_jsonl_output(stdout)
|
||||
assert len(records) == 2
|
||||
|
||||
|
||||
class TestCrawlUpdate:
|
||||
"""Tests for `archivebox crawl update`."""
|
||||
|
||||
def test_update_status(self, cli_env, initialized_archive):
|
||||
"""Update crawl status."""
|
||||
# Create a crawl
|
||||
url = create_test_url()
|
||||
stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], env=cli_env)
|
||||
crawl = parse_jsonl_output(stdout1)[0]
|
||||
|
||||
# Update it
|
||||
stdout2, stderr, code = run_archivebox_cmd(
|
||||
['crawl', 'update', '--status=started'],
|
||||
stdin=json.dumps(crawl),
|
||||
env=cli_env,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
assert 'Updated 1 crawls' in stderr
|
||||
|
||||
records = parse_jsonl_output(stdout2)
|
||||
assert records[0]['status'] == 'started'
|
||||
|
||||
|
||||
class TestCrawlDelete:
|
||||
"""Tests for `archivebox crawl delete`."""
|
||||
|
||||
def test_delete_requires_yes(self, cli_env, initialized_archive):
|
||||
"""Delete requires --yes flag."""
|
||||
url = create_test_url()
|
||||
stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], env=cli_env)
|
||||
crawl = parse_jsonl_output(stdout1)[0]
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['crawl', 'delete'],
|
||||
stdin=json.dumps(crawl),
|
||||
env=cli_env,
|
||||
)
|
||||
|
||||
assert code == 1
|
||||
assert '--yes' in stderr
|
||||
|
||||
def test_delete_with_yes(self, cli_env, initialized_archive):
|
||||
"""Delete with --yes flag works."""
|
||||
url = create_test_url()
|
||||
stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], env=cli_env)
|
||||
crawl = parse_jsonl_output(stdout1)[0]
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['crawl', 'delete', '--yes'],
|
||||
stdin=json.dumps(crawl),
|
||||
env=cli_env,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
assert 'Deleted 1 crawls' in stderr
|
||||
|
||||
def test_delete_dry_run(self, cli_env, initialized_archive):
|
||||
"""Dry run shows what would be deleted."""
|
||||
url = create_test_url()
|
||||
stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], env=cli_env)
|
||||
crawl = parse_jsonl_output(stdout1)[0]
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['crawl', 'delete', '--dry-run'],
|
||||
stdin=json.dumps(crawl),
|
||||
env=cli_env,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
assert 'Would delete' in stderr
|
||||
assert 'dry run' in stderr.lower()
|
||||
254
archivebox/tests/test_cli_run.py
Normal file
254
archivebox/tests/test_cli_run.py
Normal file
@@ -0,0 +1,254 @@
|
||||
"""
|
||||
Tests for archivebox run CLI command.
|
||||
|
||||
Tests cover:
|
||||
- run with stdin JSONL (Crawl, Snapshot, ArchiveResult)
|
||||
- create-or-update behavior (records with/without id)
|
||||
- pass-through output (for chaining)
|
||||
"""
|
||||
|
||||
import json
|
||||
import pytest
|
||||
|
||||
from archivebox.tests.conftest import (
|
||||
run_archivebox_cmd,
|
||||
parse_jsonl_output,
|
||||
create_test_url,
|
||||
create_test_crawl_json,
|
||||
create_test_snapshot_json,
|
||||
)
|
||||
|
||||
|
||||
class TestRunWithCrawl:
|
||||
"""Tests for `archivebox run` with Crawl input."""
|
||||
|
||||
def test_run_with_new_crawl(self, cli_env, initialized_archive):
|
||||
"""Run creates and processes a new Crawl (no id)."""
|
||||
crawl_record = create_test_crawl_json()
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['run'],
|
||||
stdin=json.dumps(crawl_record),
|
||||
env=cli_env,
|
||||
timeout=120,
|
||||
)
|
||||
|
||||
assert code == 0, f"Command failed: {stderr}"
|
||||
|
||||
# Should output the created Crawl
|
||||
records = parse_jsonl_output(stdout)
|
||||
crawl_records = [r for r in records if r.get('type') == 'Crawl']
|
||||
assert len(crawl_records) >= 1
|
||||
assert crawl_records[0].get('id') # Should have an id now
|
||||
|
||||
def test_run_with_existing_crawl(self, cli_env, initialized_archive):
|
||||
"""Run re-queues an existing Crawl (with id)."""
|
||||
url = create_test_url()
|
||||
|
||||
# First create a crawl
|
||||
stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], env=cli_env)
|
||||
crawl = parse_jsonl_output(stdout1)[0]
|
||||
|
||||
# Run with the existing crawl
|
||||
stdout2, stderr, code = run_archivebox_cmd(
|
||||
['run'],
|
||||
stdin=json.dumps(crawl),
|
||||
env=cli_env,
|
||||
timeout=120,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
records = parse_jsonl_output(stdout2)
|
||||
assert len(records) >= 1
|
||||
|
||||
|
||||
class TestRunWithSnapshot:
|
||||
"""Tests for `archivebox run` with Snapshot input."""
|
||||
|
||||
def test_run_with_new_snapshot(self, cli_env, initialized_archive):
|
||||
"""Run creates and processes a new Snapshot (no id, just url)."""
|
||||
snapshot_record = create_test_snapshot_json()
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['run'],
|
||||
stdin=json.dumps(snapshot_record),
|
||||
env=cli_env,
|
||||
timeout=120,
|
||||
)
|
||||
|
||||
assert code == 0, f"Command failed: {stderr}"
|
||||
|
||||
records = parse_jsonl_output(stdout)
|
||||
snapshot_records = [r for r in records if r.get('type') == 'Snapshot']
|
||||
assert len(snapshot_records) >= 1
|
||||
assert snapshot_records[0].get('id')
|
||||
|
||||
def test_run_with_existing_snapshot(self, cli_env, initialized_archive):
|
||||
"""Run re-queues an existing Snapshot (with id)."""
|
||||
url = create_test_url()
|
||||
|
||||
# First create a snapshot
|
||||
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
|
||||
snapshot = parse_jsonl_output(stdout1)[0]
|
||||
|
||||
# Run with the existing snapshot
|
||||
stdout2, stderr, code = run_archivebox_cmd(
|
||||
['run'],
|
||||
stdin=json.dumps(snapshot),
|
||||
env=cli_env,
|
||||
timeout=120,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
records = parse_jsonl_output(stdout2)
|
||||
assert len(records) >= 1
|
||||
|
||||
def test_run_with_plain_url(self, cli_env, initialized_archive):
|
||||
"""Run accepts plain URL records (no type field)."""
|
||||
url = create_test_url()
|
||||
url_record = {'url': url}
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['run'],
|
||||
stdin=json.dumps(url_record),
|
||||
env=cli_env,
|
||||
timeout=120,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
records = parse_jsonl_output(stdout)
|
||||
assert len(records) >= 1
|
||||
|
||||
|
||||
class TestRunWithArchiveResult:
|
||||
"""Tests for `archivebox run` with ArchiveResult input."""
|
||||
|
||||
def test_run_requeues_failed_archiveresult(self, cli_env, initialized_archive):
|
||||
"""Run re-queues a failed ArchiveResult."""
|
||||
url = create_test_url()
|
||||
|
||||
# Create snapshot and archive result
|
||||
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
|
||||
snapshot = parse_jsonl_output(stdout1)[0]
|
||||
|
||||
stdout2, _, _ = run_archivebox_cmd(
|
||||
['archiveresult', 'create', '--plugin=title'],
|
||||
stdin=json.dumps(snapshot),
|
||||
env=cli_env,
|
||||
)
|
||||
ar = next(r for r in parse_jsonl_output(stdout2) if r.get('type') == 'ArchiveResult')
|
||||
|
||||
# Update to failed
|
||||
ar['status'] = 'failed'
|
||||
run_archivebox_cmd(
|
||||
['archiveresult', 'update', '--status=failed'],
|
||||
stdin=json.dumps(ar),
|
||||
env=cli_env,
|
||||
)
|
||||
|
||||
# Now run should re-queue it
|
||||
stdout3, stderr, code = run_archivebox_cmd(
|
||||
['run'],
|
||||
stdin=json.dumps(ar),
|
||||
env=cli_env,
|
||||
timeout=120,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
records = parse_jsonl_output(stdout3)
|
||||
ar_records = [r for r in records if r.get('type') == 'ArchiveResult']
|
||||
assert len(ar_records) >= 1
|
||||
|
||||
|
||||
class TestRunPassThrough:
|
||||
"""Tests for pass-through behavior in `archivebox run`."""
|
||||
|
||||
def test_run_passes_through_unknown_types(self, cli_env, initialized_archive):
|
||||
"""Run passes through records with unknown types."""
|
||||
unknown_record = {'type': 'Unknown', 'id': 'fake-id', 'data': 'test'}
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['run'],
|
||||
stdin=json.dumps(unknown_record),
|
||||
env=cli_env,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
records = parse_jsonl_output(stdout)
|
||||
unknown_records = [r for r in records if r.get('type') == 'Unknown']
|
||||
assert len(unknown_records) == 1
|
||||
assert unknown_records[0]['data'] == 'test'
|
||||
|
||||
def test_run_outputs_all_processed_records(self, cli_env, initialized_archive):
|
||||
"""Run outputs all processed records for chaining."""
|
||||
url = create_test_url()
|
||||
crawl_record = create_test_crawl_json(urls=[url])
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['run'],
|
||||
stdin=json.dumps(crawl_record),
|
||||
env=cli_env,
|
||||
timeout=120,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
records = parse_jsonl_output(stdout)
|
||||
# Should have at least the Crawl in output
|
||||
assert len(records) >= 1
|
||||
|
||||
|
||||
class TestRunMixedInput:
|
||||
"""Tests for `archivebox run` with mixed record types."""
|
||||
|
||||
def test_run_handles_mixed_types(self, cli_env, initialized_archive):
|
||||
"""Run handles mixed Crawl/Snapshot/ArchiveResult input."""
|
||||
crawl = create_test_crawl_json()
|
||||
snapshot = create_test_snapshot_json()
|
||||
unknown = {'type': 'Tag', 'id': 'fake', 'name': 'test'}
|
||||
|
||||
stdin = '\n'.join([
|
||||
json.dumps(crawl),
|
||||
json.dumps(snapshot),
|
||||
json.dumps(unknown),
|
||||
])
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['run'],
|
||||
stdin=stdin,
|
||||
env=cli_env,
|
||||
timeout=120,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
records = parse_jsonl_output(stdout)
|
||||
|
||||
types = set(r.get('type') for r in records)
|
||||
# Should have processed Crawl and Snapshot, passed through Tag
|
||||
assert 'Crawl' in types or 'Snapshot' in types or 'Tag' in types
|
||||
|
||||
|
||||
class TestRunEmpty:
|
||||
"""Tests for `archivebox run` edge cases."""
|
||||
|
||||
def test_run_empty_stdin(self, cli_env, initialized_archive):
|
||||
"""Run with empty stdin returns success."""
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['run'],
|
||||
stdin='',
|
||||
env=cli_env,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
|
||||
def test_run_no_records_to_process(self, cli_env, initialized_archive):
|
||||
"""Run with only pass-through records shows message."""
|
||||
unknown = {'type': 'Unknown', 'id': 'fake'}
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['run'],
|
||||
stdin=json.dumps(unknown),
|
||||
env=cli_env,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
assert 'No records to process' in stderr
|
||||
274
archivebox/tests/test_cli_snapshot.py
Normal file
274
archivebox/tests/test_cli_snapshot.py
Normal file
@@ -0,0 +1,274 @@
|
||||
"""
|
||||
Tests for archivebox snapshot CLI command.
|
||||
|
||||
Tests cover:
|
||||
- snapshot create (from URLs, from Crawl JSONL, pass-through)
|
||||
- snapshot list (with filters)
|
||||
- snapshot update
|
||||
- snapshot delete
|
||||
"""
|
||||
|
||||
import json
|
||||
import pytest
|
||||
|
||||
from archivebox.tests.conftest import (
|
||||
run_archivebox_cmd,
|
||||
parse_jsonl_output,
|
||||
assert_jsonl_contains_type,
|
||||
create_test_url,
|
||||
)
|
||||
|
||||
|
||||
class TestSnapshotCreate:
|
||||
"""Tests for `archivebox snapshot create`."""
|
||||
|
||||
def test_create_from_url_args(self, cli_env, initialized_archive):
|
||||
"""Create snapshot from URL arguments."""
|
||||
url = create_test_url()
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['snapshot', 'create', url],
|
||||
env=cli_env,
|
||||
)
|
||||
|
||||
assert code == 0, f"Command failed: {stderr}"
|
||||
assert 'Created' in stderr
|
||||
|
||||
records = parse_jsonl_output(stdout)
|
||||
assert len(records) == 1
|
||||
assert records[0]['type'] == 'Snapshot'
|
||||
assert records[0]['url'] == url
|
||||
|
||||
def test_create_from_crawl_jsonl(self, cli_env, initialized_archive):
|
||||
"""Create snapshots from Crawl JSONL input."""
|
||||
url = create_test_url()
|
||||
|
||||
# First create a crawl
|
||||
stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], env=cli_env)
|
||||
crawl = parse_jsonl_output(stdout1)[0]
|
||||
|
||||
# Pipe crawl to snapshot create
|
||||
stdout2, stderr, code = run_archivebox_cmd(
|
||||
['snapshot', 'create'],
|
||||
stdin=json.dumps(crawl),
|
||||
env=cli_env,
|
||||
)
|
||||
|
||||
assert code == 0, f"Command failed: {stderr}"
|
||||
|
||||
records = parse_jsonl_output(stdout2)
|
||||
# Should have the Crawl passed through and the Snapshot created
|
||||
types = [r.get('type') for r in records]
|
||||
assert 'Crawl' in types
|
||||
assert 'Snapshot' in types
|
||||
|
||||
snapshot = next(r for r in records if r['type'] == 'Snapshot')
|
||||
assert snapshot['url'] == url
|
||||
|
||||
def test_create_with_tag(self, cli_env, initialized_archive):
|
||||
"""Create snapshot with --tag flag."""
|
||||
url = create_test_url()
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['snapshot', 'create', '--tag=test-tag', url],
|
||||
env=cli_env,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
records = parse_jsonl_output(stdout)
|
||||
assert 'test-tag' in records[0].get('tags_str', '')
|
||||
|
||||
def test_create_pass_through_other_types(self, cli_env, initialized_archive):
|
||||
"""Pass-through records of other types unchanged."""
|
||||
tag_record = {'type': 'Tag', 'id': 'fake-tag-id', 'name': 'test'}
|
||||
url = create_test_url()
|
||||
stdin = json.dumps(tag_record) + '\n' + json.dumps({'url': url})
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['snapshot', 'create'],
|
||||
stdin=stdin,
|
||||
env=cli_env,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
records = parse_jsonl_output(stdout)
|
||||
|
||||
types = [r.get('type') for r in records]
|
||||
assert 'Tag' in types
|
||||
assert 'Snapshot' in types
|
||||
|
||||
def test_create_multiple_urls(self, cli_env, initialized_archive):
|
||||
"""Create snapshots from multiple URLs."""
|
||||
urls = [create_test_url() for _ in range(3)]
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['snapshot', 'create'] + urls,
|
||||
env=cli_env,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
records = parse_jsonl_output(stdout)
|
||||
assert len(records) == 3
|
||||
|
||||
created_urls = {r['url'] for r in records}
|
||||
for url in urls:
|
||||
assert url in created_urls
|
||||
|
||||
|
||||
class TestSnapshotList:
|
||||
"""Tests for `archivebox snapshot list`."""
|
||||
|
||||
def test_list_empty(self, cli_env, initialized_archive):
|
||||
"""List with no snapshots returns empty."""
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['snapshot', 'list'],
|
||||
env=cli_env,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
assert 'Listed 0 snapshots' in stderr
|
||||
|
||||
def test_list_returns_created(self, cli_env, initialized_archive):
|
||||
"""List returns previously created snapshots."""
|
||||
url = create_test_url()
|
||||
run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['snapshot', 'list'],
|
||||
env=cli_env,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
records = parse_jsonl_output(stdout)
|
||||
assert len(records) >= 1
|
||||
assert any(r.get('url') == url for r in records)
|
||||
|
||||
def test_list_filter_by_status(self, cli_env, initialized_archive):
|
||||
"""Filter snapshots by status."""
|
||||
url = create_test_url()
|
||||
run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['snapshot', 'list', '--status=queued'],
|
||||
env=cli_env,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
records = parse_jsonl_output(stdout)
|
||||
for r in records:
|
||||
assert r['status'] == 'queued'
|
||||
|
||||
def test_list_filter_by_url_contains(self, cli_env, initialized_archive):
|
||||
"""Filter snapshots by URL contains."""
|
||||
url = create_test_url(domain='unique-domain-12345.com')
|
||||
run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['snapshot', 'list', '--url__icontains=unique-domain-12345'],
|
||||
env=cli_env,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
records = parse_jsonl_output(stdout)
|
||||
assert len(records) == 1
|
||||
assert 'unique-domain-12345' in records[0]['url']
|
||||
|
||||
def test_list_with_limit(self, cli_env, initialized_archive):
|
||||
"""Limit number of results."""
|
||||
for _ in range(3):
|
||||
run_archivebox_cmd(['snapshot', 'create', create_test_url()], env=cli_env)
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['snapshot', 'list', '--limit=2'],
|
||||
env=cli_env,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
records = parse_jsonl_output(stdout)
|
||||
assert len(records) == 2
|
||||
|
||||
|
||||
class TestSnapshotUpdate:
|
||||
"""Tests for `archivebox snapshot update`."""
|
||||
|
||||
def test_update_status(self, cli_env, initialized_archive):
|
||||
"""Update snapshot status."""
|
||||
url = create_test_url()
|
||||
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
|
||||
snapshot = parse_jsonl_output(stdout1)[0]
|
||||
|
||||
stdout2, stderr, code = run_archivebox_cmd(
|
||||
['snapshot', 'update', '--status=started'],
|
||||
stdin=json.dumps(snapshot),
|
||||
env=cli_env,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
assert 'Updated 1 snapshots' in stderr
|
||||
|
||||
records = parse_jsonl_output(stdout2)
|
||||
assert records[0]['status'] == 'started'
|
||||
|
||||
def test_update_add_tag(self, cli_env, initialized_archive):
|
||||
"""Update snapshot by adding tag."""
|
||||
url = create_test_url()
|
||||
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
|
||||
snapshot = parse_jsonl_output(stdout1)[0]
|
||||
|
||||
stdout2, stderr, code = run_archivebox_cmd(
|
||||
['snapshot', 'update', '--tag=new-tag'],
|
||||
stdin=json.dumps(snapshot),
|
||||
env=cli_env,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
assert 'Updated 1 snapshots' in stderr
|
||||
|
||||
|
||||
class TestSnapshotDelete:
|
||||
"""Tests for `archivebox snapshot delete`."""
|
||||
|
||||
def test_delete_requires_yes(self, cli_env, initialized_archive):
|
||||
"""Delete requires --yes flag."""
|
||||
url = create_test_url()
|
||||
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
|
||||
snapshot = parse_jsonl_output(stdout1)[0]
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['snapshot', 'delete'],
|
||||
stdin=json.dumps(snapshot),
|
||||
env=cli_env,
|
||||
)
|
||||
|
||||
assert code == 1
|
||||
assert '--yes' in stderr
|
||||
|
||||
def test_delete_with_yes(self, cli_env, initialized_archive):
|
||||
"""Delete with --yes flag works."""
|
||||
url = create_test_url()
|
||||
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
|
||||
snapshot = parse_jsonl_output(stdout1)[0]
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['snapshot', 'delete', '--yes'],
|
||||
stdin=json.dumps(snapshot),
|
||||
env=cli_env,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
assert 'Deleted 1 snapshots' in stderr
|
||||
|
||||
def test_delete_dry_run(self, cli_env, initialized_archive):
|
||||
"""Dry run shows what would be deleted."""
|
||||
url = create_test_url()
|
||||
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
|
||||
snapshot = parse_jsonl_output(stdout1)[0]
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['snapshot', 'delete', '--dry-run'],
|
||||
stdin=json.dumps(snapshot),
|
||||
env=cli_env,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
assert 'Would delete' in stderr
|
||||
@@ -32,7 +32,7 @@ _supervisord_proc = None
|
||||
|
||||
ORCHESTRATOR_WORKER = {
|
||||
"name": "worker_orchestrator",
|
||||
"command": "archivebox manage orchestrator", # runs forever by default
|
||||
"command": "archivebox run", # runs forever by default
|
||||
"autostart": "true",
|
||||
"autorestart": "true",
|
||||
"stdout_logfile": "logs/worker_orchestrator.log",
|
||||
|
||||
Reference in New Issue
Block a user