Merge branch 'dev' into claude/refactor-process-management-WcQyZ

This commit is contained in:
Nick Sweeting
2025-12-30 23:42:23 -08:00
committed by GitHub
53 changed files with 4673 additions and 2494 deletions

131
TODO_cli_refactor.md Normal file
View File

@@ -0,0 +1,131 @@
# ArchiveBox CLI Refactor TODO
## Design Decisions
1. **Keep `archivebox add`** as high-level convenience command
2. **Unified `archivebox run`** for processing (replaces per-model `run` and `orchestrator`)
3. **Expose all models** including binary, process, machine
4. **Clean break** from old command structure (no backward compatibility aliases)
## Final Architecture
```
archivebox <model> <action> [args...] [--filters]
archivebox run [stdin JSONL]
```
### Actions (4 per model):
- `create` - Create records (from args, stdin, or JSONL), dedupes by indexed fields
- `list` - Query records (with filters, returns JSONL)
- `update` - Modify records (from stdin JSONL, PATCH semantics)
- `delete` - Remove records (from stdin JSONL, requires --yes)
### Unified Run Command:
- `archivebox run` - Process queued work
- With stdin JSONL: Process piped records, exit when complete
- Without stdin (TTY): Run orchestrator in foreground until killed
### Models (7 total):
- `crawl` - Crawl jobs
- `snapshot` - Individual archived pages
- `archiveresult` - Plugin extraction results
- `tag` - Tags/labels
- `binary` - Detected binaries (chrome, wget, etc.)
- `process` - Process execution records (read-only)
- `machine` - Machine/host records (read-only)
---
## Implementation Checklist
### Phase 1: Unified Run Command
- [x] Create `archivebox/cli/archivebox_run.py` - unified processing command
### Phase 2: Core Model Commands
- [x] Refactor `archivebox/cli/archivebox_snapshot.py` to Click group with create|list|update|delete
- [x] Refactor `archivebox/cli/archivebox_crawl.py` to Click group with create|list|update|delete
- [x] Create `archivebox/cli/archivebox_archiveresult.py` with create|list|update|delete
- [x] Create `archivebox/cli/archivebox_tag.py` with create|list|update|delete
### Phase 3: System Model Commands
- [x] Create `archivebox/cli/archivebox_binary.py` with create|list|update|delete
- [x] Create `archivebox/cli/archivebox_process.py` with list only (read-only)
- [x] Create `archivebox/cli/archivebox_machine.py` with list only (read-only)
### Phase 4: Registry & Cleanup
- [x] Update `archivebox/cli/__init__.py` command registry
- [x] Delete `archivebox/cli/archivebox_extract.py`
- [x] Delete `archivebox/cli/archivebox_remove.py`
- [x] Delete `archivebox/cli/archivebox_search.py`
- [x] Delete `archivebox/cli/archivebox_orchestrator.py`
- [x] Update `archivebox/cli/archivebox_add.py` internals (no changes needed - uses models directly)
- [x] Update `archivebox/cli/tests_piping.py`
### Phase 5: Tests for New Commands
- [ ] Add tests for `archivebox run` command
- [ ] Add tests for `archivebox crawl create|list|update|delete`
- [ ] Add tests for `archivebox snapshot create|list|update|delete`
- [ ] Add tests for `archivebox archiveresult create|list|update|delete`
- [ ] Add tests for `archivebox tag create|list|update|delete`
- [ ] Add tests for `archivebox binary create|list|update|delete`
- [ ] Add tests for `archivebox process list`
- [ ] Add tests for `archivebox machine list`
---
## Usage Examples
### Basic CRUD
```bash
# Create
archivebox crawl create https://example.com https://foo.com --depth=1
archivebox snapshot create https://example.com --tag=news
# List with filters
archivebox crawl list --status=queued
archivebox snapshot list --url__icontains=example.com
archivebox archiveresult list --status=failed --plugin=screenshot
# Update (reads JSONL from stdin, applies changes)
archivebox snapshot list --tag=old | archivebox snapshot update --tag=new
# Delete (requires --yes)
archivebox crawl list --url__icontains=example.com | archivebox crawl delete --yes
```
### Unified Run Command
```bash
# Run orchestrator in foreground (replaces `archivebox orchestrator`)
archivebox run
# Process specific records (pipe any JSONL type, exits when done)
archivebox snapshot list --status=queued | archivebox run
archivebox archiveresult list --status=failed | archivebox run
archivebox crawl list --status=queued | archivebox run
# Mixed types work too - run handles any JSONL
cat mixed_records.jsonl | archivebox run
```
### Composed Workflows
```bash
# Full pipeline (replaces old `archivebox add`)
archivebox crawl create https://example.com --status=queued \
| archivebox snapshot create --status=queued \
| archivebox archiveresult create --status=queued \
| archivebox run
# Re-run failed extractions
archivebox archiveresult list --status=failed | archivebox run
# Delete all snapshots for a domain
archivebox snapshot list --url__icontains=spam.com | archivebox snapshot delete --yes
```
### Keep `archivebox add` as convenience
```bash
# This remains the simple user-friendly interface:
archivebox add https://example.com --depth=1 --tag=news
# Internally equivalent to the composed pipeline above
```

View File

@@ -478,7 +478,7 @@ interface LoadedChromeExtension extends ChromeExtension {
const CHROME_EXTENSIONS: LoadedChromeExtension[] = [
// Content access / unblocking / blocking plugins
{webstore_id: 'ifibfemgeogfhoebkmokieepdoobkbpo', name: 'captcha2'}, // https://2captcha.com/blog/how-to-use-2captcha-solver-extension-in-puppeteer
{webstore_id: 'ifibfemgeogfhoebkmokieepdoobkbpo', name: 'twocaptcha'}, // https://2captcha.com/blog/how-to-use-2captcha-solver-extension-in-puppeteer
{webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm', name: 'istilldontcareaboutcookies'},
{webstore_id: 'cjpalhdlnbpafiamejdnhcphjbkeiagm', name: 'ublock'},
// {webstore_id: 'mlomiejdfkolichcflejclcbmpeaniij', name: 'ghostery'},
@@ -1123,7 +1123,7 @@ async function setup2CaptchaExtension({browser, extensions}) {
try {
// open a new tab to finish setting up the 2captcha extension manually using its extension options page
page = await browser.newPage()
const { options_url } = extensions.filter(ext => ext.name === 'captcha2')[0]
const { options_url } = extensions.filter(ext => ext.name === 'twocaptcha')[0]
await page.goto(options_url)
await wait(2_500)
await page.bringToFront()

View File

@@ -27,36 +27,43 @@ class ArchiveBoxGroup(click.Group):
'init': 'archivebox.cli.archivebox_init.main',
'install': 'archivebox.cli.archivebox_install.main',
}
# Model commands (CRUD operations via subcommands)
model_commands = {
'crawl': 'archivebox.cli.archivebox_crawl.main',
'snapshot': 'archivebox.cli.archivebox_snapshot.main',
'archiveresult': 'archivebox.cli.archivebox_archiveresult.main',
'tag': 'archivebox.cli.archivebox_tag.main',
'binary': 'archivebox.cli.archivebox_binary.main',
'process': 'archivebox.cli.archivebox_process.main',
'machine': 'archivebox.cli.archivebox_machine.main',
}
archive_commands = {
# High-level commands
'add': 'archivebox.cli.archivebox_add.main',
'remove': 'archivebox.cli.archivebox_remove.main',
'run': 'archivebox.cli.archivebox_run.main',
'update': 'archivebox.cli.archivebox_update.main',
'search': 'archivebox.cli.archivebox_search.main',
'status': 'archivebox.cli.archivebox_status.main',
'config': 'archivebox.cli.archivebox_config.main',
'schedule': 'archivebox.cli.archivebox_schedule.main',
'server': 'archivebox.cli.archivebox_server.main',
'shell': 'archivebox.cli.archivebox_shell.main',
'manage': 'archivebox.cli.archivebox_manage.main',
# Worker/orchestrator commands
'orchestrator': 'archivebox.cli.archivebox_orchestrator.main',
# Worker command
'worker': 'archivebox.cli.archivebox_worker.main',
# Task commands (called by workers as subprocesses)
'crawl': 'archivebox.cli.archivebox_crawl.main',
'snapshot': 'archivebox.cli.archivebox_snapshot.main',
'extract': 'archivebox.cli.archivebox_extract.main',
}
all_subcommands = {
**meta_commands,
**setup_commands,
**model_commands,
**archive_commands,
}
renamed_commands = {
'setup': 'install',
'list': 'search',
'import': 'add',
'archive': 'add',
'export': 'search',
# Old commands replaced by new model commands
'orchestrator': 'run',
'extract': 'archiveresult',
}
@classmethod
@@ -110,9 +117,9 @@ def cli(ctx, help=False):
if help or ctx.invoked_subcommand is None:
ctx.invoke(ctx.command.get_command(ctx, 'help'))
# if the subcommand is in the archive_commands dict and is not 'manage',
# if the subcommand is in archive_commands or model_commands,
# then we need to set up the django environment and check that we're in a valid data folder
if subcommand in ArchiveBoxGroup.archive_commands:
if subcommand in ArchiveBoxGroup.archive_commands or subcommand in ArchiveBoxGroup.model_commands:
# print('SETUP DJANGO AND CHECK DATA FOLDER')
try:
from archivebox.config.django import setup_django

View File

@@ -0,0 +1,365 @@
#!/usr/bin/env python3
"""
archivebox archiveresult <action> [args...] [--filters]
Manage ArchiveResult records (plugin extraction results).
Actions:
create - Create ArchiveResults for Snapshots (queue extractions)
list - List ArchiveResults as JSONL (with optional filters)
update - Update ArchiveResults from stdin JSONL
delete - Delete ArchiveResults from stdin JSONL
Examples:
# Create ArchiveResults for snapshots (queue for extraction)
archivebox snapshot list --status=queued | archivebox archiveresult create
archivebox archiveresult create --plugin=screenshot --snapshot-id=<uuid>
# List with filters
archivebox archiveresult list --status=failed
archivebox archiveresult list --plugin=screenshot --status=succeeded
# Update (reset failed extractions to queued)
archivebox archiveresult list --status=failed | archivebox archiveresult update --status=queued
# Delete
archivebox archiveresult list --plugin=singlefile | archivebox archiveresult delete --yes
# Re-run failed extractions
archivebox archiveresult list --status=failed | archivebox run
"""
__package__ = 'archivebox.cli'
__command__ = 'archivebox archiveresult'
import sys
from typing import Optional
import rich_click as click
from rich import print as rprint
def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
"""Apply Django-style filters from CLI kwargs to a QuerySet."""
filters = {}
for key, value in filter_kwargs.items():
if value is not None and key not in ('limit', 'offset'):
filters[key] = value
if filters:
queryset = queryset.filter(**filters)
if limit:
queryset = queryset[:limit]
return queryset
# =============================================================================
# CREATE
# =============================================================================
def create_archiveresults(
snapshot_id: Optional[str] = None,
plugin: Optional[str] = None,
status: str = 'queued',
) -> int:
"""
Create ArchiveResults for Snapshots.
Reads Snapshot records from stdin and creates ArchiveResult entries.
If --plugin is specified, only creates results for that plugin.
Otherwise, creates results for all pending plugins.
Exit codes:
0: Success
1: Failure
"""
from django.utils import timezone
from archivebox.misc.jsonl import read_stdin, write_record, TYPE_SNAPSHOT
from archivebox.core.models import Snapshot, ArchiveResult
is_tty = sys.stdout.isatty()
# If snapshot_id provided directly, use that
if snapshot_id:
try:
snapshots = [Snapshot.objects.get(id=snapshot_id)]
except Snapshot.DoesNotExist:
rprint(f'[red]Snapshot not found: {snapshot_id}[/red]', file=sys.stderr)
return 1
else:
# Read from stdin
records = list(read_stdin())
if not records:
rprint('[yellow]No Snapshot records provided via stdin[/yellow]', file=sys.stderr)
return 1
# Filter to only Snapshot records
snapshot_ids = []
for record in records:
if record.get('type') == TYPE_SNAPSHOT:
if record.get('id'):
snapshot_ids.append(record['id'])
elif record.get('id'):
# Assume it's a snapshot ID if no type specified
snapshot_ids.append(record['id'])
if not snapshot_ids:
rprint('[yellow]No valid Snapshot IDs in input[/yellow]', file=sys.stderr)
return 1
snapshots = list(Snapshot.objects.filter(id__in=snapshot_ids))
if not snapshots:
rprint('[yellow]No matching snapshots found[/yellow]', file=sys.stderr)
return 1
created_count = 0
for snapshot in snapshots:
if plugin:
# Create for specific plugin only
result, created = ArchiveResult.objects.get_or_create(
snapshot=snapshot,
plugin=plugin,
defaults={
'status': status,
'retry_at': timezone.now(),
}
)
if not created and result.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED]:
# Reset for retry
result.status = status
result.retry_at = timezone.now()
result.save()
if not is_tty:
write_record(result.to_json())
created_count += 1
else:
# Create all pending plugins
snapshot.create_pending_archiveresults()
for result in snapshot.archiveresult_set.filter(status=ArchiveResult.StatusChoices.QUEUED):
if not is_tty:
write_record(result.to_json())
created_count += 1
rprint(f'[green]Created/queued {created_count} archive results[/green]', file=sys.stderr)
return 0
# =============================================================================
# LIST
# =============================================================================
def list_archiveresults(
status: Optional[str] = None,
plugin: Optional[str] = None,
snapshot_id: Optional[str] = None,
limit: Optional[int] = None,
) -> int:
"""
List ArchiveResults as JSONL with optional filters.
Exit codes:
0: Success (even if no results)
"""
from archivebox.misc.jsonl import write_record
from archivebox.core.models import ArchiveResult
is_tty = sys.stdout.isatty()
queryset = ArchiveResult.objects.all().order_by('-start_ts')
# Apply filters
filter_kwargs = {
'status': status,
'plugin': plugin,
'snapshot_id': snapshot_id,
}
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
count = 0
for result in queryset:
if is_tty:
status_color = {
'queued': 'yellow',
'started': 'blue',
'succeeded': 'green',
'failed': 'red',
'skipped': 'dim',
'backoff': 'magenta',
}.get(result.status, 'dim')
rprint(f'[{status_color}]{result.status:10}[/{status_color}] {result.plugin:15} [dim]{result.id}[/dim] {result.snapshot.url[:40]}')
else:
write_record(result.to_json())
count += 1
rprint(f'[dim]Listed {count} archive results[/dim]', file=sys.stderr)
return 0
# =============================================================================
# UPDATE
# =============================================================================
def update_archiveresults(
status: Optional[str] = None,
) -> int:
"""
Update ArchiveResults from stdin JSONL.
Reads ArchiveResult records from stdin and applies updates.
Uses PATCH semantics - only specified fields are updated.
Exit codes:
0: Success
1: No input or error
"""
from django.utils import timezone
from archivebox.misc.jsonl import read_stdin, write_record
from archivebox.core.models import ArchiveResult
is_tty = sys.stdout.isatty()
records = list(read_stdin())
if not records:
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
return 1
updated_count = 0
for record in records:
result_id = record.get('id')
if not result_id:
continue
try:
result = ArchiveResult.objects.get(id=result_id)
# Apply updates from CLI flags
if status:
result.status = status
result.retry_at = timezone.now()
result.save()
updated_count += 1
if not is_tty:
write_record(result.to_json())
except ArchiveResult.DoesNotExist:
rprint(f'[yellow]ArchiveResult not found: {result_id}[/yellow]', file=sys.stderr)
continue
rprint(f'[green]Updated {updated_count} archive results[/green]', file=sys.stderr)
return 0
# =============================================================================
# DELETE
# =============================================================================
def delete_archiveresults(yes: bool = False, dry_run: bool = False) -> int:
"""
Delete ArchiveResults from stdin JSONL.
Requires --yes flag to confirm deletion.
Exit codes:
0: Success
1: No input or missing --yes flag
"""
from archivebox.misc.jsonl import read_stdin
from archivebox.core.models import ArchiveResult
records = list(read_stdin())
if not records:
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
return 1
result_ids = [r.get('id') for r in records if r.get('id')]
if not result_ids:
rprint('[yellow]No valid archive result IDs in input[/yellow]', file=sys.stderr)
return 1
results = ArchiveResult.objects.filter(id__in=result_ids)
count = results.count()
if count == 0:
rprint('[yellow]No matching archive results found[/yellow]', file=sys.stderr)
return 0
if dry_run:
rprint(f'[yellow]Would delete {count} archive results (dry run)[/yellow]', file=sys.stderr)
for result in results[:10]:
rprint(f' [dim]{result.id}[/dim] {result.plugin} {result.snapshot.url[:40]}', file=sys.stderr)
if count > 10:
rprint(f' ... and {count - 10} more', file=sys.stderr)
return 0
if not yes:
rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
return 1
# Perform deletion
deleted_count, _ = results.delete()
rprint(f'[green]Deleted {deleted_count} archive results[/green]', file=sys.stderr)
return 0
# =============================================================================
# CLI Commands
# =============================================================================
@click.group()
def main():
"""Manage ArchiveResult records (plugin extraction results)."""
pass
@main.command('create')
@click.option('--snapshot-id', help='Snapshot ID to create results for')
@click.option('--plugin', '-p', help='Plugin name (e.g., screenshot, singlefile)')
@click.option('--status', '-s', default='queued', help='Initial status (default: queued)')
def create_cmd(snapshot_id: Optional[str], plugin: Optional[str], status: str):
"""Create ArchiveResults for Snapshots from stdin JSONL."""
sys.exit(create_archiveresults(snapshot_id=snapshot_id, plugin=plugin, status=status))
@main.command('list')
@click.option('--status', '-s', help='Filter by status (queued, started, succeeded, failed, skipped)')
@click.option('--plugin', '-p', help='Filter by plugin name')
@click.option('--snapshot-id', help='Filter by snapshot ID')
@click.option('--limit', '-n', type=int, help='Limit number of results')
def list_cmd(status: Optional[str], plugin: Optional[str],
snapshot_id: Optional[str], limit: Optional[int]):
"""List ArchiveResults as JSONL."""
sys.exit(list_archiveresults(
status=status,
plugin=plugin,
snapshot_id=snapshot_id,
limit=limit,
))
@main.command('update')
@click.option('--status', '-s', help='Set status')
def update_cmd(status: Optional[str]):
"""Update ArchiveResults from stdin JSONL."""
sys.exit(update_archiveresults(status=status))
@main.command('delete')
@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
def delete_cmd(yes: bool, dry_run: bool):
"""Delete ArchiveResults from stdin JSONL."""
sys.exit(delete_archiveresults(yes=yes, dry_run=dry_run))
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,304 @@
#!/usr/bin/env python3
"""
archivebox binary <action> [args...] [--filters]
Manage Binary records (detected executables like chrome, wget, etc.).
Actions:
create - Create/register a Binary
list - List Binaries as JSONL (with optional filters)
update - Update Binaries from stdin JSONL
delete - Delete Binaries from stdin JSONL
Examples:
# List all binaries
archivebox binary list
# List specific binary
archivebox binary list --name=chrome
# List binaries with specific version
archivebox binary list --version__icontains=120
# Delete old binary entries
archivebox binary list --name=chrome | archivebox binary delete --yes
"""
__package__ = 'archivebox.cli'
__command__ = 'archivebox binary'
import sys
from typing import Optional
import rich_click as click
from rich import print as rprint
def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
"""Apply Django-style filters from CLI kwargs to a QuerySet."""
filters = {}
for key, value in filter_kwargs.items():
if value is not None and key not in ('limit', 'offset'):
filters[key] = value
if filters:
queryset = queryset.filter(**filters)
if limit:
queryset = queryset[:limit]
return queryset
# =============================================================================
# CREATE
# =============================================================================
def create_binary(
name: str,
abspath: str,
version: str = '',
) -> int:
"""
Create/register a Binary.
Exit codes:
0: Success
1: Failure
"""
from archivebox.misc.jsonl import write_record
from archivebox.machine.models import Binary
is_tty = sys.stdout.isatty()
if not name or not abspath:
rprint('[red]Both --name and --abspath are required[/red]', file=sys.stderr)
return 1
try:
binary, created = Binary.objects.get_or_create(
name=name,
abspath=abspath,
defaults={'version': version}
)
if not is_tty:
write_record(binary.to_json())
if created:
rprint(f'[green]Created binary: {name} at {abspath}[/green]', file=sys.stderr)
else:
rprint(f'[dim]Binary already exists: {name} at {abspath}[/dim]', file=sys.stderr)
return 0
except Exception as e:
rprint(f'[red]Error creating binary: {e}[/red]', file=sys.stderr)
return 1
# =============================================================================
# LIST
# =============================================================================
def list_binaries(
name: Optional[str] = None,
abspath__icontains: Optional[str] = None,
version__icontains: Optional[str] = None,
limit: Optional[int] = None,
) -> int:
"""
List Binaries as JSONL with optional filters.
Exit codes:
0: Success (even if no results)
"""
from archivebox.misc.jsonl import write_record
from archivebox.machine.models import Binary
is_tty = sys.stdout.isatty()
queryset = Binary.objects.all().order_by('name', '-loaded_at')
# Apply filters
filter_kwargs = {
'name': name,
'abspath__icontains': abspath__icontains,
'version__icontains': version__icontains,
}
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
count = 0
for binary in queryset:
if is_tty:
rprint(f'[cyan]{binary.name:20}[/cyan] [dim]{binary.version:15}[/dim] {binary.abspath}')
else:
write_record(binary.to_json())
count += 1
rprint(f'[dim]Listed {count} binaries[/dim]', file=sys.stderr)
return 0
# =============================================================================
# UPDATE
# =============================================================================
def update_binaries(
version: Optional[str] = None,
abspath: Optional[str] = None,
) -> int:
"""
Update Binaries from stdin JSONL.
Reads Binary records from stdin and applies updates.
Uses PATCH semantics - only specified fields are updated.
Exit codes:
0: Success
1: No input or error
"""
from archivebox.misc.jsonl import read_stdin, write_record
from archivebox.machine.models import Binary
is_tty = sys.stdout.isatty()
records = list(read_stdin())
if not records:
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
return 1
updated_count = 0
for record in records:
binary_id = record.get('id')
if not binary_id:
continue
try:
binary = Binary.objects.get(id=binary_id)
# Apply updates from CLI flags
if version:
binary.version = version
if abspath:
binary.abspath = abspath
binary.save()
updated_count += 1
if not is_tty:
write_record(binary.to_json())
except Binary.DoesNotExist:
rprint(f'[yellow]Binary not found: {binary_id}[/yellow]', file=sys.stderr)
continue
rprint(f'[green]Updated {updated_count} binaries[/green]', file=sys.stderr)
return 0
# =============================================================================
# DELETE
# =============================================================================
def delete_binaries(yes: bool = False, dry_run: bool = False) -> int:
"""
Delete Binaries from stdin JSONL.
Requires --yes flag to confirm deletion.
Exit codes:
0: Success
1: No input or missing --yes flag
"""
from archivebox.misc.jsonl import read_stdin
from archivebox.machine.models import Binary
records = list(read_stdin())
if not records:
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
return 1
binary_ids = [r.get('id') for r in records if r.get('id')]
if not binary_ids:
rprint('[yellow]No valid binary IDs in input[/yellow]', file=sys.stderr)
return 1
binaries = Binary.objects.filter(id__in=binary_ids)
count = binaries.count()
if count == 0:
rprint('[yellow]No matching binaries found[/yellow]', file=sys.stderr)
return 0
if dry_run:
rprint(f'[yellow]Would delete {count} binaries (dry run)[/yellow]', file=sys.stderr)
for binary in binaries:
rprint(f' {binary.name} {binary.abspath}', file=sys.stderr)
return 0
if not yes:
rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
return 1
# Perform deletion
deleted_count, _ = binaries.delete()
rprint(f'[green]Deleted {deleted_count} binaries[/green]', file=sys.stderr)
return 0
# =============================================================================
# CLI Commands
# =============================================================================
@click.group()
def main():
"""Manage Binary records (detected executables)."""
pass
@main.command('create')
@click.option('--name', '-n', required=True, help='Binary name (e.g., chrome, wget)')
@click.option('--abspath', '-p', required=True, help='Absolute path to binary')
@click.option('--version', '-v', default='', help='Binary version')
def create_cmd(name: str, abspath: str, version: str):
"""Create/register a Binary."""
sys.exit(create_binary(name=name, abspath=abspath, version=version))
@main.command('list')
@click.option('--name', '-n', help='Filter by name')
@click.option('--abspath__icontains', help='Filter by path contains')
@click.option('--version__icontains', help='Filter by version contains')
@click.option('--limit', type=int, help='Limit number of results')
def list_cmd(name: Optional[str], abspath__icontains: Optional[str],
version__icontains: Optional[str], limit: Optional[int]):
"""List Binaries as JSONL."""
sys.exit(list_binaries(
name=name,
abspath__icontains=abspath__icontains,
version__icontains=version__icontains,
limit=limit,
))
@main.command('update')
@click.option('--version', '-v', help='Set version')
@click.option('--abspath', '-p', help='Set path')
def update_cmd(version: Optional[str], abspath: Optional[str]):
"""Update Binaries from stdin JSONL."""
sys.exit(update_binaries(version=version, abspath=abspath))
@main.command('delete')
@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
def delete_cmd(yes: bool, dry_run: bool):
"""Delete Binaries from stdin JSONL."""
sys.exit(delete_binaries(yes=yes, dry_run=dry_run))
if __name__ == '__main__':
main()

View File

@@ -1,108 +1,134 @@
#!/usr/bin/env python3
"""
archivebox crawl [urls...] [--depth=N] [--tag=TAG]
archivebox crawl <action> [args...] [--filters]
Create Crawl jobs from URLs. Accepts URLs as arguments, from stdin, or via JSONL.
Does NOT immediately start the crawl - pipe to `archivebox snapshot` to process.
Manage Crawl records.
Input formats:
- Plain URLs (one per line)
- JSONL: {"url": "...", "depth": 1, "tags": "..."}
Output (JSONL):
{"type": "Crawl", "id": "...", "urls": "...", "status": "queued", ...}
Actions:
create - Create Crawl jobs from URLs
list - List Crawls as JSONL (with optional filters)
update - Update Crawls from stdin JSONL
delete - Delete Crawls from stdin JSONL
Examples:
# Create a crawl job
archivebox crawl https://example.com
# Create
archivebox crawl create https://example.com https://foo.com --depth=1
archivebox crawl create --tag=news https://example.com
# Create crawl with depth
archivebox crawl --depth=1 https://example.com
# List with filters
archivebox crawl list --status=queued
archivebox crawl list --urls__icontains=example.com
# Full pipeline: create crawl, create snapshots, run extractors
archivebox crawl https://example.com | archivebox snapshot | archivebox extract
# Update
archivebox crawl list --status=started | archivebox crawl update --status=queued
# Process existing Crawl by ID (runs the crawl state machine)
archivebox crawl 01234567-89ab-cdef-0123-456789abcdef
# Delete
archivebox crawl list --urls__icontains=spam.com | archivebox crawl delete --yes
# Full pipeline
archivebox crawl create https://example.com | archivebox snapshot create | archivebox run
"""
__package__ = 'archivebox.cli'
__command__ = 'archivebox crawl'
import sys
from typing import Optional
from typing import Optional, Iterable
import rich_click as click
from rich import print as rprint
def create_crawls(
records: list,
def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
"""Apply Django-style filters from CLI kwargs to a QuerySet."""
filters = {}
for key, value in filter_kwargs.items():
if value is not None and key not in ('limit', 'offset'):
filters[key] = value
if filters:
queryset = queryset.filter(**filters)
if limit:
queryset = queryset[:limit]
return queryset
# =============================================================================
# CREATE
# =============================================================================
def create_crawl(
urls: Iterable[str],
depth: int = 0,
tag: str = '',
status: str = 'queued',
created_by_id: Optional[int] = None,
) -> int:
"""
Create a single Crawl job from all input URLs.
Create a Crawl job from URLs.
Takes pre-read records, creates one Crawl with all URLs, outputs JSONL.
Does NOT start the crawl - just creates the job in QUEUED state.
Takes URLs as args or stdin, creates one Crawl with all URLs, outputs JSONL.
Exit codes:
0: Success
1: Failure
"""
from rich import print as rprint
from archivebox.misc.jsonl import write_record
from archivebox.misc.jsonl import read_args_or_stdin, write_record
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
created_by_id = created_by_id or get_or_create_system_user_pk()
is_tty = sys.stdout.isatty()
# Collect all input records
records = list(read_args_or_stdin(urls))
if not records:
rprint('[yellow]No URLs provided. Pass URLs as arguments or via stdin.[/yellow]', file=sys.stderr)
return 1
# Collect all URLs into a single newline-separated string
urls = []
url_list = []
for record in records:
url = record.get('url')
if url:
urls.append(url)
url_list.append(url)
if not urls:
if not url_list:
rprint('[red]No valid URLs found[/red]', file=sys.stderr)
return 1
try:
# Build crawl record with all URLs as newline-separated string
crawl_record = {
'urls': '\n'.join(urls),
'urls': '\n'.join(url_list),
'max_depth': depth,
'tags_str': tag,
'status': status,
'label': '',
}
crawl = Crawl.from_jsonl(crawl_record, overrides={'created_by_id': created_by_id})
crawl = Crawl.from_json(crawl_record, overrides={'created_by_id': created_by_id})
if not crawl:
rprint('[red]Failed to create crawl[/red]', file=sys.stderr)
return 1
# Output JSONL record (only when piped)
if not is_tty:
write_record(crawl.to_jsonl())
write_record(crawl.to_json())
rprint(f'[green]Created crawl with {len(urls)} URLs[/green]', file=sys.stderr)
rprint(f'[green]Created crawl with {len(url_list)} URLs[/green]', file=sys.stderr)
# If TTY, show human-readable output
if is_tty:
rprint(f' [dim]{crawl.id}[/dim]', file=sys.stderr)
for url in urls[:5]: # Show first 5 URLs
for url in url_list[:5]: # Show first 5 URLs
rprint(f' {url[:70]}', file=sys.stderr)
if len(urls) > 5:
rprint(f' ... and {len(urls) - 5} more', file=sys.stderr)
if len(url_list) > 5:
rprint(f' ... and {len(url_list) - 5} more', file=sys.stderr)
return 0
@@ -111,81 +137,217 @@ def create_crawls(
return 1
def process_crawl_by_id(crawl_id: str) -> int:
"""
Process a single Crawl by ID (used by workers).
# =============================================================================
# LIST
# =============================================================================
Triggers the Crawl's state machine tick() which will:
- Transition from queued -> started (creates root snapshot)
- Transition from started -> sealed (when all snapshots done)
def list_crawls(
status: Optional[str] = None,
urls__icontains: Optional[str] = None,
max_depth: Optional[int] = None,
limit: Optional[int] = None,
) -> int:
"""
from rich import print as rprint
List Crawls as JSONL with optional filters.
Exit codes:
0: Success (even if no results)
"""
from archivebox.misc.jsonl import write_record
from archivebox.crawls.models import Crawl
try:
crawl = Crawl.objects.get(id=crawl_id)
except Crawl.DoesNotExist:
rprint(f'[red]Crawl {crawl_id} not found[/red]', file=sys.stderr)
return 1
is_tty = sys.stdout.isatty()
rprint(f'[blue]Processing Crawl {crawl.id} (status={crawl.status})[/blue]', file=sys.stderr)
queryset = Crawl.objects.all().order_by('-created_at')
try:
crawl.sm.tick()
crawl.refresh_from_db()
rprint(f'[green]Crawl complete (status={crawl.status})[/green]', file=sys.stderr)
return 0
except Exception as e:
rprint(f'[red]Crawl error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
return 1
# Apply filters
filter_kwargs = {
'status': status,
'urls__icontains': urls__icontains,
'max_depth': max_depth,
}
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
count = 0
for crawl in queryset:
if is_tty:
status_color = {
'queued': 'yellow',
'started': 'blue',
'sealed': 'green',
}.get(crawl.status, 'dim')
url_preview = crawl.urls[:50].replace('\n', ' ')
rprint(f'[{status_color}]{crawl.status:8}[/{status_color}] [dim]{crawl.id}[/dim] {url_preview}...')
else:
write_record(crawl.to_json())
count += 1
rprint(f'[dim]Listed {count} crawls[/dim]', file=sys.stderr)
return 0
def is_crawl_id(value: str) -> bool:
"""Check if value looks like a Crawl UUID."""
import re
uuid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.I)
if not uuid_pattern.match(value):
return False
# Verify it's actually a Crawl (not a Snapshot or other object)
# =============================================================================
# UPDATE
# =============================================================================
def update_crawls(
status: Optional[str] = None,
max_depth: Optional[int] = None,
) -> int:
"""
Update Crawls from stdin JSONL.
Reads Crawl records from stdin and applies updates.
Uses PATCH semantics - only specified fields are updated.
Exit codes:
0: Success
1: No input or error
"""
from django.utils import timezone
from archivebox.misc.jsonl import read_stdin, write_record
from archivebox.crawls.models import Crawl
return Crawl.objects.filter(id=value).exists()
is_tty = sys.stdout.isatty()
@click.command()
@click.option('--depth', '-d', type=int, default=0, help='Max depth for recursive crawling (default: 0, no recursion)')
@click.option('--tag', '-t', default='', help='Comma-separated tags to add to snapshots')
@click.argument('args', nargs=-1)
def main(depth: int, tag: str, args: tuple):
"""Create Crawl jobs from URLs, or process existing Crawls by ID"""
from archivebox.misc.jsonl import read_args_or_stdin
# Read all input
records = list(read_args_or_stdin(args))
records = list(read_stdin())
if not records:
from rich import print as rprint
rprint('[yellow]No URLs or Crawl IDs provided. Pass as arguments or via stdin.[/yellow]', file=sys.stderr)
sys.exit(1)
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
return 1
# Check if input looks like existing Crawl IDs to process
# If ALL inputs are Crawl UUIDs, process them
all_are_crawl_ids = all(
is_crawl_id(r.get('id') or r.get('url', ''))
for r in records
)
updated_count = 0
for record in records:
crawl_id = record.get('id')
if not crawl_id:
continue
if all_are_crawl_ids:
# Process existing Crawls by ID
exit_code = 0
for record in records:
crawl_id = record.get('id') or record.get('url')
result = process_crawl_by_id(crawl_id)
if result != 0:
exit_code = result
sys.exit(exit_code)
else:
# Default behavior: create Crawl jobs from URLs
sys.exit(create_crawls(records, depth=depth, tag=tag))
try:
crawl = Crawl.objects.get(id=crawl_id)
# Apply updates from CLI flags
if status:
crawl.status = status
crawl.retry_at = timezone.now()
if max_depth is not None:
crawl.max_depth = max_depth
crawl.save()
updated_count += 1
if not is_tty:
write_record(crawl.to_json())
except Crawl.DoesNotExist:
rprint(f'[yellow]Crawl not found: {crawl_id}[/yellow]', file=sys.stderr)
continue
rprint(f'[green]Updated {updated_count} crawls[/green]', file=sys.stderr)
return 0
# =============================================================================
# DELETE
# =============================================================================
def delete_crawls(yes: bool = False, dry_run: bool = False) -> int:
"""
Delete Crawls from stdin JSONL.
Requires --yes flag to confirm deletion.
Exit codes:
0: Success
1: No input or missing --yes flag
"""
from archivebox.misc.jsonl import read_stdin
from archivebox.crawls.models import Crawl
records = list(read_stdin())
if not records:
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
return 1
crawl_ids = [r.get('id') for r in records if r.get('id')]
if not crawl_ids:
rprint('[yellow]No valid crawl IDs in input[/yellow]', file=sys.stderr)
return 1
crawls = Crawl.objects.filter(id__in=crawl_ids)
count = crawls.count()
if count == 0:
rprint('[yellow]No matching crawls found[/yellow]', file=sys.stderr)
return 0
if dry_run:
rprint(f'[yellow]Would delete {count} crawls (dry run)[/yellow]', file=sys.stderr)
for crawl in crawls:
url_preview = crawl.urls[:50].replace('\n', ' ')
rprint(f' [dim]{crawl.id}[/dim] {url_preview}...', file=sys.stderr)
return 0
if not yes:
rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
return 1
# Perform deletion
deleted_count, _ = crawls.delete()
rprint(f'[green]Deleted {deleted_count} crawls[/green]', file=sys.stderr)
return 0
# =============================================================================
# CLI Commands
# =============================================================================
@click.group()
def main():
"""Manage Crawl records."""
pass
@main.command('create')
@click.argument('urls', nargs=-1)
@click.option('--depth', '-d', type=int, default=0, help='Max crawl depth (default: 0)')
@click.option('--tag', '-t', default='', help='Comma-separated tags to add')
@click.option('--status', '-s', default='queued', help='Initial status (default: queued)')
def create_cmd(urls: tuple, depth: int, tag: str, status: str):
"""Create a Crawl job from URLs or stdin."""
sys.exit(create_crawl(urls, depth=depth, tag=tag, status=status))
@main.command('list')
@click.option('--status', '-s', help='Filter by status (queued, started, sealed)')
@click.option('--urls__icontains', help='Filter by URLs contains')
@click.option('--max-depth', type=int, help='Filter by max depth')
@click.option('--limit', '-n', type=int, help='Limit number of results')
def list_cmd(status: Optional[str], urls__icontains: Optional[str],
max_depth: Optional[int], limit: Optional[int]):
"""List Crawls as JSONL."""
sys.exit(list_crawls(
status=status,
urls__icontains=urls__icontains,
max_depth=max_depth,
limit=limit,
))
@main.command('update')
@click.option('--status', '-s', help='Set status')
@click.option('--max-depth', type=int, help='Set max depth')
def update_cmd(status: Optional[str], max_depth: Optional[int]):
"""Update Crawls from stdin JSONL."""
sys.exit(update_crawls(status=status, max_depth=max_depth))
@main.command('delete')
@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
def delete_cmd(yes: bool, dry_run: bool):
"""Delete Crawls from stdin JSONL."""
sys.exit(delete_crawls(yes=yes, dry_run=dry_run))
if __name__ == '__main__':

View File

@@ -1,265 +0,0 @@
#!/usr/bin/env python3
"""
archivebox extract [snapshot_ids...] [--plugins=NAMES]
Run plugins on Snapshots. Accepts snapshot IDs as arguments, from stdin, or via JSONL.
Input formats:
- Snapshot UUIDs (one per line)
- JSONL: {"type": "Snapshot", "id": "...", "url": "..."}
- JSONL: {"type": "ArchiveResult", "snapshot_id": "...", "plugin": "..."}
Output (JSONL):
{"type": "ArchiveResult", "id": "...", "snapshot_id": "...", "plugin": "...", "status": "..."}
Examples:
# Extract specific snapshot
archivebox extract 01234567-89ab-cdef-0123-456789abcdef
# Pipe from snapshot command
archivebox snapshot https://example.com | archivebox extract
# Run specific plugins only
archivebox extract --plugins=screenshot,singlefile 01234567-89ab-cdef-0123-456789abcdef
# Chain commands
archivebox crawl https://example.com | archivebox snapshot | archivebox extract
"""
__package__ = 'archivebox.cli'
__command__ = 'archivebox extract'
import sys
from typing import Optional, List
import rich_click as click
def process_archiveresult_by_id(archiveresult_id: str) -> int:
"""
Run extraction for a single ArchiveResult by ID (used by workers).
Triggers the ArchiveResult's state machine tick() to run the extractor plugin.
"""
from rich import print as rprint
from archivebox.core.models import ArchiveResult
try:
archiveresult = ArchiveResult.objects.get(id=archiveresult_id)
except ArchiveResult.DoesNotExist:
rprint(f'[red]ArchiveResult {archiveresult_id} not found[/red]', file=sys.stderr)
return 1
rprint(f'[blue]Extracting {archiveresult.plugin} for {archiveresult.snapshot.url}[/blue]', file=sys.stderr)
try:
# Trigger state machine tick - this runs the actual extraction
archiveresult.sm.tick()
archiveresult.refresh_from_db()
if archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED:
print(f'[green]Extraction succeeded: {archiveresult.output_str}[/green]')
return 0
elif archiveresult.status == ArchiveResult.StatusChoices.FAILED:
print(f'[red]Extraction failed: {archiveresult.output_str}[/red]', file=sys.stderr)
return 1
else:
# Still in progress or backoff - not a failure
print(f'[yellow]Extraction status: {archiveresult.status}[/yellow]')
return 0
except Exception as e:
print(f'[red]Extraction error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
return 1
def run_plugins(
args: tuple,
plugins: str = '',
wait: bool = True,
) -> int:
"""
Run plugins on Snapshots from input.
Reads Snapshot IDs or JSONL from args/stdin, runs plugins, outputs JSONL.
Exit codes:
0: Success
1: Failure
"""
from rich import print as rprint
from django.utils import timezone
from archivebox.misc.jsonl import (
read_args_or_stdin, write_record,
TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
)
from archivebox.core.models import Snapshot, ArchiveResult
from archivebox.workers.orchestrator import Orchestrator
is_tty = sys.stdout.isatty()
# Parse comma-separated plugins list once (reused in creation and filtering)
plugins_list = [p.strip() for p in plugins.split(',') if p.strip()] if plugins else []
# Collect all input records
records = list(read_args_or_stdin(args))
if not records:
rprint('[yellow]No snapshots provided. Pass snapshot IDs as arguments or via stdin.[/yellow]', file=sys.stderr)
return 1
# Gather snapshot IDs to process
snapshot_ids = set()
for record in records:
record_type = record.get('type')
if record_type == TYPE_SNAPSHOT:
snapshot_id = record.get('id')
if snapshot_id:
snapshot_ids.add(snapshot_id)
elif record.get('url'):
# Look up by URL (get most recent if multiple exist)
snap = Snapshot.objects.filter(url=record['url']).order_by('-created_at').first()
if snap:
snapshot_ids.add(str(snap.id))
else:
rprint(f'[yellow]Snapshot not found for URL: {record["url"]}[/yellow]', file=sys.stderr)
elif record_type == TYPE_ARCHIVERESULT:
snapshot_id = record.get('snapshot_id')
if snapshot_id:
snapshot_ids.add(snapshot_id)
elif 'id' in record:
# Assume it's a snapshot ID
snapshot_ids.add(record['id'])
if not snapshot_ids:
rprint('[red]No valid snapshot IDs found in input[/red]', file=sys.stderr)
return 1
# Get snapshots and ensure they have pending ArchiveResults
processed_count = 0
for snapshot_id in snapshot_ids:
try:
snapshot = Snapshot.objects.get(id=snapshot_id)
except Snapshot.DoesNotExist:
rprint(f'[yellow]Snapshot {snapshot_id} not found[/yellow]', file=sys.stderr)
continue
# Create pending ArchiveResults if needed
if plugins_list:
# Only create for specific plugins
for plugin_name in plugins_list:
result, created = ArchiveResult.objects.get_or_create(
snapshot=snapshot,
plugin=plugin_name,
defaults={
'status': ArchiveResult.StatusChoices.QUEUED,
'retry_at': timezone.now(),
}
)
if not created and result.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED]:
# Reset for retry
result.status = ArchiveResult.StatusChoices.QUEUED
result.retry_at = timezone.now()
result.save()
else:
# Create all pending plugins
snapshot.create_pending_archiveresults()
# Reset snapshot status to allow processing
if snapshot.status == Snapshot.StatusChoices.SEALED:
snapshot.status = Snapshot.StatusChoices.STARTED
snapshot.retry_at = timezone.now()
snapshot.save()
processed_count += 1
if processed_count == 0:
rprint('[red]No snapshots to process[/red]', file=sys.stderr)
return 1
rprint(f'[blue]Queued {processed_count} snapshots for extraction[/blue]', file=sys.stderr)
# Run orchestrator if --wait (default)
if wait:
rprint('[blue]Running plugins...[/blue]', file=sys.stderr)
orchestrator = Orchestrator(exit_on_idle=True)
orchestrator.runloop()
# Output results as JSONL (when piped) or human-readable (when TTY)
for snapshot_id in snapshot_ids:
try:
snapshot = Snapshot.objects.get(id=snapshot_id)
results = snapshot.archiveresult_set.all()
if plugins_list:
results = results.filter(plugin__in=plugins_list)
for result in results:
if is_tty:
status_color = {
'succeeded': 'green',
'failed': 'red',
'skipped': 'yellow',
}.get(result.status, 'dim')
rprint(f' [{status_color}]{result.status}[/{status_color}] {result.plugin}{result.output_str or ""}', file=sys.stderr)
else:
write_record(result.to_jsonl())
except Snapshot.DoesNotExist:
continue
return 0
def is_archiveresult_id(value: str) -> bool:
"""Check if value looks like an ArchiveResult UUID."""
import re
uuid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.I)
if not uuid_pattern.match(value):
return False
# Verify it's actually an ArchiveResult (not a Snapshot or other object)
from archivebox.core.models import ArchiveResult
return ArchiveResult.objects.filter(id=value).exists()
@click.command()
@click.option('--plugins', '-p', default='', help='Comma-separated list of plugins to run (e.g., screenshot,singlefile)')
@click.option('--wait/--no-wait', default=True, help='Wait for plugins to complete (default: wait)')
@click.argument('args', nargs=-1)
def main(plugins: str, wait: bool, args: tuple):
"""Run plugins on Snapshots, or process existing ArchiveResults by ID"""
from archivebox.misc.jsonl import read_args_or_stdin
# Read all input
records = list(read_args_or_stdin(args))
if not records:
from rich import print as rprint
rprint('[yellow]No Snapshot IDs or ArchiveResult IDs provided. Pass as arguments or via stdin.[/yellow]', file=sys.stderr)
sys.exit(1)
# Check if input looks like existing ArchiveResult IDs to process
all_are_archiveresult_ids = all(
is_archiveresult_id(r.get('id') or r.get('url', ''))
for r in records
)
if all_are_archiveresult_ids:
# Process existing ArchiveResults by ID
exit_code = 0
for record in records:
archiveresult_id = record.get('id') or record.get('url')
result = process_archiveresult_by_id(archiveresult_id)
if result != 0:
exit_code = result
sys.exit(exit_code)
else:
# Default behavior: run plugins on Snapshots from input
sys.exit(run_plugins(args, plugins=plugins, wait=wait))
if __name__ == '__main__':
main()

View File

@@ -127,7 +127,7 @@ def init(force: bool=False, quick: bool=False, install: bool=False) -> None:
if pending_links:
for link_dict in pending_links.values():
Snapshot.from_jsonl(link_dict)
Snapshot.from_json(link_dict)
# Hint for orphaned snapshot directories
print()

View File

@@ -0,0 +1,113 @@
#!/usr/bin/env python3
"""
archivebox machine <action> [--filters]
Manage Machine records (system-managed, mostly read-only).
Machine records track the host machines where ArchiveBox runs.
They are created automatically by the system and are primarily for debugging.
Actions:
list - List Machines as JSONL (with optional filters)
Examples:
# List all machines
archivebox machine list
# List machines by hostname
archivebox machine list --hostname__icontains=myserver
"""
__package__ = 'archivebox.cli'
__command__ = 'archivebox machine'
import sys
from typing import Optional
import rich_click as click
from rich import print as rprint
def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
"""Apply Django-style filters from CLI kwargs to a QuerySet."""
filters = {}
for key, value in filter_kwargs.items():
if value is not None and key not in ('limit', 'offset'):
filters[key] = value
if filters:
queryset = queryset.filter(**filters)
if limit:
queryset = queryset[:limit]
return queryset
# =============================================================================
# LIST
# =============================================================================
def list_machines(
hostname__icontains: Optional[str] = None,
os_platform: Optional[str] = None,
limit: Optional[int] = None,
) -> int:
"""
List Machines as JSONL with optional filters.
Exit codes:
0: Success (even if no results)
"""
from archivebox.misc.jsonl import write_record
from archivebox.machine.models import Machine
is_tty = sys.stdout.isatty()
queryset = Machine.objects.all().order_by('-created_at')
# Apply filters
filter_kwargs = {
'hostname__icontains': hostname__icontains,
'os_platform': os_platform,
}
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
count = 0
for machine in queryset:
if is_tty:
rprint(f'[cyan]{machine.hostname:30}[/cyan] [dim]{machine.os_platform:10}[/dim] {machine.id}')
else:
write_record(machine.to_json())
count += 1
rprint(f'[dim]Listed {count} machines[/dim]', file=sys.stderr)
return 0
# =============================================================================
# CLI Commands
# =============================================================================
@click.group()
def main():
"""Manage Machine records (read-only, system-managed)."""
pass
@main.command('list')
@click.option('--hostname__icontains', help='Filter by hostname contains')
@click.option('--os-platform', help='Filter by OS platform')
@click.option('--limit', '-n', type=int, help='Limit number of results')
def list_cmd(hostname__icontains: Optional[str], os_platform: Optional[str], limit: Optional[int]):
"""List Machines as JSONL."""
sys.exit(list_machines(
hostname__icontains=hostname__icontains,
os_platform=os_platform,
limit=limit,
))
if __name__ == '__main__':
main()

View File

@@ -1,67 +0,0 @@
#!/usr/bin/env python3
"""
archivebox orchestrator [--daemon]
Start the orchestrator process that manages workers.
The orchestrator polls queues for each model type (Crawl, Snapshot, ArchiveResult)
and lazily spawns worker processes when there is work to be done.
"""
__package__ = 'archivebox.cli'
__command__ = 'archivebox orchestrator'
import sys
import rich_click as click
from archivebox.misc.util import docstring
def orchestrator(daemon: bool = False, watch: bool = False) -> int:
"""
Start the orchestrator process.
The orchestrator:
1. Polls each model queue (Crawl, Snapshot, ArchiveResult)
2. Spawns worker processes when there is work to do
3. Monitors worker health and restarts failed workers
4. Exits when all queues are empty (unless --daemon)
Args:
daemon: Run forever (don't exit when idle)
watch: Just watch the queues without spawning workers (for debugging)
Exit codes:
0: All work completed successfully
1: Error occurred
"""
from archivebox.workers.orchestrator import Orchestrator
if Orchestrator.is_running():
print('[yellow]Orchestrator is already running[/yellow]')
return 0
try:
orchestrator_instance = Orchestrator(exit_on_idle=not daemon)
orchestrator_instance.runloop()
return 0
except KeyboardInterrupt:
return 0
except Exception as e:
print(f'[red]Orchestrator error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
return 1
@click.command()
@click.option('--daemon', '-d', is_flag=True, help="Run forever (don't exit on idle)")
@click.option('--watch', '-w', is_flag=True, help="Watch queues without spawning workers")
@docstring(orchestrator.__doc__)
def main(daemon: bool, watch: bool):
"""Start the ArchiveBox orchestrator process"""
sys.exit(orchestrator(daemon=daemon, watch=watch))
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,121 @@
#!/usr/bin/env python3
"""
archivebox process <action> [--filters]
Manage Process records (system-managed, mostly read-only).
Process records track executions of binaries during extraction.
They are created automatically by the system and are primarily for debugging.
Actions:
list - List Processes as JSONL (with optional filters)
Examples:
# List all processes
archivebox process list
# List processes by binary
archivebox process list --binary-name=chrome
# List recent processes
archivebox process list --limit=10
"""
__package__ = 'archivebox.cli'
__command__ = 'archivebox process'
import sys
from typing import Optional
import rich_click as click
from rich import print as rprint
def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
"""Apply Django-style filters from CLI kwargs to a QuerySet."""
filters = {}
for key, value in filter_kwargs.items():
if value is not None and key not in ('limit', 'offset'):
filters[key] = value
if filters:
queryset = queryset.filter(**filters)
if limit:
queryset = queryset[:limit]
return queryset
# =============================================================================
# LIST
# =============================================================================
def list_processes(
binary_name: Optional[str] = None,
machine_id: Optional[str] = None,
limit: Optional[int] = None,
) -> int:
"""
List Processes as JSONL with optional filters.
Exit codes:
0: Success (even if no results)
"""
from archivebox.misc.jsonl import write_record
from archivebox.machine.models import Process
is_tty = sys.stdout.isatty()
queryset = Process.objects.all().select_related('binary', 'machine').order_by('-start_ts')
# Apply filters
filter_kwargs = {}
if binary_name:
filter_kwargs['binary__name'] = binary_name
if machine_id:
filter_kwargs['machine_id'] = machine_id
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
count = 0
for process in queryset:
if is_tty:
binary_name_str = process.binary.name if process.binary else 'unknown'
exit_code = process.returncode if process.returncode is not None else '?'
status_color = 'green' if process.returncode == 0 else 'red' if process.returncode else 'yellow'
rprint(f'[{status_color}]exit={exit_code:3}[/{status_color}] [cyan]{binary_name_str:15}[/cyan] [dim]{process.id}[/dim]')
else:
write_record(process.to_json())
count += 1
rprint(f'[dim]Listed {count} processes[/dim]', file=sys.stderr)
return 0
# =============================================================================
# CLI Commands
# =============================================================================
@click.group()
def main():
"""Manage Process records (read-only, system-managed)."""
pass
@main.command('list')
@click.option('--binary-name', '-b', help='Filter by binary name')
@click.option('--machine-id', '-m', help='Filter by machine ID')
@click.option('--limit', '-n', type=int, help='Limit number of results')
def list_cmd(binary_name: Optional[str], machine_id: Optional[str], limit: Optional[int]):
"""List Processes as JSONL."""
sys.exit(list_processes(
binary_name=binary_name,
machine_id=machine_id,
limit=limit,
))
if __name__ == '__main__':
main()

View File

@@ -1,98 +0,0 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox remove'
import shutil
from pathlib import Path
from typing import Iterable
import rich_click as click
from django.db.models import QuerySet
from archivebox.config import DATA_DIR
from archivebox.config.django import setup_django
from archivebox.misc.util import enforce_types, docstring
from archivebox.misc.checks import check_data_folder
from archivebox.misc.logging_util import (
log_list_started,
log_list_finished,
log_removal_started,
log_removal_finished,
TimedProgress,
)
@enforce_types
def remove(filter_patterns: Iterable[str]=(),
filter_type: str='exact',
snapshots: QuerySet | None=None,
after: float | None=None,
before: float | None=None,
yes: bool=False,
delete: bool=False,
out_dir: Path=DATA_DIR) -> QuerySet:
"""Remove the specified URLs from the archive"""
setup_django()
check_data_folder()
from archivebox.cli.archivebox_search import get_snapshots
log_list_started(filter_patterns, filter_type)
timer = TimedProgress(360, prefix=' ')
try:
snapshots = get_snapshots(
snapshots=snapshots,
filter_patterns=list(filter_patterns) if filter_patterns else None,
filter_type=filter_type,
after=after,
before=before,
)
finally:
timer.end()
if not snapshots.exists():
log_removal_finished(0, 0)
raise SystemExit(1)
log_list_finished(snapshots)
log_removal_started(snapshots, yes=yes, delete=delete)
timer = TimedProgress(360, prefix=' ')
try:
for snapshot in snapshots:
if delete:
shutil.rmtree(snapshot.output_dir, ignore_errors=True)
finally:
timer.end()
to_remove = snapshots.count()
from archivebox.search import flush_search_index
from archivebox.core.models import Snapshot
flush_search_index(snapshots=snapshots)
snapshots.delete()
all_snapshots = Snapshot.objects.all()
log_removal_finished(all_snapshots.count(), to_remove)
return all_snapshots
@click.command()
@click.option('--yes', is_flag=True, help='Remove links instantly without prompting to confirm')
@click.option('--delete', is_flag=True, help='Delete the archived content and metadata folder in addition to removing from index')
@click.option('--before', type=float, help='Remove only URLs bookmarked before timestamp')
@click.option('--after', type=float, help='Remove only URLs bookmarked after timestamp')
@click.option('--filter-type', '-f', type=click.Choice(('exact', 'substring', 'domain', 'regex', 'tag')), default='exact', help='Type of pattern matching to use when filtering URLs')
@click.argument('filter_patterns', nargs=-1)
@docstring(remove.__doc__)
def main(**kwargs):
"""Remove the specified URLs from the archive"""
remove(**kwargs)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,155 @@
#!/usr/bin/env python3
"""
archivebox run [--daemon]
Unified command for processing queued work.
Modes:
- With stdin JSONL: Process piped records, exit when complete
- Without stdin (TTY): Run orchestrator in foreground until killed
Examples:
# Run orchestrator in foreground (replaces `archivebox orchestrator`)
archivebox run
# Run as daemon (don't exit on idle)
archivebox run --daemon
# Process specific records (pipe any JSONL type, exits when done)
archivebox snapshot list --status=queued | archivebox run
archivebox archiveresult list --status=failed | archivebox run
archivebox crawl list --status=queued | archivebox run
# Mixed types work too
cat mixed_records.jsonl | archivebox run
"""
__package__ = 'archivebox.cli'
__command__ = 'archivebox run'
import sys
import rich_click as click
from rich import print as rprint
def process_stdin_records() -> int:
"""
Process JSONL records from stdin.
Reads records, queues them for processing, then runs orchestrator until complete.
Handles any record type: Crawl, Snapshot, ArchiveResult, etc.
Returns exit code (0 = success, 1 = error).
"""
from django.utils import timezone
from archivebox.misc.jsonl import read_stdin, TYPE_CRAWL, TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
from archivebox.core.models import Snapshot, ArchiveResult
from archivebox.crawls.models import Crawl
from archivebox.workers.orchestrator import Orchestrator
records = list(read_stdin())
if not records:
return 0 # Nothing to process
queued_count = 0
for record in records:
record_type = record.get('type')
record_id = record.get('id')
if not record_id:
continue
try:
if record_type == TYPE_CRAWL:
crawl = Crawl.objects.get(id=record_id)
if crawl.status in [Crawl.StatusChoices.QUEUED, Crawl.StatusChoices.STARTED]:
crawl.retry_at = timezone.now()
crawl.save()
queued_count += 1
elif record_type == TYPE_SNAPSHOT:
snapshot = Snapshot.objects.get(id=record_id)
if snapshot.status in [Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]:
snapshot.retry_at = timezone.now()
snapshot.save()
queued_count += 1
elif record_type == TYPE_ARCHIVERESULT:
archiveresult = ArchiveResult.objects.get(id=record_id)
if archiveresult.status in [ArchiveResult.StatusChoices.QUEUED, ArchiveResult.StatusChoices.STARTED, ArchiveResult.StatusChoices.BACKOFF]:
archiveresult.retry_at = timezone.now()
archiveresult.save()
queued_count += 1
except (Crawl.DoesNotExist, Snapshot.DoesNotExist, ArchiveResult.DoesNotExist):
rprint(f'[yellow]Record not found: {record_type} {record_id}[/yellow]', file=sys.stderr)
continue
if queued_count == 0:
rprint('[yellow]No records to process[/yellow]', file=sys.stderr)
return 0
rprint(f'[blue]Processing {queued_count} records...[/blue]', file=sys.stderr)
# Run orchestrator until all queued work is done
orchestrator = Orchestrator(exit_on_idle=True)
orchestrator.runloop()
return 0
def run_orchestrator(daemon: bool = False) -> int:
"""
Run the orchestrator process.
The orchestrator:
1. Polls each model queue (Crawl, Snapshot, ArchiveResult)
2. Spawns worker processes when there is work to do
3. Monitors worker health and restarts failed workers
4. Exits when all queues are empty (unless --daemon)
Args:
daemon: Run forever (don't exit when idle)
Returns exit code (0 = success, 1 = error).
"""
from archivebox.workers.orchestrator import Orchestrator
if Orchestrator.is_running():
rprint('[yellow]Orchestrator is already running[/yellow]', file=sys.stderr)
return 0
try:
orchestrator = Orchestrator(exit_on_idle=not daemon)
orchestrator.runloop()
return 0
except KeyboardInterrupt:
return 0
except Exception as e:
rprint(f'[red]Orchestrator error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
return 1
@click.command()
@click.option('--daemon', '-d', is_flag=True, help="Run forever (don't exit on idle)")
def main(daemon: bool):
"""
Process queued work.
When stdin is piped: Process those specific records and exit.
When run standalone: Run orchestrator in foreground.
"""
# Check if stdin has data (non-TTY means piped input)
if not sys.stdin.isatty():
sys.exit(process_stdin_records())
else:
sys.exit(run_orchestrator(daemon=daemon))
if __name__ == '__main__':
main()

View File

@@ -1,131 +0,0 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox search'
from pathlib import Path
from typing import Optional, List, Any
import rich_click as click
from rich import print
from django.db.models import QuerySet
from archivebox.config import DATA_DIR
from archivebox.misc.logging import stderr
from archivebox.misc.util import enforce_types, docstring
# Filter types for URL matching
LINK_FILTERS = {
'exact': lambda pattern: {'url': pattern},
'substring': lambda pattern: {'url__icontains': pattern},
'regex': lambda pattern: {'url__iregex': pattern},
'domain': lambda pattern: {'url__istartswith': f'http://{pattern}'},
'tag': lambda pattern: {'tags__name': pattern},
'timestamp': lambda pattern: {'timestamp': pattern},
}
STATUS_CHOICES = ['indexed', 'archived', 'unarchived']
def get_snapshots(snapshots: Optional[QuerySet]=None,
filter_patterns: Optional[List[str]]=None,
filter_type: str='substring',
after: Optional[float]=None,
before: Optional[float]=None,
out_dir: Path=DATA_DIR) -> QuerySet:
"""Filter and return Snapshots matching the given criteria."""
from archivebox.core.models import Snapshot
if snapshots:
result = snapshots
else:
result = Snapshot.objects.all()
if after is not None:
result = result.filter(timestamp__gte=after)
if before is not None:
result = result.filter(timestamp__lt=before)
if filter_patterns:
result = Snapshot.objects.filter_by_patterns(filter_patterns, filter_type)
if not result:
stderr('[!] No Snapshots matched your filters:', filter_patterns, f'({filter_type})', color='lightyellow')
return result
@enforce_types
def search(filter_patterns: list[str] | None=None,
filter_type: str='substring',
status: str='indexed',
before: float | None=None,
after: float | None=None,
sort: str | None=None,
json: bool=False,
html: bool=False,
csv: str | None=None,
with_headers: bool=False):
"""List, filter, and export information about archive entries"""
from archivebox.core.models import Snapshot
if with_headers and not (json or html or csv):
stderr('[X] --with-headers requires --json, --html or --csv\n', color='red')
raise SystemExit(2)
# Query DB directly - no filesystem scanning
snapshots = get_snapshots(
filter_patterns=list(filter_patterns) if filter_patterns else None,
filter_type=filter_type,
before=before,
after=after,
)
# Apply status filter
if status == 'archived':
snapshots = snapshots.filter(downloaded_at__isnull=False)
elif status == 'unarchived':
snapshots = snapshots.filter(downloaded_at__isnull=True)
# 'indexed' = all snapshots (no filter)
if sort:
snapshots = snapshots.order_by(sort)
# Export to requested format
if json:
output = snapshots.to_json(with_headers=with_headers)
elif html:
output = snapshots.to_html(with_headers=with_headers)
elif csv:
output = snapshots.to_csv(cols=csv.split(','), header=with_headers)
else:
from archivebox.misc.logging_util import printable_folders
# Convert to dict for printable_folders
folders = {s.output_dir: s for s in snapshots}
output = printable_folders(folders, with_headers)
print(output)
return output
@click.command()
@click.option('--filter-type', '-f', type=click.Choice(['search', *LINK_FILTERS.keys()]), default='substring', help='Pattern matching type for filtering URLs')
@click.option('--status', '-s', type=click.Choice(STATUS_CHOICES), default='indexed', help='List snapshots with the given status')
@click.option('--before', '-b', type=float, help='List snapshots bookmarked before the given UNIX timestamp')
@click.option('--after', '-a', type=float, help='List snapshots bookmarked after the given UNIX timestamp')
@click.option('--sort', '-o', type=str, help='Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at')
@click.option('--json', '-J', is_flag=True, help='Print output in JSON format')
@click.option('--html', '-M', is_flag=True, help='Print output in HTML format (suitable for viewing statically without a server)')
@click.option('--csv', '-C', type=str, help='Print output as CSV with the provided fields, e.g.: created_at,url,title')
@click.option('--with-headers', '-H', is_flag=True, help='Include extra CSV/HTML headers in the output')
@click.help_option('--help', '-h')
@click.argument('filter_patterns', nargs=-1)
@docstring(search.__doc__)
def main(**kwargs):
return search(**kwargs)
if __name__ == '__main__':
main()

View File

@@ -1,93 +1,76 @@
#!/usr/bin/env python3
"""
archivebox snapshot [urls_or_crawl_ids...] [--tag=TAG] [--plugins=NAMES]
archivebox snapshot <action> [args...] [--filters]
Create Snapshots from URLs or Crawl jobs. Accepts URLs, Crawl JSONL, or Crawl IDs.
Manage Snapshot records.
Input formats:
- Plain URLs (one per line)
- JSONL: {"type": "Crawl", "id": "...", "urls": "..."}
- JSONL: {"type": "Snapshot", "url": "...", "title": "...", "tags": "..."}
- Crawl UUIDs (one per line)
Output (JSONL):
{"type": "Snapshot", "id": "...", "url": "...", "status": "queued", ...}
Actions:
create - Create Snapshots from URLs or Crawl JSONL
list - List Snapshots as JSONL (with optional filters)
update - Update Snapshots from stdin JSONL
delete - Delete Snapshots from stdin JSONL
Examples:
# Create snapshots from URLs directly
archivebox snapshot https://example.com https://foo.com
# Create
archivebox snapshot create https://example.com --tag=news
archivebox crawl create https://example.com | archivebox snapshot create
# Pipe from crawl command
archivebox crawl https://example.com | archivebox snapshot
# List with filters
archivebox snapshot list --status=queued
archivebox snapshot list --url__icontains=example.com
# Chain with extract
archivebox crawl https://example.com | archivebox snapshot | archivebox extract
# Update
archivebox snapshot list --tag=old | archivebox snapshot update --tag=new
# Run specific plugins after creating snapshots
archivebox snapshot --plugins=screenshot,singlefile https://example.com
# Process existing Snapshot by ID
archivebox snapshot 01234567-89ab-cdef-0123-456789abcdef
# Delete
archivebox snapshot list --url__icontains=spam.com | archivebox snapshot delete --yes
"""
__package__ = 'archivebox.cli'
__command__ = 'archivebox snapshot'
import sys
from typing import Optional
from typing import Optional, Iterable
import rich_click as click
from archivebox.misc.util import docstring
from rich import print as rprint
def process_snapshot_by_id(snapshot_id: str) -> int:
"""
Process a single Snapshot by ID (used by workers).
def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
"""Apply Django-style filters from CLI kwargs to a QuerySet."""
filters = {}
for key, value in filter_kwargs.items():
if value is not None and key not in ('limit', 'offset'):
filters[key] = value
Triggers the Snapshot's state machine tick() which will:
- Transition from queued -> started (creates pending ArchiveResults)
- Transition from started -> sealed (when all ArchiveResults done)
"""
from rich import print as rprint
from archivebox.core.models import Snapshot
if filters:
queryset = queryset.filter(**filters)
try:
snapshot = Snapshot.objects.get(id=snapshot_id)
except Snapshot.DoesNotExist:
rprint(f'[red]Snapshot {snapshot_id} not found[/red]', file=sys.stderr)
return 1
if limit:
queryset = queryset[:limit]
rprint(f'[blue]Processing Snapshot {snapshot.id} {snapshot.url[:50]} (status={snapshot.status})[/blue]', file=sys.stderr)
return queryset
try:
snapshot.sm.tick()
snapshot.refresh_from_db()
rprint(f'[green]Snapshot complete (status={snapshot.status})[/green]', file=sys.stderr)
return 0
except Exception as e:
rprint(f'[red]Snapshot error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
return 1
# =============================================================================
# CREATE
# =============================================================================
def create_snapshots(
args: tuple,
urls: Iterable[str],
tag: str = '',
plugins: str = '',
status: str = 'queued',
depth: int = 0,
created_by_id: Optional[int] = None,
) -> int:
"""
Create Snapshots from URLs, Crawl JSONL, or Crawl IDs.
Reads from args or stdin, creates Snapshot objects, outputs JSONL.
If --plugins is passed, also runs specified plugins (blocking).
Create Snapshots from URLs or stdin JSONL (Crawl or Snapshot records).
Exit codes:
0: Success
1: Failure
"""
from rich import print as rprint
from django.utils import timezone
from archivebox.misc.jsonl import (
@@ -102,7 +85,7 @@ def create_snapshots(
is_tty = sys.stdout.isatty()
# Collect all input records
records = list(read_args_or_stdin(args))
records = list(read_args_or_stdin(urls))
if not records:
rprint('[yellow]No URLs or Crawls provided. Pass URLs as arguments or via stdin.[/yellow]', file=sys.stderr)
@@ -122,47 +105,44 @@ def create_snapshots(
try:
crawl = Crawl.objects.get(id=crawl_id)
except Crawl.DoesNotExist:
# Crawl doesn't exist, create it
crawl = Crawl.from_jsonl(record, overrides={'created_by_id': created_by_id})
crawl = Crawl.from_json(record, overrides={'created_by_id': created_by_id})
else:
# No ID, create new crawl
crawl = Crawl.from_jsonl(record, overrides={'created_by_id': created_by_id})
crawl = Crawl.from_json(record, overrides={'created_by_id': created_by_id})
if not crawl:
continue
# Create snapshots for each URL in the crawl
for url in crawl.get_urls_list():
# Merge CLI tags with crawl tags
merged_tags = crawl.tags_str
if tag:
if merged_tags:
merged_tags = f"{merged_tags},{tag}"
else:
merged_tags = tag
merged_tags = f"{merged_tags},{tag}" if merged_tags else tag
snapshot_record = {
'url': url,
'tags': merged_tags,
'crawl_id': str(crawl.id),
'depth': 0,
'depth': depth,
'status': status,
}
snapshot = Snapshot.from_jsonl(snapshot_record, overrides={'created_by_id': created_by_id})
snapshot = Snapshot.from_json(snapshot_record, overrides={'created_by_id': created_by_id})
if snapshot:
created_snapshots.append(snapshot)
if not is_tty:
write_record(snapshot.to_jsonl())
write_record(snapshot.to_json())
elif record_type == TYPE_SNAPSHOT or record.get('url'):
# Input is a Snapshot or plain URL
# Add tags if provided via CLI
if tag and not record.get('tags'):
record['tags'] = tag
if status:
record['status'] = status
record['depth'] = record.get('depth', depth)
snapshot = Snapshot.from_jsonl(record, overrides={'created_by_id': created_by_id})
snapshot = Snapshot.from_json(record, overrides={'created_by_id': created_by_id})
if snapshot:
created_snapshots.append(snapshot)
if not is_tty:
write_record(snapshot.to_jsonl())
write_record(snapshot.to_json())
except Exception as e:
rprint(f'[red]Error creating snapshot: {e}[/red]', file=sys.stderr)
@@ -174,93 +154,237 @@ def create_snapshots(
rprint(f'[green]Created {len(created_snapshots)} snapshots[/green]', file=sys.stderr)
# If TTY, show human-readable output
if is_tty:
for snapshot in created_snapshots:
rprint(f' [dim]{snapshot.id}[/dim] {snapshot.url[:60]}', file=sys.stderr)
# If --plugins is passed, create ArchiveResults and run the orchestrator
if plugins:
from archivebox.core.models import ArchiveResult
from archivebox.workers.orchestrator import Orchestrator
# Parse comma-separated plugins list
plugins_list = [p.strip() for p in plugins.split(',') if p.strip()]
# Create ArchiveResults for the specific plugins on each snapshot
for snapshot in created_snapshots:
for plugin_name in plugins_list:
result, created = ArchiveResult.objects.get_or_create(
snapshot=snapshot,
plugin=plugin_name,
defaults={
'status': ArchiveResult.StatusChoices.QUEUED,
'retry_at': timezone.now(),
}
)
if not created and result.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED]:
# Reset for retry
result.status = ArchiveResult.StatusChoices.QUEUED
result.retry_at = timezone.now()
result.save()
rprint(f'[blue]Running plugins: {plugins}...[/blue]', file=sys.stderr)
orchestrator = Orchestrator(exit_on_idle=True)
orchestrator.runloop()
return 0
def is_snapshot_id(value: str) -> bool:
"""Check if value looks like a Snapshot UUID."""
import re
uuid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.I)
if not uuid_pattern.match(value):
return False
# Verify it's actually a Snapshot (not a Crawl or other object)
# =============================================================================
# LIST
# =============================================================================
def list_snapshots(
status: Optional[str] = None,
url__icontains: Optional[str] = None,
url__istartswith: Optional[str] = None,
tag: Optional[str] = None,
crawl_id: Optional[str] = None,
limit: Optional[int] = None,
) -> int:
"""
List Snapshots as JSONL with optional filters.
Exit codes:
0: Success (even if no results)
"""
from archivebox.misc.jsonl import write_record
from archivebox.core.models import Snapshot
return Snapshot.objects.filter(id=value).exists()
is_tty = sys.stdout.isatty()
queryset = Snapshot.objects.all().order_by('-created_at')
# Apply filters
filter_kwargs = {
'status': status,
'url__icontains': url__icontains,
'url__istartswith': url__istartswith,
'crawl_id': crawl_id,
}
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
# Tag filter requires special handling (M2M)
if tag:
queryset = queryset.filter(tags__name__iexact=tag)
count = 0
for snapshot in queryset:
if is_tty:
status_color = {
'queued': 'yellow',
'started': 'blue',
'sealed': 'green',
}.get(snapshot.status, 'dim')
rprint(f'[{status_color}]{snapshot.status:8}[/{status_color}] [dim]{snapshot.id}[/dim] {snapshot.url[:60]}')
else:
write_record(snapshot.to_json())
count += 1
rprint(f'[dim]Listed {count} snapshots[/dim]', file=sys.stderr)
return 0
@click.command()
@click.option('--tag', '-t', default='', help='Comma-separated tags to add to each snapshot')
@click.option('--plugins', '-p', default='', help='Comma-separated list of plugins to run after creating snapshots (e.g., screenshot,singlefile)')
@click.argument('args', nargs=-1)
def main(tag: str, plugins: str, args: tuple):
"""Create Snapshots from URLs/Crawls, or process existing Snapshots by ID"""
from archivebox.misc.jsonl import read_args_or_stdin
# =============================================================================
# UPDATE
# =============================================================================
# Read all input
records = list(read_args_or_stdin(args))
def update_snapshots(
status: Optional[str] = None,
tag: Optional[str] = None,
) -> int:
"""
Update Snapshots from stdin JSONL.
Reads Snapshot records from stdin and applies updates.
Uses PATCH semantics - only specified fields are updated.
Exit codes:
0: Success
1: No input or error
"""
from django.utils import timezone
from archivebox.misc.jsonl import read_stdin, write_record
from archivebox.core.models import Snapshot
is_tty = sys.stdout.isatty()
records = list(read_stdin())
if not records:
from rich import print as rprint
rprint('[yellow]No URLs, Crawl IDs, or Snapshot IDs provided. Pass as arguments or via stdin.[/yellow]', file=sys.stderr)
sys.exit(1)
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
return 1
# Check if input looks like existing Snapshot IDs to process
# If ALL inputs are UUIDs with no URL and exist as Snapshots, process them
all_are_snapshot_ids = all(
is_snapshot_id(r.get('id') or r.get('url', ''))
for r in records
if r.get('type') != 'Crawl' # Don't check Crawl records as Snapshot IDs
)
updated_count = 0
for record in records:
snapshot_id = record.get('id')
if not snapshot_id:
continue
# But also check that we're not receiving Crawl JSONL
has_crawl_records = any(r.get('type') == 'Crawl' for r in records)
try:
snapshot = Snapshot.objects.get(id=snapshot_id)
if all_are_snapshot_ids and not has_crawl_records:
# Process existing Snapshots by ID
exit_code = 0
for record in records:
snapshot_id = record.get('id') or record.get('url')
result = process_snapshot_by_id(snapshot_id)
if result != 0:
exit_code = result
sys.exit(exit_code)
else:
# Create new Snapshots from URLs or Crawls
sys.exit(create_snapshots(args, tag=tag, plugins=plugins))
# Apply updates from CLI flags (override stdin values)
if status:
snapshot.status = status
snapshot.retry_at = timezone.now()
if tag:
# Add tag to existing tags
snapshot.save() # Ensure saved before M2M
from archivebox.core.models import Tag
tag_obj, _ = Tag.objects.get_or_create(name=tag)
snapshot.tags.add(tag_obj)
snapshot.save()
updated_count += 1
if not is_tty:
write_record(snapshot.to_json())
except Snapshot.DoesNotExist:
rprint(f'[yellow]Snapshot not found: {snapshot_id}[/yellow]', file=sys.stderr)
continue
rprint(f'[green]Updated {updated_count} snapshots[/green]', file=sys.stderr)
return 0
# =============================================================================
# DELETE
# =============================================================================
def delete_snapshots(yes: bool = False, dry_run: bool = False) -> int:
"""
Delete Snapshots from stdin JSONL.
Requires --yes flag to confirm deletion.
Exit codes:
0: Success
1: No input or missing --yes flag
"""
from archivebox.misc.jsonl import read_stdin
from archivebox.core.models import Snapshot
records = list(read_stdin())
if not records:
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
return 1
snapshot_ids = [r.get('id') for r in records if r.get('id')]
if not snapshot_ids:
rprint('[yellow]No valid snapshot IDs in input[/yellow]', file=sys.stderr)
return 1
snapshots = Snapshot.objects.filter(id__in=snapshot_ids)
count = snapshots.count()
if count == 0:
rprint('[yellow]No matching snapshots found[/yellow]', file=sys.stderr)
return 0
if dry_run:
rprint(f'[yellow]Would delete {count} snapshots (dry run)[/yellow]', file=sys.stderr)
for snapshot in snapshots:
rprint(f' [dim]{snapshot.id}[/dim] {snapshot.url[:60]}', file=sys.stderr)
return 0
if not yes:
rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
return 1
# Perform deletion
deleted_count, _ = snapshots.delete()
rprint(f'[green]Deleted {deleted_count} snapshots[/green]', file=sys.stderr)
return 0
# =============================================================================
# CLI Commands
# =============================================================================
@click.group()
def main():
"""Manage Snapshot records."""
pass
@main.command('create')
@click.argument('urls', nargs=-1)
@click.option('--tag', '-t', default='', help='Comma-separated tags to add')
@click.option('--status', '-s', default='queued', help='Initial status (default: queued)')
@click.option('--depth', '-d', type=int, default=0, help='Crawl depth (default: 0)')
def create_cmd(urls: tuple, tag: str, status: str, depth: int):
"""Create Snapshots from URLs or stdin JSONL."""
sys.exit(create_snapshots(urls, tag=tag, status=status, depth=depth))
@main.command('list')
@click.option('--status', '-s', help='Filter by status (queued, started, sealed)')
@click.option('--url__icontains', help='Filter by URL contains')
@click.option('--url__istartswith', help='Filter by URL starts with')
@click.option('--tag', '-t', help='Filter by tag name')
@click.option('--crawl-id', help='Filter by crawl ID')
@click.option('--limit', '-n', type=int, help='Limit number of results')
def list_cmd(status: Optional[str], url__icontains: Optional[str], url__istartswith: Optional[str],
tag: Optional[str], crawl_id: Optional[str], limit: Optional[int]):
"""List Snapshots as JSONL."""
sys.exit(list_snapshots(
status=status,
url__icontains=url__icontains,
url__istartswith=url__istartswith,
tag=tag,
crawl_id=crawl_id,
limit=limit,
))
@main.command('update')
@click.option('--status', '-s', help='Set status')
@click.option('--tag', '-t', help='Add tag')
def update_cmd(status: Optional[str], tag: Optional[str]):
"""Update Snapshots from stdin JSONL."""
sys.exit(update_snapshots(status=status, tag=tag))
@main.command('delete')
@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
def delete_cmd(yes: bool, dry_run: bool):
"""Delete Snapshots from stdin JSONL."""
sys.exit(delete_snapshots(yes=yes, dry_run=dry_run))
if __name__ == '__main__':

View File

@@ -0,0 +1,307 @@
#!/usr/bin/env python3
"""
archivebox tag <action> [args...] [--filters]
Manage Tag records.
Actions:
create - Create Tags
list - List Tags as JSONL (with optional filters)
update - Update Tags from stdin JSONL
delete - Delete Tags from stdin JSONL
Examples:
# Create
archivebox tag create news tech science
archivebox tag create "important stuff"
# List
archivebox tag list
archivebox tag list --name__icontains=news
# Update (rename tags)
archivebox tag list --name=oldname | archivebox tag update --name=newname
# Delete
archivebox tag list --name=unused | archivebox tag delete --yes
"""
__package__ = 'archivebox.cli'
__command__ = 'archivebox tag'
import sys
from typing import Optional, Iterable
import rich_click as click
from rich import print as rprint
def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
"""Apply Django-style filters from CLI kwargs to a QuerySet."""
filters = {}
for key, value in filter_kwargs.items():
if value is not None and key not in ('limit', 'offset'):
filters[key] = value
if filters:
queryset = queryset.filter(**filters)
if limit:
queryset = queryset[:limit]
return queryset
# =============================================================================
# CREATE
# =============================================================================
def create_tags(names: Iterable[str]) -> int:
"""
Create Tags from names.
Exit codes:
0: Success
1: Failure
"""
from archivebox.misc.jsonl import write_record
from archivebox.core.models import Tag
is_tty = sys.stdout.isatty()
# Convert to list if needed
name_list = list(names) if names else []
if not name_list:
rprint('[yellow]No tag names provided. Pass names as arguments.[/yellow]', file=sys.stderr)
return 1
created_count = 0
for name in name_list:
name = name.strip()
if not name:
continue
tag, created = Tag.objects.get_or_create(name=name)
if not is_tty:
write_record(tag.to_json())
if created:
created_count += 1
rprint(f'[green]Created tag: {name}[/green]', file=sys.stderr)
else:
rprint(f'[dim]Tag already exists: {name}[/dim]', file=sys.stderr)
rprint(f'[green]Created {created_count} new tags[/green]', file=sys.stderr)
return 0
# =============================================================================
# LIST
# =============================================================================
def list_tags(
name: Optional[str] = None,
name__icontains: Optional[str] = None,
limit: Optional[int] = None,
) -> int:
"""
List Tags as JSONL with optional filters.
Exit codes:
0: Success (even if no results)
"""
from archivebox.misc.jsonl import write_record
from archivebox.core.models import Tag
is_tty = sys.stdout.isatty()
queryset = Tag.objects.all().order_by('name')
# Apply filters
filter_kwargs = {
'name': name,
'name__icontains': name__icontains,
}
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
count = 0
for tag in queryset:
snapshot_count = tag.snapshot_set.count()
if is_tty:
rprint(f'[cyan]{tag.name:30}[/cyan] [dim]({snapshot_count} snapshots)[/dim]')
else:
write_record(tag.to_json())
count += 1
rprint(f'[dim]Listed {count} tags[/dim]', file=sys.stderr)
return 0
# =============================================================================
# UPDATE
# =============================================================================
def update_tags(name: Optional[str] = None) -> int:
"""
Update Tags from stdin JSONL.
Reads Tag records from stdin and applies updates.
Uses PATCH semantics - only specified fields are updated.
Exit codes:
0: Success
1: No input or error
"""
from archivebox.misc.jsonl import read_stdin, write_record
from archivebox.core.models import Tag
is_tty = sys.stdout.isatty()
records = list(read_stdin())
if not records:
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
return 1
updated_count = 0
for record in records:
tag_id = record.get('id')
old_name = record.get('name')
if not tag_id and not old_name:
continue
try:
if tag_id:
tag = Tag.objects.get(id=tag_id)
else:
tag = Tag.objects.get(name=old_name)
# Apply updates from CLI flags
if name:
tag.name = name
tag.save()
updated_count += 1
if not is_tty:
write_record(tag.to_json())
except Tag.DoesNotExist:
rprint(f'[yellow]Tag not found: {tag_id or old_name}[/yellow]', file=sys.stderr)
continue
rprint(f'[green]Updated {updated_count} tags[/green]', file=sys.stderr)
return 0
# =============================================================================
# DELETE
# =============================================================================
def delete_tags(yes: bool = False, dry_run: bool = False) -> int:
"""
Delete Tags from stdin JSONL.
Requires --yes flag to confirm deletion.
Exit codes:
0: Success
1: No input or missing --yes flag
"""
from archivebox.misc.jsonl import read_stdin
from archivebox.core.models import Tag
records = list(read_stdin())
if not records:
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
return 1
# Collect tag IDs or names
tag_ids = []
tag_names = []
for r in records:
if r.get('id'):
tag_ids.append(r['id'])
elif r.get('name'):
tag_names.append(r['name'])
if not tag_ids and not tag_names:
rprint('[yellow]No valid tag IDs or names in input[/yellow]', file=sys.stderr)
return 1
from django.db.models import Q
query = Q()
if tag_ids:
query |= Q(id__in=tag_ids)
if tag_names:
query |= Q(name__in=tag_names)
tags = Tag.objects.filter(query)
count = tags.count()
if count == 0:
rprint('[yellow]No matching tags found[/yellow]', file=sys.stderr)
return 0
if dry_run:
rprint(f'[yellow]Would delete {count} tags (dry run)[/yellow]', file=sys.stderr)
for tag in tags:
rprint(f' {tag.name}', file=sys.stderr)
return 0
if not yes:
rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
return 1
# Perform deletion
deleted_count, _ = tags.delete()
rprint(f'[green]Deleted {deleted_count} tags[/green]', file=sys.stderr)
return 0
# =============================================================================
# CLI Commands
# =============================================================================
@click.group()
def main():
"""Manage Tag records."""
pass
@main.command('create')
@click.argument('names', nargs=-1)
def create_cmd(names: tuple):
"""Create Tags from names."""
sys.exit(create_tags(names))
@main.command('list')
@click.option('--name', help='Filter by exact name')
@click.option('--name__icontains', help='Filter by name contains')
@click.option('--limit', '-n', type=int, help='Limit number of results')
def list_cmd(name: Optional[str], name__icontains: Optional[str], limit: Optional[int]):
"""List Tags as JSONL."""
sys.exit(list_tags(name=name, name__icontains=name__icontains, limit=limit))
@main.command('update')
@click.option('--name', '-n', help='Set new name')
def update_cmd(name: Optional[str]):
"""Update Tags from stdin JSONL."""
sys.exit(update_tags(name=name))
@main.command('delete')
@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
def delete_cmd(yes: bool, dry_run: bool):
"""Delete Tags from stdin JSONL."""
sys.exit(delete_tags(yes=yes, dry_run=dry_run))
if __name__ == '__main__':
main()

View File

@@ -1,17 +1,18 @@
#!/usr/bin/env python3
"""
Tests for CLI piping workflow: crawl | snapshot | extract
Tests for CLI piping workflow: crawl | snapshot | archiveresult | run
This module tests the JSONL-based piping between CLI commands as described in:
https://github.com/ArchiveBox/ArchiveBox/issues/1363
Workflows tested:
archivebox crawl URL -> Crawl JSONL
archivebox snapshot -> Snapshot JSONL (accepts Crawl or URL input)
archivebox extract -> ArchiveResult JSONL (accepts Snapshot input)
archivebox crawl create URL -> Crawl JSONL
archivebox snapshot create -> Snapshot JSONL (accepts Crawl or URL input)
archivebox archiveresult create -> ArchiveResult JSONL (accepts Snapshot input)
archivebox run -> Process queued records (accepts any JSONL)
Pipeline:
archivebox crawl URL | archivebox snapshot | archivebox extract
archivebox crawl create URL | archivebox snapshot create | archivebox archiveresult create | archivebox run
Each command should:
- Accept URLs, IDs, or JSONL as input (args or stdin)
@@ -154,13 +155,13 @@ class TestJSONLParsing(unittest.TestCase):
class TestJSONLOutput(unittest.TestCase):
"""Test JSONL output formatting."""
def test_crawl_to_jsonl(self):
"""Crawl model should serialize to JSONL correctly."""
def test_crawl_to_json(self):
"""Crawl model should serialize to JSON correctly."""
from archivebox.misc.jsonl import TYPE_CRAWL
# Create a mock crawl with to_jsonl method configured
# Create a mock crawl with to_json method configured
mock_crawl = MagicMock()
mock_crawl.to_jsonl.return_value = {
mock_crawl.to_json.return_value = {
'type': TYPE_CRAWL,
'schema_version': '0.9.0',
'id': 'test-crawl-uuid',
@@ -172,7 +173,7 @@ class TestJSONLOutput(unittest.TestCase):
'created_at': None,
}
result = mock_crawl.to_jsonl()
result = mock_crawl.to_json()
self.assertEqual(result['type'], TYPE_CRAWL)
self.assertEqual(result['id'], 'test-crawl-uuid')
self.assertEqual(result['urls'], 'https://example.com')
@@ -351,8 +352,8 @@ class TestSnapshotCommand(unittest.TestCase):
# using real Snapshot instances.
class TestExtractCommand(unittest.TestCase):
"""Unit tests for archivebox extract command."""
class TestArchiveResultCommand(unittest.TestCase):
"""Unit tests for archivebox archiveresult command."""
def setUp(self):
"""Set up test environment."""
@@ -363,8 +364,8 @@ class TestExtractCommand(unittest.TestCase):
"""Clean up test environment."""
shutil.rmtree(self.test_dir, ignore_errors=True)
def test_extract_accepts_snapshot_id(self):
"""extract should accept snapshot IDs as input."""
def test_archiveresult_accepts_snapshot_id(self):
"""archiveresult should accept snapshot IDs as input."""
from archivebox.misc.jsonl import read_args_or_stdin
uuid = '01234567-89ab-cdef-0123-456789abcdef'
@@ -374,8 +375,8 @@ class TestExtractCommand(unittest.TestCase):
self.assertEqual(len(records), 1)
self.assertEqual(records[0]['id'], uuid)
def test_extract_accepts_jsonl_snapshot(self):
"""extract should accept JSONL Snapshot records."""
def test_archiveresult_accepts_jsonl_snapshot(self):
"""archiveresult should accept JSONL Snapshot records."""
from archivebox.misc.jsonl import read_args_or_stdin, TYPE_SNAPSHOT
stdin = StringIO('{"type": "Snapshot", "id": "abc123", "url": "https://example.com"}\n')
@@ -387,8 +388,8 @@ class TestExtractCommand(unittest.TestCase):
self.assertEqual(records[0]['type'], TYPE_SNAPSHOT)
self.assertEqual(records[0]['id'], 'abc123')
def test_extract_gathers_snapshot_ids(self):
"""extract should gather snapshot IDs from various input formats."""
def test_archiveresult_gathers_snapshot_ids(self):
"""archiveresult should gather snapshot IDs from various input formats."""
from archivebox.misc.jsonl import TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
records = [
@@ -529,7 +530,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
# Create crawl with multiple URLs (as newline-separated string)
urls = 'https://test-crawl-1.example.com\nhttps://test-crawl-2.example.com'
crawl = Crawl.from_jsonl({'urls': urls}, overrides={'created_by_id': created_by_id})
crawl = Crawl.from_json({'urls': urls}, overrides={'created_by_id': created_by_id})
self.assertIsNotNone(crawl)
self.assertIsNotNone(crawl.id)
@@ -543,7 +544,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
self.assertIn('https://test-crawl-2.example.com', urls_list)
# Verify output format
output = crawl.to_jsonl()
output = crawl.to_json()
self.assertEqual(output['type'], TYPE_CRAWL)
self.assertIn('id', output)
self.assertEqual(output['urls'], urls)
@@ -566,8 +567,8 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
# Step 1: Create crawl (simulating 'archivebox crawl')
urls = 'https://crawl-to-snap-1.example.com\nhttps://crawl-to-snap-2.example.com'
crawl = Crawl.from_jsonl({'urls': urls}, overrides={'created_by_id': created_by_id})
crawl_output = crawl.to_jsonl()
crawl = Crawl.from_json({'urls': urls}, overrides={'created_by_id': created_by_id})
crawl_output = crawl.to_json()
# Step 2: Parse crawl output as snapshot input
stdin = StringIO(json.dumps(crawl_output) + '\n')
@@ -581,7 +582,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
# Step 3: Create snapshots from crawl URLs
created_snapshots = []
for url in crawl.get_urls_list():
snapshot = Snapshot.from_jsonl({'url': url}, overrides={'created_by_id': created_by_id})
snapshot = Snapshot.from_json({'url': url}, overrides={'created_by_id': created_by_id})
if snapshot:
created_snapshots.append(snapshot)
@@ -589,7 +590,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
# Verify snapshot output
for snapshot in created_snapshots:
output = snapshot.to_jsonl()
output = snapshot.to_json()
self.assertEqual(output['type'], TYPE_SNAPSHOT)
self.assertIn(output['url'], [
'https://crawl-to-snap-1.example.com',
@@ -619,13 +620,13 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
# Create snapshot
overrides = {'created_by_id': created_by_id}
snapshot = Snapshot.from_jsonl(records[0], overrides=overrides)
snapshot = Snapshot.from_json(records[0], overrides=overrides)
self.assertIsNotNone(snapshot.id)
self.assertEqual(snapshot.url, url)
# Verify output format
output = snapshot.to_jsonl()
output = snapshot.to_json()
self.assertEqual(output['type'], TYPE_SNAPSHOT)
self.assertIn('id', output)
self.assertEqual(output['url'], url)
@@ -647,8 +648,8 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
# Step 1: Create snapshot (simulating 'archivebox snapshot')
url = 'https://test-extract-1.example.com'
overrides = {'created_by_id': created_by_id}
snapshot = Snapshot.from_jsonl({'url': url}, overrides=overrides)
snapshot_output = snapshot.to_jsonl()
snapshot = Snapshot.from_json({'url': url}, overrides=overrides)
snapshot_output = snapshot.to_json()
# Step 2: Parse snapshot output as extract input
stdin = StringIO(json.dumps(snapshot_output) + '\n')
@@ -686,8 +687,8 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
# === archivebox crawl https://example.com ===
url = 'https://test-pipeline-full.example.com'
crawl = Crawl.from_jsonl({'url': url}, overrides={'created_by_id': created_by_id})
crawl_jsonl = json.dumps(crawl.to_jsonl())
crawl = Crawl.from_json({'url': url}, overrides={'created_by_id': created_by_id})
crawl_jsonl = json.dumps(crawl.to_json())
# === | archivebox snapshot ===
stdin = StringIO(crawl_jsonl + '\n')
@@ -705,7 +706,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
if crawl_id:
db_crawl = Crawl.objects.get(id=crawl_id)
for crawl_url in db_crawl.get_urls_list():
snapshot = Snapshot.from_jsonl({'url': crawl_url}, overrides={'created_by_id': created_by_id})
snapshot = Snapshot.from_json({'url': crawl_url}, overrides={'created_by_id': created_by_id})
if snapshot:
created_snapshots.append(snapshot)
@@ -713,7 +714,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
self.assertEqual(created_snapshots[0].url, url)
# === | archivebox extract ===
snapshot_jsonl_lines = [json.dumps(s.to_jsonl()) for s in created_snapshots]
snapshot_jsonl_lines = [json.dumps(s.to_json()) for s in created_snapshots]
stdin = StringIO('\n'.join(snapshot_jsonl_lines) + '\n')
stdin.isatty = lambda: False
@@ -757,12 +758,12 @@ class TestDepthWorkflows(unittest.TestCase):
# Create crawl with depth 0
url = 'https://depth0-test.example.com'
crawl = Crawl.from_jsonl({'url': url, 'max_depth': 0}, overrides={'created_by_id': created_by_id})
crawl = Crawl.from_json({'url': url, 'max_depth': 0}, overrides={'created_by_id': created_by_id})
self.assertEqual(crawl.max_depth, 0)
# Create snapshot
snapshot = Snapshot.from_jsonl({'url': url}, overrides={'created_by_id': created_by_id})
snapshot = Snapshot.from_json({'url': url}, overrides={'created_by_id': created_by_id})
self.assertEqual(snapshot.url, url)
def test_depth_metadata_in_crawl(self):
@@ -773,7 +774,7 @@ class TestDepthWorkflows(unittest.TestCase):
created_by_id = get_or_create_system_user_pk()
# Create crawl with depth
crawl = Crawl.from_jsonl(
crawl = Crawl.from_json(
{'url': 'https://depth-meta-test.example.com', 'max_depth': 2},
overrides={'created_by_id': created_by_id}
)
@@ -781,7 +782,7 @@ class TestDepthWorkflows(unittest.TestCase):
self.assertEqual(crawl.max_depth, 2)
# Verify in JSONL output
output = crawl.to_jsonl()
output = crawl.to_json()
self.assertEqual(output['max_depth'], 2)

View File

@@ -158,7 +158,7 @@ class AddLinkForm(forms.Form):
'search_backend_ripgrep', 'search_backend_sonic', 'search_backend_sqlite'
}
binary = {'apt', 'brew', 'custom', 'env', 'npm', 'pip'}
extensions = {'captcha2', 'istilldontcareaboutcookies', 'ublock'}
extensions = {'twocaptcha', 'istilldontcareaboutcookies', 'ublock'}
# Populate plugin field choices
self.fields['chrome_plugins'].choices = [

View File

@@ -1,6 +1,6 @@
__package__ = 'archivebox.core'
from typing import Optional, Dict, Iterable, Any, List, TYPE_CHECKING
from typing import Optional, Dict, Iterable, Any, List, TYPE_CHECKING, Iterator, Set
from archivebox.uuid_compat import uuid7
from datetime import datetime, timedelta
from django_stubs_ext.db.models import TypedModelMeta
@@ -41,6 +41,8 @@ from archivebox.machine.models import NetworkInterface, Binary
class Tag(ModelWithSerializers):
JSONL_TYPE = 'Tag'
# Keep AutoField for compatibility with main branch migrations
# Don't use UUIDField here - requires complex FK transformation
id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
@@ -91,26 +93,66 @@ class Tag(ModelWithSerializers):
def api_url(self) -> str:
return reverse_lazy('api-1:get_tag', args=[self.id])
def to_jsonl(self) -> dict:
def to_json(self) -> dict:
"""
Convert Tag model instance to a JSONL record.
Convert Tag model instance to a JSON-serializable dict.
"""
from archivebox.config import VERSION
return {
'type': 'Tag',
'type': self.JSONL_TYPE,
'schema_version': VERSION,
'id': str(self.id),
'name': self.name,
'slug': self.slug,
}
@staticmethod
def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None):
def to_jsonl(self, seen: Set[tuple] = None, **kwargs) -> Iterator[dict]:
"""
Create/update Tag from JSONL record.
Yield this Tag as a JSON record.
Args:
record: JSONL record with 'name' field
seen: Set of (type, id) tuples already emitted (for deduplication)
**kwargs: Passed to children (none for Tag, leaf node)
Yields:
dict: JSON-serializable record for this tag
"""
if seen is not None:
key = (self.JSONL_TYPE, str(self.id))
if key in seen:
return
seen.add(key)
yield self.to_json()
@classmethod
def from_jsonl(cls, records, overrides: Dict[str, Any] = None) -> list['Tag']:
"""
Create/update Tags from an iterable of JSONL records.
Filters to only records with type='Tag'.
Args:
records: Iterable of dicts (JSONL records)
overrides: Optional dict with 'snapshot' to auto-attach tags
Returns:
List of Tag instances (skips None results)
"""
results = []
for record in records:
record_type = record.get('type', cls.JSONL_TYPE)
if record_type == cls.JSONL_TYPE:
instance = cls.from_json(record, overrides=overrides)
if instance:
results.append(instance)
return results
@staticmethod
def from_json(record: Dict[str, Any], overrides: Dict[str, Any] = None) -> 'Tag | None':
"""
Create/update a single Tag from a JSON record dict.
Args:
record: Dict with 'name' field
overrides: Optional dict with 'snapshot' to auto-attach tag
Returns:
@@ -289,6 +331,8 @@ class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)):
class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
JSONL_TYPE = 'Snapshot'
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
created_at = models.DateTimeField(default=timezone.now, db_index=True)
modified_at = models.DateTimeField(auto_now=True)
@@ -968,38 +1012,18 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
Each line is a JSON record with a 'type' field:
- Snapshot: snapshot metadata (crawl_id, url, tags, etc.)
- ArchiveResult: extractor results (plugin, status, output, etc.)
- Binary: binary info used for the extraction
- Process: process execution details (cmd, exit_code, timing, etc.)
- ArchiveResult: extractor results (plugin, status, output, etc.)
"""
import json
index_path = Path(self.output_dir) / CONSTANTS.JSONL_INDEX_FILENAME
index_path.parent.mkdir(parents=True, exist_ok=True)
# Track unique binaries and processes to avoid duplicates
binaries_seen = set()
processes_seen = set()
with open(index_path, 'w') as f:
# Write Snapshot record first (to_jsonl includes crawl_id, fs_version)
f.write(json.dumps(self.to_jsonl()) + '\n')
# Write ArchiveResult records with their associated Binary and Process
# Use select_related to optimize queries
for ar in self.archiveresult_set.select_related('process__binary').order_by('start_ts'):
# Write Binary record if not already written
if ar.process and ar.process.binary and ar.process.binary_id not in binaries_seen:
binaries_seen.add(ar.process.binary_id)
f.write(json.dumps(ar.process.binary.to_jsonl()) + '\n')
# Write Process record if not already written
if ar.process and ar.process_id not in processes_seen:
processes_seen.add(ar.process_id)
f.write(json.dumps(ar.process.to_jsonl()) + '\n')
# Write ArchiveResult record
f.write(json.dumps(ar.to_jsonl()) + '\n')
for record in self.to_jsonl():
f.write(json.dumps(record) + '\n')
def read_index_jsonl(self) -> dict:
"""
@@ -1411,23 +1435,21 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
if not self.OUTPUT_DIR.exists():
return False
for plugin_dir in self.OUTPUT_DIR.iterdir():
if not plugin_dir.is_dir():
continue
pid_file = plugin_dir / 'hook.pid'
# Check all .pid files in the snapshot directory (hook-specific names)
for pid_file in self.OUTPUT_DIR.glob('**/*.pid'):
if process_is_alive(pid_file):
return True
return False
def to_jsonl(self) -> dict:
def to_json(self) -> dict:
"""
Convert Snapshot model instance to a JSONL record.
Convert Snapshot model instance to a JSON-serializable dict.
Includes all fields needed to fully reconstruct/identify this snapshot.
"""
from archivebox.config import VERSION
return {
'type': 'Snapshot',
'type': self.JSONL_TYPE,
'schema_version': VERSION,
'id': str(self.id),
'crawl_id': str(self.crawl_id),
@@ -1442,12 +1464,68 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
'fs_version': self.fs_version,
}
@staticmethod
def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None, queue_for_extraction: bool = True):
def to_jsonl(self, seen: Set[tuple] = None, archiveresult: bool = True, process: bool = True, binary: bool = True, machine: bool = False, iface: bool = False, **kwargs) -> Iterator[dict]:
"""
Create/update Snapshot from JSONL record or dict.
Yield this Snapshot and optionally related objects as JSON records.
Unified method that handles:
Uses select_related for efficient querying. Deduplicates automatically.
Args:
seen: Set of (type, id) tuples already emitted (for deduplication)
archiveresult: Include related ArchiveResults (default: True)
process: Include Process for each ArchiveResult (default: True)
binary: Include Binary for each Process (default: True)
machine: Include Machine for each Process (default: False)
iface: Include NetworkInterface for each Process (default: False)
**kwargs: Additional options passed to children
Yields:
dict: JSON-serializable records
"""
if seen is None:
seen = set()
key = (self.JSONL_TYPE, str(self.id))
if key in seen:
return
seen.add(key)
yield self.to_json()
if archiveresult:
# Use select_related to optimize queries
for ar in self.archiveresult_set.select_related('process__binary').order_by('start_ts'):
yield from ar.to_jsonl(seen=seen, process=process, binary=binary, machine=machine, iface=iface, **kwargs)
@classmethod
def from_jsonl(cls, records, overrides: Dict[str, Any] = None, queue_for_extraction: bool = True) -> list['Snapshot']:
"""
Create/update Snapshots from an iterable of JSONL records.
Filters to only records with type='Snapshot' (or no type).
Args:
records: Iterable of dicts (JSONL records)
overrides: Dict with 'crawl', 'snapshot' (parent), 'created_by_id'
queue_for_extraction: If True, sets status=QUEUED and retry_at (default: True)
Returns:
List of Snapshot instances (skips None results)
"""
results = []
for record in records:
record_type = record.get('type', cls.JSONL_TYPE)
if record_type == cls.JSONL_TYPE:
instance = cls.from_json(record, overrides=overrides, queue_for_extraction=queue_for_extraction)
if instance:
results.append(instance)
return results
@staticmethod
def from_json(record: Dict[str, Any], overrides: Dict[str, Any] = None, queue_for_extraction: bool = True) -> 'Snapshot | None':
"""
Create/update a single Snapshot from a JSON record dict.
Handles:
- ID-based patching: {"id": "...", "title": "new title"}
- URL-based create/update: {"url": "...", "title": "...", "tags": "..."}
- Auto-creates Crawl if not provided
@@ -2054,8 +2132,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
result['canonical'] = self.canonical_outputs()
return result
def to_json(self, indent: int = 4) -> str:
"""Convert to JSON string"""
def to_json_str(self, indent: int = 4) -> str:
"""Convert to JSON string for file output."""
return to_json(self.to_dict(extended=True), indent=indent)
def to_csv(self, cols: Optional[List[str]] = None, separator: str = ',', ljust: int = 0) -> str:
@@ -2203,6 +2281,8 @@ class SnapshotMachine(BaseStateMachine, strict_states=True):
class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
JSONL_TYPE = 'ArchiveResult'
class StatusChoices(models.TextChoices):
QUEUED = 'queued', 'Queued'
STARTED = 'started', 'Started'
@@ -2274,13 +2354,13 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
"""Convenience property to access the user who created this archive result via its snapshot's crawl."""
return self.snapshot.crawl.created_by
def to_jsonl(self) -> dict:
def to_json(self) -> dict:
"""
Convert ArchiveResult model instance to a JSONL record.
Convert ArchiveResult model instance to a JSON-serializable dict.
"""
from archivebox.config import VERSION
record = {
'type': 'ArchiveResult',
'type': self.JSONL_TYPE,
'schema_version': VERSION,
'id': str(self.id),
'snapshot_id': str(self.snapshot_id),
@@ -2308,6 +2388,31 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
record['process_id'] = str(self.process_id)
return record
def to_jsonl(self, seen: Set[tuple] = None, process: bool = True, **kwargs) -> Iterator[dict]:
"""
Yield this ArchiveResult and optionally related objects as JSON records.
Args:
seen: Set of (type, id) tuples already emitted (for deduplication)
process: Include related Process and its children (default: True)
**kwargs: Passed to Process.to_jsonl() (e.g., binary=True, machine=False)
Yields:
dict: JSON-serializable records
"""
if seen is None:
seen = set()
key = (self.JSONL_TYPE, str(self.id))
if key in seen:
return
seen.add(key)
yield self.to_json()
if process and self.process:
yield from self.process.to_jsonl(seen=seen, **kwargs)
def save(self, *args, **kwargs):
is_new = self._state.adding
@@ -2595,8 +2700,12 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
self.save()
return
# Read and parse JSONL output from stdout.log
stdout_file = plugin_dir / 'stdout.log'
# Derive hook basename for hook-specific filenames
# e.g., "on_Snapshot__50_wget.py" -> "on_Snapshot__50_wget"
hook_basename = Path(self.hook_name).stem if self.hook_name else 'hook'
# Read and parse JSONL output from hook-specific stdout log
stdout_file = plugin_dir / f'{hook_basename}.stdout.log'
stdout = stdout_file.read_text() if stdout_file.exists() else ''
records = []
@@ -2637,7 +2746,16 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
self.output_str = 'Hook did not output ArchiveResult record'
# Walk filesystem and populate output_files, output_size, output_mimetypes
exclude_names = {'stdout.log', 'stderr.log', 'hook.pid', 'listener.pid'}
# Exclude hook output files (hook-specific names like on_Snapshot__50_wget.stdout.log)
def is_hook_output_file(name: str) -> bool:
"""Check if a file is a hook output file that should be excluded."""
return (
name.endswith('.stdout.log') or
name.endswith('.stderr.log') or
name.endswith('.pid') or
(name.endswith('.sh') and name.startswith('on_'))
)
mime_sizes = defaultdict(int)
total_size = 0
output_files = {}
@@ -2645,7 +2763,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
for file_path in plugin_dir.rglob('*'):
if not file_path.is_file():
continue
if file_path.name in exclude_names:
if is_hook_output_file(file_path.name):
continue
try:
@@ -2703,10 +2821,10 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
}
process_hook_records(filtered_records, overrides=overrides)
# Cleanup PID files and empty logs
pid_file = plugin_dir / 'hook.pid'
# Cleanup PID files and empty logs (hook-specific names)
pid_file = plugin_dir / f'{hook_basename}.pid'
pid_file.unlink(missing_ok=True)
stderr_file = plugin_dir / 'stderr.log'
stderr_file = plugin_dir / f'{hook_basename}.stderr.log'
if stdout_file.exists() and stdout_file.stat().st_size == 0:
stdout_file.unlink()
if stderr_file.exists() and stderr_file.stat().st_size == 0:
@@ -2812,7 +2930,9 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
plugin_dir = Path(self.pwd) if self.pwd else None
if not plugin_dir:
return False
pid_file = plugin_dir / 'hook.pid'
# Use hook-specific pid filename
hook_basename = Path(self.hook_name).stem if self.hook_name else 'hook'
pid_file = plugin_dir / f'{hook_basename}.pid'
return pid_file.exists()

View File

@@ -1,6 +1,6 @@
__package__ = 'archivebox.crawls'
from typing import TYPE_CHECKING, Iterable
from typing import TYPE_CHECKING, Iterable, Iterator, Set
from datetime import timedelta
from archivebox.uuid_compat import uuid7
from pathlib import Path
@@ -59,6 +59,8 @@ class CrawlSchedule(ModelWithSerializers, ModelWithNotes, ModelWithHealthStats):
class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWithStateMachine):
JSONL_TYPE = 'Crawl'
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
created_at = models.DateTimeField(default=timezone.now, db_index=True)
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False)
@@ -134,13 +136,13 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
def api_url(self) -> str:
return reverse_lazy('api-1:get_crawl', args=[self.id])
def to_jsonl(self) -> dict:
def to_json(self) -> dict:
"""
Convert Crawl model instance to a JSONL record.
Convert Crawl model instance to a JSON-serializable dict.
"""
from archivebox.config import VERSION
return {
'type': 'Crawl',
'type': self.JSONL_TYPE,
'schema_version': VERSION,
'id': str(self.id),
'urls': self.urls,
@@ -151,10 +153,63 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
'created_at': self.created_at.isoformat() if self.created_at else None,
}
@staticmethod
def from_jsonl(record: dict, overrides: dict = None):
def to_jsonl(self, seen: Set[tuple] = None, snapshot: bool = True, archiveresult: bool = True, process: bool = True, binary: bool = True, machine: bool = False, iface: bool = False, **kwargs) -> Iterator[dict]:
"""
Create or get a Crawl from a JSONL record.
Yield this Crawl and optionally related objects as JSON records.
Args:
seen: Set of (type, id) tuples already emitted (for deduplication)
snapshot: Include related Snapshots (default: True)
archiveresult: Include ArchiveResults for each Snapshot (default: True)
process: Include Process for each ArchiveResult (default: True)
binary: Include Binary for each Process (default: True)
machine: Include Machine for each Process (default: False)
iface: Include NetworkInterface for each Process (default: False)
**kwargs: Additional options passed to children
Yields:
dict: JSON-serializable records
"""
if seen is None:
seen = set()
key = (self.JSONL_TYPE, str(self.id))
if key in seen:
return
seen.add(key)
yield self.to_json()
if snapshot:
for snap in self.snapshot_set.all():
yield from snap.to_jsonl(seen=seen, archiveresult=archiveresult, process=process, binary=binary, machine=machine, iface=iface, **kwargs)
@classmethod
def from_jsonl(cls, records, overrides: dict = None) -> list['Crawl']:
"""
Create/update Crawls from an iterable of JSONL records.
Filters to only records with type='Crawl' (or no type).
Args:
records: Iterable of dicts (JSONL records)
overrides: Dict of field overrides (e.g., created_by_id)
Returns:
List of Crawl instances (skips None results)
"""
results = []
for record in records:
record_type = record.get('type', cls.JSONL_TYPE)
if record_type == cls.JSONL_TYPE:
instance = cls.from_json(record, overrides=overrides)
if instance:
results.append(instance)
return results
@staticmethod
def from_json(record: dict, overrides: dict = None) -> 'Crawl | None':
"""
Create or get a single Crawl from a JSON record dict.
Args:
record: Dict with 'urls' (required), optional 'max_depth', 'tags_str', 'label'

View File

@@ -365,11 +365,14 @@ def run_hook(
# Old convention: __background in stem (for backwards compatibility)
is_background = '.bg.' in script.name or '__background' in script.stem
# Set up output files for ALL hooks (useful for debugging)
stdout_file = output_dir / 'stdout.log'
stderr_file = output_dir / 'stderr.log'
pid_file = output_dir / 'hook.pid'
cmd_file = output_dir / 'cmd.sh'
# Set up output files for ALL hooks - use hook-specific names to avoid conflicts
# when multiple hooks run in the same plugin directory
# e.g., on_Snapshot__20_chrome_tab.bg.js -> on_Snapshot__20_chrome_tab.bg.stdout.log
hook_basename = script.stem # e.g., "on_Snapshot__20_chrome_tab.bg"
stdout_file = output_dir / f'{hook_basename}.stdout.log'
stderr_file = output_dir / f'{hook_basename}.stderr.log'
pid_file = output_dir / f'{hook_basename}.pid'
cmd_file = output_dir / f'{hook_basename}.sh'
try:
# Write command script for validation
@@ -421,8 +424,14 @@ def run_hook(
# Detect new files created by the hook
files_after = set(output_dir.rglob('*')) if output_dir.exists() else set()
new_files = [str(f.relative_to(output_dir)) for f in (files_after - files_before) if f.is_file()]
# Exclude the log files themselves from new_files
new_files = [f for f in new_files if f not in ('stdout.log', 'stderr.log', 'hook.pid')]
# Exclude the log/pid/sh files themselves from new_files (hook-specific names)
hook_output_files = {
f'{hook_basename}.stdout.log',
f'{hook_basename}.stderr.log',
f'{hook_basename}.pid',
f'{hook_basename}.sh',
}
new_files = [f for f in new_files if f not in hook_output_files]
# Parse JSONL output from stdout
# Each line starting with { that has 'type' field is a record
@@ -1176,7 +1185,9 @@ def create_model_record(record: Dict[str, Any]) -> Any:
def process_hook_records(records: List[Dict[str, Any]], overrides: Dict[str, Any] = None) -> Dict[str, int]:
"""
Process JSONL records from hook output.
Dispatches to Model.from_jsonl() for each record type.
Uses Model.from_jsonl() which automatically filters by JSONL_TYPE.
Each model only processes records matching its type.
Args:
records: List of JSONL record dicts from result['records']
@@ -1185,54 +1196,26 @@ def process_hook_records(records: List[Dict[str, Any]], overrides: Dict[str, Any
Returns:
Dict with counts by record type
"""
stats = {}
from archivebox.core.models import Snapshot, Tag
from archivebox.machine.models import Binary, Machine
overrides = overrides or {}
for record in records:
record_type = record.get('type')
if not record_type:
continue
# Filter out ArchiveResult records (they update the calling AR, not create new ones)
filtered_records = [r for r in records if r.get('type') != 'ArchiveResult']
# Skip ArchiveResult records (they update the calling ArchiveResult, not create new ones)
if record_type == 'ArchiveResult':
continue
# Each model's from_jsonl() filters to only its own type
snapshots = Snapshot.from_jsonl(filtered_records, overrides)
tags = Tag.from_jsonl(filtered_records, overrides)
binaries = Binary.from_jsonl(filtered_records, overrides)
machines = Machine.from_jsonl(filtered_records, overrides)
try:
# Dispatch to appropriate model's from_jsonl() method
if record_type == 'Snapshot':
from archivebox.core.models import Snapshot
obj = Snapshot.from_jsonl(record.copy(), overrides)
if obj:
stats['Snapshot'] = stats.get('Snapshot', 0) + 1
elif record_type == 'Tag':
from archivebox.core.models import Tag
obj = Tag.from_jsonl(record.copy(), overrides)
if obj:
stats['Tag'] = stats.get('Tag', 0) + 1
elif record_type == 'Binary':
from archivebox.machine.models import Binary
obj = Binary.from_jsonl(record.copy(), overrides)
if obj:
stats['Binary'] = stats.get('Binary', 0) + 1
elif record_type == 'Machine':
from archivebox.machine.models import Machine
obj = Machine.from_jsonl(record.copy(), overrides)
if obj:
stats['Machine'] = stats.get('Machine', 0) + 1
else:
import sys
print(f"Warning: Unknown record type '{record_type}' from hook output", file=sys.stderr)
except Exception as e:
import sys
print(f"Warning: Failed to create {record_type}: {e}", file=sys.stderr)
continue
return stats
return {
'Snapshot': len(snapshots),
'Tag': len(tags),
'Binary': len(binaries),
'Machine': len(machines),
}
def process_is_alive(pid_file: Path) -> bool:
@@ -1261,15 +1244,16 @@ def kill_process(pid_file: Path, sig: int = signal.SIGTERM, validate: bool = Tru
Kill process in PID file with optional validation.
Args:
pid_file: Path to hook.pid file
pid_file: Path to hook-specific .pid file (e.g., on_Snapshot__20_chrome_tab.bg.pid)
sig: Signal to send (default SIGTERM)
validate: If True, validate process identity before killing (default: True)
"""
from archivebox.misc.process_utils import safe_kill_process
if validate:
# Use safe kill with validation
cmd_file = pid_file.parent / 'cmd.sh'
# Derive cmd file from pid file: on_Snapshot__20_chrome_tab.bg.pid -> on_Snapshot__20_chrome_tab.bg.sh
cmd_file = pid_file.with_suffix('.sh')
safe_kill_process(pid_file, cmd_file, signal_num=sig)
else:
# Legacy behavior - kill without validation

View File

@@ -1,6 +1,7 @@
__package__ = 'archivebox.machine'
import socket
from typing import Iterator, Set
from archivebox.uuid_compat import uuid7
from datetime import timedelta
@@ -29,6 +30,8 @@ class MachineManager(models.Manager):
class Machine(ModelWithHealthStats):
JSONL_TYPE = 'Machine'
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
created_at = models.DateTimeField(default=timezone.now, db_index=True)
modified_at = models.DateTimeField(auto_now=True)
@@ -69,13 +72,35 @@ class Machine(ModelWithHealthStats):
)
return _CURRENT_MACHINE
@staticmethod
def from_jsonl(record: dict, overrides: dict = None):
@classmethod
def from_jsonl(cls, records, overrides: dict = None) -> list['Machine']:
"""
Update Machine config from JSONL record.
Update Machine configs from an iterable of JSONL records.
Filters to only records with type='Machine'.
Args:
record: JSONL record with '_method': 'update', 'key': '...', 'value': '...'
records: Iterable of dicts (JSONL records)
overrides: Not used
Returns:
List of Machine instances (skips None results)
"""
results = []
for record in records:
record_type = record.get('type', cls.JSONL_TYPE)
if record_type == cls.JSONL_TYPE:
instance = cls.from_json(record, overrides=overrides)
if instance:
results.append(instance)
return results
@staticmethod
def from_json(record: dict, overrides: dict = None) -> 'Machine | None':
"""
Update a single Machine config from a JSON record dict.
Args:
record: Dict with '_method': 'update', 'key': '...', 'value': '...'
overrides: Not used
Returns:
@@ -94,6 +119,44 @@ class Machine(ModelWithHealthStats):
return machine
return None
def to_json(self) -> dict:
"""
Convert Machine model instance to a JSON-serializable dict.
"""
from archivebox.config import VERSION
return {
'type': self.JSONL_TYPE,
'schema_version': VERSION,
'id': str(self.id),
'guid': self.guid,
'hostname': self.hostname,
'hw_in_docker': self.hw_in_docker,
'hw_in_vm': self.hw_in_vm,
'os_arch': self.os_arch,
'os_family': self.os_family,
'os_platform': self.os_platform,
'os_release': self.os_release,
'created_at': self.created_at.isoformat() if self.created_at else None,
}
def to_jsonl(self, seen: Set[tuple] = None, **kwargs) -> Iterator[dict]:
"""
Yield this Machine as a JSON record.
Args:
seen: Set of (type, id) tuples already emitted (for deduplication)
**kwargs: Passed to children (none for Machine, leaf node)
Yields:
dict: JSON-serializable record for this machine
"""
if seen is not None:
key = (self.JSONL_TYPE, str(self.id))
if key in seen:
return
seen.add(key)
yield self.to_json()
class NetworkInterfaceManager(models.Manager):
def current(self) -> 'NetworkInterface':
@@ -101,6 +164,8 @@ class NetworkInterfaceManager(models.Manager):
class NetworkInterface(ModelWithHealthStats):
JSONL_TYPE = 'NetworkInterface'
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
created_at = models.DateTimeField(default=timezone.now, db_index=True)
modified_at = models.DateTimeField(auto_now=True)
@@ -139,6 +204,46 @@ class NetworkInterface(ModelWithHealthStats):
)
return _CURRENT_INTERFACE
def to_json(self) -> dict:
"""
Convert NetworkInterface model instance to a JSON-serializable dict.
"""
from archivebox.config import VERSION
return {
'type': self.JSONL_TYPE,
'schema_version': VERSION,
'id': str(self.id),
'machine_id': str(self.machine_id),
'hostname': self.hostname,
'iface': self.iface,
'ip_public': self.ip_public,
'ip_local': self.ip_local,
'mac_address': self.mac_address,
'dns_server': self.dns_server,
'isp': self.isp,
'city': self.city,
'region': self.region,
'country': self.country,
'created_at': self.created_at.isoformat() if self.created_at else None,
}
def to_jsonl(self, seen: Set[tuple] = None, **kwargs) -> Iterator[dict]:
"""
Yield this NetworkInterface as a JSON record.
Args:
seen: Set of (type, id) tuples already emitted (for deduplication)
**kwargs: Passed to children (none for NetworkInterface, leaf node)
Yields:
dict: JSON-serializable record for this network interface
"""
if seen is not None:
key = (self.JSONL_TYPE, str(self.id))
if key in seen:
return
seen.add(key)
yield self.to_json()
class BinaryManager(models.Manager):
@@ -165,7 +270,7 @@ class BinaryManager(models.Manager):
class Binary(ModelWithHealthStats):
"""
Tracks an binary on a specific machine.
Tracks a binary on a specific machine.
Follows the unified state machine pattern:
- queued: Binary needs to be installed
@@ -176,6 +281,7 @@ class Binary(ModelWithHealthStats):
State machine calls run() which executes on_Binary__install_* hooks
to install the binary using the specified providers.
"""
JSONL_TYPE = 'Binary'
class StatusChoices(models.TextChoices):
QUEUED = 'queued', 'Queued'
@@ -242,13 +348,13 @@ class Binary(ModelWithHealthStats):
'is_valid': self.is_valid,
}
def to_jsonl(self) -> dict:
def to_json(self) -> dict:
"""
Convert Binary model instance to a JSONL record.
Convert Binary model instance to a JSON-serializable dict.
"""
from archivebox.config import VERSION
return {
'type': 'Binary',
'type': self.JSONL_TYPE,
'schema_version': VERSION,
'id': str(self.id),
'machine_id': str(self.machine_id),
@@ -260,17 +366,57 @@ class Binary(ModelWithHealthStats):
'status': self.status,
}
@staticmethod
def from_jsonl(record: dict, overrides: dict = None):
def to_jsonl(self, seen: Set[tuple] = None, **kwargs) -> Iterator[dict]:
"""
Create/update Binary from JSONL record.
Yield this Binary as a JSON record.
Args:
seen: Set of (type, id) tuples already emitted (for deduplication)
**kwargs: Passed to children (none for Binary, leaf node)
Yields:
dict: JSON-serializable record for this binary
"""
if seen is not None:
key = (self.JSONL_TYPE, str(self.id))
if key in seen:
return
seen.add(key)
yield self.to_json()
@classmethod
def from_jsonl(cls, records, overrides: dict = None) -> list['Binary']:
"""
Create/update Binaries from an iterable of JSONL records.
Filters to only records with type='Binary'.
Args:
records: Iterable of dicts (JSONL records)
overrides: Not used
Returns:
List of Binary instances (skips None results)
"""
results = []
for record in records:
record_type = record.get('type', cls.JSONL_TYPE)
if record_type == cls.JSONL_TYPE:
instance = cls.from_json(record, overrides=overrides)
if instance:
results.append(instance)
return results
@staticmethod
def from_json(record: dict, overrides: dict = None) -> 'Binary | None':
"""
Create/update a single Binary from a JSON record dict.
Handles two cases:
1. From binaries.jsonl: creates queued binary with name, binproviders, overrides
2. From hook output: updates binary with abspath, version, sha256, binprovider
Args:
record: JSONL record with 'name' and either:
record: Dict with 'name' and either:
- 'binproviders', 'overrides' (from binaries.jsonl)
- 'abspath', 'version', 'sha256', 'binprovider' (from hook output)
overrides: Not used
@@ -494,6 +640,7 @@ class Process(ModelWithHealthStats):
State machine calls launch() to spawn the process and monitors its lifecycle.
"""
JSONL_TYPE = 'Process'
class StatusChoices(models.TextChoices):
QUEUED = 'queued', 'Queued'
@@ -624,13 +771,13 @@ class Process(ModelWithHealthStats):
return self.archiveresult.hook_name
return ''
def to_jsonl(self) -> dict:
def to_json(self) -> dict:
"""
Convert Process model instance to a JSONL record.
Convert Process model instance to a JSON-serializable dict.
"""
from archivebox.config import VERSION
record = {
'type': 'Process',
'type': self.JSONL_TYPE,
'schema_version': VERSION,
'id': str(self.id),
'machine_id': str(self.machine_id),
@@ -650,6 +797,37 @@ class Process(ModelWithHealthStats):
record['timeout'] = self.timeout
return record
def to_jsonl(self, seen: Set[tuple] = None, binary: bool = True, machine: bool = False, iface: bool = False, **kwargs) -> Iterator[dict]:
"""
Yield this Process and optionally related objects as JSON records.
Args:
seen: Set of (type, id) tuples already emitted (for deduplication)
binary: Include related Binary (default: True)
machine: Include related Machine (default: False)
iface: Include related NetworkInterface (default: False)
**kwargs: Passed to children
Yields:
dict: JSON-serializable records
"""
if seen is None:
seen = set()
key = (self.JSONL_TYPE, str(self.id))
if key in seen:
return
seen.add(key)
yield self.to_json()
if binary and self.binary:
yield from self.binary.to_jsonl(seen=seen, **kwargs)
if machine and self.machine:
yield from self.machine.to_jsonl(seen=seen, **kwargs)
if iface and self.iface:
yield from self.iface.to_jsonl(seen=seen, **kwargs)
def update_and_requeue(self, **kwargs):
"""
Update process fields and requeue for worker state machine.

View File

@@ -24,7 +24,7 @@ __package__ = 'archivebox.misc'
import sys
import json
from typing import Iterator, Dict, Any, Optional, TextIO, Callable
from typing import Iterator, Dict, Any, Optional, TextIO
from pathlib import Path
@@ -150,36 +150,3 @@ def write_records(records: Iterator[Dict[str, Any]], stream: Optional[TextIO] =
count += 1
return count
def filter_by_type(records: Iterator[Dict[str, Any]], record_type: str) -> Iterator[Dict[str, Any]]:
"""
Filter records by type.
"""
for record in records:
if record.get('type') == record_type:
yield record
def process_records(
records: Iterator[Dict[str, Any]],
handlers: Dict[str, Callable[[Dict[str, Any]], Optional[Dict[str, Any]]]]
) -> Iterator[Dict[str, Any]]:
"""
Process records through type-specific handlers.
Args:
records: Input record iterator
handlers: Dict mapping type names to handler functions
Handlers return output records or None to skip
Yields output records from handlers.
"""
for record in records:
record_type = record.get('type')
handler = handlers.get(record_type)
if handler:
result = handler(record)
if result:
yield result

View File

@@ -1,21 +0,0 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"required_plugins": ["chrome"],
"properties": {
"CAPTCHA2_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["USE_CAPTCHA2"],
"description": "Enable Captcha2 browser extension for CAPTCHA solving"
},
"CAPTCHA2_TIMEOUT": {
"type": "integer",
"default": 60,
"minimum": 5,
"x-fallback": "TIMEOUT",
"description": "Timeout for CAPTCHA solving in seconds"
}
}
}

View File

@@ -1,279 +0,0 @@
#!/usr/bin/env node
/**
* 2Captcha Extension Configuration
*
* Configures the 2captcha extension with API key after Crawl-level Chrome session starts.
* Runs once per crawl to inject API key into extension storage.
*
* Priority: 11 (after chrome_launch at 20)
* Hook: on_Crawl (runs once per crawl, not per snapshot)
*
* Requirements:
* - API_KEY_2CAPTCHA environment variable must be set
* - chrome plugin must have loaded extensions (extensions.json must exist)
*/
const path = require('path');
const fs = require('fs');
// Add NODE_MODULES_DIR to module resolution paths if set
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer-core');
// Get crawl's chrome directory from environment variable set by hooks.py
function getCrawlChromeSessionDir() {
const crawlOutputDir = process.env.CRAWL_OUTPUT_DIR || '';
if (!crawlOutputDir) {
return null;
}
return path.join(crawlOutputDir, 'chrome');
}
const CHROME_SESSION_DIR = getCrawlChromeSessionDir() || '../chrome';
const CONFIG_MARKER = path.join(CHROME_SESSION_DIR, '.captcha2_configured');
// Get environment variable with default
function getEnv(name, defaultValue = '') {
return (process.env[name] || defaultValue).trim();
}
// Parse command line arguments
function parseArgs() {
const args = {};
process.argv.slice(2).forEach(arg => {
if (arg.startsWith('--')) {
const [key, ...valueParts] = arg.slice(2).split('=');
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
}
});
return args;
}
async function configure2Captcha() {
// Check if already configured in this session
if (fs.existsSync(CONFIG_MARKER)) {
console.error('[*] 2captcha already configured in this browser session');
return { success: true, skipped: true };
}
// Check if API key is set
const apiKey = getEnv('API_KEY_2CAPTCHA');
if (!apiKey || apiKey === 'YOUR_API_KEY_HERE') {
console.warn('[⚠️] 2captcha extension loaded but API_KEY_2CAPTCHA not configured');
console.warn('[⚠️] Set API_KEY_2CAPTCHA environment variable to enable automatic CAPTCHA solving');
return { success: false, error: 'API_KEY_2CAPTCHA not configured' };
}
// Load extensions metadata
const extensionsFile = path.join(CHROME_SESSION_DIR, 'extensions.json');
if (!fs.existsSync(extensionsFile)) {
return { success: false, error: 'extensions.json not found - chrome plugin must run first' };
}
const extensions = JSON.parse(fs.readFileSync(extensionsFile, 'utf-8'));
const captchaExt = extensions.find(ext => ext.name === 'captcha2');
if (!captchaExt) {
console.error('[*] 2captcha extension not installed, skipping configuration');
return { success: true, skipped: true };
}
console.error('[*] Configuring 2captcha extension with API key...');
try {
// Connect to the existing Chrome session via CDP
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
if (!fs.existsSync(cdpFile)) {
return { success: false, error: 'CDP URL not found - chrome plugin must run first' };
}
const cdpUrl = fs.readFileSync(cdpFile, 'utf-8').trim();
const browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl });
try {
// Method 1: Try to inject via extension background page
if (captchaExt.target && captchaExt.target_ctx) {
console.error('[*] Attempting to configure via extension background page...');
// Reconnect to the browser to get fresh target context
const targets = await browser.targets();
const extTarget = targets.find(t =>
t.url().startsWith(`chrome-extension://${captchaExt.id}`)
);
if (extTarget) {
const extContext = await extTarget.worker() || await extTarget.page();
if (extContext) {
await extContext.evaluate((key) => {
// Try all common storage patterns
if (typeof chrome !== 'undefined' && chrome.storage) {
chrome.storage.local.set({
apiKey: key,
api_key: key,
'2captcha_apikey': key,
apikey: key,
'solver-api-key': key,
});
chrome.storage.sync.set({
apiKey: key,
api_key: key,
'2captcha_apikey': key,
apikey: key,
'solver-api-key': key,
});
}
// Also try localStorage as fallback
if (typeof localStorage !== 'undefined') {
localStorage.setItem('apiKey', key);
localStorage.setItem('2captcha_apikey', key);
localStorage.setItem('solver-api-key', key);
}
}, apiKey);
console.error('[+] 2captcha API key configured successfully via background page');
// Mark as configured
fs.writeFileSync(CONFIG_MARKER, new Date().toISOString());
return { success: true, method: 'background_page' };
}
}
}
// Method 2: Try to configure via options page
console.error('[*] Attempting to configure via options page...');
const optionsUrl = `chrome-extension://${captchaExt.id}/options.html`;
const configPage = await browser.newPage();
try {
await configPage.goto(optionsUrl, { waitUntil: 'networkidle0', timeout: 10000 });
const configured = await configPage.evaluate((key) => {
// Try to find API key input field
const selectors = [
'input[name*="apikey" i]',
'input[id*="apikey" i]',
'input[name*="api-key" i]',
'input[id*="api-key" i]',
'input[name*="key" i]',
'input[placeholder*="api" i]',
'input[type="text"]',
];
for (const selector of selectors) {
const input = document.querySelector(selector);
if (input) {
input.value = key;
input.dispatchEvent(new Event('input', { bubbles: true }));
input.dispatchEvent(new Event('change', { bubbles: true }));
// Try to find and click save button
const saveSelectors = [
'button[type="submit"]',
'input[type="submit"]',
'button:contains("Save")',
'button:contains("Apply")',
];
for (const btnSel of saveSelectors) {
const btn = document.querySelector(btnSel);
if (btn) {
btn.click();
break;
}
}
// Also save to storage
if (typeof chrome !== 'undefined' && chrome.storage) {
chrome.storage.local.set({ apiKey: key, api_key: key, '2captcha_apikey': key });
chrome.storage.sync.set({ apiKey: key, api_key: key, '2captcha_apikey': key });
}
return true;
}
}
// Fallback: Just save to storage
if (typeof chrome !== 'undefined' && chrome.storage) {
chrome.storage.local.set({ apiKey: key, api_key: key, '2captcha_apikey': key });
chrome.storage.sync.set({ apiKey: key, api_key: key, '2captcha_apikey': key });
return true;
}
return false;
}, apiKey);
await configPage.close();
if (configured) {
console.error('[+] 2captcha API key configured successfully via options page');
// Mark as configured
fs.writeFileSync(CONFIG_MARKER, new Date().toISOString());
return { success: true, method: 'options_page' };
}
} catch (e) {
console.warn(`[⚠️] Failed to configure via options page: ${e.message}`);
try {
await configPage.close();
} catch (e2) {}
}
return { success: false, error: 'Could not configure via any method' };
} finally {
browser.disconnect();
}
} catch (e) {
return { success: false, error: `${e.name}: ${e.message}` };
}
}
async function main() {
const args = parseArgs();
const url = args.url;
const snapshotId = args.snapshot_id;
if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__21_captcha2_config.js --url=<url> --snapshot-id=<uuid>');
process.exit(1);
}
const startTs = new Date();
let status = 'failed';
let error = '';
try {
const result = await configure2Captcha();
if (result.skipped) {
status = 'skipped';
} else if (result.success) {
status = 'succeeded';
} else {
status = 'failed';
error = result.error || 'Configuration failed';
}
} catch (e) {
error = `${e.name}: ${e.message}`;
status = 'failed';
}
const endTs = new Date();
const duration = (endTs - startTs) / 1000;
if (error) {
console.error(`ERROR: ${error}`);
}
// Config hooks don't emit JSONL - they're utility hooks for setup
// Exit code indicates success/failure
process.exit(status === 'succeeded' || status === 'skipped' ? 0 : 1);
}
main().catch(e => {
console.error(`Fatal error: ${e.message}`);
process.exit(1);
});

View File

@@ -1,184 +0,0 @@
"""
Unit tests for captcha2 plugin
Tests invoke the plugin hooks as external processes and verify outputs/side effects.
"""
import json
import os
import subprocess
import tempfile
from pathlib import Path
import pytest
PLUGIN_DIR = Path(__file__).parent.parent
INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_captcha2.*'), None)
CONFIG_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_captcha2_config.*'), None)
def test_install_script_exists():
"""Verify install script exists"""
assert INSTALL_SCRIPT.exists(), f"Install script not found: {INSTALL_SCRIPT}"
def test_config_script_exists():
"""Verify config script exists"""
assert CONFIG_SCRIPT.exists(), f"Config script not found: {CONFIG_SCRIPT}"
def test_extension_metadata():
"""Test that captcha2 extension has correct metadata"""
with tempfile.TemporaryDirectory() as tmpdir:
env = os.environ.copy()
env["CHROME_EXTENSIONS_DIR"] = str(Path(tmpdir) / "chrome_extensions")
# Just check the script can be loaded
result = subprocess.run(
["node", "-e", f"const ext = require('{INSTALL_SCRIPT}'); console.log(JSON.stringify(ext.EXTENSION))"],
capture_output=True,
text=True,
env=env
)
assert result.returncode == 0, f"Failed to load extension metadata: {result.stderr}"
metadata = json.loads(result.stdout)
assert metadata["webstore_id"] == "ifibfemgeogfhoebkmokieepdoobkbpo"
assert metadata["name"] == "captcha2"
def test_install_creates_cache():
"""Test that install creates extension cache"""
with tempfile.TemporaryDirectory() as tmpdir:
ext_dir = Path(tmpdir) / "chrome_extensions"
ext_dir.mkdir(parents=True)
env = os.environ.copy()
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
env["API_KEY_2CAPTCHA"] = "test_api_key"
# Run install script
result = subprocess.run(
["node", str(INSTALL_SCRIPT)],
capture_output=True,
text=True,
env=env,
timeout=60
)
# Check output mentions installation
assert "[*] Installing 2captcha extension" in result.stdout or "[*] 2captcha extension already installed" in result.stdout
# Check cache file was created
cache_file = ext_dir / "captcha2.extension.json"
assert cache_file.exists(), "Cache file should be created"
# Verify cache content
cache_data = json.loads(cache_file.read_text())
assert cache_data["webstore_id"] == "ifibfemgeogfhoebkmokieepdoobkbpo"
assert cache_data["name"] == "captcha2"
assert "unpacked_path" in cache_data
assert "version" in cache_data
def test_install_twice_uses_cache():
"""Test that running install twice uses existing cache on second run"""
with tempfile.TemporaryDirectory() as tmpdir:
ext_dir = Path(tmpdir) / "chrome_extensions"
ext_dir.mkdir(parents=True)
env = os.environ.copy()
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
env["API_KEY_2CAPTCHA"] = "test_api_key"
# First install - downloads the extension
result1 = subprocess.run(
["node", str(INSTALL_SCRIPT)],
capture_output=True,
text=True,
env=env,
timeout=60
)
assert result1.returncode == 0, f"First install failed: {result1.stderr}"
# Verify cache was created
cache_file = ext_dir / "captcha2.extension.json"
assert cache_file.exists(), "Cache file should exist after first install"
# Second install - should use cache
result2 = subprocess.run(
["node", str(INSTALL_SCRIPT)],
capture_output=True,
text=True,
env=env,
timeout=30
)
assert result2.returncode == 0, f"Second install failed: {result2.stderr}"
# Second run should mention cache reuse
assert "already installed" in result2.stdout or "cache" in result2.stdout.lower() or result2.returncode == 0
def test_install_warns_without_api_key():
"""Test that install warns when API key not configured"""
with tempfile.TemporaryDirectory() as tmpdir:
ext_dir = Path(tmpdir) / "chrome_extensions"
ext_dir.mkdir(parents=True)
env = os.environ.copy()
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
# Don't set API_KEY_2CAPTCHA
# Run install script
result = subprocess.run(
["node", str(INSTALL_SCRIPT)],
capture_output=True,
text=True,
env=env,
timeout=60
)
# Should warn about missing API key
combined_output = result.stdout + result.stderr
assert "API_KEY_2CAPTCHA not configured" in combined_output or "Set API_KEY_2CAPTCHA" in combined_output
def test_install_success_with_api_key():
"""Test that install succeeds when API key is configured"""
with tempfile.TemporaryDirectory() as tmpdir:
ext_dir = Path(tmpdir) / "chrome_extensions"
ext_dir.mkdir(parents=True)
env = os.environ.copy()
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
env["API_KEY_2CAPTCHA"] = "test_valid_api_key_123"
# Run install script
result = subprocess.run(
["node", str(INSTALL_SCRIPT)],
capture_output=True,
text=True,
env=env,
timeout=60
)
# Should mention API key configured
combined_output = result.stdout + result.stderr
assert "API key configured" in combined_output or "API_KEY_2CAPTCHA" in combined_output
def test_config_script_structure():
"""Test that config script has proper structure"""
# Verify the script exists and contains expected markers
script_content = CONFIG_SCRIPT.read_text()
# Should mention configuration marker file
assert "CONFIG_MARKER" in script_content or "captcha2_configured" in script_content
# Should mention API key
assert "API_KEY_2CAPTCHA" in script_content
# Should have main function or be executable
assert "async function" in script_content or "main" in script_content

View File

@@ -533,9 +533,9 @@ async function killChrome(pid, outputDir = null) {
}
// Step 8: Clean up PID files
// Note: hook-specific .pid files are cleaned up by run_hook() and Snapshot.cleanup()
if (outputDir) {
try { fs.unlinkSync(path.join(outputDir, 'chrome.pid')); } catch (e) {}
try { fs.unlinkSync(path.join(outputDir, 'hook.pid')); } catch (e) {}
}
console.error('[*] Chrome cleanup completed');

View File

@@ -3,7 +3,12 @@
Install hook for Chrome/Chromium and puppeteer-core.
Runs at crawl start to install/find Chromium and puppeteer-core.
Outputs JSONL for Binary and Machine config updates.
Also validates config and computes derived values.
Outputs:
- JSONL for Binary and Machine config updates
- COMPUTED:KEY=VALUE lines that hooks.py parses and adds to env
Respects CHROME_BINARY env var for custom binary paths.
Uses `npx @puppeteer/browsers install chromium@latest` and parses output.
@@ -19,6 +24,28 @@ import subprocess
from pathlib import Path
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_bool(name: str, default: bool = False) -> bool:
val = get_env(name, '').lower()
if val in ('true', '1', 'yes', 'on'):
return True
if val in ('false', '0', 'no', 'off'):
return False
return default
def detect_docker() -> bool:
"""Detect if running inside Docker container."""
return (
os.path.exists('/.dockerenv') or
os.environ.get('IN_DOCKER', '').lower() in ('true', '1', 'yes') or
os.path.exists('/run/.containerenv')
)
def get_chrome_version(binary_path: str) -> str | None:
"""Get Chrome/Chromium version string."""
try:
@@ -131,13 +158,41 @@ def install_chromium() -> dict | None:
def main():
warnings = []
errors = []
computed = {}
# Install puppeteer-core if NODE_MODULES_DIR is set
install_puppeteer_core()
# Check if Chrome is enabled
chrome_enabled = get_env_bool('CHROME_ENABLED', True)
# Detect Docker and adjust sandbox
in_docker = detect_docker()
computed['IN_DOCKER'] = str(in_docker).lower()
chrome_sandbox = get_env_bool('CHROME_SANDBOX', True)
if in_docker and chrome_sandbox:
warnings.append(
"Running in Docker with CHROME_SANDBOX=true. "
"Chrome may fail to start. Consider setting CHROME_SANDBOX=false."
)
# Auto-disable sandbox in Docker unless explicitly set
if not get_env('CHROME_SANDBOX'):
computed['CHROME_SANDBOX'] = 'false'
# Check Node.js availability
node_binary = get_env('NODE_BINARY', 'node')
computed['NODE_BINARY'] = node_binary
# Check if CHROME_BINARY is already set and valid
configured_binary = os.environ.get('CHROME_BINARY', '').strip()
configured_binary = get_env('CHROME_BINARY', '')
if configured_binary and os.path.isfile(configured_binary) and os.access(configured_binary, os.X_OK):
version = get_chrome_version(configured_binary)
computed['CHROME_BINARY'] = configured_binary
computed['CHROME_VERSION'] = version or 'unknown'
print(json.dumps({
'type': 'Binary',
'name': 'chromium',
@@ -145,12 +200,22 @@ def main():
'version': version,
'binprovider': 'env',
}))
# Output computed values
for key, value in computed.items():
print(f"COMPUTED:{key}={value}")
for warning in warnings:
print(f"WARNING:{warning}", file=sys.stderr)
sys.exit(0)
# Install/find Chromium via puppeteer
result = install_chromium()
if result and result.get('abspath'):
computed['CHROME_BINARY'] = result['abspath']
computed['CHROME_VERSION'] = result['version'] or 'unknown'
print(json.dumps({
'type': 'Binary',
'name': result['name'],
@@ -174,9 +239,25 @@ def main():
'value': result['version'],
}))
# Output computed values
for key, value in computed.items():
print(f"COMPUTED:{key}={value}")
for warning in warnings:
print(f"WARNING:{warning}", file=sys.stderr)
sys.exit(0)
else:
print("Chromium binary not found", file=sys.stderr)
errors.append("Chromium binary not found")
computed['CHROME_BINARY'] = ''
# Output computed values and errors
for key, value in computed.items():
print(f"COMPUTED:{key}={value}")
for warning in warnings:
print(f"WARNING:{warning}", file=sys.stderr)
for error in errors:
print(f"ERROR:{error}", file=sys.stderr)
sys.exit(1)

View File

@@ -1,172 +0,0 @@
#!/usr/bin/env python3
"""
Validate and compute derived Chrome config values.
This hook runs early in the Crawl lifecycle to:
1. Auto-detect Chrome binary location
2. Compute sandbox settings based on Docker detection
3. Validate binary availability and version
4. Set computed env vars for subsequent hooks
Output:
- COMPUTED:KEY=VALUE lines that hooks.py parses and adds to env
- Binary JSONL records to stdout when binaries are found
"""
import json
import os
import sys
from abx_pkg import Binary, EnvProvider
# Chrome binary search order
CHROME_BINARY_NAMES = [
'chromium',
'chromium-browser',
'google-chrome',
'google-chrome-stable',
'chrome',
]
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_bool(name: str, default: bool = False) -> bool:
val = get_env(name, '').lower()
if val in ('true', '1', 'yes', 'on'):
return True
if val in ('false', '0', 'no', 'off'):
return False
return default
def detect_docker() -> bool:
"""Detect if running inside Docker container."""
return (
os.path.exists('/.dockerenv') or
os.environ.get('IN_DOCKER', '').lower() in ('true', '1', 'yes') or
os.path.exists('/run/.containerenv')
)
def find_chrome_binary(configured: str, provider: EnvProvider) -> Binary | None:
"""Find Chrome binary using abx-pkg, checking configured path first."""
# Try configured binary first
if configured:
try:
binary = Binary(name=configured, binproviders=[provider]).load()
if binary.abspath:
return binary
except Exception:
pass
# Search common names
for name in CHROME_BINARY_NAMES:
try:
binary = Binary(name=name, binproviders=[provider]).load()
if binary.abspath:
return binary
except Exception:
continue
return None
def output_binary(binary: Binary, name: str):
"""Output Binary JSONL record to stdout."""
machine_id = os.environ.get('MACHINE_ID', '')
record = {
'type': 'Binary',
'name': name,
'abspath': str(binary.abspath),
'version': str(binary.version) if binary.version else '',
'sha256': binary.sha256 or '',
'binprovider': 'env',
'machine_id': machine_id,
}
print(json.dumps(record))
def main():
warnings = []
errors = []
computed = {}
# Get config values
chrome_binary = get_env('CHROME_BINARY', 'chromium')
chrome_sandbox = get_env_bool('CHROME_SANDBOX', True)
screenshot_enabled = get_env_bool('SCREENSHOT_ENABLED', True)
pdf_enabled = get_env_bool('PDF_ENABLED', True)
dom_enabled = get_env_bool('DOM_ENABLED', True)
# Compute USE_CHROME (derived from extractor enabled flags)
use_chrome = screenshot_enabled or pdf_enabled or dom_enabled
computed['USE_CHROME'] = str(use_chrome).lower()
# Detect Docker and adjust sandbox
in_docker = detect_docker()
computed['IN_DOCKER'] = str(in_docker).lower()
if in_docker and chrome_sandbox:
warnings.append(
"Running in Docker with CHROME_SANDBOX=true. "
"Chrome may fail to start. Consider setting CHROME_SANDBOX=false."
)
# Auto-disable sandbox in Docker unless explicitly set
if not get_env('CHROME_SANDBOX'):
computed['CHROME_SANDBOX'] = 'false'
# Find Chrome binary using abx-pkg
provider = EnvProvider()
if use_chrome:
chrome = find_chrome_binary(chrome_binary, provider)
if not chrome or not chrome.abspath:
errors.append(
f"Chrome binary not found (tried: {chrome_binary}). "
"Install Chrome/Chromium or set CHROME_BINARY path."
)
computed['CHROME_BINARY'] = ''
else:
computed['CHROME_BINARY'] = str(chrome.abspath)
computed['CHROME_VERSION'] = str(chrome.version) if chrome.version else 'unknown'
# Output Binary JSONL record for Chrome
output_binary(chrome, name='chrome')
# Check Node.js for Puppeteer
node_binary_name = get_env('NODE_BINARY', 'node')
try:
node = Binary(name=node_binary_name, binproviders=[provider]).load()
node_path = str(node.abspath) if node.abspath else ''
except Exception:
node = None
node_path = ''
if use_chrome and not node_path:
errors.append(
f"Node.js not found (tried: {node_binary_name}). "
"Install Node.js or set NODE_BINARY path for Puppeteer."
)
else:
computed['NODE_BINARY'] = node_path
if node and node.abspath:
# Output Binary JSONL record for Node
output_binary(node, name='node')
# Output computed values
for key, value in computed.items():
print(f"COMPUTED:{key}={value}")
for warning in warnings:
print(f"WARNING:{warning}", file=sys.stderr)
for error in errors:
print(f"ERROR:{error}", file=sys.stderr)
sys.exit(1 if errors else 0)
if __name__ == '__main__':
main()

View File

@@ -8,8 +8,8 @@
* NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for
* --load-extension and --disable-extensions-except flags.
*
* Usage: on_Crawl__20_chrome_launch.bg.js --crawl-id=<uuid> --source-url=<url>
* Output: Creates chrome/ directory under crawl output dir with:
* Usage: on_Crawl__30_chrome_launch.bg.js --crawl-id=<uuid> --source-url=<url>
* Output: Writes to current directory (executor creates chrome/ dir):
* - cdp_url.txt: WebSocket URL for CDP connection
* - chrome.pid: Chromium process ID (for cleanup)
* - port.txt: Debug port number
@@ -42,7 +42,7 @@ const {
// Extractor metadata
const PLUGIN_NAME = 'chrome_launch';
const OUTPUT_DIR = 'chrome';
const OUTPUT_DIR = '.';
// Global state for cleanup
let chromePid = null;
@@ -143,12 +143,11 @@ async function main() {
console.error(`[+] Found ${installedExtensions.length} extension(s) to load`);
}
// Write hook's own PID
const hookStartTime = Date.now() / 1000;
// Note: PID file is written by run_hook() with hook-specific name
// Snapshot.cleanup() kills all *.pid processes when done
if (!fs.existsSync(OUTPUT_DIR)) {
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
}
writePidWithMtime(path.join(OUTPUT_DIR, 'hook.pid'), process.pid, hookStartTime);
// Launch Chromium using consolidated function
const result = await launchChromium({
@@ -165,14 +164,6 @@ async function main() {
chromePid = result.pid;
const cdpUrl = result.cdpUrl;
// Write extensions metadata
if (installedExtensions.length > 0) {
fs.writeFileSync(
path.join(OUTPUT_DIR, 'extensions.json'),
JSON.stringify(installedExtensions, null, 2)
);
}
// Connect puppeteer for extension verification
console.error(`[*] Connecting puppeteer to CDP...`);
const browser = await puppeteer.connect({
@@ -181,30 +172,102 @@ async function main() {
});
browserInstance = browser;
// Verify extensions loaded
// Get actual extension IDs from chrome://extensions page
if (extensionPaths.length > 0) {
await new Promise(r => setTimeout(r, 3000));
await new Promise(r => setTimeout(r, 2000));
const targets = browser.targets();
console.error(`[*] All browser targets (${targets.length}):`);
for (const t of targets) {
console.error(` - ${t.type()}: ${t.url().slice(0, 80)}`);
try {
const extPage = await browser.newPage();
await extPage.goto('chrome://extensions', { waitUntil: 'domcontentloaded', timeout: 10000 });
await new Promise(r => setTimeout(r, 2000));
// Parse extension info from the page
const extensionsFromPage = await extPage.evaluate(() => {
const extensions = [];
// Extensions manager uses shadow DOM
const manager = document.querySelector('extensions-manager');
if (!manager || !manager.shadowRoot) return extensions;
const itemList = manager.shadowRoot.querySelector('extensions-item-list');
if (!itemList || !itemList.shadowRoot) return extensions;
const items = itemList.shadowRoot.querySelectorAll('extensions-item');
for (const item of items) {
const id = item.getAttribute('id');
const nameEl = item.shadowRoot?.querySelector('#name');
const name = nameEl?.textContent?.trim() || '';
if (id && name) {
extensions.push({ id, name });
}
}
return extensions;
});
console.error(`[*] Found ${extensionsFromPage.length} extension(s) on chrome://extensions`);
for (const e of extensionsFromPage) {
console.error(` - ${e.id}: "${e.name}"`);
}
// Match extensions by name (strict matching)
for (const ext of installedExtensions) {
// Read the extension's manifest to get its display name
const manifestPath = path.join(ext.unpacked_path, 'manifest.json');
if (fs.existsSync(manifestPath)) {
const manifest = JSON.parse(fs.readFileSync(manifestPath, 'utf-8'));
let manifestName = manifest.name || '';
// Resolve message placeholder (e.g., __MSG_extName__)
if (manifestName.startsWith('__MSG_') && manifestName.endsWith('__')) {
const msgKey = manifestName.slice(6, -2); // Extract key from __MSG_key__
const defaultLocale = manifest.default_locale || 'en';
const messagesPath = path.join(ext.unpacked_path, '_locales', defaultLocale, 'messages.json');
if (fs.existsSync(messagesPath)) {
try {
const messages = JSON.parse(fs.readFileSync(messagesPath, 'utf-8'));
if (messages[msgKey] && messages[msgKey].message) {
manifestName = messages[msgKey].message;
}
} catch (e) {
console.error(`[!] Failed to read messages.json: ${e.message}`);
}
}
}
console.error(`[*] Looking for match: ext.name="${ext.name}" manifest.name="${manifestName}"`);
// Find matching extension from page by exact name match first
let match = extensionsFromPage.find(e => e.name === manifestName);
// If no exact match, try case-insensitive exact match
if (!match) {
match = extensionsFromPage.find(e =>
e.name.toLowerCase() === manifestName.toLowerCase()
);
}
if (match) {
ext.id = match.id;
console.error(`[+] Matched extension: ${ext.name} (${manifestName}) -> ${match.id}`);
} else {
console.error(`[!] No match found for: ${ext.name} (${manifestName})`);
}
}
}
await extPage.close();
} catch (e) {
console.error(`[!] Failed to get extensions from chrome://extensions: ${e.message}`);
}
const extTargets = targets.filter(t =>
t.url().startsWith('chrome-extension://') ||
t.type() === 'service_worker' ||
t.type() === 'background_page'
);
// Filter out built-in extensions
// Fallback: check browser targets
const targets = browser.targets();
const builtinIds = [
'nkeimhogjdpnpccoofpliimaahmaaome',
'fignfifoniblkonapihmkfakmlgkbkcf',
'ahfgeienlihckogmohjhadlkjgocpleb',
'mhjfbmdgcfjbbpaeojofohoefgiehjai',
];
const customExtTargets = extTargets.filter(t => {
const customExtTargets = targets.filter(t => {
const url = t.url();
if (!url.startsWith('chrome-extension://')) return false;
const extId = url.split('://')[1].split('/')[0];
@@ -216,7 +279,7 @@ async function main() {
for (const target of customExtTargets) {
const url = target.url();
const extId = url.split('://')[1].split('/')[0];
console.error(`[+] Extension loaded: ${extId} (${target.type()})`);
console.error(`[+] Extension target: ${extId} (${target.type()})`);
}
if (customExtTargets.length === 0 && extensionPaths.length > 0) {
@@ -225,6 +288,14 @@ async function main() {
}
}
// Write extensions metadata with actual IDs
if (installedExtensions.length > 0) {
fs.writeFileSync(
path.join(OUTPUT_DIR, 'extensions.json'),
JSON.stringify(installedExtensions, null, 2)
);
}
console.error(`[+] Chromium session started for crawl ${crawlId}`);
console.error(`[+] CDP URL: ${cdpUrl}`);
console.error(`[+] PID: ${chromePid}`);

View File

@@ -2,7 +2,7 @@
/**
* Create a Chrome tab for this snapshot in the shared crawl Chrome session.
*
* If a crawl-level Chrome session exists (from on_Crawl__20_chrome_launch.bg.js),
* If a crawl-level Chrome session exists (from on_Crawl__30_chrome_launch.bg.js),
* this connects to it and creates a new tab. Otherwise, falls back to launching
* its own Chrome instance.
*
@@ -215,7 +215,7 @@ async function launchNewChrome(url, binary) {
console.log(`[*] Launched Chrome (PID: ${chromePid}), waiting for debug port...`);
// Write PID immediately for cleanup
fs.writeFileSync(path.join(OUTPUT_DIR, 'pid.txt'), String(chromePid));
fs.writeFileSync(path.join(OUTPUT_DIR, 'chrome.pid'), String(chromePid));
try {
// Wait for Chrome to be ready

View File

@@ -29,7 +29,7 @@ import shutil
import platform
PLUGIN_DIR = Path(__file__).parent.parent
CHROME_LAUNCH_HOOK = PLUGIN_DIR / 'on_Crawl__20_chrome_launch.bg.js'
CHROME_LAUNCH_HOOK = PLUGIN_DIR / 'on_Crawl__30_chrome_launch.bg.js'
CHROME_TAB_HOOK = PLUGIN_DIR / 'on_Snapshot__20_chrome_tab.bg.js'
CHROME_NAVIGATE_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_chrome_navigate.*'), None)
@@ -176,6 +176,7 @@ def test_chrome_launch_and_tab_creation():
crawl_dir = Path(tmpdir) / 'crawl'
crawl_dir.mkdir()
chrome_dir = crawl_dir / 'chrome'
chrome_dir.mkdir()
# Get test environment with NODE_MODULES_DIR set
env = get_test_env()
@@ -184,7 +185,7 @@ def test_chrome_launch_and_tab_creation():
# Launch Chrome at crawl level (background process)
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-crawl-123'],
cwd=str(crawl_dir),
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
@@ -292,7 +293,7 @@ def test_chrome_navigation():
# Launch Chrome (background process)
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-crawl-nav'],
cwd=str(crawl_dir),
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
@@ -363,7 +364,7 @@ def test_tab_cleanup_on_sigterm():
# Launch Chrome (background process)
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-cleanup'],
cwd=str(crawl_dir),
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
@@ -423,11 +424,12 @@ def test_multiple_snapshots_share_chrome():
crawl_dir = Path(tmpdir) / 'crawl'
crawl_dir.mkdir()
chrome_dir = crawl_dir / 'chrome'
chrome_dir.mkdir()
# Launch Chrome at crawl level
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-multi-crawl'],
cwd=str(crawl_dir),
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
@@ -513,7 +515,7 @@ def test_chrome_cleanup_on_crawl_end():
# Launch Chrome in background
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-crawl-end'],
cwd=str(crawl_dir),
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
@@ -554,11 +556,12 @@ def test_zombie_prevention_hook_killed():
crawl_dir = Path(tmpdir) / 'crawl'
crawl_dir.mkdir()
chrome_dir = crawl_dir / 'chrome'
chrome_dir.mkdir()
# Launch Chrome
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-zombie'],
cwd=str(crawl_dir),
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,

View File

@@ -19,7 +19,7 @@ const puppeteer = require('puppeteer-core');
const PLUGIN_NAME = 'consolelog';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'console.jsonl';
const PID_FILE = 'hook.pid';
// PID file is now written by run_hook() with hook-specific name
const CHROME_SESSION_DIR = '../chrome';
function parseArgs() {
@@ -221,8 +221,8 @@ async function main() {
// Set up listeners BEFORE navigation
await setupListeners();
// Write PID file so chrome_cleanup can kill any remaining processes
fs.writeFileSync(path.join(OUTPUT_DIR, PID_FILE), String(process.pid));
// Note: PID file is written by run_hook() with hook-specific name
// Snapshot.cleanup() kills all *.pid processes when done
// Wait for chrome_navigate to complete (BLOCKING)
await waitForNavigation();

View File

@@ -26,7 +26,7 @@ import pytest
PLUGIN_DIR = Path(__file__).parent.parent
PLUGINS_ROOT = PLUGIN_DIR.parent
INFINISCROLL_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_infiniscroll.*'), None)
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__20_chrome_launch.bg.js'
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__30_chrome_launch.bg.js'
CHROME_TAB_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Snapshot__20_chrome_tab.bg.js'
CHROME_NAVIGATE_HOOK = next((PLUGINS_ROOT / 'chrome').glob('on_Snapshot__*_chrome_navigate.*'), None)
TEST_URL = 'https://www.singsing.movie/'
@@ -122,6 +122,7 @@ def setup_chrome_session(tmpdir):
crawl_dir = Path(tmpdir) / 'crawl'
crawl_dir.mkdir()
chrome_dir = crawl_dir / 'chrome'
chrome_dir.mkdir()
env = get_test_env()
env['CHROME_HEADLESS'] = 'true'
@@ -129,7 +130,7 @@ def setup_chrome_session(tmpdir):
# Launch Chrome at crawl level
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-infiniscroll'],
cwd=str(crawl_dir),
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,

View File

@@ -16,7 +16,7 @@ import pytest
PLUGIN_DIR = Path(__file__).parent.parent
INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_istilldontcareaboutcookies.*'), None)
INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_install_istilldontcareaboutcookies_extension.*'), None)
def test_install_script_exists():
@@ -124,78 +124,106 @@ def test_no_configuration_required():
assert "API" not in (result.stdout + result.stderr) or result.returncode == 0
def setup_test_lib_dirs(tmpdir: Path) -> dict:
"""Create isolated lib directories for tests and return env dict.
PLUGINS_ROOT = PLUGIN_DIR.parent
CHROME_INSTALL_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__00_install_puppeteer_chromium.py'
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__30_chrome_launch.bg.js'
Sets up:
LIB_DIR: tmpdir/lib/<arch>
NODE_MODULES_DIR: tmpdir/lib/<arch>/npm/node_modules
NPM_BIN_DIR: tmpdir/lib/<arch>/npm/bin
PIP_VENV_DIR: tmpdir/lib/<arch>/pip/venv
PIP_BIN_DIR: tmpdir/lib/<arch>/pip/venv/bin
def setup_test_env(tmpdir: Path) -> dict:
"""Set up isolated data/lib directory structure for tests.
Creates structure matching real ArchiveBox data dir:
<tmpdir>/data/
lib/
arm64-darwin/ (or x86_64-linux, etc.)
npm/
.bin/
node_modules/
personas/
Default/
chrome_extensions/
users/
testuser/
crawls/
snapshots/
Calls chrome install hook which handles puppeteer-core and chromium installation.
Returns env dict with DATA_DIR, LIB_DIR, NPM_BIN_DIR, NODE_MODULES_DIR, CHROME_BINARY, etc.
"""
import platform
arch = platform.machine()
from datetime import datetime
# Determine machine type (matches archivebox.config.paths.get_machine_type())
machine = platform.machine().lower()
system = platform.system().lower()
arch_dir = f"{arch}-{system}"
if machine in ('arm64', 'aarch64'):
machine = 'arm64'
elif machine in ('x86_64', 'amd64'):
machine = 'x86_64'
machine_type = f"{machine}-{system}"
lib_dir = tmpdir / 'lib' / arch_dir
# Create proper directory structure matching real ArchiveBox layout
data_dir = tmpdir / 'data'
lib_dir = data_dir / 'lib' / machine_type
npm_dir = lib_dir / 'npm'
npm_bin_dir = npm_dir / '.bin'
node_modules_dir = npm_dir / 'node_modules'
npm_bin_dir = npm_dir / 'bin'
pip_venv_dir = lib_dir / 'pip' / 'venv'
pip_bin_dir = pip_venv_dir / 'bin'
# Create directories
# Extensions go under personas/Default/
chrome_extensions_dir = data_dir / 'personas' / 'Default' / 'chrome_extensions'
# User data goes under users/{username}/
date_str = datetime.now().strftime('%Y%m%d')
users_dir = data_dir / 'users' / 'testuser'
crawls_dir = users_dir / 'crawls' / date_str
snapshots_dir = users_dir / 'snapshots' / date_str
# Create all directories
node_modules_dir.mkdir(parents=True, exist_ok=True)
npm_bin_dir.mkdir(parents=True, exist_ok=True)
pip_bin_dir.mkdir(parents=True, exist_ok=True)
chrome_extensions_dir.mkdir(parents=True, exist_ok=True)
crawls_dir.mkdir(parents=True, exist_ok=True)
snapshots_dir.mkdir(parents=True, exist_ok=True)
# Install puppeteer-core to the test node_modules if not present
if not (node_modules_dir / 'puppeteer-core').exists():
result = subprocess.run(
['npm', 'install', '--prefix', str(npm_dir), 'puppeteer-core'],
capture_output=True,
text=True,
timeout=120
)
if result.returncode != 0:
pytest.skip(f"Failed to install puppeteer-core: {result.stderr}")
return {
# Build complete env dict
env = os.environ.copy()
env.update({
'DATA_DIR': str(data_dir),
'LIB_DIR': str(lib_dir),
'NODE_MODULES_DIR': str(node_modules_dir),
'MACHINE_TYPE': machine_type,
'NPM_BIN_DIR': str(npm_bin_dir),
'PIP_VENV_DIR': str(pip_venv_dir),
'PIP_BIN_DIR': str(pip_bin_dir),
}
'NODE_MODULES_DIR': str(node_modules_dir),
'CHROME_EXTENSIONS_DIR': str(chrome_extensions_dir),
'CRAWLS_DIR': str(crawls_dir),
'SNAPSHOTS_DIR': str(snapshots_dir),
})
PLUGINS_ROOT = PLUGIN_DIR.parent
def find_chromium_binary():
"""Find the Chromium binary using chrome_utils.js findChromium().
This uses the centralized findChromium() function which checks:
- CHROME_BINARY env var
- @puppeteer/browsers install locations
- System Chromium locations
- Falls back to Chrome (with warning)
"""
chrome_utils = PLUGINS_ROOT / 'chrome' / 'chrome_utils.js'
# Call chrome install hook (installs puppeteer-core and chromium, outputs JSONL)
result = subprocess.run(
['node', str(chrome_utils), 'findChromium'],
capture_output=True,
text=True,
timeout=10
['python', str(CHROME_INSTALL_HOOK)],
capture_output=True, text=True, timeout=120, env=env
)
if result.returncode == 0 and result.stdout.strip():
return result.stdout.strip()
return None
if result.returncode != 0:
pytest.skip(f"Chrome install hook failed: {result.stderr}")
# Parse JSONL output to get CHROME_BINARY
chrome_binary = None
for line in result.stdout.strip().split('\n'):
if not line.strip():
continue
try:
data = json.loads(line)
if data.get('type') == 'Binary' and data.get('abspath'):
chrome_binary = data['abspath']
break
except json.JSONDecodeError:
continue
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__20_chrome_launch.bg.js'
if not chrome_binary or not Path(chrome_binary).exists():
pytest.skip(f"Chromium binary not found: {chrome_binary}")
env['CHROME_BINARY'] = chrome_binary
return env
TEST_URL = 'https://www.filmin.es/'
@@ -210,22 +238,11 @@ def test_extension_loads_in_chromium():
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Set up isolated lib directories for this test
lib_env = setup_test_lib_dirs(tmpdir)
# Set up isolated env with proper directory structure
env = setup_test_env(tmpdir)
env.setdefault('CHROME_HEADLESS', 'true')
# Set up extensions directory
ext_dir = tmpdir / 'chrome_extensions'
ext_dir.mkdir(parents=True)
env = os.environ.copy()
env.update(lib_env)
env['CHROME_EXTENSIONS_DIR'] = str(ext_dir)
env['CHROME_HEADLESS'] = 'true'
# Ensure CHROME_BINARY points to Chromium
chromium = find_chromium_binary()
if chromium:
env['CHROME_BINARY'] = chromium
ext_dir = Path(env['CHROME_EXTENSIONS_DIR'])
# Step 1: Install the extension
result = subprocess.run(
@@ -245,13 +262,16 @@ def test_extension_loads_in_chromium():
print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}")
# Step 2: Launch Chromium using the chrome hook (loads extensions automatically)
crawl_dir = tmpdir / 'crawl'
crawl_dir.mkdir()
crawl_id = 'test-cookies'
crawl_dir = Path(env['CRAWLS_DIR']) / crawl_id
crawl_dir.mkdir(parents=True, exist_ok=True)
chrome_dir = crawl_dir / 'chrome'
chrome_dir.mkdir(parents=True, exist_ok=True)
env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-cookies'],
cwd=str(crawl_dir),
['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
@@ -400,156 +420,362 @@ const puppeteer = require('puppeteer-core');
pass
def test_hides_cookie_consent_on_filmin():
"""Live test: verify extension hides cookie consent popup on filmin.es.
def launch_chromium_session(env: dict, chrome_dir: Path, crawl_id: str):
"""Launch Chromium and return (process, cdp_url) or raise on failure."""
chrome_dir.mkdir(parents=True, exist_ok=True)
Uses Chromium with extensions loaded automatically via chrome hook.
"""
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=env
)
# Set up isolated lib directories for this test
lib_env = setup_test_lib_dirs(tmpdir)
# Wait for Chromium to launch and CDP URL to be available
cdp_url = None
for i in range(20):
if chrome_launch_process.poll() is not None:
stdout, stderr = chrome_launch_process.communicate()
raise RuntimeError(f"Chromium launch failed:\nStdout: {stdout}\nStderr: {stderr}")
cdp_file = chrome_dir / 'cdp_url.txt'
if cdp_file.exists():
cdp_url = cdp_file.read_text().strip()
break
time.sleep(1)
# Set up extensions directory
ext_dir = tmpdir / 'chrome_extensions'
ext_dir.mkdir(parents=True)
if not cdp_url:
chrome_launch_process.kill()
raise RuntimeError("Chromium CDP URL not found after 20s")
env = os.environ.copy()
env.update(lib_env)
env['CHROME_EXTENSIONS_DIR'] = str(ext_dir)
env['CHROME_HEADLESS'] = 'true'
return chrome_launch_process, cdp_url
# Ensure CHROME_BINARY points to Chromium
chromium = find_chromium_binary()
if chromium:
env['CHROME_BINARY'] = chromium
# Step 1: Install the extension
result = subprocess.run(
['node', str(INSTALL_SCRIPT)],
cwd=str(tmpdir),
capture_output=True,
text=True,
env=env,
timeout=60
)
assert result.returncode == 0, f"Extension install failed: {result.stderr}"
# Verify extension cache was created
cache_file = ext_dir / 'istilldontcareaboutcookies.extension.json'
assert cache_file.exists(), "Extension cache not created"
ext_data = json.loads(cache_file.read_text())
print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}")
# Step 2: Launch Chromium using the chrome hook (loads extensions automatically)
crawl_dir = tmpdir / 'crawl'
crawl_dir.mkdir()
chrome_dir = crawl_dir / 'chrome'
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-cookies'],
cwd=str(crawl_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=env
)
# Wait for Chromium to launch and CDP URL to be available
cdp_url = None
for i in range(20):
if chrome_launch_process.poll() is not None:
stdout, stderr = chrome_launch_process.communicate()
raise RuntimeError(f"Chromium launch failed:\nStdout: {stdout}\nStderr: {stderr}")
cdp_file = chrome_dir / 'cdp_url.txt'
if cdp_file.exists():
cdp_url = cdp_file.read_text().strip()
break
time.sleep(1)
assert cdp_url, "Chromium CDP URL not found after 20s"
print(f"Chromium launched with CDP URL: {cdp_url}")
def kill_chromium_session(chrome_launch_process, chrome_dir: Path):
"""Clean up Chromium process."""
try:
chrome_launch_process.send_signal(signal.SIGTERM)
chrome_launch_process.wait(timeout=5)
except:
pass
chrome_pid_file = chrome_dir / 'chrome.pid'
if chrome_pid_file.exists():
try:
# Step 3: Connect to Chromium and test cookie consent hiding
test_script = f'''
chrome_pid = int(chrome_pid_file.read_text().strip())
os.kill(chrome_pid, signal.SIGKILL)
except (OSError, ValueError):
pass
def check_cookie_consent_visibility(cdp_url: str, test_url: str, env: dict, script_dir: Path) -> dict:
"""Check if cookie consent elements are visible on a page.
Returns dict with:
- visible: bool - whether any cookie consent element is visible
- selector: str - which selector matched (if visible)
- elements_found: list - all cookie-related elements found in DOM
- html_snippet: str - snippet of the page HTML for debugging
"""
test_script = f'''
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer-core');
(async () => {{
const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }});
// Wait for extension to initialize
await new Promise(r => setTimeout(r, 2000));
const page = await browser.newPage();
await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36');
await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
await page.setViewport({{ width: 1440, height: 900 }});
console.error('Navigating to {TEST_URL}...');
await page.goto('{TEST_URL}', {{ waitUntil: 'networkidle2', timeout: 30000 }});
console.error('Navigating to {test_url}...');
await page.goto('{test_url}', {{ waitUntil: 'networkidle2', timeout: 30000 }});
// Wait for extension content script to process page
await new Promise(r => setTimeout(r, 5000));
// Wait for page to fully render and any cookie scripts to run
await new Promise(r => setTimeout(r, 3000));
// Check cookie consent visibility
// Check cookie consent visibility using multiple common selectors
const result = await page.evaluate(() => {{
const selectors = ['.cky-consent-container', '.cky-popup-center', '.cky-overlay'];
// Common cookie consent selectors used by various consent management platforms
const selectors = [
// CookieYes
'.cky-consent-container', '.cky-popup-center', '.cky-overlay', '.cky-modal',
// OneTrust
'#onetrust-consent-sdk', '#onetrust-banner-sdk', '.onetrust-pc-dark-filter',
// Cookiebot
'#CybotCookiebotDialog', '#CybotCookiebotDialogBodyUnderlay',
// Generic cookie banners
'[class*="cookie-consent"]', '[class*="cookie-banner"]', '[class*="cookie-notice"]',
'[class*="cookie-popup"]', '[class*="cookie-modal"]', '[class*="cookie-dialog"]',
'[id*="cookie-consent"]', '[id*="cookie-banner"]', '[id*="cookie-notice"]',
'[id*="cookieconsent"]', '[id*="cookie-law"]',
// GDPR banners
'[class*="gdpr"]', '[id*="gdpr"]',
// Consent banners
'[class*="consent-banner"]', '[class*="consent-modal"]', '[class*="consent-popup"]',
// Privacy banners
'[class*="privacy-banner"]', '[class*="privacy-notice"]',
// Common frameworks
'.cc-window', '.cc-banner', '#cc-main', // Cookie Consent by Insites
'.qc-cmp2-container', // Quantcast
'.sp-message-container', // SourcePoint
];
const elementsFound = [];
let visibleElement = null;
for (const sel of selectors) {{
const el = document.querySelector(sel);
if (el) {{
const style = window.getComputedStyle(el);
const rect = el.getBoundingClientRect();
const visible = style.display !== 'none' &&
style.visibility !== 'hidden' &&
rect.width > 0 && rect.height > 0;
if (visible) return {{ visible: true, selector: sel }};
try {{
const elements = document.querySelectorAll(sel);
for (const el of elements) {{
const style = window.getComputedStyle(el);
const rect = el.getBoundingClientRect();
const isVisible = style.display !== 'none' &&
style.visibility !== 'hidden' &&
style.opacity !== '0' &&
rect.width > 0 && rect.height > 0;
elementsFound.push({{
selector: sel,
visible: isVisible,
display: style.display,
visibility: style.visibility,
opacity: style.opacity,
width: rect.width,
height: rect.height
}});
if (isVisible && !visibleElement) {{
visibleElement = {{ selector: sel, width: rect.width, height: rect.height }};
}}
}}
}} catch (e) {{
// Invalid selector, skip
}}
}}
return {{ visible: false }};
// Also grab a snippet of the HTML to help debug
const bodyHtml = document.body.innerHTML.slice(0, 2000);
const hasCookieKeyword = bodyHtml.toLowerCase().includes('cookie') ||
bodyHtml.toLowerCase().includes('consent') ||
bodyHtml.toLowerCase().includes('gdpr');
return {{
visible: visibleElement !== null,
selector: visibleElement ? visibleElement.selector : null,
elements_found: elementsFound,
has_cookie_keyword_in_html: hasCookieKeyword,
html_snippet: bodyHtml.slice(0, 500)
}};
}});
console.error('Cookie consent:', JSON.stringify(result));
console.error('Cookie consent check result:', JSON.stringify({{
visible: result.visible,
selector: result.selector,
elements_found_count: result.elements_found.length
}}));
browser.disconnect();
console.log(JSON.stringify(result));
}})();
'''
script_path = tmpdir / 'test_extension.js'
script_path.write_text(test_script)
script_path = script_dir / 'check_cookies.js'
script_path.write_text(test_script)
result = subprocess.run(
['node', str(script_path)],
cwd=str(tmpdir),
capture_output=True,
text=True,
env=env,
timeout=90
result = subprocess.run(
['node', str(script_path)],
cwd=str(script_dir),
capture_output=True,
text=True,
env=env,
timeout=90
)
if result.returncode != 0:
raise RuntimeError(f"Cookie check script failed: {result.stderr}")
output_lines = [l for l in result.stdout.strip().split('\n') if l.startswith('{')]
if not output_lines:
raise RuntimeError(f"No JSON output from cookie check: {result.stdout}\nstderr: {result.stderr}")
return json.loads(output_lines[-1])
def test_hides_cookie_consent_on_filmin():
"""Live test: verify extension hides cookie consent popup on filmin.es.
This test runs TWO browser sessions:
1. WITHOUT extension - verifies cookie consent IS visible (baseline)
2. WITH extension - verifies cookie consent is HIDDEN
This ensures we're actually testing the extension's effect, not just
that a page happens to not have cookie consent.
"""
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Set up isolated env with proper directory structure
env_base = setup_test_env(tmpdir)
env_base['CHROME_HEADLESS'] = 'true'
ext_dir = Path(env_base['CHROME_EXTENSIONS_DIR'])
# ============================================================
# STEP 1: BASELINE - Run WITHOUT extension, verify cookie consent IS visible
# ============================================================
print("\n" + "="*60)
print("STEP 1: BASELINE TEST (no extension)")
print("="*60)
data_dir = Path(env_base['DATA_DIR'])
env_no_ext = env_base.copy()
env_no_ext['CHROME_EXTENSIONS_DIR'] = str(data_dir / 'personas' / 'Default' / 'empty_extensions')
(data_dir / 'personas' / 'Default' / 'empty_extensions').mkdir(parents=True, exist_ok=True)
# Launch baseline Chromium in crawls directory
baseline_crawl_id = 'baseline-no-ext'
baseline_crawl_dir = Path(env_base['CRAWLS_DIR']) / baseline_crawl_id
baseline_crawl_dir.mkdir(parents=True, exist_ok=True)
baseline_chrome_dir = baseline_crawl_dir / 'chrome'
env_no_ext['CRAWL_OUTPUT_DIR'] = str(baseline_crawl_dir)
baseline_process = None
try:
baseline_process, baseline_cdp_url = launch_chromium_session(
env_no_ext, baseline_chrome_dir, baseline_crawl_id
)
print(f"Baseline Chromium launched: {baseline_cdp_url}")
# Wait a moment for browser to be ready
time.sleep(2)
baseline_result = check_cookie_consent_visibility(
baseline_cdp_url, TEST_URL, env_no_ext, tmpdir
)
print(f"stderr: {result.stderr}")
print(f"stdout: {result.stdout}")
print(f"Baseline result: visible={baseline_result['visible']}, "
f"elements_found={len(baseline_result['elements_found'])}")
assert result.returncode == 0, f"Test failed: {result.stderr}"
output_lines = [l for l in result.stdout.strip().split('\n') if l.startswith('{')]
assert output_lines, f"No JSON output: {result.stdout}"
test_result = json.loads(output_lines[-1])
assert not test_result['visible'], \
f"Cookie consent should be hidden by extension. Result: {test_result}"
if baseline_result['elements_found']:
print("Elements found in baseline:")
for el in baseline_result['elements_found'][:5]: # Show first 5
print(f" - {el['selector']}: visible={el['visible']}, "
f"display={el['display']}, size={el['width']}x{el['height']}")
finally:
# Clean up Chromium
try:
chrome_launch_process.send_signal(signal.SIGTERM)
chrome_launch_process.wait(timeout=5)
except:
pass
chrome_pid_file = chrome_dir / 'chrome.pid'
if chrome_pid_file.exists():
try:
chrome_pid = int(chrome_pid_file.read_text().strip())
os.kill(chrome_pid, signal.SIGKILL)
except (OSError, ValueError):
pass
if baseline_process:
kill_chromium_session(baseline_process, baseline_chrome_dir)
# Verify baseline shows cookie consent
if not baseline_result['visible']:
# If no cookie consent visible in baseline, we can't test the extension
# This could happen if:
# - The site changed and no longer shows cookie consent
# - Cookie consent is region-specific
# - Our selectors don't match this site
print("\nWARNING: No cookie consent visible in baseline!")
print(f"HTML has cookie keywords: {baseline_result.get('has_cookie_keyword_in_html')}")
print(f"HTML snippet: {baseline_result.get('html_snippet', '')[:200]}")
pytest.skip(
f"Cannot test extension: no cookie consent visible in baseline on {TEST_URL}. "
f"Elements found: {len(baseline_result['elements_found'])}. "
f"The site may have changed or cookie consent may be region-specific."
)
print(f"\n✓ Baseline confirmed: Cookie consent IS visible (selector: {baseline_result['selector']})")
# ============================================================
# STEP 2: Install the extension
# ============================================================
print("\n" + "="*60)
print("STEP 2: INSTALLING EXTENSION")
print("="*60)
env_with_ext = env_base.copy()
env_with_ext['CHROME_EXTENSIONS_DIR'] = str(ext_dir)
result = subprocess.run(
['node', str(INSTALL_SCRIPT)],
cwd=str(tmpdir),
capture_output=True,
text=True,
env=env_with_ext,
timeout=60
)
assert result.returncode == 0, f"Extension install failed: {result.stderr}"
cache_file = ext_dir / 'istilldontcareaboutcookies.extension.json'
assert cache_file.exists(), "Extension cache not created"
ext_data = json.loads(cache_file.read_text())
print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}")
# ============================================================
# STEP 3: Run WITH extension, verify cookie consent is HIDDEN
# ============================================================
print("\n" + "="*60)
print("STEP 3: TEST WITH EXTENSION")
print("="*60)
# Launch extension test Chromium in crawls directory
ext_crawl_id = 'test-with-ext'
ext_crawl_dir = Path(env_base['CRAWLS_DIR']) / ext_crawl_id
ext_crawl_dir.mkdir(parents=True, exist_ok=True)
ext_chrome_dir = ext_crawl_dir / 'chrome'
env_with_ext['CRAWL_OUTPUT_DIR'] = str(ext_crawl_dir)
ext_process = None
try:
ext_process, ext_cdp_url = launch_chromium_session(
env_with_ext, ext_chrome_dir, ext_crawl_id
)
print(f"Extension Chromium launched: {ext_cdp_url}")
# Check that extension was loaded
extensions_file = ext_chrome_dir / 'extensions.json'
if extensions_file.exists():
loaded_exts = json.loads(extensions_file.read_text())
print(f"Extensions loaded: {[e.get('name') for e in loaded_exts]}")
# Wait for extension to initialize
time.sleep(3)
ext_result = check_cookie_consent_visibility(
ext_cdp_url, TEST_URL, env_with_ext, tmpdir
)
print(f"Extension result: visible={ext_result['visible']}, "
f"elements_found={len(ext_result['elements_found'])}")
if ext_result['elements_found']:
print("Elements found with extension:")
for el in ext_result['elements_found'][:5]:
print(f" - {el['selector']}: visible={el['visible']}, "
f"display={el['display']}, size={el['width']}x{el['height']}")
finally:
if ext_process:
kill_chromium_session(ext_process, ext_chrome_dir)
# ============================================================
# STEP 4: Compare results
# ============================================================
print("\n" + "="*60)
print("STEP 4: COMPARISON")
print("="*60)
print(f"Baseline (no extension): cookie consent visible = {baseline_result['visible']}")
print(f"With extension: cookie consent visible = {ext_result['visible']}")
assert baseline_result['visible'], \
"Baseline should show cookie consent (this shouldn't happen, we checked above)"
assert not ext_result['visible'], \
f"Cookie consent should be HIDDEN by extension.\n" \
f"Baseline showed consent at: {baseline_result['selector']}\n" \
f"But with extension, consent is still visible.\n" \
f"Elements still visible: {[e for e in ext_result['elements_found'] if e['visible']]}"
print("\n✓ SUCCESS: Extension correctly hides cookie consent!")
print(f" - Baseline showed consent at: {baseline_result['selector']}")
print(f" - Extension successfully hid it")

View File

@@ -26,7 +26,7 @@ import pytest
PLUGIN_DIR = Path(__file__).parent.parent
PLUGINS_ROOT = PLUGIN_DIR.parent
MODALCLOSER_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_modalcloser.*'), None)
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__20_chrome_launch.bg.js'
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__30_chrome_launch.bg.js'
CHROME_TAB_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Snapshot__20_chrome_tab.bg.js'
CHROME_NAVIGATE_HOOK = next((PLUGINS_ROOT / 'chrome').glob('on_Snapshot__*_chrome_navigate.*'), None)
TEST_URL = 'https://www.singsing.movie/'
@@ -123,6 +123,7 @@ def setup_chrome_session(tmpdir):
crawl_dir = Path(tmpdir) / 'crawl'
crawl_dir.mkdir()
chrome_dir = crawl_dir / 'chrome'
chrome_dir.mkdir()
env = get_test_env()
env['CHROME_HEADLESS'] = 'true'
@@ -130,7 +131,7 @@ def setup_chrome_session(tmpdir):
# Launch Chrome at crawl level
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-modalcloser'],
cwd=str(crawl_dir),
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,

View File

@@ -19,7 +19,7 @@ const puppeteer = require('puppeteer-core');
const PLUGIN_NAME = 'redirects';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'redirects.jsonl';
const PID_FILE = 'hook.pid';
// PID file is now written by run_hook() with hook-specific name
const CHROME_SESSION_DIR = '../chrome';
// Global state
@@ -274,8 +274,8 @@ async function main() {
// Set up redirect listener BEFORE navigation
await setupRedirectListener();
// Write PID file
fs.writeFileSync(path.join(OUTPUT_DIR, PID_FILE), String(process.pid));
// Note: PID file is written by run_hook() with hook-specific name
// Snapshot.cleanup() kills all *.pid processes when done
// Wait for chrome_navigate to complete (BLOCKING)
await waitForNavigation();

View File

@@ -19,7 +19,7 @@ const puppeteer = require('puppeteer-core');
const PLUGIN_NAME = 'responses';
const OUTPUT_DIR = '.';
const PID_FILE = 'hook.pid';
// PID file is now written by run_hook() with hook-specific name
const CHROME_SESSION_DIR = '../chrome';
// Resource types to capture (by default, capture everything)
@@ -323,8 +323,8 @@ async function main() {
// Set up listener BEFORE navigation
await setupListener();
// Write PID file
fs.writeFileSync(path.join(OUTPUT_DIR, PID_FILE), String(process.pid));
// Note: PID file is written by run_hook() with hook-specific name
// Snapshot.cleanup() kills all *.pid processes when done
// Wait for chrome_navigate to complete (BLOCKING)
await waitForNavigation();

View File

@@ -1,268 +0,0 @@
#!/usr/bin/env node
/**
* SingleFile Extension Plugin
*
* Installs and uses the SingleFile Chrome extension for archiving complete web pages.
* Falls back to single-file-cli if the extension is not available.
*
* Extension: https://chromewebstore.google.com/detail/mpiodijhokgodhhofbcjdecpffjipkle
*
* Priority: 04 (early) - Must install before Chrome session starts at Crawl level
* Hook: on_Crawl (runs once per crawl, not per snapshot)
*
* This extension automatically:
* - Saves complete web pages as single HTML files
* - Inlines all resources (CSS, JS, images, fonts)
* - Preserves page fidelity better than wget/curl
* - Works with SPAs and dynamically loaded content
*/
const path = require('path');
const fs = require('fs');
const { promisify } = require('util');
const { exec } = require('child_process');
const execAsync = promisify(exec);
// Import extension utilities
const extensionUtils = require('../chrome/chrome_utils.js');
// Extension metadata
const EXTENSION = {
webstore_id: 'mpiodijhokgodhhofbcjdecpffjipkle',
name: 'singlefile',
};
// Get extensions directory from environment or use default
const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions');
const CHROME_DOWNLOADS_DIR = process.env.CHROME_DOWNLOADS_DIR ||
path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_downloads');
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'singlefile.html';
/**
* Install the SingleFile extension
*/
async function installSinglefileExtension() {
console.log('[*] Installing SingleFile extension...');
// Install the extension
const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR);
if (!extension) {
console.error('[❌] Failed to install SingleFile extension');
return null;
}
console.log('[+] SingleFile extension installed');
console.log('[+] Web pages will be saved as single HTML files');
return extension;
}
/**
* Wait for a specified amount of time
*/
function wait(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
/**
* Save a page using the SingleFile extension
*
* @param {Object} page - Puppeteer page object
* @param {Object} extension - Extension metadata with dispatchAction method
* @param {Object} options - Additional options
* @returns {Promise<string|null>} - Path to saved file or null on failure
*/
async function saveSinglefileWithExtension(page, extension, options = {}) {
if (!extension || !extension.version) {
throw new Error('SingleFile extension not found or not loaded');
}
const url = await page.url();
// Check for unsupported URL schemes
const URL_SCHEMES_IGNORED = ['about', 'chrome', 'chrome-extension', 'data', 'javascript', 'blob'];
const scheme = url.split(':')[0];
if (URL_SCHEMES_IGNORED.includes(scheme)) {
console.log(`[⚠️] Skipping SingleFile for URL scheme: ${scheme}`);
return null;
}
// Ensure downloads directory exists
await fs.promises.mkdir(CHROME_DOWNLOADS_DIR, { recursive: true });
// Get list of existing files to ignore
const files_before = new Set(
(await fs.promises.readdir(CHROME_DOWNLOADS_DIR))
.filter(fn => fn.endsWith('.html'))
);
// Output directory is current directory (hook already runs in output dir)
const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
console.log(`[🛠️] Saving SingleFile HTML using extension (${extension.id})...`);
// Bring page to front (extension action button acts on foreground tab)
await page.bringToFront();
// Trigger the extension's action (toolbar button click)
await extension.dispatchAction();
// Wait for file to appear in downloads directory
const check_delay = 3000; // 3 seconds
const max_tries = 10;
let files_new = [];
for (let attempt = 0; attempt < max_tries; attempt++) {
await wait(check_delay);
const files_after = (await fs.promises.readdir(CHROME_DOWNLOADS_DIR))
.filter(fn => fn.endsWith('.html'));
files_new = files_after.filter(file => !files_before.has(file));
if (files_new.length === 0) {
continue;
}
// Find the matching file by checking if it contains the URL in the HTML header
for (const file of files_new) {
const dl_path = path.join(CHROME_DOWNLOADS_DIR, file);
const dl_text = await fs.promises.readFile(dl_path, 'utf-8');
const dl_header = dl_text.split('meta charset')[0];
if (dl_header.includes(`url: ${url}`)) {
console.log(`[✍️] Moving SingleFile download from ${file} to ${out_path}`);
await fs.promises.rename(dl_path, out_path);
return out_path;
}
}
}
console.warn(`[❌] Couldn't find matching SingleFile HTML in ${CHROME_DOWNLOADS_DIR} after waiting ${(check_delay * max_tries) / 1000}s`);
console.warn(`[⚠️] New files found: ${files_new.join(', ')}`);
return null;
}
/**
* Save a page using single-file-cli (fallback method)
*
* @param {string} url - URL to archive
* @param {Object} options - Additional options
* @returns {Promise<string|null>} - Path to saved file or null on failure
*/
async function saveSinglefileWithCLI(url, options = {}) {
console.log('[*] Falling back to single-file-cli...');
// Find single-file binary
let binary = null;
try {
const { stdout } = await execAsync('which single-file');
binary = stdout.trim();
} catch (err) {
console.error('[❌] single-file-cli not found. Install with: npm install -g single-file-cli');
return null;
}
// Output directory is current directory (hook already runs in output dir)
const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
// Build command
const cmd = [
binary,
'--browser-headless',
url,
out_path,
];
// Add optional args
if (options.userAgent) {
cmd.splice(2, 0, '--browser-user-agent', options.userAgent);
}
if (options.cookiesFile && fs.existsSync(options.cookiesFile)) {
cmd.splice(2, 0, '--browser-cookies-file', options.cookiesFile);
}
if (options.ignoreSSL) {
cmd.splice(2, 0, '--browser-ignore-insecure-certs');
}
// Execute
try {
const timeout = options.timeout || 120000;
await execAsync(cmd.join(' '), { timeout });
if (fs.existsSync(out_path) && fs.statSync(out_path).size > 0) {
console.log(`[+] SingleFile saved via CLI: ${out_path}`);
return out_path;
}
console.error('[❌] SingleFile CLI completed but no output file found');
return null;
} catch (err) {
console.error(`[❌] SingleFile CLI error: ${err.message}`);
return null;
}
}
/**
* Main entry point - install extension before archiving
*/
async function main() {
// Check if extension is already cached
const cacheFile = path.join(EXTENSIONS_DIR, 'singlefile.extension.json');
if (fs.existsSync(cacheFile)) {
try {
const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
const manifestPath = path.join(cached.unpacked_path, 'manifest.json');
if (fs.existsSync(manifestPath)) {
console.log('[*] SingleFile extension already installed (using cache)');
return cached;
}
} catch (e) {
// Cache file corrupted, re-install
console.warn('[⚠️] Extension cache corrupted, re-installing...');
}
}
// Install extension
const extension = await installSinglefileExtension();
// Export extension metadata for chrome plugin to load
if (extension) {
// Write extension info to a cache file that chrome plugin can read
await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true });
await fs.promises.writeFile(
cacheFile,
JSON.stringify(extension, null, 2)
);
console.log(`[+] Extension metadata written to ${cacheFile}`);
}
return extension;
}
// Export functions for use by other plugins
module.exports = {
EXTENSION,
installSinglefileExtension,
saveSinglefileWithExtension,
saveSinglefileWithCLI,
};
// Run if executed directly
if (require.main === module) {
main().then(() => {
console.log('[✓] SingleFile extension setup complete');
process.exit(0);
}).catch(err => {
console.error('[❌] SingleFile extension setup failed:', err);
process.exit(1);
});
}

View File

@@ -0,0 +1,281 @@
#!/usr/bin/env node
/**
* SingleFile Extension Plugin
*
* DISABLED: Extension functionality commented out - using single-file-cli only
*
* Installs and uses the SingleFile Chrome extension for archiving complete web pages.
* Falls back to single-file-cli if the extension is not available.
*
* Extension: https://chromewebstore.google.com/detail/mpiodijhokgodhhofbcjdecpffjipkle
*
* Priority: 04 (early) - Must install before Chrome session starts at Crawl level
* Hook: on_Crawl (runs once per crawl, not per snapshot)
*
* This extension automatically:
* - Saves complete web pages as single HTML files
* - Inlines all resources (CSS, JS, images, fonts)
* - Preserves page fidelity better than wget/curl
* - Works with SPAs and dynamically loaded content
*/
const path = require('path');
const fs = require('fs');
const { promisify } = require('util');
const { exec } = require('child_process');
const execAsync = promisify(exec);
// DISABLED: Extension functionality - using single-file-cli only
// // Import extension utilities
// const extensionUtils = require('../chrome/chrome_utils.js');
// // Extension metadata
// const EXTENSION = {
// webstore_id: 'mpiodijhokgodhhofbcjdecpffjipkle',
// name: 'singlefile',
// };
// // Get extensions directory from environment or use default
// const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
// path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions');
// const CHROME_DOWNLOADS_DIR = process.env.CHROME_DOWNLOADS_DIR ||
// path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_downloads');
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'singlefile.html';
// DISABLED: Extension functionality - using single-file-cli only
// /**
// * Install the SingleFile extension
// */
// async function installSinglefileExtension() {
// console.log('[*] Installing SingleFile extension...');
// // Install the extension
// const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR);
// if (!extension) {
// console.error('[❌] Failed to install SingleFile extension');
// return null;
// }
// console.log('[+] SingleFile extension installed');
// console.log('[+] Web pages will be saved as single HTML files');
// return extension;
// }
// /**
// * Wait for a specified amount of time
// */
// function wait(ms) {
// return new Promise(resolve => setTimeout(resolve, ms));
// }
// /**
// * Save a page using the SingleFile extension
// *
// * @param {Object} page - Puppeteer page object
// * @param {Object} extension - Extension metadata with dispatchAction method
// * @param {Object} options - Additional options
// * @returns {Promise<string|null>} - Path to saved file or null on failure
// */
// async function saveSinglefileWithExtension(page, extension, options = {}) {
// if (!extension || !extension.version) {
// throw new Error('SingleFile extension not found or not loaded');
// }
// const url = await page.url();
// // Check for unsupported URL schemes
// const URL_SCHEMES_IGNORED = ['about', 'chrome', 'chrome-extension', 'data', 'javascript', 'blob'];
// const scheme = url.split(':')[0];
// if (URL_SCHEMES_IGNORED.includes(scheme)) {
// console.log(`[⚠️] Skipping SingleFile for URL scheme: ${scheme}`);
// return null;
// }
// // Ensure downloads directory exists
// await fs.promises.mkdir(CHROME_DOWNLOADS_DIR, { recursive: true });
// // Get list of existing files to ignore
// const files_before = new Set(
// (await fs.promises.readdir(CHROME_DOWNLOADS_DIR))
// .filter(fn => fn.endsWith('.html'))
// );
// // Output directory is current directory (hook already runs in output dir)
// const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
// console.log(`[🛠️] Saving SingleFile HTML using extension (${extension.id})...`);
// // Bring page to front (extension action button acts on foreground tab)
// await page.bringToFront();
// // Trigger the extension's action (toolbar button click)
// await extension.dispatchAction();
// // Wait for file to appear in downloads directory
// const check_delay = 3000; // 3 seconds
// const max_tries = 10;
// let files_new = [];
// for (let attempt = 0; attempt < max_tries; attempt++) {
// await wait(check_delay);
// const files_after = (await fs.promises.readdir(CHROME_DOWNLOADS_DIR))
// .filter(fn => fn.endsWith('.html'));
// files_new = files_after.filter(file => !files_before.has(file));
// if (files_new.length === 0) {
// continue;
// }
// // Find the matching file by checking if it contains the URL in the HTML header
// for (const file of files_new) {
// const dl_path = path.join(CHROME_DOWNLOADS_DIR, file);
// const dl_text = await fs.promises.readFile(dl_path, 'utf-8');
// const dl_header = dl_text.split('meta charset')[0];
// if (dl_header.includes(`url: ${url}`)) {
// console.log(`[✍️] Moving SingleFile download from ${file} to ${out_path}`);
// await fs.promises.rename(dl_path, out_path);
// return out_path;
// }
// }
// }
// console.warn(`[❌] Couldn't find matching SingleFile HTML in ${CHROME_DOWNLOADS_DIR} after waiting ${(check_delay * max_tries) / 1000}s`);
// console.warn(`[⚠️] New files found: ${files_new.join(', ')}`);
// return null;
// }
/**
* Save a page using single-file-cli (fallback method)
*
* @param {string} url - URL to archive
* @param {Object} options - Additional options
* @returns {Promise<string|null>} - Path to saved file or null on failure
*/
async function saveSinglefileWithCLI(url, options = {}) {
console.log('[*] Falling back to single-file-cli...');
// Find single-file binary
let binary = null;
try {
const { stdout } = await execAsync('which single-file');
binary = stdout.trim();
} catch (err) {
console.error('[❌] single-file-cli not found. Install with: npm install -g single-file-cli');
return null;
}
// Output directory is current directory (hook already runs in output dir)
const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
// Build command
const cmd = [
binary,
'--browser-headless',
url,
out_path,
];
// Add optional args
if (options.userAgent) {
cmd.splice(2, 0, '--browser-user-agent', options.userAgent);
}
if (options.cookiesFile && fs.existsSync(options.cookiesFile)) {
cmd.splice(2, 0, '--browser-cookies-file', options.cookiesFile);
}
if (options.ignoreSSL) {
cmd.splice(2, 0, '--browser-ignore-insecure-certs');
}
// Execute
try {
const timeout = options.timeout || 120000;
await execAsync(cmd.join(' '), { timeout });
if (fs.existsSync(out_path) && fs.statSync(out_path).size > 0) {
console.log(`[+] SingleFile saved via CLI: ${out_path}`);
return out_path;
}
console.error('[❌] SingleFile CLI completed but no output file found');
return null;
} catch (err) {
console.error(`[❌] SingleFile CLI error: ${err.message}`);
return null;
}
}
// DISABLED: Extension functionality - using single-file-cli only
// /**
// * Main entry point - install extension before archiving
// */
// async function main() {
// // Check if extension is already cached
// const cacheFile = path.join(EXTENSIONS_DIR, 'singlefile.extension.json');
// if (fs.existsSync(cacheFile)) {
// try {
// const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
// const manifestPath = path.join(cached.unpacked_path, 'manifest.json');
// if (fs.existsSync(manifestPath)) {
// console.log('[*] SingleFile extension already installed (using cache)');
// return cached;
// }
// } catch (e) {
// // Cache file corrupted, re-install
// console.warn('[⚠️] Extension cache corrupted, re-installing...');
// }
// }
// // Install extension
// const extension = await installSinglefileExtension();
// // Export extension metadata for chrome plugin to load
// if (extension) {
// // Write extension info to a cache file that chrome plugin can read
// await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true });
// await fs.promises.writeFile(
// cacheFile,
// JSON.stringify(extension, null, 2)
// );
// console.log(`[+] Extension metadata written to ${cacheFile}`);
// }
// return extension;
// }
// Export functions for use by other plugins
module.exports = {
// DISABLED: Extension functionality - using single-file-cli only
// EXTENSION,
// installSinglefileExtension,
// saveSinglefileWithExtension,
saveSinglefileWithCLI,
};
// DISABLED: Extension functionality - using single-file-cli only
// // Run if executed directly
// if (require.main === module) {
// main().then(() => {
// console.log('[✓] SingleFile extension setup complete');
// process.exit(0);
// }).catch(err => {
// console.error('[❌] SingleFile extension setup failed:', err);
// process.exit(1);
// });
// }
// No-op when run directly (extension install disabled)
if (require.main === module) {
console.log('[*] SingleFile extension install disabled - using single-file-cli only');
process.exit(0);
}

View File

@@ -2,16 +2,15 @@
Integration tests for singlefile plugin
Tests verify:
1. Hook script exists and has correct metadata
2. Extension installation and caching works
3. Chrome/node dependencies available
4. Hook can be executed successfully
1. Hook scripts exist with correct naming
2. CLI-based singlefile extraction works
3. Dependencies available via abx-pkg
4. Output contains valid HTML
"""
import json
import os
import subprocess
import sys
import tempfile
from pathlib import Path
@@ -20,177 +19,63 @@ import pytest
PLUGIN_DIR = Path(__file__).parent.parent
PLUGINS_ROOT = PLUGIN_DIR.parent
INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_singlefile.*'), None)
NPM_PROVIDER_HOOK = PLUGINS_ROOT / 'npm' / 'on_Binary__install_using_npm_provider.py'
SNAPSHOT_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_singlefile.py'), None)
TEST_URL = "https://example.com"
def test_install_script_exists():
"""Verify install script exists"""
assert INSTALL_SCRIPT.exists(), f"Install script not found: {INSTALL_SCRIPT}"
def test_snapshot_hook_exists():
"""Verify snapshot extraction hook exists"""
assert SNAPSHOT_HOOK is not None and SNAPSHOT_HOOK.exists(), f"Snapshot hook not found in {PLUGIN_DIR}"
def test_extension_metadata():
"""Test that SingleFile extension has correct metadata"""
with tempfile.TemporaryDirectory() as tmpdir:
env = os.environ.copy()
env["CHROME_EXTENSIONS_DIR"] = str(Path(tmpdir) / "chrome_extensions")
result = subprocess.run(
["node", "-e", f"const ext = require('{INSTALL_SCRIPT}'); console.log(JSON.stringify(ext.EXTENSION))"],
capture_output=True,
text=True,
env=env
)
assert result.returncode == 0, f"Failed to load extension metadata: {result.stderr}"
metadata = json.loads(result.stdout)
assert metadata["webstore_id"] == "mpiodijhokgodhhofbcjdecpffjipkle"
assert metadata["name"] == "singlefile"
def test_install_creates_cache():
"""Test that install creates extension cache"""
with tempfile.TemporaryDirectory() as tmpdir:
ext_dir = Path(tmpdir) / "chrome_extensions"
ext_dir.mkdir(parents=True)
env = os.environ.copy()
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
result = subprocess.run(
["node", str(INSTALL_SCRIPT)],
capture_output=True,
text=True,
env=env,
timeout=60
)
# Check output mentions installation
assert "SingleFile" in result.stdout or "singlefile" in result.stdout
# Check cache file was created
cache_file = ext_dir / "singlefile.extension.json"
assert cache_file.exists(), "Cache file should be created"
# Verify cache content
cache_data = json.loads(cache_file.read_text())
assert cache_data["webstore_id"] == "mpiodijhokgodhhofbcjdecpffjipkle"
assert cache_data["name"] == "singlefile"
def test_install_twice_uses_cache():
"""Test that running install twice uses existing cache on second run"""
with tempfile.TemporaryDirectory() as tmpdir:
ext_dir = Path(tmpdir) / "chrome_extensions"
ext_dir.mkdir(parents=True)
env = os.environ.copy()
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
# First install - downloads the extension
result1 = subprocess.run(
["node", str(INSTALL_SCRIPT)],
capture_output=True,
text=True,
env=env,
timeout=60
)
assert result1.returncode == 0, f"First install failed: {result1.stderr}"
# Verify cache was created
cache_file = ext_dir / "singlefile.extension.json"
assert cache_file.exists(), "Cache file should exist after first install"
# Second install - should use cache
result2 = subprocess.run(
["node", str(INSTALL_SCRIPT)],
capture_output=True,
text=True,
env=env,
timeout=30
)
assert result2.returncode == 0, f"Second install failed: {result2.stderr}"
# Second run should be faster (uses cache) and mention cache
assert "already installed" in result2.stdout or "cache" in result2.stdout.lower() or result2.returncode == 0
def test_no_configuration_required():
"""Test that SingleFile works without configuration"""
with tempfile.TemporaryDirectory() as tmpdir:
ext_dir = Path(tmpdir) / "chrome_extensions"
ext_dir.mkdir(parents=True)
env = os.environ.copy()
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
# No API keys needed
result = subprocess.run(
["node", str(INSTALL_SCRIPT)],
capture_output=True,
text=True,
env=env,
timeout=60
)
# Should work without API keys
assert result.returncode == 0
def test_priority_order():
"""Test that singlefile has correct priority (04)"""
# Extract priority from filename
filename = INSTALL_SCRIPT.name
assert "04" in filename, "SingleFile should have priority 04"
assert filename.startswith("on_Crawl__04_"), "Should follow priority naming convention for Crawl hooks"
def test_output_directory_structure():
"""Test that plugin defines correct output structure"""
# Verify the script mentions singlefile output directory
script_content = INSTALL_SCRIPT.read_text()
# Should mention singlefile output directory
assert "singlefile" in script_content.lower()
# Should mention HTML output
assert ".html" in script_content or "html" in script_content.lower()
def test_snapshot_hook_priority():
"""Test that snapshot hook has correct priority (50)"""
filename = SNAPSHOT_HOOK.name
assert "50" in filename, "SingleFile snapshot hook should have priority 50"
assert filename.startswith("on_Snapshot__50_"), "Should follow priority naming convention"
def test_verify_deps_with_abx_pkg():
"""Verify dependencies are available via abx-pkg after hook installation."""
from abx_pkg import Binary, EnvProvider, BinProviderOverrides
"""Verify dependencies are available via abx-pkg."""
from abx_pkg import Binary, EnvProvider
EnvProvider.model_rebuild()
# Verify node is available (singlefile uses Chrome extension, needs Node)
# Verify node is available
node_binary = Binary(name='node', binproviders=[EnvProvider()])
node_loaded = node_binary.load()
assert node_loaded and node_loaded.abspath, "Node.js required for singlefile plugin"
def test_singlefile_hook_runs():
"""Verify singlefile hook can be executed and completes."""
# Prerequisites checked by earlier test
def test_singlefile_cli_archives_example_com():
"""Test that singlefile CLI archives example.com and produces valid HTML."""
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Run singlefile extraction hook
env = os.environ.copy()
env['SINGLEFILE_ENABLED'] = 'true'
# Run singlefile snapshot hook
result = subprocess.run(
['node', str(INSTALL_SCRIPT), f'--url={TEST_URL}', '--snapshot-id=test789'],
['python', str(SNAPSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=120
)
# Hook should complete successfully (even if it just installs extension)
assert result.returncode == 0, f"Hook execution failed: {result.stderr}"
# Verify extension installation happens
assert 'SingleFile extension' in result.stdout or result.returncode == 0, "Should install extension or complete"
# Verify output file exists
output_file = tmpdir / 'singlefile.html'
assert output_file.exists(), f"singlefile.html not created. stdout: {result.stdout}, stderr: {result.stderr}"
# Verify it contains real HTML
html_content = output_file.read_text()
assert len(html_content) > 500, "Output file too small to be valid HTML"
assert '<!DOCTYPE html>' in html_content or '<html' in html_content, "Output should contain HTML doctype or html tag"
assert 'Example Domain' in html_content, "Output should contain example.com content"
if __name__ == '__main__':

View File

@@ -19,7 +19,7 @@ const puppeteer = require('puppeteer-core');
const PLUGIN_NAME = 'ssl';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'ssl.jsonl';
const PID_FILE = 'hook.pid';
// PID file is now written by run_hook() with hook-specific name
const CHROME_SESSION_DIR = '../chrome';
function parseArgs() {
@@ -211,8 +211,8 @@ async function main() {
// Set up listener BEFORE navigation
await setupListener(url);
// Write PID file so chrome_cleanup can kill any remaining processes
fs.writeFileSync(path.join(OUTPUT_DIR, PID_FILE), String(process.pid));
// Note: PID file is written by run_hook() with hook-specific name
// Snapshot.cleanup() kills all *.pid processes when done
// Wait for chrome_navigate to complete (BLOCKING)
await waitForNavigation();

View File

@@ -18,7 +18,7 @@ const puppeteer = require('puppeteer-core');
const PLUGIN_NAME = 'staticfile';
const OUTPUT_DIR = '.';
const PID_FILE = 'hook.pid';
// PID file is now written by run_hook() with hook-specific name
const CHROME_SESSION_DIR = '../chrome';
// Content-Types that indicate static files
@@ -398,8 +398,8 @@ async function main() {
// Set up static file listener BEFORE navigation
await setupStaticFileListener();
// Write PID file
fs.writeFileSync(path.join(OUTPUT_DIR, PID_FILE), String(process.pid));
// Note: PID file is written by run_hook() with hook-specific name
// Snapshot.cleanup() kills all *.pid processes when done
// Wait for chrome_navigate to complete (BLOCKING)
await waitForNavigation();

View File

@@ -0,0 +1,50 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"required_plugins": ["chrome"],
"properties": {
"TWOCAPTCHA_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["CAPTCHA2_ENABLED", "USE_CAPTCHA2", "USE_TWOCAPTCHA"],
"description": "Enable 2captcha browser extension for automatic CAPTCHA solving"
},
"TWOCAPTCHA_API_KEY": {
"type": "string",
"default": "",
"x-aliases": ["API_KEY_2CAPTCHA", "CAPTCHA2_API_KEY"],
"x-sensitive": true,
"description": "2captcha API key for CAPTCHA solving service (get from https://2captcha.com)"
},
"TWOCAPTCHA_RETRY_COUNT": {
"type": "integer",
"default": 3,
"minimum": 0,
"maximum": 10,
"x-aliases": ["CAPTCHA2_RETRY_COUNT"],
"description": "Number of times to retry CAPTCHA solving on error"
},
"TWOCAPTCHA_RETRY_DELAY": {
"type": "integer",
"default": 5,
"minimum": 0,
"maximum": 60,
"x-aliases": ["CAPTCHA2_RETRY_DELAY"],
"description": "Delay in seconds between CAPTCHA solving retries"
},
"TWOCAPTCHA_TIMEOUT": {
"type": "integer",
"default": 60,
"minimum": 5,
"x-fallback": "TIMEOUT",
"x-aliases": ["CAPTCHA2_TIMEOUT"],
"description": "Timeout for CAPTCHA solving in seconds"
},
"TWOCAPTCHA_AUTO_SUBMIT": {
"type": "boolean",
"default": false,
"description": "Automatically submit forms after CAPTCHA is solved"
}
}
}

View File

@@ -12,7 +12,7 @@
* Hook: on_Crawl (runs once per crawl, not per snapshot)
*
* Requirements:
* - API_KEY_2CAPTCHA environment variable must be set
* - TWOCAPTCHA_API_KEY environment variable must be set
* - Extension will automatically solve reCAPTCHA, hCaptcha, Cloudflare Turnstile, etc.
*/
@@ -25,7 +25,7 @@ const extensionUtils = require('../chrome/chrome_utils.js');
// Extension metadata
const EXTENSION = {
webstore_id: 'ifibfemgeogfhoebkmokieepdoobkbpo',
name: 'captcha2',
name: 'twocaptcha',
};
// Get extensions directory from environment or use default
@@ -47,10 +47,10 @@ async function installCaptchaExtension() {
}
// Check if API key is configured
const apiKey = process.env.API_KEY_2CAPTCHA;
const apiKey = process.env.TWOCAPTCHA_API_KEY || process.env.API_KEY_2CAPTCHA;
if (!apiKey || apiKey === 'YOUR_API_KEY_HERE') {
console.warn('[⚠️] 2captcha extension installed but API_KEY_2CAPTCHA not configured');
console.warn('[⚠️] Set API_KEY_2CAPTCHA environment variable to enable automatic CAPTCHA solving');
console.warn('[⚠️] 2captcha extension installed but TWOCAPTCHA_API_KEY not configured');
console.warn('[⚠️] Set TWOCAPTCHA_API_KEY environment variable to enable automatic CAPTCHA solving');
} else {
console.log('[+] 2captcha extension installed and API key configured');
}
@@ -69,7 +69,7 @@ async function installCaptchaExtension() {
*/
async function main() {
// Check if extension is already cached
const cacheFile = path.join(EXTENSIONS_DIR, 'captcha2.extension.json');
const cacheFile = path.join(EXTENSIONS_DIR, 'twocaptcha.extension.json');
if (fs.existsSync(cacheFile)) {
try {

View File

@@ -0,0 +1,348 @@
#!/usr/bin/env node
/**
* 2Captcha Extension Configuration
*
* Configures the 2captcha extension with API key and settings after Crawl-level Chrome session starts.
* Runs once per crawl to inject configuration into extension storage.
*
* Priority: 25 (after chrome_launch at 30, before snapshots start)
* Hook: on_Crawl (runs once per crawl, not per snapshot)
*
* Config Options (from config.json / environment):
* - TWOCAPTCHA_API_KEY: API key for 2captcha service
* - TWOCAPTCHA_ENABLED: Enable/disable the extension
* - TWOCAPTCHA_RETRY_COUNT: Number of retries on error
* - TWOCAPTCHA_RETRY_DELAY: Delay between retries (seconds)
* - TWOCAPTCHA_AUTO_SUBMIT: Auto-submit forms after solving
*
* Requirements:
* - TWOCAPTCHA_API_KEY environment variable must be set
* - chrome plugin must have loaded extensions (extensions.json must exist)
*/
const path = require('path');
const fs = require('fs');
// Add NODE_MODULES_DIR to module resolution paths if set
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer-core');
// Get crawl's chrome directory from environment variable set by hooks.py
function getCrawlChromeSessionDir() {
const crawlOutputDir = process.env.CRAWL_OUTPUT_DIR || '';
if (!crawlOutputDir) {
return null;
}
return path.join(crawlOutputDir, 'chrome');
}
const CHROME_SESSION_DIR = getCrawlChromeSessionDir() || '../chrome';
const CONFIG_MARKER = path.join(CHROME_SESSION_DIR, '.twocaptcha_configured');
// Get environment variable with default
function getEnv(name, defaultValue = '') {
return (process.env[name] || defaultValue).trim();
}
// Get boolean environment variable
function getEnvBool(name, defaultValue = false) {
const val = getEnv(name, '').toLowerCase();
if (['true', '1', 'yes', 'on'].includes(val)) return true;
if (['false', '0', 'no', 'off'].includes(val)) return false;
return defaultValue;
}
// Get integer environment variable
function getEnvInt(name, defaultValue = 0) {
const val = parseInt(getEnv(name, String(defaultValue)), 10);
return isNaN(val) ? defaultValue : val;
}
// Parse command line arguments
function parseArgs() {
const args = {};
process.argv.slice(2).forEach(arg => {
if (arg.startsWith('--')) {
const [key, ...valueParts] = arg.slice(2).split('=');
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
}
});
return args;
}
/**
* Get 2captcha configuration from environment variables.
* Supports both TWOCAPTCHA_* and legacy API_KEY_2CAPTCHA naming.
*/
function getTwoCaptchaConfig() {
const apiKey = getEnv('TWOCAPTCHA_API_KEY') || getEnv('API_KEY_2CAPTCHA') || getEnv('CAPTCHA2_API_KEY');
const isEnabled = getEnvBool('TWOCAPTCHA_ENABLED', true);
const retryCount = getEnvInt('TWOCAPTCHA_RETRY_COUNT', 3);
const retryDelay = getEnvInt('TWOCAPTCHA_RETRY_DELAY', 5);
const autoSubmit = getEnvBool('TWOCAPTCHA_AUTO_SUBMIT', false);
// Build the full config object matching the extension's storage structure
// Structure: chrome.storage.local.set({config: {...}})
return {
// API key - both variants for compatibility
apiKey: apiKey,
api_key: apiKey,
// Plugin enabled state
isPluginEnabled: isEnabled,
// Retry settings
repeatOnErrorTimes: retryCount,
repeatOnErrorDelay: retryDelay,
// Auto-submit setting
autoSubmitForms: autoSubmit,
submitFormsDelay: 0,
// Enable all CAPTCHA types
enabledForNormal: true,
enabledForRecaptchaV2: true,
enabledForInvisibleRecaptchaV2: true,
enabledForRecaptchaV3: true,
enabledForRecaptchaAudio: false,
enabledForGeetest: true,
enabledForGeetest_v4: true,
enabledForKeycaptcha: true,
enabledForArkoselabs: true,
enabledForLemin: true,
enabledForYandex: true,
enabledForCapyPuzzle: true,
enabledForTurnstile: true,
enabledForAmazonWaf: true,
enabledForMTCaptcha: true,
// Auto-solve all CAPTCHA types
autoSolveNormal: true,
autoSolveRecaptchaV2: true,
autoSolveInvisibleRecaptchaV2: true,
autoSolveRecaptchaV3: true,
autoSolveRecaptchaAudio: false,
autoSolveGeetest: true,
autoSolveGeetest_v4: true,
autoSolveKeycaptcha: true,
autoSolveArkoselabs: true,
autoSolveLemin: true,
autoSolveYandex: true,
autoSolveCapyPuzzle: true,
autoSolveTurnstile: true,
autoSolveAmazonWaf: true,
autoSolveMTCaptcha: true,
// Other settings with sensible defaults
recaptchaV2Type: 'token',
recaptchaV3MinScore: 0.3,
buttonPosition: 'inner',
useProxy: false,
proxy: '',
proxytype: 'HTTP',
blackListDomain: '',
autoSubmitRules: [],
normalSources: [],
};
}
async function configure2Captcha() {
// Check if already configured in this session
if (fs.existsSync(CONFIG_MARKER)) {
console.error('[*] 2captcha already configured in this browser session');
return { success: true, skipped: true };
}
// Get configuration
const config = getTwoCaptchaConfig();
// Check if API key is set
if (!config.apiKey || config.apiKey === 'YOUR_API_KEY_HERE') {
console.warn('[!] 2captcha extension loaded but TWOCAPTCHA_API_KEY not configured');
console.warn('[!] Set TWOCAPTCHA_API_KEY environment variable to enable automatic CAPTCHA solving');
return { success: false, error: 'TWOCAPTCHA_API_KEY not configured' };
}
console.error('[*] Configuring 2captcha extension...');
console.error(`[*] API Key: ${config.apiKey.slice(0, 8)}...${config.apiKey.slice(-4)}`);
console.error(`[*] Enabled: ${config.isPluginEnabled}`);
console.error(`[*] Retry Count: ${config.repeatOnErrorTimes}`);
console.error(`[*] Retry Delay: ${config.repeatOnErrorDelay}s`);
console.error(`[*] Auto Submit: ${config.autoSubmitForms}`);
console.error(`[*] Auto Solve: all CAPTCHA types enabled`);
try {
// Connect to the existing Chrome session via CDP
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
if (!fs.existsSync(cdpFile)) {
return { success: false, error: 'CDP URL not found - chrome plugin must run first' };
}
const cdpUrl = fs.readFileSync(cdpFile, 'utf-8').trim();
const browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl });
try {
// First, navigate to a page to trigger extension content scripts and wake up service worker
console.error('[*] Waking up extension by visiting a page...');
const triggerPage = await browser.newPage();
try {
await triggerPage.goto('https://www.google.com', { waitUntil: 'domcontentloaded', timeout: 10000 });
await new Promise(r => setTimeout(r, 3000)); // Give extension time to initialize
} catch (e) {
console.warn(`[!] Trigger page failed: ${e.message}`);
}
try { await triggerPage.close(); } catch (e) {}
// Get 2captcha extension info from extensions.json
const extensionsFile = path.join(CHROME_SESSION_DIR, 'extensions.json');
if (!fs.existsSync(extensionsFile)) {
return { success: false, error: 'extensions.json not found - chrome plugin must run first' };
}
const extensions = JSON.parse(fs.readFileSync(extensionsFile, 'utf-8'));
const captchaExt = extensions.find(ext => ext.name === 'twocaptcha');
if (!captchaExt) {
console.error('[*] 2captcha extension not installed, skipping configuration');
return { success: true, skipped: true };
}
if (!captchaExt.id) {
return { success: false, error: '2captcha extension ID not found in extensions.json' };
}
const extensionId = captchaExt.id;
console.error(`[*] 2captcha Extension ID: ${extensionId}`);
// Configure via options page
console.error('[*] Configuring via options page...');
const optionsUrl = `chrome-extension://${extensionId}/options/options.html`;
let configPage = await browser.newPage();
try {
// Navigate to options page - catch error but continue since page may still load
try {
await configPage.goto(optionsUrl, { waitUntil: 'networkidle0', timeout: 10000 });
} catch (navError) {
// Navigation may throw ERR_BLOCKED_BY_CLIENT but page still loads
console.error(`[*] Navigation threw error (may still work): ${navError.message}`);
}
// Wait a moment for page to settle
await new Promise(r => setTimeout(r, 3000));
// Check all pages for the extension page (Chrome may open it in a different tab)
const pages = await browser.pages();
for (const page of pages) {
const url = page.url();
if (url.startsWith(`chrome-extension://${extensionId}`)) {
configPage = page;
break;
}
}
const currentUrl = configPage.url();
console.error(`[*] Current URL: ${currentUrl}`);
if (!currentUrl.startsWith(`chrome-extension://${extensionId}`)) {
return { success: false, error: `Failed to navigate to options page, got: ${currentUrl}` };
}
// Wait for Config object to be available
console.error('[*] Waiting for Config object...');
await configPage.waitForFunction(() => typeof Config !== 'undefined', { timeout: 10000 });
// Use chrome.storage.local.set with the config wrapper
const result = await configPage.evaluate((cfg) => {
return new Promise((resolve) => {
if (typeof chrome !== 'undefined' && chrome.storage) {
chrome.storage.local.set({ config: cfg }, () => {
if (chrome.runtime.lastError) {
resolve({ success: false, error: chrome.runtime.lastError.message });
} else {
resolve({ success: true, method: 'options_page' });
}
});
} else {
resolve({ success: false, error: 'chrome.storage not available' });
}
});
}, config);
if (result.success) {
console.error(`[+] 2captcha configured via ${result.method}`);
fs.writeFileSync(CONFIG_MARKER, JSON.stringify({
timestamp: new Date().toISOString(),
method: result.method,
extensionId: extensionId,
config: {
apiKeySet: !!config.apiKey,
isPluginEnabled: config.isPluginEnabled,
repeatOnErrorTimes: config.repeatOnErrorTimes,
repeatOnErrorDelay: config.repeatOnErrorDelay,
autoSubmitForms: config.autoSubmitForms,
autoSolveEnabled: true,
}
}, null, 2));
return { success: true, method: result.method };
}
return { success: false, error: result.error || 'Config failed' };
} finally {
try { await configPage.close(); } catch (e) {}
}
} finally {
browser.disconnect();
}
} catch (e) {
return { success: false, error: `${e.name}: ${e.message}` };
}
}
async function main() {
const args = parseArgs();
const url = args.url;
const snapshotId = args.snapshot_id;
if (!url || !snapshotId) {
console.error('Usage: on_Crawl__25_configure_twocaptcha_extension_options.js --url=<url> --snapshot-id=<uuid>');
process.exit(1);
}
const startTs = new Date();
let status = 'failed';
let error = '';
try {
const result = await configure2Captcha();
if (result.skipped) {
status = 'skipped';
} else if (result.success) {
status = 'succeeded';
} else {
status = 'failed';
error = result.error || 'Configuration failed';
}
} catch (e) {
error = `${e.name}: ${e.message}`;
status = 'failed';
}
const endTs = new Date();
const duration = (endTs - startTs) / 1000;
if (error) {
console.error(`ERROR: ${error}`);
}
// Config hooks don't emit JSONL - they're utility hooks for setup
// Exit code indicates success/failure
process.exit(status === 'succeeded' || status === 'skipped' ? 0 : 1);
}
main().catch(e => {
console.error(`Fatal error: ${e.message}`);
process.exit(1);
});

View File

@@ -0,0 +1,396 @@
"""
Integration tests for twocaptcha plugin
Run with: TWOCAPTCHA_API_KEY=your_key pytest archivebox/plugins/twocaptcha/tests/ -xvs
NOTE: Chrome 137+ removed --load-extension support, so these tests MUST use Chromium.
"""
import json
import os
import signal
import subprocess
import tempfile
import time
from pathlib import Path
import pytest
PLUGIN_DIR = Path(__file__).parent.parent
PLUGINS_ROOT = PLUGIN_DIR.parent
INSTALL_SCRIPT = PLUGIN_DIR / 'on_Crawl__20_install_twocaptcha_extension.js'
CONFIG_SCRIPT = PLUGIN_DIR / 'on_Crawl__25_configure_twocaptcha_extension_options.js'
CHROME_INSTALL_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__00_install_puppeteer_chromium.py'
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__30_chrome_launch.bg.js'
TEST_URL = 'https://2captcha.com/demo/recaptcha-v2'
def setup_test_env(tmpdir: Path) -> dict:
"""Set up isolated data/lib directory structure for tests.
Creates structure matching real ArchiveBox data dir:
<tmpdir>/data/
lib/
arm64-darwin/ (or x86_64-linux, etc.)
npm/
.bin/
node_modules/
personas/
default/
chrome_extensions/
users/
testuser/
crawls/
snapshots/
Calls chrome install hook which handles puppeteer-core and chromium installation.
Returns env dict with DATA_DIR, LIB_DIR, NPM_BIN_DIR, NODE_MODULES_DIR, CHROME_BINARY, etc.
"""
import platform
from datetime import datetime
# Determine machine type (matches archivebox.config.paths.get_machine_type())
machine = platform.machine().lower()
system = platform.system().lower()
if machine in ('arm64', 'aarch64'):
machine = 'arm64'
elif machine in ('x86_64', 'amd64'):
machine = 'x86_64'
machine_type = f"{machine}-{system}"
# Create proper directory structure matching real ArchiveBox layout
data_dir = tmpdir / 'data'
lib_dir = data_dir / 'lib' / machine_type
npm_dir = lib_dir / 'npm'
npm_bin_dir = npm_dir / '.bin'
node_modules_dir = npm_dir / 'node_modules'
# Extensions go under personas/Default/
chrome_extensions_dir = data_dir / 'personas' / 'Default' / 'chrome_extensions'
# User data goes under users/{username}/
date_str = datetime.now().strftime('%Y%m%d')
users_dir = data_dir / 'users' / 'testuser'
crawls_dir = users_dir / 'crawls' / date_str
snapshots_dir = users_dir / 'snapshots' / date_str
# Create all directories
node_modules_dir.mkdir(parents=True, exist_ok=True)
npm_bin_dir.mkdir(parents=True, exist_ok=True)
chrome_extensions_dir.mkdir(parents=True, exist_ok=True)
crawls_dir.mkdir(parents=True, exist_ok=True)
snapshots_dir.mkdir(parents=True, exist_ok=True)
# Build complete env dict
env = os.environ.copy()
env.update({
'DATA_DIR': str(data_dir),
'LIB_DIR': str(lib_dir),
'MACHINE_TYPE': machine_type,
'NPM_BIN_DIR': str(npm_bin_dir),
'NODE_MODULES_DIR': str(node_modules_dir),
'CHROME_EXTENSIONS_DIR': str(chrome_extensions_dir),
'CRAWLS_DIR': str(crawls_dir),
'SNAPSHOTS_DIR': str(snapshots_dir),
})
# Only set headless if not already in environment (allow override for debugging)
if 'CHROME_HEADLESS' not in os.environ:
env['CHROME_HEADLESS'] = 'true'
# Call chrome install hook (installs puppeteer-core and chromium, outputs JSONL)
result = subprocess.run(
['python', str(CHROME_INSTALL_HOOK)],
capture_output=True, text=True, timeout=120, env=env
)
if result.returncode != 0:
pytest.skip(f"Chrome install hook failed: {result.stderr}")
# Parse JSONL output to get CHROME_BINARY
chrome_binary = None
for line in result.stdout.strip().split('\n'):
if not line.strip():
continue
try:
data = json.loads(line)
if data.get('type') == 'Binary' and data.get('abspath'):
chrome_binary = data['abspath']
break
except json.JSONDecodeError:
continue
if not chrome_binary or not Path(chrome_binary).exists():
pytest.skip(f"Chromium binary not found: {chrome_binary}")
env['CHROME_BINARY'] = chrome_binary
return env
def launch_chrome(env: dict, chrome_dir: Path, crawl_id: str):
"""Launch Chromium and return (process, cdp_url)."""
chrome_dir.mkdir(parents=True, exist_ok=True)
process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=env
)
cdp_url = None
extensions_ready = False
for _ in range(30):
if process.poll() is not None:
stdout, stderr = process.communicate()
raise RuntimeError(f"Chromium failed:\n{stdout}\n{stderr}")
cdp_file = chrome_dir / 'cdp_url.txt'
ext_file = chrome_dir / 'extensions.json'
if cdp_file.exists() and not cdp_url:
cdp_url = cdp_file.read_text().strip()
if ext_file.exists():
extensions_ready = True
if cdp_url and extensions_ready:
break
time.sleep(1)
if not cdp_url:
process.kill()
stdout, stderr = process.communicate()
raise RuntimeError(f"CDP URL not found after 30s.\nstdout: {stdout}\nstderr: {stderr}")
# Print chrome launch hook output for debugging
import select
if hasattr(select, 'poll'):
# Read any available stderr without blocking
import fcntl
import os as os_module
fd = process.stderr.fileno()
fl = fcntl.fcntl(fd, fcntl.F_GETFL)
fcntl.fcntl(fd, fcntl.F_SETFL, fl | os_module.O_NONBLOCK)
try:
stderr_output = process.stderr.read()
if stderr_output:
print(f"[Chrome Launch Hook Output]\n{stderr_output}")
except:
pass
return process, cdp_url
def kill_chrome(process, chrome_dir: Path):
"""Kill Chromium process."""
try:
process.send_signal(signal.SIGTERM)
process.wait(timeout=5)
except:
pass
pid_file = chrome_dir / 'chrome.pid'
if pid_file.exists():
try:
os.kill(int(pid_file.read_text().strip()), signal.SIGKILL)
except:
pass
class TestTwoCaptcha:
"""Integration tests requiring TWOCAPTCHA_API_KEY."""
@pytest.fixture(autouse=True)
def setup(self):
self.api_key = os.environ.get('TWOCAPTCHA_API_KEY') or os.environ.get('API_KEY_2CAPTCHA')
if not self.api_key:
pytest.skip("TWOCAPTCHA_API_KEY required")
def test_install_and_load(self):
"""Extension installs and loads in Chromium."""
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
env = setup_test_env(tmpdir)
env['TWOCAPTCHA_API_KEY'] = self.api_key
# Install
result = subprocess.run(['node', str(INSTALL_SCRIPT)], env=env, timeout=120, capture_output=True, text=True)
assert result.returncode == 0, f"Install failed: {result.stderr}"
cache = Path(env['CHROME_EXTENSIONS_DIR']) / 'twocaptcha.extension.json'
assert cache.exists()
data = json.loads(cache.read_text())
assert data['webstore_id'] == 'ifibfemgeogfhoebkmokieepdoobkbpo'
# Launch Chromium in crawls directory
crawl_id = 'test'
crawl_dir = Path(env['CRAWLS_DIR']) / crawl_id
chrome_dir = crawl_dir / 'chrome'
env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
process, cdp_url = launch_chrome(env, chrome_dir, crawl_id)
try:
exts = json.loads((chrome_dir / 'extensions.json').read_text())
assert any(e['name'] == 'twocaptcha' for e in exts), f"Not loaded: {exts}"
print(f"[+] Extension loaded: id={next(e['id'] for e in exts if e['name']=='twocaptcha')}")
finally:
kill_chrome(process, chrome_dir)
def test_config_applied(self):
"""Configuration is applied to extension and verified via Config.getAll()."""
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
env = setup_test_env(tmpdir)
env['TWOCAPTCHA_API_KEY'] = self.api_key
env['TWOCAPTCHA_RETRY_COUNT'] = '5'
env['TWOCAPTCHA_RETRY_DELAY'] = '10'
subprocess.run(['node', str(INSTALL_SCRIPT)], env=env, timeout=120, capture_output=True)
# Launch Chromium in crawls directory
crawl_id = 'cfg'
crawl_dir = Path(env['CRAWLS_DIR']) / crawl_id
chrome_dir = crawl_dir / 'chrome'
env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
process, cdp_url = launch_chrome(env, chrome_dir, crawl_id)
try:
result = subprocess.run(
['node', str(CONFIG_SCRIPT), '--url=https://example.com', '--snapshot-id=test'],
env=env, timeout=30, capture_output=True, text=True
)
assert result.returncode == 0, f"Config failed: {result.stderr}"
assert (chrome_dir / '.twocaptcha_configured').exists()
# Verify config via options.html and Config.getAll()
# Get the actual extension ID from the config marker (Chrome computes IDs differently)
config_marker = json.loads((chrome_dir / '.twocaptcha_configured').read_text())
ext_id = config_marker['extensionId']
script = f'''
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer-core');
(async () => {{
const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }});
// Load options.html and use Config.getAll() to verify
const optionsUrl = 'chrome-extension://{ext_id}/options/options.html';
const page = await browser.newPage();
console.error('[*] Loading options page:', optionsUrl);
// Navigate - catch error but continue since page may still load
try {{
await page.goto(optionsUrl, {{ waitUntil: 'networkidle0', timeout: 10000 }});
}} catch (e) {{
console.error('[*] Navigation threw error (may still work):', e.message);
}}
// Wait for page to settle
await new Promise(r => setTimeout(r, 2000));
console.error('[*] Current URL:', page.url());
// Wait for Config object to be available
await page.waitForFunction(() => typeof Config !== 'undefined', {{ timeout: 5000 }});
// Call Config.getAll() - the extension's own API (returns a Promise)
const cfg = await page.evaluate(async () => await Config.getAll());
console.error('[*] Config.getAll() returned:', JSON.stringify(cfg));
await page.close();
browser.disconnect();
console.log(JSON.stringify(cfg));
}})();
'''
(tmpdir / 'v.js').write_text(script)
r = subprocess.run(['node', str(tmpdir / 'v.js')], env=env, timeout=30, capture_output=True, text=True)
print(r.stderr)
assert r.returncode == 0, f"Verify failed: {r.stderr}"
cfg = json.loads(r.stdout.strip().split('\n')[-1])
print(f"[*] Config from extension: {json.dumps(cfg, indent=2)}")
# Verify all the fields we care about
assert cfg.get('apiKey') == self.api_key or cfg.get('api_key') == self.api_key, f"API key not set: {cfg}"
assert cfg.get('isPluginEnabled') == True, f"Plugin not enabled: {cfg}"
assert cfg.get('repeatOnErrorTimes') == 5, f"Retry count wrong: {cfg}"
assert cfg.get('repeatOnErrorDelay') == 10, f"Retry delay wrong: {cfg}"
assert cfg.get('autoSolveRecaptchaV2') == True, f"autoSolveRecaptchaV2 not enabled: {cfg}"
assert cfg.get('autoSolveRecaptchaV3') == True, f"autoSolveRecaptchaV3 not enabled: {cfg}"
assert cfg.get('autoSolveTurnstile') == True, f"autoSolveTurnstile not enabled: {cfg}"
assert cfg.get('enabledForRecaptchaV2') == True, f"enabledForRecaptchaV2 not enabled: {cfg}"
print(f"[+] Config verified via Config.getAll()!")
finally:
kill_chrome(process, chrome_dir)
def test_solves_recaptcha(self):
"""Extension solves reCAPTCHA on demo page."""
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
env = setup_test_env(tmpdir)
env['TWOCAPTCHA_API_KEY'] = self.api_key
subprocess.run(['node', str(INSTALL_SCRIPT)], env=env, timeout=120, capture_output=True)
# Launch Chromium in crawls directory
crawl_id = 'solve'
crawl_dir = Path(env['CRAWLS_DIR']) / crawl_id
chrome_dir = crawl_dir / 'chrome'
env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
process, cdp_url = launch_chrome(env, chrome_dir, crawl_id)
try:
subprocess.run(['node', str(CONFIG_SCRIPT), '--url=x', '--snapshot-id=x'], env=env, timeout=30, capture_output=True)
script = f'''
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer-core');
(async () => {{
const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }});
const page = await browser.newPage();
await page.setViewport({{ width: 1440, height: 900 }});
console.error('[*] Loading {TEST_URL}...');
await page.goto('{TEST_URL}', {{ waitUntil: 'networkidle2', timeout: 30000 }});
await new Promise(r => setTimeout(r, 3000));
const start = Date.now();
const maxWait = 90000;
while (Date.now() - start < maxWait) {{
const state = await page.evaluate(() => {{
const resp = document.querySelector('textarea[name="g-recaptcha-response"]');
const solver = document.querySelector('.captcha-solver');
return {{
solved: resp ? resp.value.length > 0 : false,
state: solver?.getAttribute('data-state'),
text: solver?.textContent?.trim() || ''
}};
}});
const sec = Math.round((Date.now() - start) / 1000);
console.error('[*] ' + sec + 's state=' + state.state + ' solved=' + state.solved + ' text=' + state.text.slice(0,30));
if (state.solved) {{ console.error('[+] SOLVED!'); break; }}
if (state.state === 'error') {{ console.error('[!] ERROR'); break; }}
await new Promise(r => setTimeout(r, 2000));
}}
const final = await page.evaluate(() => {{
const resp = document.querySelector('textarea[name="g-recaptcha-response"]');
return {{ solved: resp ? resp.value.length > 0 : false, preview: resp?.value?.slice(0,50) || '' }};
}});
browser.disconnect();
console.log(JSON.stringify(final));
}})();
'''
(tmpdir / 's.js').write_text(script)
print("\n[*] Solving CAPTCHA (10-60s)...")
r = subprocess.run(['node', str(tmpdir / 's.js')], env=env, timeout=120, capture_output=True, text=True)
print(r.stderr)
assert r.returncode == 0, f"Failed: {r.stderr}"
final = json.loads([l for l in r.stdout.strip().split('\n') if l.startswith('{')][-1])
assert final.get('solved'), f"Not solved: {final}"
print(f"[+] SOLVED! {final.get('preview','')[:30]}...")
finally:
kill_chrome(process, chrome_dir)
if __name__ == '__main__':
pytest.main([__file__, '-xvs'])

View File

@@ -14,7 +14,7 @@ import pytest
PLUGIN_DIR = Path(__file__).parent.parent
INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_ublock.*'), None)
INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_install_ublock_extension.*'), None)
def test_install_script_exists():
@@ -158,26 +158,221 @@ def test_large_extension_size():
PLUGINS_ROOT = PLUGIN_DIR.parent
CHROME_INSTALL_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__00_chrome_install.py'
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__20_chrome_launch.bg.js'
CHROME_INSTALL_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__00_install_puppeteer_chromium.py'
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__30_chrome_launch.bg.js'
def launch_chromium_session(env: dict, chrome_dir: Path, crawl_id: str):
"""Launch Chromium and return (process, cdp_url) or raise on failure."""
import signal
import time
chrome_dir.mkdir(parents=True, exist_ok=True)
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=env
)
# Wait for Chromium to launch and CDP URL to be available
cdp_url = None
for i in range(20):
if chrome_launch_process.poll() is not None:
stdout, stderr = chrome_launch_process.communicate()
raise RuntimeError(f"Chromium launch failed:\nStdout: {stdout}\nStderr: {stderr}")
cdp_file = chrome_dir / 'cdp_url.txt'
if cdp_file.exists():
cdp_url = cdp_file.read_text().strip()
break
time.sleep(1)
if not cdp_url:
chrome_launch_process.kill()
raise RuntimeError("Chromium CDP URL not found after 20s")
return chrome_launch_process, cdp_url
def kill_chromium_session(chrome_launch_process, chrome_dir: Path):
"""Clean up Chromium process."""
import signal
try:
chrome_launch_process.send_signal(signal.SIGTERM)
chrome_launch_process.wait(timeout=5)
except:
pass
chrome_pid_file = chrome_dir / 'chrome.pid'
if chrome_pid_file.exists():
try:
chrome_pid = int(chrome_pid_file.read_text().strip())
os.kill(chrome_pid, signal.SIGKILL)
except (OSError, ValueError):
pass
def check_ad_blocking(cdp_url: str, test_url: str, env: dict, script_dir: Path) -> dict:
"""Check ad blocking effectiveness by counting ad elements on page.
Returns dict with:
- adElementsFound: int - number of ad-related elements found
- adElementsVisible: int - number of visible ad elements
- blockedRequests: int - number of blocked network requests (ads/trackers)
- totalRequests: int - total network requests made
- percentBlocked: int - percentage of ad elements hidden (0-100)
"""
test_script = f'''
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer-core');
(async () => {{
const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }});
const page = await browser.newPage();
await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
await page.setViewport({{ width: 1440, height: 900 }});
// Track network requests
let blockedRequests = 0;
let totalRequests = 0;
const adDomains = ['doubleclick', 'googlesyndication', 'googleadservices', 'facebook.com/tr',
'analytics', 'adservice', 'advertising', 'taboola', 'outbrain', 'criteo',
'amazon-adsystem', 'ads.yahoo', 'gemini.yahoo', 'yimg.com/cv/', 'beap.gemini'];
page.on('request', request => {{
totalRequests++;
const url = request.url().toLowerCase();
if (adDomains.some(d => url.includes(d))) {{
// This is an ad request
}}
}});
page.on('requestfailed', request => {{
const url = request.url().toLowerCase();
if (adDomains.some(d => url.includes(d))) {{
blockedRequests++;
}}
}});
console.error('Navigating to {test_url}...');
await page.goto('{test_url}', {{ waitUntil: 'domcontentloaded', timeout: 60000 }});
// Wait for page to fully render and ads to load
await new Promise(r => setTimeout(r, 5000));
// Check for ad elements in the DOM
const result = await page.evaluate(() => {{
// Common ad-related selectors
const adSelectors = [
// Generic ad containers
'[class*="ad-"]', '[class*="ad_"]', '[class*="-ad"]', '[class*="_ad"]',
'[id*="ad-"]', '[id*="ad_"]', '[id*="-ad"]', '[id*="_ad"]',
'[class*="advertisement"]', '[id*="advertisement"]',
'[class*="sponsored"]', '[id*="sponsored"]',
// Google ads
'ins.adsbygoogle', '[data-ad-client]', '[data-ad-slot]',
// Yahoo specific
'[class*="gemini"]', '[data-beacon]', '[class*="native-ad"]',
'[class*="stream-ad"]', '[class*="LDRB"]', '[class*="ntv-ad"]',
// iframes (often ads)
'iframe[src*="ad"]', 'iframe[src*="doubleclick"]', 'iframe[src*="googlesyndication"]',
// Common ad sizes
'[style*="300px"][style*="250px"]', '[style*="728px"][style*="90px"]',
'[style*="160px"][style*="600px"]', '[style*="320px"][style*="50px"]',
];
let adElementsFound = 0;
let adElementsVisible = 0;
for (const selector of adSelectors) {{
try {{
const elements = document.querySelectorAll(selector);
for (const el of elements) {{
adElementsFound++;
const style = window.getComputedStyle(el);
const rect = el.getBoundingClientRect();
const isVisible = style.display !== 'none' &&
style.visibility !== 'hidden' &&
style.opacity !== '0' &&
rect.width > 0 && rect.height > 0;
if (isVisible) {{
adElementsVisible++;
}}
}}
}} catch (e) {{
// Invalid selector, skip
}}
}}
return {{
adElementsFound,
adElementsVisible,
pageTitle: document.title
}};
}});
result.blockedRequests = blockedRequests;
result.totalRequests = totalRequests;
// Calculate how many ad elements were hidden (found but not visible)
const hiddenAds = result.adElementsFound - result.adElementsVisible;
result.percentBlocked = result.adElementsFound > 0
? Math.round((hiddenAds / result.adElementsFound) * 100)
: 0;
console.error('Ad blocking result:', JSON.stringify(result));
browser.disconnect();
console.log(JSON.stringify(result));
}})();
'''
script_path = script_dir / 'check_ads.js'
script_path.write_text(test_script)
result = subprocess.run(
['node', str(script_path)],
cwd=str(script_dir),
capture_output=True,
text=True,
env=env,
timeout=90
)
if result.returncode != 0:
raise RuntimeError(f"Ad check script failed: {result.stderr}")
output_lines = [l for l in result.stdout.strip().split('\n') if l.startswith('{')]
if not output_lines:
raise RuntimeError(f"No JSON output from ad check: {result.stdout}\nstderr: {result.stderr}")
return json.loads(output_lines[-1])
def setup_test_env(tmpdir: Path) -> dict:
"""Set up isolated data/lib directory structure for tests.
Creates structure like:
Creates structure matching real ArchiveBox data dir:
<tmpdir>/data/
lib/
arm64-darwin/ (or x86_64-linux, etc.)
npm/
bin/
.bin/
node_modules/
chrome_extensions/
personas/
default/
chrome_extensions/
users/
testuser/
crawls/
snapshots/
Calls chrome install hook which handles puppeteer-core and chromium installation.
Returns env dict with DATA_DIR, LIB_DIR, NPM_BIN_DIR, NODE_MODULES_DIR, CHROME_BINARY, etc.
"""
import platform
from datetime import datetime
# Determine machine type (matches archivebox.config.paths.get_machine_type())
machine = platform.machine().lower()
@@ -188,18 +383,28 @@ def setup_test_env(tmpdir: Path) -> dict:
machine = 'x86_64'
machine_type = f"{machine}-{system}"
# Create proper directory structure
# Create proper directory structure matching real ArchiveBox layout
data_dir = tmpdir / 'data'
lib_dir = data_dir / 'lib' / machine_type
npm_dir = lib_dir / 'npm'
npm_bin_dir = npm_dir / 'bin'
npm_bin_dir = npm_dir / '.bin'
node_modules_dir = npm_dir / 'node_modules'
chrome_extensions_dir = data_dir / 'chrome_extensions'
# Extensions go under personas/Default/
chrome_extensions_dir = data_dir / 'personas' / 'Default' / 'chrome_extensions'
# User data goes under users/{username}/
date_str = datetime.now().strftime('%Y%m%d')
users_dir = data_dir / 'users' / 'testuser'
crawls_dir = users_dir / 'crawls' / date_str
snapshots_dir = users_dir / 'snapshots' / date_str
# Create all directories
node_modules_dir.mkdir(parents=True, exist_ok=True)
npm_bin_dir.mkdir(parents=True, exist_ok=True)
chrome_extensions_dir.mkdir(parents=True, exist_ok=True)
crawls_dir.mkdir(parents=True, exist_ok=True)
snapshots_dir.mkdir(parents=True, exist_ok=True)
# Build complete env dict
env = os.environ.copy()
@@ -210,12 +415,14 @@ def setup_test_env(tmpdir: Path) -> dict:
'NPM_BIN_DIR': str(npm_bin_dir),
'NODE_MODULES_DIR': str(node_modules_dir),
'CHROME_EXTENSIONS_DIR': str(chrome_extensions_dir),
'CRAWLS_DIR': str(crawls_dir),
'SNAPSHOTS_DIR': str(snapshots_dir),
})
# Call chrome install hook (installs puppeteer-core and chromium, outputs JSONL)
result = subprocess.run(
['python', str(CHROME_INSTALL_HOOK)],
capture_output=True, text=True, timeout=10, env=env
capture_output=True, text=True, timeout=120, env=env
)
if result.returncode != 0:
pytest.skip(f"Chrome install hook failed: {result.stderr}")
@@ -240,8 +447,8 @@ def setup_test_env(tmpdir: Path) -> dict:
return env
# Test URL: ad blocker test page that shows if ads are blocked
TEST_URL = 'https://d3ward.github.io/toolz/adblock.html'
# Test URL: Yahoo has many ads that uBlock should block
TEST_URL = 'https://www.yahoo.com/'
@pytest.mark.timeout(15)
@@ -290,14 +497,18 @@ def test_extension_loads_in_chromium():
print(f"[test] NODE_MODULES_DIR={env.get('NODE_MODULES_DIR')}", flush=True)
print(f"[test] puppeteer-core exists: {(Path(env['NODE_MODULES_DIR']) / 'puppeteer-core').exists()}", flush=True)
print("[test] Launching Chromium...", flush=True)
data_dir = Path(env['DATA_DIR'])
crawl_dir = data_dir / 'crawl'
crawl_dir.mkdir()
# Launch Chromium in crawls directory
crawl_id = 'test-ublock'
crawl_dir = Path(env['CRAWLS_DIR']) / crawl_id
crawl_dir.mkdir(parents=True, exist_ok=True)
chrome_dir = crawl_dir / 'chrome'
chrome_dir.mkdir(parents=True, exist_ok=True)
env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-ublock'],
cwd=str(crawl_dir),
['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
@@ -457,161 +668,177 @@ const puppeteer = require('puppeteer-core');
def test_blocks_ads_on_test_page():
"""Live test: verify uBlock Origin blocks ads on a test page.
Uses Chromium with extensions loaded automatically via chrome hook.
Tests against d3ward's ad blocker test page which checks ad domains.
This test runs TWO browser sessions:
1. WITHOUT extension - verifies ads are NOT blocked (baseline)
2. WITH extension - verifies ads ARE blocked
This ensures we're actually testing the extension's effect, not just
that a test page happens to show ads as blocked.
"""
import signal
import time
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Set up isolated env with proper directory structure
env = setup_test_env(tmpdir)
env['CHROME_HEADLESS'] = 'true'
env_base = setup_test_env(tmpdir)
env_base['CHROME_HEADLESS'] = 'true'
ext_dir = Path(env['CHROME_EXTENSIONS_DIR'])
# ============================================================
# STEP 1: BASELINE - Run WITHOUT extension, verify ads are NOT blocked
# ============================================================
print("\n" + "="*60)
print("STEP 1: BASELINE TEST (no extension)")
print("="*60)
data_dir = Path(env_base['DATA_DIR'])
env_no_ext = env_base.copy()
env_no_ext['CHROME_EXTENSIONS_DIR'] = str(data_dir / 'personas' / 'Default' / 'empty_extensions')
(data_dir / 'personas' / 'Default' / 'empty_extensions').mkdir(parents=True, exist_ok=True)
# Launch baseline Chromium in crawls directory
baseline_crawl_id = 'baseline-no-ext'
baseline_crawl_dir = Path(env_base['CRAWLS_DIR']) / baseline_crawl_id
baseline_crawl_dir.mkdir(parents=True, exist_ok=True)
baseline_chrome_dir = baseline_crawl_dir / 'chrome'
env_no_ext['CRAWL_OUTPUT_DIR'] = str(baseline_crawl_dir)
baseline_process = None
try:
baseline_process, baseline_cdp_url = launch_chromium_session(
env_no_ext, baseline_chrome_dir, baseline_crawl_id
)
print(f"Baseline Chromium launched: {baseline_cdp_url}")
# Wait a moment for browser to be ready
time.sleep(2)
baseline_result = check_ad_blocking(
baseline_cdp_url, TEST_URL, env_no_ext, tmpdir
)
print(f"Baseline result: {baseline_result['adElementsVisible']} visible ads "
f"(found {baseline_result['adElementsFound']} ad elements)")
finally:
if baseline_process:
kill_chromium_session(baseline_process, baseline_chrome_dir)
# Verify baseline shows ads ARE visible (not blocked)
if baseline_result['adElementsFound'] == 0:
pytest.skip(
f"Cannot test extension: no ad elements found on {TEST_URL}. "
f"The page may have changed or loaded differently."
)
if baseline_result['adElementsVisible'] == 0:
print(f"\nWARNING: Baseline shows 0 visible ads despite finding {baseline_result['adElementsFound']} elements!")
print("This suggests either:")
print(" - There's another ad blocker interfering")
print(" - Network-level ad blocking is in effect")
pytest.skip(
f"Cannot test extension: baseline shows no visible ads "
f"despite finding {baseline_result['adElementsFound']} ad elements."
)
print(f"\n✓ Baseline confirmed: {baseline_result['adElementsVisible']} visible ads without extension")
# ============================================================
# STEP 2: Install the uBlock extension
# ============================================================
print("\n" + "="*60)
print("STEP 2: INSTALLING EXTENSION")
print("="*60)
ext_dir = Path(env_base['CHROME_EXTENSIONS_DIR'])
# Step 1: Install the uBlock extension
result = subprocess.run(
['node', str(INSTALL_SCRIPT)],
capture_output=True,
text=True,
env=env,
timeout=15
env=env_base,
timeout=60
)
assert result.returncode == 0, f"Extension install failed: {result.stderr}"
# Verify extension cache was created
cache_file = ext_dir / 'ublock.extension.json'
assert cache_file.exists(), "Extension cache not created"
ext_data = json.loads(cache_file.read_text())
print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}")
# Step 2: Launch Chromium using the chrome hook (loads extensions automatically)
data_dir = Path(env['DATA_DIR'])
crawl_dir = data_dir / 'crawl'
crawl_dir.mkdir()
chrome_dir = crawl_dir / 'chrome'
# ============================================================
# STEP 3: Run WITH extension, verify ads ARE blocked
# ============================================================
print("\n" + "="*60)
print("STEP 3: TEST WITH EXTENSION")
print("="*60)
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-ublock'],
cwd=str(crawl_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=env
)
# Wait for Chrome to launch and CDP URL to be available
cdp_url = None
for i in range(20):
if chrome_launch_process.poll() is not None:
stdout, stderr = chrome_launch_process.communicate()
raise RuntimeError(f"Chrome launch failed:\nStdout: {stdout}\nStderr: {stderr}")
cdp_file = chrome_dir / 'cdp_url.txt'
if cdp_file.exists():
cdp_url = cdp_file.read_text().strip()
break
time.sleep(1)
assert cdp_url, "Chrome CDP URL not found after 20s"
print(f"Chrome launched with CDP URL: {cdp_url}")
# Check that extensions were loaded
extensions_file = chrome_dir / 'extensions.json'
if extensions_file.exists():
loaded_exts = json.loads(extensions_file.read_text())
print(f"Extensions loaded: {[e.get('name') for e in loaded_exts]}")
# Launch extension test Chromium in crawls directory
ext_crawl_id = 'test-with-ext'
ext_crawl_dir = Path(env_base['CRAWLS_DIR']) / ext_crawl_id
ext_crawl_dir.mkdir(parents=True, exist_ok=True)
ext_chrome_dir = ext_crawl_dir / 'chrome'
env_base['CRAWL_OUTPUT_DIR'] = str(ext_crawl_dir)
ext_process = None
try:
# Step 3: Connect to Chrome and test ad blocking
test_script = f'''
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer-core');
ext_process, ext_cdp_url = launch_chromium_session(
env_base, ext_chrome_dir, ext_crawl_id
)
print(f"Extension Chromium launched: {ext_cdp_url}")
(async () => {{
const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }});
# Check that extension was loaded
extensions_file = ext_chrome_dir / 'extensions.json'
if extensions_file.exists():
loaded_exts = json.loads(extensions_file.read_text())
print(f"Extensions loaded: {[e.get('name') for e in loaded_exts]}")
// Wait for extension to initialize
await new Promise(r => setTimeout(r, 500));
# Wait for extension to initialize
time.sleep(3)
// Check extension loaded by looking at targets
const targets = browser.targets();
const extTargets = targets.filter(t =>
t.url().startsWith('chrome-extension://') ||
t.type() === 'service_worker' ||
t.type() === 'background_page'
);
console.error('Extension targets found:', extTargets.length);
extTargets.forEach(t => console.error(' -', t.type(), t.url().substring(0, 60)));
const page = await browser.newPage();
await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36');
await page.setViewport({{ width: 1440, height: 900 }});
console.error('Navigating to {TEST_URL}...');
await page.goto('{TEST_URL}', {{ waitUntil: 'networkidle2', timeout: 60000 }});
// Wait for the test page to run its checks
await new Promise(r => setTimeout(r, 5000));
// The d3ward test page shows blocked percentage
const result = await page.evaluate(() => {{
const scoreEl = document.querySelector('#score');
const score = scoreEl ? scoreEl.textContent : null;
const blockedItems = document.querySelectorAll('.blocked').length;
const totalItems = document.querySelectorAll('.testlist li').length;
return {{
score,
blockedItems,
totalItems,
percentBlocked: totalItems > 0 ? Math.round((blockedItems / totalItems) * 100) : 0
}};
}});
console.error('Ad blocking result:', JSON.stringify(result));
browser.disconnect();
console.log(JSON.stringify(result));
}})();
'''
script_path = tmpdir / 'test_ublock.js'
script_path.write_text(test_script)
result = subprocess.run(
['node', str(script_path)],
cwd=str(tmpdir),
capture_output=True,
text=True,
env=env,
timeout=10
ext_result = check_ad_blocking(
ext_cdp_url, TEST_URL, env_base, tmpdir
)
print(f"stderr: {result.stderr}")
print(f"stdout: {result.stdout}")
assert result.returncode == 0, f"Test failed: {result.stderr}"
output_lines = [l for l in result.stdout.strip().split('\n') if l.startswith('{')]
assert output_lines, f"No JSON output: {result.stdout}"
test_result = json.loads(output_lines[-1])
# uBlock should block most ad domains on the test page
assert test_result['percentBlocked'] >= 50, \
f"uBlock should block at least 50% of ads, only blocked {test_result['percentBlocked']}%. Result: {test_result}"
print(f"Extension result: {ext_result['adElementsVisible']} visible ads "
f"(found {ext_result['adElementsFound']} ad elements)")
finally:
# Clean up Chrome
try:
chrome_launch_process.send_signal(signal.SIGTERM)
chrome_launch_process.wait(timeout=5)
except:
pass
chrome_pid_file = chrome_dir / 'chrome.pid'
if chrome_pid_file.exists():
try:
chrome_pid = int(chrome_pid_file.read_text().strip())
os.kill(chrome_pid, signal.SIGKILL)
except (OSError, ValueError):
pass
if ext_process:
kill_chromium_session(ext_process, ext_chrome_dir)
# ============================================================
# STEP 4: Compare results
# ============================================================
print("\n" + "="*60)
print("STEP 4: COMPARISON")
print("="*60)
print(f"Baseline (no extension): {baseline_result['adElementsVisible']} visible ads")
print(f"With extension: {ext_result['adElementsVisible']} visible ads")
# Calculate reduction in visible ads
ads_blocked = baseline_result['adElementsVisible'] - ext_result['adElementsVisible']
reduction_percent = (ads_blocked / baseline_result['adElementsVisible'] * 100) if baseline_result['adElementsVisible'] > 0 else 0
print(f"Reduction: {ads_blocked} fewer visible ads ({reduction_percent:.0f}% reduction)")
# Extension should significantly reduce visible ads
assert ext_result['adElementsVisible'] < baseline_result['adElementsVisible'], \
f"uBlock should reduce visible ads.\n" \
f"Baseline: {baseline_result['adElementsVisible']} visible ads\n" \
f"With extension: {ext_result['adElementsVisible']} visible ads\n" \
f"Expected fewer ads with extension."
# Extension should block at least 30% of ads
assert reduction_percent >= 30, \
f"uBlock should block at least 30% of ads.\n" \
f"Baseline: {baseline_result['adElementsVisible']} visible ads\n" \
f"With extension: {ext_result['adElementsVisible']} visible ads\n" \
f"Reduction: only {reduction_percent:.0f}% (expected at least 30%)"
print(f"\n✓ SUCCESS: uBlock correctly blocks ads!")
print(f" - Baseline: {baseline_result['adElementsVisible']} visible ads")
print(f" - With extension: {ext_result['adElementsVisible']} visible ads")
print(f" - Blocked: {ads_blocked} ads ({reduction_percent:.0f}% reduction)")

View File

@@ -133,7 +133,7 @@ This plugin provides shared Chrome infrastructure for other plugins. It manages
chrome/
├── on_Crawl__00_chrome_install_config.py # Configure Chrome settings
├── on_Crawl__00_chrome_install.py # Install Chrome binary
├── on_Crawl__20_chrome_launch.bg.js # Launch Chrome (Crawl-level, bg)
├── on_Crawl__30_chrome_launch.bg.js # Launch Chrome (Crawl-level, bg)
├── on_Snapshot__20_chrome_tab.bg.js # Open tab (Snapshot-level, bg)
├── on_Snapshot__30_chrome_navigate.js # Navigate to URL (foreground)
├── on_Snapshot__45_chrome_tab_cleanup.py # Close tab, kill bg hooks