diff --git a/ArchiveBox.conf b/ArchiveBox.conf
deleted file mode 100644
index a7eaff41..00000000
--- a/ArchiveBox.conf
+++ /dev/null
@@ -1,3 +0,0 @@
-[SERVER_CONFIG]
-SECRET_KEY = y6fw9wcaqls9sx_dze6ahky9ggpkpzoaw5g5v98_u3ro5j0_4f
-
diff --git a/archivebox/api/v1_workers.py b/archivebox/api/v1_workers.py
index 95678ef5..9e138e16 100644
--- a/archivebox/api/v1_workers.py
+++ b/archivebox/api/v1_workers.py
@@ -35,12 +35,8 @@ class WorkerSchema(Schema):
model: str
max_tick_time: int
max_concurrent_tasks: int
- poll_interval: float
- idle_timeout: int
running_count: int
running_workers: List[dict[str, Any]]
- queue_count: int
- queue: List[QueueItemSchema]
@staticmethod
def resolve_model(obj) -> str:
@@ -55,38 +51,21 @@ class WorkerSchema(Schema):
def resolve_max_concurrent_tasks(obj) -> int:
return obj.MAX_CONCURRENT_TASKS
- @staticmethod
- def resolve_poll_interval(obj) -> float:
- return obj.POLL_INTERVAL
-
- @staticmethod
- def resolve_idle_timeout(obj) -> int:
- return obj.IDLE_TIMEOUT
-
@staticmethod
def resolve_running_count(obj) -> int:
- return len(obj.get_running_workers())
+ return obj.get_worker_count()
@staticmethod
def resolve_running_workers(obj) -> List[dict[str, Any]]:
return obj.get_running_workers()
- @staticmethod
- def resolve_queue_count(obj) -> int:
- return obj.get_queue().count()
-
- @staticmethod
- def resolve_queue(obj) -> List[QueueItemSchema]:
- return list(obj.get_queue()[:50]) # Limit to 50 items
-
class OrchestratorSchema(Schema):
"""Schema for the Orchestrator."""
is_running: bool
poll_interval: float
idle_timeout: int
- max_workers_per_type: int
- max_total_workers: int
+ max_crawl_workers: int
total_worker_count: int
workers: List[WorkerSchema]
@@ -95,23 +74,20 @@ class OrchestratorSchema(Schema):
def get_orchestrator(request):
"""Get the orchestrator status and all worker queues."""
from archivebox.workers.orchestrator import Orchestrator
- from archivebox.workers.worker import CrawlWorker, SnapshotWorker, ArchiveResultWorker
+ from archivebox.workers.worker import CrawlWorker
orchestrator = Orchestrator()
# Create temporary worker instances to query their queues
workers = [
CrawlWorker(worker_id=-1),
- SnapshotWorker(worker_id=-1),
- ArchiveResultWorker(worker_id=-1),
]
return {
'is_running': orchestrator.is_running(),
'poll_interval': orchestrator.POLL_INTERVAL,
'idle_timeout': orchestrator.IDLE_TIMEOUT,
- 'max_workers_per_type': orchestrator.MAX_WORKERS_PER_TYPE,
- 'max_total_workers': orchestrator.MAX_TOTAL_WORKERS,
+ 'max_crawl_workers': orchestrator.MAX_CRAWL_WORKERS,
'total_worker_count': orchestrator.get_total_worker_count(),
'workers': workers,
}
@@ -120,41 +96,12 @@ def get_orchestrator(request):
@router.get("/workers", response=List[WorkerSchema], url_name="get_workers")
def get_workers(request):
"""List all worker types and their current status."""
- from archivebox.workers.worker import CrawlWorker, SnapshotWorker, ArchiveResultWorker
+ from archivebox.workers.worker import CrawlWorker
# Create temporary instances to query their queues
return [
CrawlWorker(worker_id=-1),
- SnapshotWorker(worker_id=-1),
- ArchiveResultWorker(worker_id=-1),
]
-@router.get("/worker/{worker_name}", response=WorkerSchema, url_name="get_worker")
-def get_worker(request, worker_name: str):
- """Get status and queue for a specific worker type."""
- from archivebox.workers.worker import WORKER_TYPES
-
- if worker_name not in WORKER_TYPES:
- from ninja.errors import HttpError
- raise HttpError(404, f"Unknown worker type: {worker_name}. Valid types: {list(WORKER_TYPES.keys())}")
-
- WorkerClass = WORKER_TYPES[worker_name]
- return WorkerClass(worker_id=-1)
-
-
-@router.get("/worker/{worker_name}/queue", response=List[QueueItemSchema], url_name="get_worker_queue")
-def get_worker_queue(request, worker_name: str, limit: int = 100):
- """Get the current queue for a specific worker type."""
- from archivebox.workers.worker import WORKER_TYPES
-
- if worker_name not in WORKER_TYPES:
- from ninja.errors import HttpError
- raise HttpError(404, f"Unknown worker type: {worker_name}. Valid types: {list(WORKER_TYPES.keys())}")
-
- WorkerClass = WORKER_TYPES[worker_name]
- worker = WorkerClass(worker_id=-1)
- return list(worker.get_queue()[:limit])
-
-
# Progress endpoint moved to core.views.live_progress_view for simplicity
diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py
index 5043f3ed..25b0815b 100644
--- a/archivebox/cli/archivebox_add.py
+++ b/archivebox/cli/archivebox_add.py
@@ -96,10 +96,9 @@ def add(urls: str | list[str],
first_url = crawl.get_urls_list()[0] if crawl.get_urls_list() else ''
print(f' [dim]First URL: {first_url}[/dim]')
- # 3. The CrawlMachine will create the root Snapshot when started
- # If URLs are from a file: first URL = file:///path/to/sources/...txt
- # Parser extractors will run on it and discover more URLs
- # Those URLs become child Snapshots (depth=1)
+ # 3. The CrawlMachine will create Snapshots from all URLs when started
+ # Parser extractors run on snapshots and discover more URLs
+ # Discovered URLs become child Snapshots (depth+1)
if index_only:
# Just create the crawl but don't start processing
@@ -119,10 +118,9 @@ def add(urls: str | list[str],
# 5. Start the orchestrator to process the queue
# The orchestrator will:
- # - Process Crawl -> create root Snapshot
- # - Process root Snapshot -> run parser extractors -> discover URLs
- # - Create child Snapshots from discovered URLs
- # - Process child Snapshots -> run extractors
+ # - Process Crawl -> create Snapshots from all URLs
+ # - Process Snapshots -> run extractors
+ # - Parser extractors discover new URLs -> create child Snapshots
# - Repeat until max_depth reached
if bg:
diff --git a/archivebox/cli/archivebox_init.py b/archivebox/cli/archivebox_init.py
index 5ef6c9ca..34b10faa 100755
--- a/archivebox/cli/archivebox_init.py
+++ b/archivebox/cli/archivebox_init.py
@@ -160,10 +160,12 @@ def init(force: bool=False, quick: bool=False, install: bool=False) -> None:
CONSTANTS.PERSONAS_DIR.mkdir(parents=True, exist_ok=True)
CONSTANTS.DEFAULT_TMP_DIR.mkdir(parents=True, exist_ok=True)
CONSTANTS.DEFAULT_LIB_DIR.mkdir(parents=True, exist_ok=True)
-
+ (CONSTANTS.DEFAULT_LIB_DIR / 'bin').mkdir(parents=True, exist_ok=True)
+
from archivebox.config.common import STORAGE_CONFIG
STORAGE_CONFIG.TMP_DIR.mkdir(parents=True, exist_ok=True)
STORAGE_CONFIG.LIB_DIR.mkdir(parents=True, exist_ok=True)
+ (STORAGE_CONFIG.LIB_DIR / 'bin').mkdir(parents=True, exist_ok=True)
if install:
from archivebox.cli.archivebox_install import install as install_method
diff --git a/archivebox/cli/archivebox_pluginmap.py b/archivebox/cli/archivebox_pluginmap.py
index b168a480..04a8cba6 100644
--- a/archivebox/cli/archivebox_pluginmap.py
+++ b/archivebox/cli/archivebox_pluginmap.py
@@ -96,33 +96,45 @@ ARCHIVERESULT_MACHINE_DIAGRAM = """
├─────────────────────────────────────────────────────────────────────────────┤
│ │
│ ┌─────────────┐ │
-│ │ QUEUED │◄────────────────┐ │
-│ │ (initial) │ │ │
-│ └──────┬──────┘ │ │
-│ │ │ tick() unless can_start() │
-│ │ tick() when │ │
-│ │ can_start() │ │
-│ ▼ │ │
-│ ┌─────────────┐ │ │
-│ │ STARTED │─────────────────┘ │
-│ │ │◄────────────────┐ │
-│ │ enter: │ │ tick() unless is_finished() │
-│ │ result.run()│─────────────────┘ │
-│ │ (execute │ │
-│ │ hook via │ │
-│ │ run_hook())│ │
-│ └──────┬──────┘ │
-│ │ │
-│ │ tick() checks status set by hook output │
-│ ├────────────────┬────────────────┬────────────────┐ │
-│ ▼ ▼ ▼ ▼ │
-│ ┌───────────┐ ┌───────────┐ ┌───────────┐ ┌───────────┐ │
-│ │ SUCCEEDED │ │ FAILED │ │ SKIPPED │ │ BACKOFF │ │
-│ │ (final) │ │ (final) │ │ (final) │ │ │ │
-│ └───────────┘ └───────────┘ └───────────┘ └─────┬─────┘ │
-│ │ │
-│ can_start()───┘ │
-│ loops back to STARTED │
+│ │ QUEUED │◄─────────────────┐ │
+│ │ (initial) │ │ │
+│ └──┬───────┬──┘ │ │
+│ │ │ │ tick() unless can_start() │
+│ │ │ exceeded_max_ │ │
+│ │ │ attempts │ │
+│ │ ▼ │ │
+│ │ ┌──────────┐ │ │
+│ │ │ SKIPPED │ │ │
+│ │ │ (final) │ │ │
+│ │ └──────────┘ │ │
+│ │ tick() when │ │
+│ │ can_start() │ │
+│ ▼ │ │
+│ ┌─────────────┐ │ │
+│ │ STARTED │──────────────────┘ │
+│ │ │◄─────────────────────────────────────────────────┐ │
+│ │ enter: │ │ │ │
+│ │ result.run()│ tick() unless │ │ │
+│ │ (execute │ is_finished() │ │ │
+│ │ hook via │──────────────────────┘ │ │
+│ │ run_hook())│ │ │
+│ └──────┬──────┘ │ │
+│ │ │ │
+│ │ tick() checks status set by hook output │ │
+│ ├─────────────┬─────────────┬─────────────┐ │ │
+│ ▼ ▼ ▼ ▼ │ │
+│ ┌───────────┐ ┌───────────┐ ┌───────────┐ ┌───────────┐ │ │
+│ │ SUCCEEDED │ │ FAILED │ │ SKIPPED │ │ BACKOFF │ │ │
+│ │ (final) │ │ (final) │ │ (final) │ │ │ │ │
+│ └───────────┘ └───────────┘ └───────────┘ └──┬──────┬─┘ │ │
+│ │ │ │ │
+│ exceeded_max_ │ │ can_start()│ │
+│ attempts │ │ loops back │ │
+│ ▼ │ └────────────┘ │
+│ ┌──────────┐ │ │
+│ │ SKIPPED │◄─┘ │
+│ │ (final) │ │
+│ └──────────┘ │
│ │
│ Each ArchiveResult runs ONE specific hook (stored in .hook_name field) │
└─────────────────────────────────────────────────────────────────────────────┘
@@ -137,35 +149,38 @@ BINARY_MACHINE_DIAGRAM = """
│ │ QUEUED │◄────────────────┐ │
│ │ (initial) │ │ │
│ └──────┬──────┘ │ │
-│ │ │ tick() unless can_start() │
+│ │ │ tick() unless can_install() │
+│ │ │ (stays queued if failed) │
│ │ tick() when │ │
-│ │ can_start() │ │
-│ ▼ │ │
-│ ┌─────────────┐ │ │
-│ │ STARTED │─────────────────┘ │
-│ │ │◄────────────────┐ │
-│ │ enter: │ │ │
-│ │ binary.run()│ │ tick() unless is_finished() │
-│ │ (discover │─────────────────┘ │
-│ │ Binary │ │
-│ │ hooks, │ │
-│ │ try each │ │
-│ │ provider) │ │
-│ └──────┬──────┘ │
+│ │ can_install() │ │
+│ │ │ │
+│ │ on_install() runs │ │
+│ │ during transition: │ │
+│ │ • binary.run() │ │
+│ │ (discover Binary │ │
+│ │ hooks, try each │ │
+│ │ provider until │ │
+│ │ one succeeds) │ │
+│ │ • Sets abspath, │ │
+│ │ version, sha256 │ │
+│ │ │ │
+│ │ If install fails: │ │
+│ │ raises exception──────┘ │
+│ │ (retry_at bumped) │
│ │ │
-│ │ tick() checks status set by hook output │
-│ ├────────────────────────────────┐ │
-│ ▼ ▼ │
-│ ┌─────────────┐ ┌─────────────┐ │
-│ │ SUCCEEDED │ │ FAILED │ │
-│ │ (final) │ │ (final) │ │
-│ │ │ │ │ │
-│ │ abspath, │ │ no provider │ │
-│ │ version set │ │ succeeded │ │
-│ └─────────────┘ └─────────────┘ │
+│ ▼ │
+│ ┌─────────────┐ │
+│ │ INSTALLED │ │
+│ │ (final) │ │
+│ │ │ │
+│ │ Binary is │ │
+│ │ ready to │ │
+│ │ use │ │
+│ └─────────────┘ │
│ │
-│ Hooks triggered: on_Binary__* (provider hooks during STARTED.enter) │
+│ Hooks triggered: on_Binary__* (provider hooks during transition) │
│ Providers tried in sequence until one succeeds: apt, brew, pip, npm, etc. │
+│ Installation is synchronous - no intermediate STARTED state │
└─────────────────────────────────────────────────────────────────────────────┘
"""
diff --git a/archivebox/cli/archivebox_update.py b/archivebox/cli/archivebox_update.py
index 2fbd05c0..f780a289 100644
--- a/archivebox/cli/archivebox_update.py
+++ b/archivebox/cli/archivebox_update.py
@@ -109,15 +109,18 @@ def drain_old_archive_dirs(resume_from: str = None, batch_size: int = 100) -> di
if not archive_dir.exists():
return stats
- print('[*] Scanning for old directories in archive/...')
+ print('[DEBUG Phase1] Scanning for old directories in archive/...')
# Scan for real directories only (skip symlinks - they're already migrated)
+ all_entries = list(os.scandir(archive_dir))
+ print(f'[DEBUG Phase1] Total entries in archive/: {len(all_entries)}')
entries = [
(e.stat().st_mtime, e.path)
- for e in os.scandir(archive_dir)
+ for e in all_entries
if e.is_dir(follow_symlinks=False) # Skip symlinks
]
entries.sort(reverse=True) # Newest first
+ print(f'[DEBUG Phase1] Real directories (not symlinks): {len(entries)}')
print(f'[*] Found {len(entries)} old directories to drain')
for mtime, entry_path in entries:
@@ -142,14 +145,48 @@ def drain_old_archive_dirs(resume_from: str = None, batch_size: int = 100) -> di
print(f" [{stats['processed']}] Invalid: {entry_path.name}")
continue
+ # Ensure snapshot has a valid crawl (migration 0024 may have failed)
+ from archivebox.crawls.models import Crawl
+ has_valid_crawl = False
+ if snapshot.crawl_id:
+ # Check if the crawl actually exists
+ has_valid_crawl = Crawl.objects.filter(id=snapshot.crawl_id).exists()
+
+ if not has_valid_crawl:
+ # Create a new crawl (created_by will default to system user)
+ crawl = Crawl.objects.create(urls=snapshot.url)
+ # Use queryset update to avoid triggering save() hooks
+ from archivebox.core.models import Snapshot as SnapshotModel
+ SnapshotModel.objects.filter(pk=snapshot.pk).update(crawl=crawl)
+ # Refresh the instance
+ snapshot.crawl = crawl
+ snapshot.crawl_id = crawl.id
+ print(f"[DEBUG Phase1] Created missing crawl for snapshot {str(snapshot.id)[:8]}")
+
# Check if needs migration (0.8.x → 0.9.x)
+ print(f"[DEBUG Phase1] Snapshot {str(snapshot.id)[:8]}: fs_version={snapshot.fs_version}, needs_migration={snapshot.fs_migration_needed}")
if snapshot.fs_migration_needed:
try:
- # Manually trigger filesystem migration without full save()
- # This avoids UNIQUE constraint issues while still migrating files
- cleanup_info = None
- if hasattr(snapshot, '_fs_migrate_from_0_8_0_to_0_9_0'):
- cleanup_info = snapshot._fs_migrate_from_0_8_0_to_0_9_0()
+ # Calculate paths using actual directory (entry_path), not snapshot.timestamp
+ # because snapshot.timestamp might be truncated
+ old_dir = entry_path
+ new_dir = snapshot.get_storage_path_for_version('0.9.0')
+ print(f"[DEBUG Phase1] Migrating {old_dir.name} → {new_dir}")
+
+ # Manually migrate files
+ if not new_dir.exists() and old_dir.exists():
+ new_dir.mkdir(parents=True, exist_ok=True)
+ import shutil
+ file_count = 0
+ for old_file in old_dir.rglob('*'):
+ if old_file.is_file():
+ rel_path = old_file.relative_to(old_dir)
+ new_file = new_dir / rel_path
+ if not new_file.exists():
+ new_file.parent.mkdir(parents=True, exist_ok=True)
+ shutil.copy2(old_file, new_file)
+ file_count += 1
+ print(f"[DEBUG Phase1] Copied {file_count} files")
# Update only fs_version field using queryset update (bypasses validation)
from archivebox.core.models import Snapshot as SnapshotModel
@@ -158,9 +195,8 @@ def drain_old_archive_dirs(resume_from: str = None, batch_size: int = 100) -> di
# Commit the transaction
transaction.commit()
- # Manually call cleanup since we bypassed normal save() flow
- if cleanup_info:
- old_dir, new_dir = cleanup_info
+ # Cleanup: delete old dir and create symlink
+ if old_dir.exists() and old_dir != new_dir:
snapshot._cleanup_old_migration_dir(old_dir, new_dir)
stats['migrated'] += 1
@@ -207,19 +243,39 @@ def process_all_db_snapshots(batch_size: int = 100) -> dict:
continue
try:
- # Reconcile index.json with DB
- snapshot.reconcile_with_index_json()
+ print(f"[DEBUG Phase2] Snapshot {str(snapshot.id)[:8]}: fs_version={snapshot.fs_version}, needs_migration={snapshot.fs_migration_needed}")
+
+ # Check if snapshot has a directory on disk
+ from pathlib import Path
+ output_dir = Path(snapshot.output_dir)
+ has_directory = output_dir.exists() and output_dir.is_dir()
+
+ # Only reconcile if directory exists (don't create empty directories for orphans)
+ if has_directory:
+ snapshot.reconcile_with_index_json()
# Clean up invalid field values from old migrations
if not isinstance(snapshot.current_step, int):
snapshot.current_step = 0
+ # If still needs migration, it's an orphan (no directory on disk)
+ # Mark it as migrated to prevent save() from triggering filesystem migration
+ if snapshot.fs_migration_needed:
+ if has_directory:
+ print(f"[DEBUG Phase2] WARNING: Snapshot {str(snapshot.id)[:8]} has directory but still needs migration")
+ else:
+ print(f"[DEBUG Phase2] Orphan snapshot {str(snapshot.id)[:8]} - marking as migrated without filesystem operation")
+ # Use queryset update to set fs_version without triggering save() hooks
+ from archivebox.core.models import Snapshot as SnapshotModel
+ SnapshotModel.objects.filter(pk=snapshot.pk).update(fs_version='0.9.0')
+ snapshot.fs_version = '0.9.0'
+
# Queue for archiving (state machine will handle it)
snapshot.status = Snapshot.StatusChoices.QUEUED
snapshot.retry_at = timezone.now()
snapshot.save()
- stats['reconciled'] += 1
+ stats['reconciled'] += 1 if has_directory else 0
stats['queued'] += 1
except Exception as e:
# Skip snapshots that can't be processed (e.g., missing crawl)
diff --git a/archivebox/core/migrations/0027_copy_archiveresult_to_process.py b/archivebox/core/migrations/0027_copy_archiveresult_to_process.py
index 5b0666c5..8ac9d889 100644
--- a/archivebox/core/migrations/0027_copy_archiveresult_to_process.py
+++ b/archivebox/core/migrations/0027_copy_archiveresult_to_process.py
@@ -4,6 +4,7 @@
from django.db import migrations, connection
import json
from pathlib import Path
+from archivebox.uuid_compat import uuid7
def parse_cmd_field(cmd_raw):
@@ -39,7 +40,6 @@ def parse_cmd_field(cmd_raw):
def get_or_create_current_machine(cursor):
"""Get or create Machine.current() using raw SQL."""
- import uuid
import socket
from datetime import datetime
@@ -55,7 +55,8 @@ def get_or_create_current_machine(cursor):
return row[0]
# Create new machine
- machine_id = str(uuid.uuid4())
+ # Django UUIDField stores UUIDs as 32-char hex (no dashes) in SQLite
+ machine_id = uuid7().hex
now = datetime.now().isoformat()
# Check which columns exist (schema differs between 0.8.x and 0.9.x)
@@ -103,7 +104,6 @@ def get_or_create_binary(cursor, machine_id, name, abspath, version):
Returns:
binary_id (str)
"""
- import uuid
from datetime import datetime
# If abspath is just a name without slashes, it's not a full path
@@ -123,7 +123,8 @@ def get_or_create_binary(cursor, machine_id, name, abspath, version):
return row[0]
# Create new binary
- binary_id = str(uuid.uuid4())
+ # Django UUIDField stores UUIDs as 32-char hex (no dashes) in SQLite
+ binary_id = uuid7().hex
now = datetime.now().isoformat()
# Check which columns exist (schema differs between 0.8.x and 0.9.x)
@@ -186,10 +187,10 @@ def create_process(cursor, machine_id, pwd, cmd, status, exit_code, started_at,
Returns:
process_id (str)
"""
- import uuid
from datetime import datetime
- process_id = str(uuid.uuid4())
+ # Django UUIDField stores UUIDs as 32-char hex (no dashes) in SQLite
+ process_id = uuid7().hex
now = datetime.now().isoformat()
# Convert cmd array to JSON
diff --git a/archivebox/core/migrations/0028_alter_snapshot_fs_version.py b/archivebox/core/migrations/0028_alter_snapshot_fs_version.py
new file mode 100644
index 00000000..eb86883d
--- /dev/null
+++ b/archivebox/core/migrations/0028_alter_snapshot_fs_version.py
@@ -0,0 +1,18 @@
+# Generated by Django 6.0 on 2026-01-02 08:43
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('core', '0027_copy_archiveresult_to_process'),
+ ]
+
+ operations = [
+ migrations.AlterField(
+ model_name='snapshot',
+ name='fs_version',
+ field=models.CharField(default='0.9.0', help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().', max_length=10),
+ ),
+ ]
diff --git a/archivebox/core/migrations/0029_migrate_archiveresult_to_uuid_pk.py b/archivebox/core/migrations/0029_migrate_archiveresult_to_uuid_pk.py
new file mode 100644
index 00000000..36b9f14c
--- /dev/null
+++ b/archivebox/core/migrations/0029_migrate_archiveresult_to_uuid_pk.py
@@ -0,0 +1,181 @@
+# Generated by hand on 2026-01-02
+# Migrate ArchiveResult from integer PK to UUID PK (matching Snapshot)
+
+from django.db import migrations, models, connection
+from uuid import UUID
+from archivebox.uuid_compat import uuid7
+
+
+def migrate_archiveresult_id_to_uuid(apps, schema_editor):
+ """
+ Migrate ArchiveResult from integer PK to UUID PK.
+
+ Strategy:
+ 1. Add old_id field to store current integer IDs
+ 2. Generate UUIDs for any records missing them
+ 3. Swap id and uuid fields (uuid becomes PK, old integer id becomes old_id)
+ """
+ cursor = connection.cursor()
+
+ # Check if table exists and has data
+ cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='core_archiveresult'")
+ if not cursor.fetchone():
+ print('ArchiveResult table does not exist, skipping migration')
+ return
+
+ cursor.execute("SELECT COUNT(*) FROM core_archiveresult")
+ row_count = cursor.fetchone()[0]
+
+ if row_count == 0:
+ print('No ArchiveResult records to migrate')
+ return
+
+ print(f'Migrating {row_count} ArchiveResult records from integer PK to UUID PK...')
+
+ # Step 0: Check if machine_process table exists, if not NULL out process_id values
+ cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='machine_process'")
+ machine_process_exists = cursor.fetchone() is not None
+
+ if not machine_process_exists:
+ print('machine_process table does not exist yet, setting process_id to NULL')
+ cursor.execute("UPDATE core_archiveresult SET process_id = NULL WHERE process_id IS NOT NULL")
+
+ # Step 1: Create new table with UUID as primary key
+ cursor.execute("""
+ CREATE TABLE core_archiveresult_new (
+ id TEXT PRIMARY KEY NOT NULL,
+ old_id INTEGER,
+ uuid TEXT UNIQUE,
+ created_at DATETIME NOT NULL,
+ modified_at DATETIME NOT NULL,
+
+ snapshot_id TEXT NOT NULL,
+ plugin VARCHAR(32) NOT NULL,
+ hook_name VARCHAR(255) NOT NULL DEFAULT '',
+
+ status VARCHAR(15) NOT NULL DEFAULT 'queued',
+ retry_at DATETIME,
+
+ start_ts DATETIME,
+ end_ts DATETIME,
+
+ output_str TEXT NOT NULL DEFAULT '',
+ output_json TEXT,
+ output_files TEXT NOT NULL DEFAULT '{}',
+ output_size BIGINT NOT NULL DEFAULT 0,
+ output_mimetypes VARCHAR(512) NOT NULL DEFAULT '',
+
+ config TEXT NOT NULL DEFAULT '{}',
+ notes TEXT NOT NULL DEFAULT '',
+ num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
+ num_uses_failed INTEGER NOT NULL DEFAULT 0,
+
+ process_id TEXT,
+
+ FOREIGN KEY (snapshot_id) REFERENCES core_snapshot(id) ON DELETE CASCADE,
+ FOREIGN KEY (process_id) REFERENCES machine_process(id) ON DELETE SET NULL
+ );
+ """)
+
+ # Step 2: Generate UUIDs for records that don't have them
+ cursor.execute("SELECT id, uuid FROM core_archiveresult")
+ records = cursor.fetchall()
+
+ id_to_uuid = {}
+ for old_id, existing_uuid in records:
+ if existing_uuid:
+ # Normalize existing UUID to 32-char hex format (Django SQLite UUIDField format)
+ # (existing UUIDs might be stored with or without dashes in old schema)
+ id_to_uuid[old_id] = UUID(existing_uuid).hex
+ else:
+ # Generate new UUIDv7 (time-ordered) as 32-char hex
+ id_to_uuid[old_id] = uuid7().hex
+
+ # Step 3: Copy data with UUIDs as new primary key
+ cursor.execute("SELECT * FROM core_archiveresult")
+ old_records = cursor.fetchall()
+
+ # Get column names
+ cursor.execute("PRAGMA table_info(core_archiveresult)")
+ columns = cursor.fetchall()
+ col_names = [col[1] for col in columns]
+
+ for i, record in enumerate(old_records):
+ old_id = record[col_names.index('id')]
+ new_uuid = id_to_uuid[old_id]
+
+ # Build insert with new structure
+ values = {col_names[i]: record[i] for i in range(len(col_names))}
+
+ # Check which fields exist in new table
+ fields_to_copy = [
+ 'created_at', 'modified_at', 'snapshot_id', 'plugin', 'hook_name',
+ 'status', 'retry_at', 'start_ts', 'end_ts',
+ 'output_str', 'output_json', 'output_files', 'output_size', 'output_mimetypes',
+ 'config', 'notes', 'num_uses_succeeded', 'num_uses_failed', 'process_id'
+ ]
+
+ # Build INSERT statement
+ existing_fields = [f for f in fields_to_copy if f in values]
+ placeholders = ', '.join(['?'] * (len(existing_fields) + 3)) # +3 for id, old_id, uuid
+ field_list = 'id, old_id, uuid, ' + ', '.join(existing_fields)
+
+ insert_values = [new_uuid, old_id, new_uuid] + [values.get(f) for f in existing_fields]
+
+ cursor.execute(
+ f"INSERT INTO core_archiveresult_new ({field_list}) VALUES ({placeholders})",
+ insert_values
+ )
+
+ # Step 4: Replace old table with new table
+ cursor.execute("DROP TABLE core_archiveresult")
+ cursor.execute("ALTER TABLE core_archiveresult_new RENAME TO core_archiveresult")
+
+ # Step 5: Create indexes
+ cursor.execute("CREATE INDEX core_archiveresult_snapshot_id_idx ON core_archiveresult(snapshot_id)")
+ cursor.execute("CREATE INDEX core_archiveresult_plugin_idx ON core_archiveresult(plugin)")
+ cursor.execute("CREATE INDEX core_archiveresult_status_idx ON core_archiveresult(status)")
+ cursor.execute("CREATE INDEX core_archiveresult_retry_at_idx ON core_archiveresult(retry_at)")
+ cursor.execute("CREATE INDEX core_archiveresult_created_at_idx ON core_archiveresult(created_at)")
+ cursor.execute("CREATE INDEX core_archiveresult_hook_name_idx ON core_archiveresult(hook_name)")
+ cursor.execute("CREATE INDEX core_archiveresult_process_id_idx ON core_archiveresult(process_id)")
+ cursor.execute("CREATE INDEX core_archiveresult_old_id_idx ON core_archiveresult(old_id)")
+
+ print(f'✓ Migrated {row_count} ArchiveResult records to UUID primary key')
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('core', '0028_alter_snapshot_fs_version'),
+ ]
+
+ operations = [
+ migrations.SeparateDatabaseAndState(
+ database_operations=[
+ migrations.RunPython(
+ migrate_archiveresult_id_to_uuid,
+ reverse_code=migrations.RunPython.noop,
+ ),
+ ],
+ state_operations=[
+ # Remove old uuid field
+ migrations.RemoveField(
+ model_name='archiveresult',
+ name='uuid',
+ ),
+ # Change id from AutoField to UUIDField
+ migrations.AlterField(
+ model_name='archiveresult',
+ name='id',
+ field=models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True),
+ ),
+ # Add old_id field to preserve legacy integer IDs
+ migrations.AddField(
+ model_name='archiveresult',
+ name='old_id',
+ field=models.IntegerField(null=True, blank=True, db_index=True, help_text='Legacy integer ID from pre-0.9.0 versions'),
+ ),
+ ],
+ ),
+ ]
diff --git a/archivebox/core/migrations/0030_alter_archiveresult_id.py b/archivebox/core/migrations/0030_alter_archiveresult_id.py
new file mode 100644
index 00000000..0c5e54b0
--- /dev/null
+++ b/archivebox/core/migrations/0030_alter_archiveresult_id.py
@@ -0,0 +1,19 @@
+# Generated by Django 6.0 on 2026-01-02 10:02
+
+import uuid
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('core', '0029_migrate_archiveresult_to_uuid_pk'),
+ ]
+
+ operations = [
+ migrations.AlterField(
+ model_name='archiveresult',
+ name='id',
+ field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
+ ),
+ ]
diff --git a/archivebox/core/models.py b/archivebox/core/models.py
index 3a21041a..3de5b4f8 100755
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -362,6 +362,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
# Migrate filesystem if needed (happens automatically on save)
if self.pk and self.fs_migration_needed:
+ print(f"[DEBUG save()] Triggering filesystem migration for {str(self.id)[:8]}: {self.fs_version} → {self._fs_current_version()}")
# Walk through migration chain automatically
current = self.fs_version
target = self._fs_current_version()
@@ -372,6 +373,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
# Only run if method exists (most are no-ops)
if hasattr(self, method):
+ print(f"[DEBUG save()] Running {method}()")
getattr(self, method)()
current = next_ver
@@ -449,10 +451,18 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
old_dir = self.get_storage_path_for_version('0.8.0')
new_dir = self.get_storage_path_for_version('0.9.0')
- if not old_dir.exists() or old_dir == new_dir or new_dir.exists():
- # Even if no directory migration needed, still convert index format
- self.convert_index_json_to_jsonl()
- return
+ print(f"[DEBUG _fs_migrate] {self.timestamp}: old_exists={old_dir.exists()}, same={old_dir == new_dir}, new_exists={new_dir.exists()}")
+
+ if not old_dir.exists() or old_dir == new_dir:
+ # No migration needed
+ print(f"[DEBUG _fs_migrate] Returning None (early return)")
+ return None
+
+ if new_dir.exists():
+ # New directory already exists (files already copied), but we still need cleanup
+ # Return cleanup info so old directory can be cleaned up
+ print(f"[DEBUG _fs_migrate] Returning cleanup info (new_dir exists)")
+ return (old_dir, new_dir)
new_dir.mkdir(parents=True, exist_ok=True)
@@ -495,47 +505,32 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
def _cleanup_old_migration_dir(self, old_dir: Path, new_dir: Path):
"""
Delete old directory and create symlink after successful migration.
- Called via transaction.on_commit() after DB commit succeeds.
"""
import shutil
import logging
- print(f"[DEBUG] _cleanup_old_migration_dir called: old_dir={old_dir}, new_dir={new_dir}")
-
# Delete old directory
if old_dir.exists() and not old_dir.is_symlink():
- print(f"[DEBUG] Attempting to delete old directory: {old_dir}")
try:
shutil.rmtree(old_dir)
- print(f"[DEBUG] Successfully deleted old directory: {old_dir}")
except Exception as e:
- # Log but don't raise - migration succeeded, this is just cleanup
- print(f"[DEBUG] Failed to delete old directory {old_dir}: {e}")
logging.getLogger('archivebox.migration').warning(
f"Could not remove old migration directory {old_dir}: {e}"
)
return # Don't create symlink if cleanup failed
- else:
- print(f"[DEBUG] Old directory doesn't exist or is already a symlink: {old_dir}")
# Create backwards-compat symlink (after old dir is deleted)
symlink_path = old_dir # Same path as old_dir
if symlink_path.is_symlink():
- print(f"[DEBUG] Unlinking existing symlink: {symlink_path}")
symlink_path.unlink()
if not symlink_path.exists():
- print(f"[DEBUG] Creating symlink: {symlink_path} -> {new_dir}")
try:
symlink_path.symlink_to(new_dir, target_is_directory=True)
- print(f"[DEBUG] Successfully created symlink")
except Exception as e:
- print(f"[DEBUG] Failed to create symlink: {e}")
logging.getLogger('archivebox.migration').warning(
f"Could not create symlink from {symlink_path} to {new_dir}: {e}"
)
- else:
- print(f"[DEBUG] Symlink path already exists: {symlink_path}")
# =========================================================================
# Path Calculation and Migration Helpers
@@ -660,13 +655,28 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
if not timestamp:
return None
- # Look up existing
+ # Look up existing (try exact match first, then fuzzy match for truncated timestamps)
try:
- return cls.objects.get(url=url, timestamp=timestamp)
+ snapshot = cls.objects.get(url=url, timestamp=timestamp)
+ print(f"[DEBUG load_from_directory] Found existing snapshot for {url} @ {timestamp}: {str(snapshot.id)[:8]}")
+ return snapshot
except cls.DoesNotExist:
+ print(f"[DEBUG load_from_directory] NOT FOUND (exact): {url} @ {timestamp}")
+ # Try fuzzy match - index.json may have truncated timestamp
+ # e.g., index has "1767000340" but DB has "1767000340.624737"
+ candidates = cls.objects.filter(url=url, timestamp__startswith=timestamp)
+ if candidates.count() == 1:
+ snapshot = candidates.first()
+ print(f"[DEBUG load_from_directory] Found via fuzzy match: {snapshot.timestamp}")
+ return snapshot
+ elif candidates.count() > 1:
+ print(f"[DEBUG load_from_directory] Multiple fuzzy matches, using first")
+ return candidates.first()
+ print(f"[DEBUG load_from_directory] NOT FOUND (fuzzy): {url} @ {timestamp}")
return None
except cls.MultipleObjectsReturned:
# Should not happen with unique constraint
+ print(f"[DEBUG load_from_directory] Multiple snapshots found for {url} @ {timestamp}")
return cls.objects.filter(url=url, timestamp=timestamp).first()
@classmethod
@@ -1668,83 +1678,20 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
return archiveresults
- def advance_step_if_ready(self) -> bool:
- """
- Advance current_step if all foreground hooks in current step are finished.
-
- Called by the state machine to check if step can advance.
- Background hooks (.bg) don't block step advancement.
-
- Step advancement rules:
- - All foreground ARs in current step must be finished (SUCCEEDED/FAILED/SKIPPED)
- - Background ARs (hook_name contains '.bg.') are ignored for advancement
- - When ready, increments current_step by 1 (up to 9)
-
- Returns:
- True if step was advanced, False if not ready or already at step 9.
- """
- from archivebox.hooks import extract_step, is_background_hook
-
- if self.current_step >= 9:
- return False # Already at final step
-
- # Get all ARs for current step that are foreground
- current_step_ars = self.archiveresult_set.filter(
- hook_name__isnull=False
- ).exclude(hook_name='')
-
- # Check each AR in current step
- for ar in current_step_ars:
- ar_step = extract_step(ar.hook_name)
- if ar_step != self.current_step:
- continue # Not in current step
-
- if is_background_hook(ar.hook_name):
- continue # Background hooks don't block
-
- # Foreground hook in current step - check if finished
- if ar.status not in ArchiveResult.FINAL_OR_ACTIVE_STATES:
- # Still pending/queued - can't advance
- return False
-
- if ar.status == ArchiveResult.StatusChoices.STARTED:
- # Still running - can't advance
- return False
-
- # All foreground hooks in current step are finished - advance!
- self.current_step += 1
- self.save(update_fields=['current_step', 'modified_at'])
- return True
def is_finished_processing(self) -> bool:
"""
- Check if this snapshot has finished processing.
+ Check if all ArchiveResults are finished.
- Used by SnapshotMachine.is_finished() to determine if snapshot is complete.
-
- Returns:
- True if all archiveresults are finished (or no work to do), False otherwise.
+ Note: This is only called for observability/progress tracking.
+ SnapshotWorker owns the execution and doesn't poll this.
"""
- # if no archiveresults exist yet, it's not finished
- if not self.archiveresult_set.exists():
- return False
+ # Check if any ARs are still pending/started
+ pending = self.archiveresult_set.exclude(
+ status__in=ArchiveResult.FINAL_OR_ACTIVE_STATES
+ ).exists()
- # Try to advance step if ready (handles step-based hook execution)
- # This will increment current_step when all foreground hooks in current step are done
- while self.advance_step_if_ready():
- pass # Keep advancing until we can't anymore
-
- # if archiveresults exist but are still pending, it's not finished
- if self.pending_archiveresults().exists():
- return False
-
- # Don't wait for background hooks - they'll be cleaned up on entering sealed state
- # Background hooks in STARTED state are excluded by pending_archiveresults()
- # (STARTED is in FINAL_OR_ACTIVE_STATES) so once all results are FINAL or ACTIVE,
- # we can transition to sealed and cleanup() will kill the background hooks
-
- # otherwise archiveresults exist and are all finished, so it's finished
- return True
+ return not pending
def get_progress_stats(self) -> dict:
"""
@@ -2242,7 +2189,6 @@ class SnapshotMachine(BaseStateMachine, strict_states=True):
tick = (
queued.to.itself(unless='can_start') |
queued.to(started, cond='can_start') |
- started.to.itself(unless='is_finished') |
started.to(sealed, cond='is_finished')
)
@@ -2253,6 +2199,10 @@ class SnapshotMachine(BaseStateMachine, strict_states=True):
can_start = bool(self.snapshot.url)
return can_start
+ def is_finished(self) -> bool:
+ """Check if all ArchiveResults for this snapshot are finished."""
+ return self.snapshot.is_finished_processing()
+
@queued.enter
def enter_queued(self):
self.snapshot.update_and_requeue(
@@ -2262,29 +2212,10 @@ class SnapshotMachine(BaseStateMachine, strict_states=True):
@started.enter
def enter_started(self):
- import sys
-
- print(f'[cyan] 🔄 SnapshotMachine.enter_started() - creating archiveresults for {self.snapshot.url}[/cyan]', file=sys.stderr)
-
- # Run the snapshot - creates pending archiveresults for all enabled plugins
- self.snapshot.run()
-
- # Check if any archiveresults were created
- ar_count = self.snapshot.archiveresult_set.count()
- print(f'[cyan] 🔄 ArchiveResult count: {ar_count}[/cyan]', file=sys.stderr)
-
- if ar_count == 0:
- # No archiveresults created, seal immediately
- print(f'[cyan] 🔄 No archiveresults created, sealing snapshot immediately[/cyan]', file=sys.stderr)
- self.seal()
- else:
- # Set status = started with retry_at far future (so workers don't claim us - we're waiting for ARs)
- # Last AR will manually call self.seal() when done
- self.snapshot.update_and_requeue(
- retry_at=timezone.now() + timedelta(days=365),
- status=Snapshot.StatusChoices.STARTED,
- )
- print(f'[cyan] 🔄 {ar_count} archiveresults created, waiting for them to finish[/cyan]', file=sys.stderr)
+ """Just mark as started - SnapshotWorker will create ARs and run hooks."""
+ self.snapshot.status = Snapshot.StatusChoices.STARTED
+ self.snapshot.retry_at = None # No more polling
+ self.snapshot.save(update_fields=['status', 'retry_at', 'modified_at'])
@sealed.enter
def enter_sealed(self):
@@ -2329,12 +2260,11 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
plugins = [get_plugin_name(e) for e in get_plugins()]
return tuple((e, e) for e in plugins)
- # Keep AutoField for backward compatibility with 0.7.x databases
- # UUID field is added separately by migration for new records
- id = models.AutoField(primary_key=True, editable=False)
- # Note: unique constraint is added by migration 0027 - don't set unique=True here
- # or SQLite table recreation in earlier migrations will fail
- uuid = models.UUIDField(default=uuid7, null=True, blank=True, db_index=True)
+ # UUID primary key (migrated from integer in 0029)
+ id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
+ # old_id preserves the legacy integer ID for backward compatibility
+ old_id = models.IntegerField(null=True, blank=True, db_index=True, help_text='Legacy integer ID from pre-0.9.0 versions')
+ # Note: uuid field was removed in migration 0029 when id became UUID
created_at = models.DateTimeField(default=timezone.now, db_index=True)
modified_at = models.DateTimeField(auto_now=True)
@@ -2684,13 +2614,11 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
plugin_dir = Path(self.snapshot.output_dir) / self.plugin
start_ts = timezone.now()
- is_bg_hook = False
+ process = None
for hook in hooks:
- # Check if this is a background hook
- is_bg_hook = is_background_hook(hook.name)
-
- result = run_hook(
+ # Run hook using Process.launch() - returns Process model
+ process = run_hook(
hook,
output_dir=plugin_dir,
config=config,
@@ -2700,27 +2628,25 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
depth=self.snapshot.depth,
)
- # Background hooks return None
- if result is None:
- is_bg_hook = True
-
- # Update status based on hook execution
- if is_bg_hook:
- # BACKGROUND HOOK - still running, return immediately
- # Status stays STARTED, will be finalized by Snapshot.cleanup()
- self.status = self.StatusChoices.STARTED
+ # Link ArchiveResult to Process
+ self.process = process
self.start_ts = start_ts
- if self.process_id:
- self.process.pwd = str(plugin_dir)
- self.process.save()
+ self.save(update_fields=['process_id', 'start_ts', 'modified_at'])
+
+ if not process:
+ # No hooks ran
+ self.status = self.StatusChoices.FAILED
+ self.output_str = 'No hooks executed'
self.save()
return
+ # Update status based on hook execution
+ if process.status == process.StatusChoices.RUNNING:
+ # BACKGROUND HOOK - still running, return immediately
+ # Status is already STARTED from enter_started(), will be finalized by Snapshot.cleanup()
+ return
+
# FOREGROUND HOOK - completed, update from filesystem
- self.start_ts = start_ts
- if self.process_id:
- self.process.pwd = str(plugin_dir)
- self.process.save()
self.update_from_output()
# Clean up empty output directory if no files were created
@@ -3037,26 +2963,30 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):
skipped = State(value=ArchiveResult.StatusChoices.SKIPPED, final=True)
# Tick Event - transitions based on conditions
+ # Flow: queued → started → (succeeded|failed|skipped)
+ # queued → skipped (if exceeded max attempts)
+ # started → backoff → started (retry)
tick = (
+ queued.to(skipped, cond='is_exceeded_max_attempts') | # Check skip first
queued.to.itself(unless='can_start') |
queued.to(started, cond='can_start') |
- started.to.itself(unless='is_finished') |
started.to(succeeded, cond='is_succeeded') |
started.to(failed, cond='is_failed') |
started.to(skipped, cond='is_skipped') |
started.to(backoff, cond='is_backoff') |
+ backoff.to(skipped, cond='is_exceeded_max_attempts') | # Check skip from backoff too
backoff.to.itself(unless='can_start') |
- backoff.to(started, cond='can_start') |
- backoff.to(succeeded, cond='is_succeeded') |
- backoff.to(failed, cond='is_failed') |
- backoff.to(skipped, cond='is_skipped')
+ backoff.to(started, cond='can_start')
+ # Removed redundant transitions: backoff.to(succeeded/failed/skipped)
+ # Reason: backoff should always retry→started, then started→final states
)
def can_start(self) -> bool:
- if not self.archiveresult.snapshot.url:
- return False
+ """Pure function - check if AR can start (has valid URL)."""
+ return bool(self.archiveresult.snapshot.url)
- # Check if snapshot has exceeded MAX_URL_ATTEMPTS failed results
+ def is_exceeded_max_attempts(self) -> bool:
+ """Check if snapshot has exceeded MAX_URL_ATTEMPTS failed results."""
from archivebox.config.configset import get_config
config = get_config(
@@ -3070,15 +3000,7 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):
status=ArchiveResult.StatusChoices.FAILED
).count()
- if failed_count >= max_attempts:
- # Mark this result as skipped since we've hit the limit
- self.archiveresult.status = ArchiveResult.StatusChoices.SKIPPED
- self.archiveresult.output_str = f'Skipped: snapshot exceeded MAX_URL_ATTEMPTS ({max_attempts} failures)'
- self.archiveresult.retry_at = None
- self.archiveresult.save()
- return False
-
- return True
+ return failed_count >= max_attempts
def is_succeeded(self) -> bool:
"""Check if extractor plugin succeeded (status was set by run())."""
@@ -3101,12 +3023,35 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):
)
def is_finished(self) -> bool:
- """Check if extraction has completed (success, failure, or skipped)."""
- return self.archiveresult.status in (
+ """
+ Check if extraction has completed (success, failure, or skipped).
+
+ For background hooks in STARTED state, checks if their Process has finished and reaps them.
+ """
+ # If already in final state, return True
+ if self.archiveresult.status in (
ArchiveResult.StatusChoices.SUCCEEDED,
ArchiveResult.StatusChoices.FAILED,
ArchiveResult.StatusChoices.SKIPPED,
- )
+ ):
+ return True
+
+ # If in STARTED state with a Process, check if Process has finished running
+ if self.archiveresult.status == ArchiveResult.StatusChoices.STARTED:
+ if self.archiveresult.process_id:
+ process = self.archiveresult.process
+
+ # If process is NOT running anymore, reap the background hook
+ if not process.is_running():
+ self.archiveresult.update_from_output()
+ # Check if now in final state after reaping
+ return self.archiveresult.status in (
+ ArchiveResult.StatusChoices.SUCCEEDED,
+ ArchiveResult.StatusChoices.FAILED,
+ ArchiveResult.StatusChoices.SKIPPED,
+ )
+
+ return False
@queued.enter
def enter_queued(self):
@@ -3148,7 +3093,12 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):
)
def _check_and_seal_parent_snapshot(self):
- """Check if this is the last ArchiveResult to finish - if so, seal the parent Snapshot."""
+ """
+ Check if this is the last ArchiveResult to finish - if so, seal the parent Snapshot.
+
+ Note: In the new architecture, SnapshotWorker handles step advancement and sealing.
+ This method is kept for backwards compatibility with manual CLI commands.
+ """
import sys
snapshot = self.archiveresult.snapshot
@@ -3189,6 +3139,8 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):
def enter_failed(self):
import sys
+ print(f'[red] ❌ ArchiveResult.enter_failed() called for {self.archiveresult.plugin}[/red]', file=sys.stderr)
+
self.archiveresult.update_and_requeue(
retry_at=None,
status=ArchiveResult.StatusChoices.FAILED,
@@ -3207,6 +3159,16 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):
def enter_skipped(self):
import sys
+ # Set output_str if not already set (e.g., when skipped due to max attempts)
+ if not self.archiveresult.output_str and self.is_exceeded_max_attempts():
+ from archivebox.config.configset import get_config
+ config = get_config(
+ crawl=self.archiveresult.snapshot.crawl,
+ snapshot=self.archiveresult.snapshot,
+ )
+ max_attempts = config.get('MAX_URL_ATTEMPTS', 50)
+ self.archiveresult.output_str = f'Skipped: snapshot exceeded MAX_URL_ATTEMPTS ({max_attempts} failures)'
+
self.archiveresult.update_and_requeue(
retry_at=None,
status=ArchiveResult.StatusChoices.SKIPPED,
diff --git a/archivebox/crawls/admin.py b/archivebox/crawls/admin.py
index da08b0ac..110fe941 100644
--- a/archivebox/crawls/admin.py
+++ b/archivebox/crawls/admin.py
@@ -281,25 +281,11 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
"""Editor for crawl URLs."""
widget_id = f'crawl_urls_{obj.pk}'
- # Check if it's a local file we can edit
- source_file = obj.get_file_path()
- is_file = source_file is not None
- file_contents = ""
- error = None
-
- if is_file and source_file:
- try:
- file_contents = source_file.read_text().strip()
- except Exception as e:
- error = f'Error reading {source_file}: {e}'
-
# Escape for safe HTML embedding
escaped_urls = (obj.urls or '').replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"')
- escaped_file_contents = file_contents.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"')
# Count lines for auto-expand logic
line_count = len((obj.urls or '').split('\n'))
- file_line_count = len(file_contents.split('\n')) if file_contents else 0
uri_rows = min(max(3, line_count), 10)
html = f'''
@@ -318,21 +304,6 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
{line_count} URL{'s' if line_count != 1 else ''} · Note: URLs displayed here for reference only
-
- {"" if not is_file else f'''
-
-
-
- {"
" + error + "
" if error else ""}
-
-
- '''}
-
'''
return mark_safe(html)
diff --git a/archivebox/crawls/models.py b/archivebox/crawls/models.py
index 52ed6c81..d8df425c 100755
--- a/archivebox/crawls/models.py
+++ b/archivebox/crawls/models.py
@@ -114,22 +114,6 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
},
)
- @classmethod
- def from_file(cls, source_file: Path, max_depth: int = 0, label: str = '', extractor: str = 'auto',
- tags_str: str = '', config=None, created_by=None):
- """Create a crawl from a file containing URLs."""
- urls_content = source_file.read_text()
- crawl = cls.objects.create(
- urls=urls_content,
- extractor=extractor,
- max_depth=max_depth,
- tags_str=tags_str,
- label=label or source_file.name,
- config=config or {},
- created_by_id=getattr(created_by, 'pk', created_by) or get_or_create_system_user_pk(),
- )
- return crawl
-
@property
def api_url(self) -> str:
return reverse_lazy('api-1:get_crawl', args=[self.id])
@@ -196,15 +180,19 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
return crawl
@property
- def output_dir_parent(self) -> str:
- """Construct parent directory: users/{username}/crawls/{YYYYMMDD}"""
- date_str = self.created_at.strftime('%Y%m%d')
- return f'users/{self.created_by.username}/crawls/{date_str}'
+ def OUTPUT_DIR(self) -> Path:
+ """
+ Construct output directory: users/{username}/crawls/{YYYYMMDD}/{domain}/{crawl-id}
+ Domain is extracted from the first URL in the crawl.
+ """
+ from archivebox import DATA_DIR
+ from archivebox.core.models import Snapshot
- @property
- def output_dir_name(self) -> str:
- """Use crawl ID as directory name"""
- return str(self.id)
+ date_str = self.created_at.strftime('%Y%m%d')
+ urls = self.get_urls_list()
+ domain = Snapshot.extract_domain_from_url(urls[0]) if urls else 'unknown'
+
+ return DATA_DIR / 'users' / self.created_by.username / 'crawls' / date_str / domain / str(self.id)
def get_urls_list(self) -> list[str]:
"""Get list of URLs from urls field, filtering out comments and empty lines."""
@@ -216,52 +204,6 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
if url.strip() and not url.strip().startswith('#')
]
- def get_file_path(self) -> Path | None:
- """
- Get filesystem path if this crawl references a local file.
- Checks if the first URL is a file:// URI.
- """
- urls = self.get_urls_list()
- if not urls:
- return None
-
- first_url = urls[0]
- if not first_url.startswith('file://'):
- return None
-
- # Remove file:// prefix
- path_str = first_url.replace('file://', '', 1)
- return Path(path_str)
-
- def create_root_snapshot(self) -> 'Snapshot':
- from archivebox.core.models import Snapshot
-
- first_url = self.get_urls_list()[0] if self.get_urls_list() else None
- if not first_url:
- raise ValueError(f'Crawl {self.id} has no URLs to create root snapshot from')
-
- # Try to get existing snapshot
- try:
- snapshot = Snapshot.objects.get(crawl=self, url=first_url)
- # If exists and already queued/started, return it as-is
- if snapshot.status in [Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]:
- # Update retry_at to now so it can be picked up immediately
- snapshot.retry_at = timezone.now()
- snapshot.save(update_fields=['retry_at'])
- return snapshot
- except Snapshot.DoesNotExist:
- pass
-
- # Create new snapshot
- root_snapshot = Snapshot.objects.create(
- crawl=self,
- url=first_url,
- status=Snapshot.INITIAL_STATE,
- retry_at=timezone.now(),
- timestamp=str(timezone.now().timestamp()),
- depth=0,
- )
- return root_snapshot
def add_url(self, entry: dict) -> bool:
"""
@@ -316,11 +258,15 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
Returns:
List of newly created Snapshot objects
"""
+ import sys
import json
from archivebox.core.models import Snapshot
created_snapshots = []
+ print(f'[cyan]DEBUG create_snapshots_from_urls: self.urls={repr(self.urls)}[/cyan]', file=sys.stderr)
+ print(f'[cyan]DEBUG create_snapshots_from_urls: lines={self.urls.splitlines()}[/cyan]', file=sys.stderr)
+
for line in self.urls.splitlines():
if not line.strip():
continue
@@ -329,13 +275,13 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
try:
entry = json.loads(line)
url = entry.get('url', '')
- depth = entry.get('depth', 1)
+ depth = entry.get('depth', 0)
title = entry.get('title')
timestamp = entry.get('timestamp')
tags = entry.get('tags', '')
except json.JSONDecodeError:
url = line.strip()
- depth = 1
+ depth = 0
title = None
timestamp = None
tags = ''
@@ -379,41 +325,90 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
The root Snapshot for this crawl, or None for system crawls that don't create snapshots
"""
import time
+ import json
from pathlib import Path
from archivebox.hooks import run_hook, discover_hooks, process_hook_records
from archivebox.config.configset import get_config
+ # Debug logging to file (since stdout/stderr redirected to /dev/null in progress mode)
+ debug_log = Path('/tmp/archivebox_crawl_debug.log')
+ with open(debug_log, 'a') as f:
+ f.write(f'\n=== Crawl.run() starting for {self.id} at {time.time()} ===\n')
+ f.flush()
+
# Get merged config with crawl context
config = get_config(crawl=self)
+ # Load all binaries.jsonl files from plugins
+ # This replaces individual on_Crawl install hooks with declarative configuration
+ from archivebox.hooks import BUILTIN_PLUGINS_DIR
+ from archivebox.machine.models import Machine
+
+ machine_id = str(Machine.current().id)
+ binaries_records = []
+
+ for binaries_file in BUILTIN_PLUGINS_DIR.glob('*/binaries.jsonl'):
+ try:
+ with open(binaries_file, 'r') as f:
+ for line in f:
+ line = line.strip()
+ if line and not line.startswith('#'):
+ try:
+ record = json.loads(line)
+ if record.get('type') == 'Binary':
+ record['machine_id'] = machine_id
+ binaries_records.append(record)
+ except json.JSONDecodeError:
+ pass
+ except Exception:
+ pass
+
+ # Process binary declarations before running hooks
+ if binaries_records:
+ overrides = {'crawl': self}
+ process_hook_records(binaries_records, overrides=overrides)
+
# Discover and run on_Crawl hooks
+ with open(debug_log, 'a') as f:
+ f.write(f'Discovering Crawl hooks...\n')
+ f.flush()
hooks = discover_hooks('Crawl', config=config)
- first_url = self.get_urls_list()[0] if self.get_urls_list() else ''
+ with open(debug_log, 'a') as f:
+ f.write(f'Found {len(hooks)} hooks\n')
+ f.flush()
for hook in hooks:
+ with open(debug_log, 'a') as f:
+ f.write(f'Running hook: {hook.name}\n')
+ f.flush()
hook_start = time.time()
plugin_name = hook.parent.name
output_dir = self.OUTPUT_DIR / plugin_name
output_dir.mkdir(parents=True, exist_ok=True)
- result = run_hook(
+ # Run hook using Process.launch() - returns Process model
+ process = run_hook(
hook,
output_dir=output_dir,
config=config,
crawl_id=str(self.id),
- source_url=first_url,
+ source_url=self.urls, # Pass full newline-separated URLs
)
+ with open(debug_log, 'a') as f:
+ f.write(f'Hook {hook.name} completed with status={process.status}\n')
+ f.flush()
hook_elapsed = time.time() - hook_start
if hook_elapsed > 0.5: # Log slow hooks
print(f'[yellow]⏱️ Hook {hook.name} took {hook_elapsed:.2f}s[/yellow]')
- # Background hook - returns None, continues running
- if result is None:
+ # Background hook - still running
+ if process.status == process.StatusChoices.RUNNING:
continue
# Foreground hook - process JSONL records
- records = result.get('records', [])
+ from archivebox.hooks import extract_records_from_process
+ records = extract_records_from_process(process)
if records:
print(f'[cyan]📝 Processing {len(records)} records from {hook.name}[/cyan]')
for record in records[:3]: # Show first 3
@@ -423,14 +418,33 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
if stats:
print(f'[green]✓ Created: {stats}[/green]')
- # System crawls (archivebox://*) don't create snapshots - they just run hooks
- if first_url.startswith('archivebox://'):
- return None
+ # Create snapshots from all URLs in self.urls
+ with open(debug_log, 'a') as f:
+ f.write(f'Creating snapshots from URLs...\n')
+ f.flush()
+ created_snapshots = self.create_snapshots_from_urls()
+ with open(debug_log, 'a') as f:
+ f.write(f'Created {len(created_snapshots)} snapshots\n')
+ f.write(f'=== Crawl.run() complete ===\n\n')
+ f.flush()
+ return created_snapshots[0] if created_snapshots else None
- # Create snapshots from URLs
- root_snapshot = self.create_root_snapshot()
- self.create_snapshots_from_urls()
- return root_snapshot
+ def is_finished(self) -> bool:
+ """Check if crawl is finished (all snapshots sealed or no snapshots exist)."""
+ from archivebox.core.models import Snapshot
+
+ # Check if any snapshots exist for this crawl
+ snapshots = Snapshot.objects.filter(crawl=self)
+
+ # If no snapshots exist, allow finishing (e.g., archivebox://install crawls that only run hooks)
+ if not snapshots.exists():
+ return True
+
+ # If snapshots exist, check if all are sealed
+ if snapshots.filter(status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]).exists():
+ return False
+
+ return True
def cleanup(self):
"""Clean up background hooks and run on_CrawlEnd hooks."""
@@ -452,7 +466,6 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
config = get_config(crawl=self)
hooks = discover_hooks('CrawlEnd', config=config)
- first_url = self.get_urls_list()[0] if self.get_urls_list() else ''
for hook in hooks:
plugin_name = hook.parent.name
@@ -464,7 +477,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
output_dir=output_dir,
config=config,
crawl_id=str(self.id),
- source_url=first_url,
+ source_url=self.urls, # Pass full newline-separated URLs
)
# Log failures but don't block
@@ -494,7 +507,6 @@ class CrawlMachine(BaseStateMachine, strict_states=True):
│ - run_hook(script, output_dir, ...) │
│ - Parse JSONL from hook output │
│ - process_hook_records() → creates Snapshots │
- │ • create_root_snapshot() → root snapshot for crawl │
│ • create_snapshots_from_urls() → from self.urls field │
│ │
│ 2. Snapshots process independently with their own │
@@ -518,7 +530,8 @@ class CrawlMachine(BaseStateMachine, strict_states=True):
# Tick Event (polled by workers)
tick = (
queued.to.itself(unless='can_start') |
- queued.to(started, cond='can_start')
+ queued.to(started, cond='can_start') |
+ started.to(sealed, cond='is_finished')
)
# Manual event (triggered by last Snapshot sealing)
@@ -534,6 +547,10 @@ class CrawlMachine(BaseStateMachine, strict_states=True):
return False
return True
+ def is_finished(self) -> bool:
+ """Check if all Snapshots for this crawl are finished."""
+ return self.crawl.is_finished()
+
@started.enter
def enter_started(self):
import sys
@@ -543,25 +560,21 @@ class CrawlMachine(BaseStateMachine, strict_states=True):
try:
# Run the crawl - runs hooks, processes JSONL, creates snapshots
- root_snapshot = self.crawl.run()
+ first_snapshot = self.crawl.run()
- if root_snapshot:
- print(f'[cyan]🔄 Created root snapshot: {root_snapshot.url}[/cyan]', file=sys.stderr)
+ if first_snapshot:
+ print(f'[cyan]🔄 Created {self.crawl.snapshot_set.count()} snapshot(s), first: {first_snapshot.url}[/cyan]', file=sys.stderr)
# Update status to STARTED
- # Set retry_at to None so workers don't claim us (we wait for snapshots to finish)
- # Last snapshot will manually call self.seal() when done
+ # Set retry_at to near future so tick() can poll and check is_finished()
self.crawl.update_and_requeue(
- retry_at=None,
+ retry_at=timezone.now() + timedelta(seconds=2),
status=Crawl.StatusChoices.STARTED,
)
else:
# No snapshots (system crawl like archivebox://install)
- print(f'[cyan]🔄 No snapshots created, allowing immediate seal[/cyan]', file=sys.stderr)
- # Set retry_at=now so next tick() will transition to sealed
- self.crawl.update_and_requeue(
- retry_at=timezone.now(),
- status=Crawl.StatusChoices.STARTED,
- )
+ print(f'[cyan]🔄 No snapshots created, sealing crawl immediately[/cyan]', file=sys.stderr)
+ # Seal immediately since there's no work to do
+ self.seal()
except Exception as e:
print(f'[red]⚠️ Crawl {self.crawl.id} failed to start: {e}[/red]')
diff --git a/archivebox/hooks.py b/archivebox/hooks.py
index b21022dc..0f69ad77 100644
--- a/archivebox/hooks.py
+++ b/archivebox/hooks.py
@@ -240,13 +240,14 @@ def run_hook(
output_dir: Path,
config: Dict[str, Any],
timeout: Optional[int] = None,
+ parent: Optional['Process'] = None,
**kwargs: Any
-) -> HookResult:
+) -> 'Process':
"""
- Execute a hook script with the given arguments.
+ Execute a hook script with the given arguments using Process model.
- This is the low-level hook executor. For running extractors with proper
- metadata handling, use call_extractor() instead.
+ This is the low-level hook executor that creates a Process record and
+ uses Process.launch() for subprocess management.
Config is passed to hooks via environment variables. Caller MUST use
get_config() to merge all sources (file, env, machine, crawl, snapshot).
@@ -257,16 +258,20 @@ def run_hook(
config: Merged config dict from get_config(crawl=..., snapshot=...) - REQUIRED
timeout: Maximum execution time in seconds
If None, auto-detects from PLUGINNAME_TIMEOUT config (fallback to TIMEOUT, default 300)
+ parent: Optional parent Process (for tracking worker->hook hierarchy)
**kwargs: Arguments passed to the script as --key=value
Returns:
- HookResult with 'returncode', 'stdout', 'stderr', 'output_json', 'output_files', 'duration_ms'
+ Process model instance (use process.exit_code, process.stdout, process.get_records())
Example:
from archivebox.config.configset import get_config
config = get_config(crawl=my_crawl, snapshot=my_snapshot)
- result = run_hook(hook_path, output_dir, config=config, url=url, snapshot_id=id)
+ process = run_hook(hook_path, output_dir, config=config, url=url, snapshot_id=id)
+ if process.status == 'exited':
+ records = process.get_records() # Get parsed JSONL output
"""
+ from archivebox.machine.models import Process, Machine
import time
start_time = time.time()
@@ -276,18 +281,32 @@ def run_hook(
plugin_config = get_plugin_special_config(plugin_name, config)
timeout = plugin_config['timeout']
+ # Get current machine
+ machine = Machine.current()
+
+ # Auto-detect parent process if not explicitly provided
+ # This enables automatic hierarchy tracking: Worker -> Hook
+ if parent is None:
+ try:
+ parent = Process.current()
+ except Exception:
+ # If Process.current() fails (e.g., not in a worker context), leave parent as None
+ pass
+
if not script.exists():
- return HookResult(
- returncode=1,
- stdout='',
+ # Create a failed Process record for hooks that don't exist
+ process = Process.objects.create(
+ machine=machine,
+ parent=parent,
+ process_type=Process.TypeChoices.HOOK,
+ pwd=str(output_dir),
+ cmd=['echo', f'Hook script not found: {script}'],
+ timeout=timeout,
+ status=Process.StatusChoices.EXITED,
+ exit_code=1,
stderr=f'Hook script not found: {script}',
- output_json=None,
- output_files=[],
- duration_ms=0,
- hook=str(script),
- plugin=script.parent.name,
- hook_name=script.name,
)
+ return process
# Determine the interpreter based on file extension
ext = script.suffix.lower()
@@ -379,130 +398,138 @@ def run_hook(
# Create output directory if needed
output_dir.mkdir(parents=True, exist_ok=True)
- # Capture files before execution to detect new output
- files_before = set(output_dir.rglob('*')) if output_dir.exists() else set()
-
# Detect if this is a background hook (long-running daemon)
# New convention: .bg. suffix (e.g., on_Snapshot__21_consolelog.bg.js)
# Old convention: __background in stem (for backwards compatibility)
is_background = '.bg.' in script.name or '__background' in script.stem
- # Set up output files for ALL hooks (useful for debugging)
- stdout_file = output_dir / 'stdout.log'
- stderr_file = output_dir / 'stderr.log'
- pid_file = output_dir / 'hook.pid'
- cmd_file = output_dir / 'cmd.sh'
-
try:
- # Write command script for validation
- from archivebox.misc.process_utils import write_cmd_file
- write_cmd_file(cmd_file, cmd)
-
- # Open log files for writing
- with open(stdout_file, 'w') as out, open(stderr_file, 'w') as err:
- process = subprocess.Popen(
- cmd,
- cwd=str(output_dir),
- stdout=out,
- stderr=err,
- env=env,
- )
-
- # Write PID with mtime set to process start time for validation
- from archivebox.misc.process_utils import write_pid_file_with_mtime
- process_start_time = time.time()
- write_pid_file_with_mtime(pid_file, process.pid, process_start_time)
-
- if is_background:
- # Background hook - return None immediately, don't wait
- # Process continues running, writing to stdout.log
- # ArchiveResult will poll for completion later
- return None
-
- # Normal hook - wait for completion with timeout
- try:
- returncode = process.wait(timeout=timeout)
- except subprocess.TimeoutExpired:
- process.kill()
- process.wait() # Clean up zombie
- duration_ms = int((time.time() - start_time) * 1000)
- return HookResult(
- returncode=-1,
- stdout='',
- stderr=f'Hook timed out after {timeout} seconds',
- output_json=None,
- output_files=[],
- duration_ms=duration_ms,
- hook=str(script),
- )
-
- # Read output from files
- stdout = stdout_file.read_text() if stdout_file.exists() else ''
- stderr = stderr_file.read_text() if stderr_file.exists() else ''
-
- # Detect new files created by the hook
- files_after = set(output_dir.rglob('*')) if output_dir.exists() else set()
- new_files = [str(f.relative_to(output_dir)) for f in (files_after - files_before) if f.is_file()]
- # Exclude the log files themselves from new_files
- new_files = [f for f in new_files if f not in ('stdout.log', 'stderr.log', 'hook.pid')]
-
- # Parse JSONL output from stdout
- # Each line starting with { that has 'type' field is a record
- records = []
- plugin_name = script.parent.name # Plugin directory name (e.g., 'wget')
- hook_name = script.name # Full hook filename (e.g., 'on_Snapshot__50_wget.py')
-
- for line in stdout.splitlines():
- line = line.strip()
- if not line or not line.startswith('{'):
- continue
-
- try:
- data = json.loads(line)
- if 'type' in data:
- # Add plugin metadata to every record
- data['plugin'] = plugin_name
- data['hook_name'] = hook_name
- data['plugin_hook'] = str(script)
- records.append(data)
- except json.JSONDecodeError:
- pass
-
- duration_ms = int((time.time() - start_time) * 1000)
-
- # Clean up log files on success (keep on failure for debugging)
- if returncode == 0:
- stdout_file.unlink(missing_ok=True)
- stderr_file.unlink(missing_ok=True)
- pid_file.unlink(missing_ok=True)
-
- return HookResult(
- returncode=returncode,
- stdout=stdout,
- stderr=stderr,
- output_json=None, # Legacy field, we now use records for JSONL
- output_files=new_files,
- duration_ms=duration_ms,
- hook=str(script),
- plugin=plugin_name,
- hook_name=hook_name,
- records=records,
+ # Create Process record
+ process = Process.objects.create(
+ machine=machine,
+ parent=parent,
+ process_type=Process.TypeChoices.HOOK,
+ pwd=str(output_dir),
+ cmd=cmd,
+ timeout=timeout,
)
+ # Build environment from config (Process._build_env() expects self.env dict)
+ # We need to set env on the process before launching
+ process.env = {}
+ for key, value in config.items():
+ if value is None:
+ continue
+ elif isinstance(value, bool):
+ process.env[key] = 'true' if value else 'false'
+ elif isinstance(value, (list, dict)):
+ process.env[key] = json.dumps(value)
+ else:
+ process.env[key] = str(value)
+
+ # Add base paths to env
+ process.env['DATA_DIR'] = str(getattr(settings, 'DATA_DIR', Path.cwd()))
+ process.env['ARCHIVE_DIR'] = str(getattr(settings, 'ARCHIVE_DIR', Path.cwd() / 'archive'))
+ process.env.setdefault('MACHINE_ID', getattr(settings, 'MACHINE_ID', '') or os.environ.get('MACHINE_ID', ''))
+
+ # Add LIB_DIR and LIB_BIN_DIR
+ lib_dir = config.get('LIB_DIR', getattr(settings, 'LIB_DIR', None))
+ lib_bin_dir = config.get('LIB_BIN_DIR', getattr(settings, 'LIB_BIN_DIR', None))
+ if lib_dir:
+ process.env['LIB_DIR'] = str(lib_dir)
+ if not lib_bin_dir and lib_dir:
+ lib_bin_dir = Path(lib_dir) / 'bin'
+ if lib_bin_dir:
+ process.env['LIB_BIN_DIR'] = str(lib_bin_dir)
+
+ # Set PATH from Machine.config if available
+ try:
+ if machine and machine.config:
+ machine_path = machine.config.get('config/PATH')
+ if machine_path:
+ # Prepend LIB_BIN_DIR to machine PATH as well
+ if lib_bin_dir and not machine_path.startswith(f'{lib_bin_dir}:'):
+ process.env['PATH'] = f'{lib_bin_dir}:{machine_path}'
+ else:
+ process.env['PATH'] = machine_path
+ elif lib_bin_dir:
+ # Just prepend to current PATH
+ current_path = os.environ.get('PATH', '')
+ if not current_path.startswith(f'{lib_bin_dir}:'):
+ process.env['PATH'] = f'{lib_bin_dir}:{current_path}' if current_path else str(lib_bin_dir)
+
+ # Also set NODE_MODULES_DIR if configured
+ node_modules_dir = machine.config.get('config/NODE_MODULES_DIR')
+ if node_modules_dir:
+ process.env['NODE_MODULES_DIR'] = node_modules_dir
+ except Exception:
+ pass # Fall back to system PATH if Machine not available
+
+ # Save env before launching
+ process.save()
+
+ # Launch subprocess using Process.launch()
+ process.launch(background=is_background)
+
+ # Return Process object (caller can use process.exit_code, process.stdout, process.get_records())
+ return process
+
except Exception as e:
- duration_ms = int((time.time() - start_time) * 1000)
- return HookResult(
- returncode=-1,
- stdout='',
+ # Create a failed Process record for exceptions
+ process = Process.objects.create(
+ machine=machine,
+ process_type=Process.TypeChoices.HOOK,
+ pwd=str(output_dir),
+ cmd=cmd,
+ timeout=timeout,
+ status=Process.StatusChoices.EXITED,
+ exit_code=-1,
stderr=f'Failed to run hook: {type(e).__name__}: {e}',
- output_json=None,
- output_files=[],
- duration_ms=duration_ms,
- hook=str(script),
- plugin=script.parent.name,
- hook_name=script.name,
- records=[],
)
+ return process
+
+
+def extract_records_from_process(process: 'Process') -> List[Dict[str, Any]]:
+ """
+ Extract JSONL records from a Process's stdout.
+
+ Uses the same parse_line() logic from misc/jsonl.py.
+ Adds plugin metadata to each record.
+
+ Args:
+ process: Process model instance with stdout captured
+
+ Returns:
+ List of parsed JSONL records with plugin metadata
+ """
+ from archivebox.misc.jsonl import parse_line
+
+ records = []
+
+ # Read stdout from process
+ stdout = process.stdout
+ if not stdout and process.stdout_file and process.stdout_file.exists():
+ stdout = process.stdout_file.read_text()
+
+ if not stdout:
+ return records
+
+ # Extract plugin metadata from process.pwd and process.cmd
+ plugin_name = Path(process.pwd).name if process.pwd else 'unknown'
+ hook_name = Path(process.cmd[1]).name if len(process.cmd) > 1 else 'unknown'
+ plugin_hook = process.cmd[1] if len(process.cmd) > 1 else ''
+
+ # Parse each line as JSONL
+ for line in stdout.splitlines():
+ record = parse_line(line)
+ if record and 'type' in record:
+ # Add plugin metadata to record
+ record.setdefault('plugin', plugin_name)
+ record.setdefault('hook_name', hook_name)
+ record.setdefault('plugin_hook', plugin_hook)
+ records.append(record)
+
+ return records
def collect_urls_from_plugins(snapshot_dir: Path) -> List[Dict[str, Any]]:
@@ -940,7 +967,7 @@ def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[
else:
# No PLUGINS whitelist - use PLUGINNAME_ENABLED (default True)
import sys
- print(f"DEBUG: NO PLUGINS whitelist in config, checking {plugin_name}_ENABLED", file=sys.stderr)
+ print(f"DEBUG: NO PLUGINS whitelist in config, checking {plugin_upper}_ENABLED", file=sys.stderr)
enabled_key = f'{plugin_upper}_ENABLED'
enabled = config.get(enabled_key)
if enabled is None:
diff --git a/archivebox/machine/migrations/0009_alter_binary_status.py b/archivebox/machine/migrations/0009_alter_binary_status.py
new file mode 100644
index 00000000..88ed39ad
--- /dev/null
+++ b/archivebox/machine/migrations/0009_alter_binary_status.py
@@ -0,0 +1,18 @@
+# Generated by Django 6.0 on 2026-01-02 08:43
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('machine', '0008_add_worker_type_field'),
+ ]
+
+ operations = [
+ migrations.AlterField(
+ model_name='binary',
+ name='status',
+ field=models.CharField(choices=[('queued', 'Queued'), ('installed', 'Installed')], db_index=True, default='queued', max_length=16),
+ ),
+ ]
diff --git a/archivebox/machine/models.py b/archivebox/machine/models.py
index 417e4c9f..73680905 100755
--- a/archivebox/machine/models.py
+++ b/archivebox/machine/models.py
@@ -683,6 +683,7 @@ class Process(models.Model):
ORCHESTRATOR = 'orchestrator', 'Orchestrator'
WORKER = 'worker', 'Worker'
CLI = 'cli', 'CLI'
+ HOOK = 'hook', 'Hook'
BINARY = 'binary', 'Binary'
# Primary fields
@@ -1415,6 +1416,10 @@ class Process(models.Model):
"""
Check if process has exited and update status if so.
+ Cleanup when process exits:
+ - Copy stdout/stderr to DB (keep files for debugging)
+ - Delete PID file
+
Returns:
exit_code if exited, None if still running
"""
@@ -1422,11 +1427,25 @@ class Process(models.Model):
return self.exit_code
if not self.is_running:
- # Process exited - read output and update status
+ # Process exited - read output and copy to DB
if self.stdout_file and self.stdout_file.exists():
self.stdout = self.stdout_file.read_text()
+ # TODO: Uncomment to cleanup (keeping for debugging for now)
+ # self.stdout_file.unlink(missing_ok=True)
if self.stderr_file and self.stderr_file.exists():
self.stderr = self.stderr_file.read_text()
+ # TODO: Uncomment to cleanup (keeping for debugging for now)
+ # self.stderr_file.unlink(missing_ok=True)
+
+ # Clean up PID file (not needed for debugging)
+ if self.pid_file and self.pid_file.exists():
+ self.pid_file.unlink(missing_ok=True)
+
+ # TODO: Uncomment to cleanup cmd.sh (keeping for debugging for now)
+ # if self.pwd:
+ # cmd_file = Path(self.pwd) / 'cmd.sh'
+ # if cmd_file.exists():
+ # cmd_file.unlink(missing_ok=True)
# Try to get exit code from proc or default to unknown
self.exit_code = self.exit_code if self.exit_code is not None else -1
@@ -1686,6 +1705,46 @@ class Process(models.Model):
"""
return cls.get_running_count(process_type=process_type, machine=machine)
+ @classmethod
+ def cleanup_orphaned_chrome(cls) -> int:
+ """
+ Kill orphaned Chrome processes using chrome_utils.js killZombieChrome.
+
+ Scans DATA_DIR for chrome/*.pid files from stale crawls (>5 min old)
+ and kills any orphaned Chrome processes.
+
+ Called by:
+ - Orchestrator on startup (cleanup from previous crashes)
+ - Orchestrator periodically (every N minutes)
+
+ Returns:
+ Number of zombie Chrome processes killed
+ """
+ import subprocess
+ from pathlib import Path
+ from django.conf import settings
+
+ chrome_utils = Path(__file__).parent.parent / 'plugins' / 'chrome' / 'chrome_utils.js'
+ if not chrome_utils.exists():
+ return 0
+
+ try:
+ result = subprocess.run(
+ ['node', str(chrome_utils), 'killZombieChrome', str(settings.DATA_DIR)],
+ capture_output=True,
+ timeout=30,
+ text=True,
+ )
+ if result.returncode == 0:
+ killed = int(result.stdout.strip())
+ if killed > 0:
+ print(f'[yellow]🧹 Cleaned up {killed} orphaned Chrome processes[/yellow]')
+ return killed
+ except (subprocess.TimeoutExpired, ValueError, FileNotFoundError) as e:
+ print(f'[red]Failed to cleanup orphaned Chrome: {e}[/red]')
+
+ return 0
+
# =============================================================================
# Binary State Machine
diff --git a/archivebox/misc/logging_util.py b/archivebox/misc/logging_util.py
index 547b3b68..a3ad4566 100644
--- a/archivebox/misc/logging_util.py
+++ b/archivebox/misc/logging_util.py
@@ -530,13 +530,13 @@ def log_worker_event(
Log a worker event with structured metadata and indentation.
Args:
- worker_type: Type of worker (Orchestrator, CrawlWorker, SnapshotWorker, etc.)
+ worker_type: Type of worker (Orchestrator, CrawlWorker, SnapshotWorker)
event: Event name (Starting, Completed, Failed, etc.)
- indent_level: Indentation level (0=Orchestrator, 1=CrawlWorker, 2=SnapshotWorker, 3=ArchiveResultWorker)
+ indent_level: Indentation level (0=Orchestrator, 1=CrawlWorker, 2=SnapshotWorker)
pid: Process ID
- worker_id: Worker ID (UUID for CrawlWorker, url for SnapshotWorker, plugin for ArchiveResultWorker)
- url: URL being processed (for SnapshotWorker/ArchiveResultWorker)
- plugin: Plugin name (for ArchiveResultWorker)
+ worker_id: Worker ID (UUID for workers)
+ url: URL being processed (for SnapshotWorker)
+ plugin: Plugin name (for hook processes)
metadata: Dict of metadata to show in curly braces
error: Exception if event is an error
"""
diff --git a/archivebox/misc/progress_layout.py b/archivebox/misc/progress_layout.py
new file mode 100644
index 00000000..f386cd13
--- /dev/null
+++ b/archivebox/misc/progress_layout.py
@@ -0,0 +1,345 @@
+"""
+Rich Layout-based live progress display for ArchiveBox orchestrator.
+
+Shows a comprehensive dashboard with:
+- Top: Crawl queue status (full width)
+- Middle: 4-column grid of SnapshotWorker progress panels
+- Bottom: Orchestrator/Daphne logs
+"""
+
+__package__ = 'archivebox.misc'
+
+from datetime import datetime, timezone
+from typing import Dict, List, Optional, Any
+from collections import deque
+
+from rich import box
+from rich.align import Align
+from rich.console import Console, Group, RenderableType
+from rich.layout import Layout
+from rich.panel import Panel
+from rich.progress import Progress, BarColumn, TextColumn, TaskProgressColumn, SpinnerColumn
+from rich.table import Table
+from rich.text import Text
+
+from archivebox.config import VERSION
+
+# Maximum number of SnapshotWorker columns to display
+MAX_WORKER_COLUMNS = 4
+
+
+class CrawlQueuePanel:
+ """Display crawl queue status across full width."""
+
+ def __init__(self):
+ self.orchestrator_status = "Idle"
+ self.crawl_queue_count = 0
+ self.crawl_workers_count = 0
+ self.max_crawl_workers = 8
+ self.crawl_id: Optional[str] = None
+
+ def __rich__(self) -> Panel:
+ grid = Table.grid(expand=True)
+ grid.add_column(justify="left", ratio=1)
+ grid.add_column(justify="center", ratio=1)
+ grid.add_column(justify="center", ratio=1)
+ grid.add_column(justify="right", ratio=1)
+
+ # Left: ArchiveBox version + timestamp
+ left_text = Text()
+ left_text.append("ArchiveBox ", style="bold cyan")
+ left_text.append(f"v{VERSION}", style="bold yellow")
+ left_text.append(f" • {datetime.now(timezone.utc).strftime('%H:%M:%S')}", style="grey53")
+
+ # Center-left: Crawl queue status
+ queue_style = "yellow" if self.crawl_queue_count > 0 else "grey53"
+ center_left_text = Text()
+ center_left_text.append("Crawls: ", style="white")
+ center_left_text.append(str(self.crawl_queue_count), style=f"bold {queue_style}")
+ center_left_text.append(" queued", style="grey53")
+
+ # Center-right: CrawlWorker status
+ worker_style = "green" if self.crawl_workers_count > 0 else "grey53"
+ center_right_text = Text()
+ center_right_text.append("Workers: ", style="white")
+ center_right_text.append(f"{self.crawl_workers_count}/{self.max_crawl_workers}", style=f"bold {worker_style}")
+ center_right_text.append(" active", style="grey53")
+
+ # Right: Orchestrator status
+ status_color = "green" if self.crawl_workers_count > 0 else "grey53"
+ right_text = Text()
+ right_text.append("Status: ", style="white")
+ right_text.append(self.orchestrator_status, style=f"bold {status_color}")
+ if self.crawl_id:
+ right_text.append(f" [{self.crawl_id[:8]}]", style="grey53")
+
+ grid.add_row(left_text, center_left_text, center_right_text, right_text)
+ return Panel(grid, style="white on blue", box=box.ROUNDED)
+
+
+class SnapshotWorkerPanel:
+ """Display progress for a single SnapshotWorker."""
+
+ def __init__(self, worker_num: int):
+ self.worker_num = worker_num
+ self.snapshot_id: Optional[str] = None
+ self.snapshot_url: Optional[str] = None
+ self.total_hooks: int = 0
+ self.completed_hooks: int = 0
+ self.current_plugin: Optional[str] = None
+ self.status: str = "idle" # idle, working, completed
+ self.recent_logs: deque = deque(maxlen=5)
+
+ def __rich__(self) -> Panel:
+ if self.status == "idle":
+ content = Align.center(
+ Text("Idle", style="grey53"),
+ vertical="middle",
+ )
+ border_style = "grey53"
+ title_style = "grey53"
+ else:
+ # Build progress display
+ lines = []
+
+ # URL (truncated)
+ if self.snapshot_url:
+ url_display = self.snapshot_url[:35] + "..." if len(self.snapshot_url) > 35 else self.snapshot_url
+ lines.append(Text(url_display, style="cyan"))
+ lines.append(Text()) # Spacing
+
+ # Progress bar
+ if self.total_hooks > 0:
+ pct = (self.completed_hooks / self.total_hooks) * 100
+ bar_width = 30
+ filled = int((pct / 100) * bar_width)
+ bar = "█" * filled + "░" * (bar_width - filled)
+
+ # Color based on progress
+ if pct < 30:
+ bar_style = "yellow"
+ elif pct < 100:
+ bar_style = "green"
+ else:
+ bar_style = "blue"
+
+ progress_text = Text()
+ progress_text.append(bar, style=bar_style)
+ progress_text.append(f" {pct:.0f}%", style="white")
+ lines.append(progress_text)
+ lines.append(Text()) # Spacing
+
+ # Stats
+ stats = Table.grid(padding=(0, 1))
+ stats.add_column(style="grey53", no_wrap=True)
+ stats.add_column(style="white")
+ stats.add_row("Hooks:", f"{self.completed_hooks}/{self.total_hooks}")
+ if self.current_plugin:
+ stats.add_row("Current:", Text(self.current_plugin, style="yellow"))
+ lines.append(stats)
+ lines.append(Text()) # Spacing
+
+ # Recent logs
+ if self.recent_logs:
+ lines.append(Text("Recent:", style="grey53"))
+ for log_msg, log_style in self.recent_logs:
+ log_text = Text(f"• {log_msg[:30]}", style=log_style)
+ lines.append(log_text)
+
+ content = Group(*lines)
+ border_style = "green" if self.status == "working" else "blue"
+ title_style = "green" if self.status == "working" else "blue"
+
+ return Panel(
+ content,
+ title=f"[{title_style}]Worker {self.worker_num}",
+ border_style=border_style,
+ box=box.ROUNDED,
+ height=20,
+ )
+
+ def add_log(self, message: str, style: str = "white"):
+ """Add a log message to this worker's recent logs."""
+ self.recent_logs.append((message, style))
+
+
+class OrchestratorLogPanel:
+ """Display orchestrator and system logs."""
+
+ def __init__(self, max_events: int = 15):
+ self.events: deque = deque(maxlen=max_events)
+ self.max_events = max_events
+
+ def add_event(self, message: str, style: str = "white"):
+ """Add an event to the log."""
+ timestamp = datetime.now(timezone.utc).strftime("%H:%M:%S")
+ self.events.append((timestamp, message, style))
+
+ def __rich__(self) -> Panel:
+ if not self.events:
+ content = Text("No recent events", style="grey53", justify="center")
+ else:
+ lines = []
+ for timestamp, message, style in self.events:
+ line = Text()
+ line.append(f"[{timestamp}] ", style="grey53")
+ line.append(message, style=style)
+ lines.append(line)
+ content = Group(*lines)
+
+ return Panel(
+ content,
+ title="[bold white]Orchestrator / Daphne Logs",
+ border_style="white",
+ box=box.ROUNDED,
+ height=12,
+ )
+
+
+class ArchiveBoxProgressLayout:
+ """
+ Main layout manager for ArchiveBox orchestrator progress display.
+
+ Layout structure:
+ ┌─────────────────────────────────────────────────────────────┐
+ │ Crawl Queue (full width) │
+ ├───────────────┬───────────────┬───────────────┬─────────────┤
+ │ Snapshot │ Snapshot │ Snapshot │ Snapshot │
+ │ Worker 1 │ Worker 2 │ Worker 3 │ Worker 4 │
+ │ │ │ │ │
+ │ Progress + │ Progress + │ Progress + │ Progress + │
+ │ Stats + │ Stats + │ Stats + │ Stats + │
+ │ Logs │ Logs │ Logs │ Logs │
+ ├───────────────┴───────────────┴───────────────┴─────────────┤
+ │ Orchestrator / Daphne Logs │
+ └─────────────────────────────────────────────────────────────┘
+ """
+
+ def __init__(self, crawl_id: Optional[str] = None):
+ self.crawl_id = crawl_id
+ self.start_time = datetime.now(timezone.utc)
+
+ # Create components
+ self.crawl_queue = CrawlQueuePanel()
+ self.crawl_queue.crawl_id = crawl_id
+
+ # Create 4 worker panels
+ self.worker_panels = [SnapshotWorkerPanel(i + 1) for i in range(MAX_WORKER_COLUMNS)]
+
+ self.orchestrator_log = OrchestratorLogPanel(max_events=12)
+
+ # Create layout
+ self.layout = self._make_layout()
+
+ # Track snapshot ID to worker panel mapping
+ self.snapshot_to_worker: Dict[str, int] = {} # snapshot_id -> worker_panel_index
+
+ def _make_layout(self) -> Layout:
+ """Define the layout structure."""
+ layout = Layout(name="root")
+
+ # Top-level split: crawl_queue, workers, logs
+ layout.split(
+ Layout(name="crawl_queue", size=3),
+ Layout(name="workers", ratio=1),
+ Layout(name="logs", size=13),
+ )
+
+ # Split workers into 4 columns
+ layout["workers"].split_row(
+ Layout(name="worker1"),
+ Layout(name="worker2"),
+ Layout(name="worker3"),
+ Layout(name="worker4"),
+ )
+
+ # Assign components to layout sections
+ layout["crawl_queue"].update(self.crawl_queue)
+ layout["worker1"].update(self.worker_panels[0])
+ layout["worker2"].update(self.worker_panels[1])
+ layout["worker3"].update(self.worker_panels[2])
+ layout["worker4"].update(self.worker_panels[3])
+ layout["logs"].update(self.orchestrator_log)
+
+ return layout
+
+ def update_orchestrator_status(
+ self,
+ status: str,
+ crawl_queue_count: int = 0,
+ crawl_workers_count: int = 0,
+ max_crawl_workers: int = 8,
+ ):
+ """Update orchestrator status in the crawl queue panel."""
+ self.crawl_queue.orchestrator_status = status
+ self.crawl_queue.crawl_queue_count = crawl_queue_count
+ self.crawl_queue.crawl_workers_count = crawl_workers_count
+ self.crawl_queue.max_crawl_workers = max_crawl_workers
+
+ def update_snapshot_worker(
+ self,
+ snapshot_id: str,
+ url: str,
+ total: int,
+ completed: int,
+ current_plugin: str = "",
+ ):
+ """Update or assign a snapshot to a worker panel."""
+ # Find or assign worker panel for this snapshot
+ if snapshot_id not in self.snapshot_to_worker:
+ # Find first idle worker panel
+ worker_idx = None
+ for idx, panel in enumerate(self.worker_panels):
+ if panel.status == "idle":
+ worker_idx = idx
+ break
+
+ # If no idle worker, use round-robin (shouldn't happen often)
+ if worker_idx is None:
+ worker_idx = len(self.snapshot_to_worker) % MAX_WORKER_COLUMNS
+
+ self.snapshot_to_worker[snapshot_id] = worker_idx
+
+ # Get assigned worker panel
+ worker_idx = self.snapshot_to_worker[snapshot_id]
+ panel = self.worker_panels[worker_idx]
+
+ # Update panel
+ panel.snapshot_id = snapshot_id
+ panel.snapshot_url = url
+ panel.total_hooks = total
+ panel.completed_hooks = completed
+ panel.current_plugin = current_plugin
+ panel.status = "working" if completed < total else "completed"
+
+ def remove_snapshot_worker(self, snapshot_id: str):
+ """Mark a snapshot worker as idle after completion."""
+ if snapshot_id in self.snapshot_to_worker:
+ worker_idx = self.snapshot_to_worker[snapshot_id]
+ panel = self.worker_panels[worker_idx]
+
+ # Mark as idle
+ panel.status = "idle"
+ panel.snapshot_id = None
+ panel.snapshot_url = None
+ panel.total_hooks = 0
+ panel.completed_hooks = 0
+ panel.current_plugin = None
+ panel.recent_logs.clear()
+
+ # Remove mapping
+ del self.snapshot_to_worker[snapshot_id]
+
+ def log_to_worker(self, snapshot_id: str, message: str, style: str = "white"):
+ """Add a log message to a specific worker's panel."""
+ if snapshot_id in self.snapshot_to_worker:
+ worker_idx = self.snapshot_to_worker[snapshot_id]
+ self.worker_panels[worker_idx].add_log(message, style)
+
+ def log_event(self, message: str, style: str = "white"):
+ """Add an event to the orchestrator log."""
+ self.orchestrator_log.add_event(message, style)
+
+ def get_layout(self) -> Layout:
+ """Get the Rich Layout object for rendering."""
+ return self.layout
diff --git a/archivebox/plugins/accessibility/tests/test_accessibility.py b/archivebox/plugins/accessibility/tests/test_accessibility.py
index 0c85b145..addd51df 100644
--- a/archivebox/plugins/accessibility/tests/test_accessibility.py
+++ b/archivebox/plugins/accessibility/tests/test_accessibility.py
@@ -72,10 +72,8 @@ class TestAccessibilityWithChrome(TestCase):
test_url=test_url,
navigate=True,
timeout=30,
- ) as (chrome_process, chrome_pid, snapshot_chrome_dir):
- # Get environment and run the accessibility hook
- env = get_test_env()
- env['CHROME_HEADLESS'] = 'true'
+ ) as (chrome_process, chrome_pid, snapshot_chrome_dir, env):
+ # Use the environment from chrome_session (already has CHROME_HEADLESS=true)
# Run accessibility hook with the active Chrome session
result = subprocess.run(
@@ -116,6 +114,85 @@ class TestAccessibilityWithChrome(TestCase):
self.skipTest(f"Chrome session setup failed: {e}")
raise
+ def test_accessibility_disabled_skips(self):
+ """Test that ACCESSIBILITY_ENABLED=False skips without error."""
+ test_url = 'https://example.com'
+ snapshot_id = 'test-disabled'
+
+ env = get_test_env()
+ env['ACCESSIBILITY_ENABLED'] = 'False'
+
+ result = subprocess.run(
+ ['node', str(ACCESSIBILITY_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
+ cwd=str(self.temp_dir),
+ capture_output=True,
+ text=True,
+ timeout=30,
+ env=env
+ )
+
+ # Should exit 0 even when disabled
+ self.assertEqual(result.returncode, 0, f"Should succeed when disabled: {result.stderr}")
+
+ # Should NOT create output file when disabled
+ accessibility_output = self.temp_dir / 'accessibility.json'
+ self.assertFalse(accessibility_output.exists(), "Should not create file when disabled")
+
+ def test_accessibility_missing_url_argument(self):
+ """Test that missing --url argument causes error."""
+ snapshot_id = 'test-missing-url'
+
+ result = subprocess.run(
+ ['node', str(ACCESSIBILITY_HOOK), f'--snapshot-id={snapshot_id}'],
+ cwd=str(self.temp_dir),
+ capture_output=True,
+ text=True,
+ timeout=30,
+ env=get_test_env()
+ )
+
+ # Should fail with non-zero exit code
+ self.assertNotEqual(result.returncode, 0, "Should fail when URL missing")
+
+ def test_accessibility_missing_snapshot_id_argument(self):
+ """Test that missing --snapshot-id argument causes error."""
+ test_url = 'https://example.com'
+
+ result = subprocess.run(
+ ['node', str(ACCESSIBILITY_HOOK), f'--url={test_url}'],
+ cwd=str(self.temp_dir),
+ capture_output=True,
+ text=True,
+ timeout=30,
+ env=get_test_env()
+ )
+
+ # Should fail with non-zero exit code
+ self.assertNotEqual(result.returncode, 0, "Should fail when snapshot-id missing")
+
+ def test_accessibility_with_no_chrome_session(self):
+ """Test that hook fails gracefully when no Chrome session exists."""
+ test_url = 'https://example.com'
+ snapshot_id = 'test-no-chrome'
+
+ result = subprocess.run(
+ ['node', str(ACCESSIBILITY_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
+ cwd=str(self.temp_dir),
+ capture_output=True,
+ text=True,
+ timeout=30,
+ env=get_test_env()
+ )
+
+ # Should fail when no Chrome session
+ self.assertNotEqual(result.returncode, 0, "Should fail when no Chrome session exists")
+ # Error should mention CDP or Chrome
+ err_lower = result.stderr.lower()
+ self.assertTrue(
+ any(x in err_lower for x in ['chrome', 'cdp', 'cannot find', 'puppeteer']),
+ f"Should mention Chrome/CDP in error: {result.stderr}"
+ )
+
if __name__ == '__main__':
pytest.main([__file__, '-v'])
diff --git a/archivebox/plugins/chrome/chrome_utils.js b/archivebox/plugins/chrome/chrome_utils.js
index dd9ad47b..f61cfcdd 100755
--- a/archivebox/plugins/chrome/chrome_utils.js
+++ b/archivebox/plugins/chrome/chrome_utils.js
@@ -1397,11 +1397,11 @@ function getMachineType() {
*/
function getLibDir() {
if (process.env.LIB_DIR) {
- return process.env.LIB_DIR;
+ return path.resolve(process.env.LIB_DIR);
}
const dataDir = getEnv('DATA_DIR', './data');
const machineType = getMachineType();
- return path.join(dataDir, 'lib', machineType);
+ return path.resolve(path.join(dataDir, 'lib', machineType));
}
/**
@@ -1412,9 +1412,9 @@ function getLibDir() {
*/
function getNodeModulesDir() {
if (process.env.NODE_MODULES_DIR) {
- return process.env.NODE_MODULES_DIR;
+ return path.resolve(process.env.NODE_MODULES_DIR);
}
- return path.join(getLibDir(), 'npm', 'node_modules');
+ return path.resolve(path.join(getLibDir(), 'npm', 'node_modules'));
}
/**
diff --git a/archivebox/plugins/chrome/tests/chrome_test_helpers.py b/archivebox/plugins/chrome/tests/chrome_test_helpers.py
index 89301f5f..3c2424ca 100644
--- a/archivebox/plugins/chrome/tests/chrome_test_helpers.py
+++ b/archivebox/plugins/chrome/tests/chrome_test_helpers.py
@@ -37,9 +37,8 @@ Usage:
# For Chrome session tests:
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
- setup_chrome_session, # Full Chrome + tab setup
- cleanup_chrome, # Cleanup by PID
- chrome_session, # Context manager
+ chrome_session, # Context manager (Full Chrome + tab setup with automatic cleanup)
+ cleanup_chrome, # Manual cleanup by PID (rarely needed)
)
# For extension tests:
@@ -184,8 +183,7 @@ def get_lib_dir() -> Path:
# Fallback to Python
if os.environ.get('LIB_DIR'):
return Path(os.environ['LIB_DIR'])
- from archivebox.config.common import STORAGE_CONFIG
- return Path(str(STORAGE_CONFIG.LIB_DIR))
+ raise Exception('LIB_DIR env var must be set!')
def get_node_modules_dir() -> Path:
@@ -695,111 +693,6 @@ def chromium_session(env: dict, chrome_dir: Path, crawl_id: str):
# =============================================================================
-def setup_chrome_session(
- tmpdir: Path,
- crawl_id: str = 'test-crawl',
- snapshot_id: str = 'test-snapshot',
- test_url: str = 'about:blank',
- navigate: bool = True,
- timeout: int = 15,
-) -> Tuple[subprocess.Popen, int, Path]:
- """Set up a Chrome session with tab and optional navigation.
-
- Creates the directory structure, launches Chrome, creates a tab,
- and optionally navigates to the test URL.
-
- Args:
- tmpdir: Temporary directory for test files
- crawl_id: ID to use for the crawl
- snapshot_id: ID to use for the snapshot
- test_url: URL to navigate to (if navigate=True)
- navigate: Whether to navigate to the URL after creating tab
- timeout: Seconds to wait for Chrome to start
-
- Returns:
- Tuple of (chrome_launch_process, chrome_pid, snapshot_chrome_dir)
-
- Raises:
- RuntimeError: If Chrome fails to start or tab creation fails
- """
- crawl_dir = Path(tmpdir) / 'crawl'
- crawl_dir.mkdir(exist_ok=True)
- chrome_dir = crawl_dir / 'chrome'
- chrome_dir.mkdir(exist_ok=True)
-
- env = get_test_env()
- env['CHROME_HEADLESS'] = 'true'
-
- # Launch Chrome at crawl level
- chrome_launch_process = subprocess.Popen(
- ['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
- cwd=str(chrome_dir),
- stdout=subprocess.PIPE,
- stderr=subprocess.PIPE,
- text=True,
- env=env
- )
-
- # Wait for Chrome to launch
- for i in range(timeout):
- if chrome_launch_process.poll() is not None:
- stdout, stderr = chrome_launch_process.communicate()
- raise RuntimeError(f"Chrome launch failed:\nStdout: {stdout}\nStderr: {stderr}")
- if (chrome_dir / 'cdp_url.txt').exists():
- break
- time.sleep(1)
-
- if not (chrome_dir / 'cdp_url.txt').exists():
- raise RuntimeError(f"Chrome CDP URL not found after {timeout}s")
-
- chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
-
- # Create snapshot directory structure
- snapshot_dir = Path(tmpdir) / 'snapshot'
- snapshot_dir.mkdir(exist_ok=True)
- snapshot_chrome_dir = snapshot_dir / 'chrome'
- snapshot_chrome_dir.mkdir(exist_ok=True)
-
- # Create tab
- tab_env = env.copy()
- tab_env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
- try:
- result = subprocess.run(
- ['node', str(CHROME_TAB_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}', f'--crawl-id={crawl_id}'],
- cwd=str(snapshot_chrome_dir),
- capture_output=True,
- text=True,
- timeout=60,
- env=tab_env
- )
- if result.returncode != 0:
- cleanup_chrome(chrome_launch_process, chrome_pid)
- raise RuntimeError(f"Tab creation failed: {result.stderr}")
- except subprocess.TimeoutExpired:
- cleanup_chrome(chrome_launch_process, chrome_pid)
- raise RuntimeError("Tab creation timed out after 60s")
-
- # Navigate to URL if requested
- if navigate and CHROME_NAVIGATE_HOOK and test_url != 'about:blank':
- try:
- result = subprocess.run(
- ['node', str(CHROME_NAVIGATE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
- cwd=str(snapshot_chrome_dir),
- capture_output=True,
- text=True,
- timeout=120,
- env=env
- )
- if result.returncode != 0:
- cleanup_chrome(chrome_launch_process, chrome_pid)
- raise RuntimeError(f"Navigation failed: {result.stderr}")
- except subprocess.TimeoutExpired:
- cleanup_chrome(chrome_launch_process, chrome_pid)
- raise RuntimeError("Navigation timed out after 120s")
-
- return chrome_launch_process, chrome_pid, snapshot_chrome_dir
-
-
def cleanup_chrome(chrome_launch_process: subprocess.Popen, chrome_pid: int, chrome_dir: Optional[Path] = None) -> None:
"""Clean up Chrome processes using chrome_utils.js killChrome.
@@ -835,8 +728,12 @@ def chrome_session(
):
"""Context manager for Chrome sessions with automatic cleanup.
+ Creates the directory structure, launches Chrome, creates a tab,
+ and optionally navigates to the test URL. Automatically cleans up
+ Chrome on exit.
+
Usage:
- with chrome_session(tmpdir, test_url='https://example.com') as (process, pid, chrome_dir):
+ with chrome_session(tmpdir, test_url='https://example.com') as (process, pid, chrome_dir, env):
# Run tests with chrome session
pass
# Chrome automatically cleaned up
@@ -850,20 +747,129 @@ def chrome_session(
timeout: Seconds to wait for Chrome to start
Yields:
- Tuple of (chrome_launch_process, chrome_pid, snapshot_chrome_dir)
+ Tuple of (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env)
+
+ Raises:
+ RuntimeError: If Chrome fails to start or tab creation fails
"""
chrome_launch_process = None
chrome_pid = None
try:
- chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(
- tmpdir=tmpdir,
- crawl_id=crawl_id,
- snapshot_id=snapshot_id,
- test_url=test_url,
- navigate=navigate,
- timeout=timeout,
+ # Create proper directory structure in tmpdir
+ machine = platform.machine().lower()
+ system = platform.system().lower()
+ if machine in ('arm64', 'aarch64'):
+ machine = 'arm64'
+ elif machine in ('x86_64', 'amd64'):
+ machine = 'x86_64'
+ machine_type = f"{machine}-{system}"
+
+ data_dir = Path(tmpdir) / 'data'
+ lib_dir = data_dir / 'lib' / machine_type
+ npm_dir = lib_dir / 'npm'
+ node_modules_dir = npm_dir / 'node_modules'
+
+ # Create lib structure for puppeteer installation
+ node_modules_dir.mkdir(parents=True, exist_ok=True)
+
+ # Create crawl and snapshot directories
+ crawl_dir = Path(tmpdir) / 'crawl'
+ crawl_dir.mkdir(exist_ok=True)
+ chrome_dir = crawl_dir / 'chrome'
+ chrome_dir.mkdir(exist_ok=True)
+
+ # Build env with tmpdir-specific paths
+ env = os.environ.copy()
+ env.update({
+ 'DATA_DIR': str(data_dir),
+ 'LIB_DIR': str(lib_dir),
+ 'MACHINE_TYPE': machine_type,
+ 'NODE_MODULES_DIR': str(node_modules_dir),
+ 'NODE_PATH': str(node_modules_dir),
+ 'NPM_BIN_DIR': str(npm_dir / '.bin'),
+ 'CHROME_HEADLESS': 'true',
+ })
+
+ # CRITICAL: Run chrome install hook first (installs puppeteer-core and chromium)
+ # chrome_launch assumes chrome_install has already run
+ install_result = subprocess.run(
+ ['python', str(CHROME_INSTALL_HOOK)],
+ capture_output=True,
+ text=True,
+ timeout=120,
+ env=env
)
- yield chrome_launch_process, chrome_pid, snapshot_chrome_dir
+ if install_result.returncode != 0:
+ raise RuntimeError(f"Chrome install failed: {install_result.stderr}")
+
+ # Launch Chrome at crawl level
+ chrome_launch_process = subprocess.Popen(
+ ['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
+ cwd=str(chrome_dir),
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ text=True,
+ env=env
+ )
+
+ # Wait for Chrome to launch
+ for i in range(timeout):
+ if chrome_launch_process.poll() is not None:
+ stdout, stderr = chrome_launch_process.communicate()
+ raise RuntimeError(f"Chrome launch failed:\nStdout: {stdout}\nStderr: {stderr}")
+ if (chrome_dir / 'cdp_url.txt').exists():
+ break
+ time.sleep(1)
+
+ if not (chrome_dir / 'cdp_url.txt').exists():
+ raise RuntimeError(f"Chrome CDP URL not found after {timeout}s")
+
+ chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
+
+ # Create snapshot directory structure
+ snapshot_dir = Path(tmpdir) / 'snapshot'
+ snapshot_dir.mkdir(exist_ok=True)
+ snapshot_chrome_dir = snapshot_dir / 'chrome'
+ snapshot_chrome_dir.mkdir(exist_ok=True)
+
+ # Create tab
+ tab_env = env.copy()
+ tab_env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
+ try:
+ result = subprocess.run(
+ ['node', str(CHROME_TAB_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}', f'--crawl-id={crawl_id}'],
+ cwd=str(snapshot_chrome_dir),
+ capture_output=True,
+ text=True,
+ timeout=60,
+ env=tab_env
+ )
+ if result.returncode != 0:
+ cleanup_chrome(chrome_launch_process, chrome_pid)
+ raise RuntimeError(f"Tab creation failed: {result.stderr}")
+ except subprocess.TimeoutExpired:
+ cleanup_chrome(chrome_launch_process, chrome_pid)
+ raise RuntimeError("Tab creation timed out after 60s")
+
+ # Navigate to URL if requested
+ if navigate and CHROME_NAVIGATE_HOOK and test_url != 'about:blank':
+ try:
+ result = subprocess.run(
+ ['node', str(CHROME_NAVIGATE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
+ cwd=str(snapshot_chrome_dir),
+ capture_output=True,
+ text=True,
+ timeout=120,
+ env=env
+ )
+ if result.returncode != 0:
+ cleanup_chrome(chrome_launch_process, chrome_pid)
+ raise RuntimeError(f"Navigation failed: {result.stderr}")
+ except subprocess.TimeoutExpired:
+ cleanup_chrome(chrome_launch_process, chrome_pid)
+ raise RuntimeError("Navigation timed out after 120s")
+
+ yield chrome_launch_process, chrome_pid, snapshot_chrome_dir, env
finally:
if chrome_launch_process and chrome_pid:
cleanup_chrome(chrome_launch_process, chrome_pid)
diff --git a/archivebox/plugins/chrome/tests/test_chrome.py b/archivebox/plugins/chrome/tests/test_chrome.py
index 0702f95f..82672566 100644
--- a/archivebox/plugins/chrome/tests/test_chrome.py
+++ b/archivebox/plugins/chrome/tests/test_chrome.py
@@ -525,10 +525,9 @@ def test_zombie_prevention_hook_killed():
time.sleep(1)
assert (chrome_dir / 'chrome.pid').exists(), "Chrome PID file should exist"
- assert (chrome_dir / 'hook.pid').exists(), "Hook PID file should exist"
chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
- hook_pid = int((chrome_dir / 'hook.pid').read_text().strip())
+ hook_pid = chrome_launch_process.pid # Use the Popen process PID instead of hook.pid file
# Verify both Chrome and hook are running
try:
diff --git a/archivebox/plugins/chrome/tests/test_chrome_test_helpers.py b/archivebox/plugins/chrome/tests/test_chrome_test_helpers.py
new file mode 100644
index 00000000..703ea037
--- /dev/null
+++ b/archivebox/plugins/chrome/tests/test_chrome_test_helpers.py
@@ -0,0 +1,260 @@
+"""
+Tests for chrome_test_helpers.py functions.
+
+These tests verify the Python helper functions used across Chrome plugin tests.
+"""
+
+import os
+import pytest
+import tempfile
+from pathlib import Path
+
+from archivebox.plugins.chrome.tests.chrome_test_helpers import (
+ get_test_env,
+ get_machine_type,
+ get_lib_dir,
+ get_node_modules_dir,
+ get_extensions_dir,
+ find_chromium_binary,
+ get_plugin_dir,
+ get_hook_script,
+ parse_jsonl_output,
+)
+
+
+def test_get_machine_type():
+ """Test get_machine_type() returns valid format."""
+ machine_type = get_machine_type()
+ assert isinstance(machine_type, str)
+ assert '-' in machine_type, "Machine type should be in format: arch-os"
+ # Should be one of the expected formats
+ assert any(x in machine_type for x in ['arm64', 'x86_64']), "Should contain valid architecture"
+ assert any(x in machine_type for x in ['darwin', 'linux', 'win32']), "Should contain valid OS"
+
+
+def test_get_lib_dir_with_env_var():
+ """Test get_lib_dir() respects LIB_DIR env var."""
+ with tempfile.TemporaryDirectory() as tmpdir:
+ custom_lib = Path(tmpdir) / 'custom_lib'
+ custom_lib.mkdir()
+
+ old_lib_dir = os.environ.get('LIB_DIR')
+ try:
+ os.environ['LIB_DIR'] = str(custom_lib)
+ lib_dir = get_lib_dir()
+ assert lib_dir == custom_lib
+ finally:
+ if old_lib_dir:
+ os.environ['LIB_DIR'] = old_lib_dir
+ else:
+ os.environ.pop('LIB_DIR', None)
+
+
+def test_get_node_modules_dir_with_env_var():
+ """Test get_node_modules_dir() respects NODE_MODULES_DIR env var."""
+ with tempfile.TemporaryDirectory() as tmpdir:
+ custom_nm = Path(tmpdir) / 'node_modules'
+ custom_nm.mkdir()
+
+ old_nm_dir = os.environ.get('NODE_MODULES_DIR')
+ try:
+ os.environ['NODE_MODULES_DIR'] = str(custom_nm)
+ nm_dir = get_node_modules_dir()
+ assert nm_dir == custom_nm
+ finally:
+ if old_nm_dir:
+ os.environ['NODE_MODULES_DIR'] = old_nm_dir
+ else:
+ os.environ.pop('NODE_MODULES_DIR', None)
+
+
+def test_get_extensions_dir_default():
+ """Test get_extensions_dir() returns expected path format."""
+ ext_dir = get_extensions_dir()
+ assert isinstance(ext_dir, str)
+ assert 'personas' in ext_dir
+ assert 'chrome_extensions' in ext_dir
+
+
+def test_get_extensions_dir_with_custom_persona():
+ """Test get_extensions_dir() respects ACTIVE_PERSONA env var."""
+ old_persona = os.environ.get('ACTIVE_PERSONA')
+ old_data_dir = os.environ.get('DATA_DIR')
+ try:
+ os.environ['ACTIVE_PERSONA'] = 'TestPersona'
+ os.environ['DATA_DIR'] = '/tmp/test'
+ ext_dir = get_extensions_dir()
+ assert 'TestPersona' in ext_dir
+ assert '/tmp/test' in ext_dir
+ finally:
+ if old_persona:
+ os.environ['ACTIVE_PERSONA'] = old_persona
+ else:
+ os.environ.pop('ACTIVE_PERSONA', None)
+ if old_data_dir:
+ os.environ['DATA_DIR'] = old_data_dir
+ else:
+ os.environ.pop('DATA_DIR', None)
+
+
+def test_get_test_env_returns_dict():
+ """Test get_test_env() returns properly formatted environment dict."""
+ env = get_test_env()
+ assert isinstance(env, dict)
+
+ # Should include key paths
+ assert 'MACHINE_TYPE' in env
+ assert 'LIB_DIR' in env
+ assert 'NODE_MODULES_DIR' in env
+ assert 'NODE_PATH' in env # Critical for module resolution
+ assert 'NPM_BIN_DIR' in env
+ assert 'CHROME_EXTENSIONS_DIR' in env
+
+ # Verify NODE_PATH equals NODE_MODULES_DIR (for Node.js module resolution)
+ assert env['NODE_PATH'] == env['NODE_MODULES_DIR']
+
+
+def test_get_test_env_paths_are_absolute():
+ """Test that get_test_env() returns absolute paths."""
+ env = get_test_env()
+
+ # All path-like values should be absolute
+ assert Path(env['LIB_DIR']).is_absolute()
+ assert Path(env['NODE_MODULES_DIR']).is_absolute()
+ assert Path(env['NODE_PATH']).is_absolute()
+
+
+def test_find_chromium_binary():
+ """Test find_chromium_binary() returns a path or None."""
+ binary = find_chromium_binary()
+ if binary:
+ assert isinstance(binary, str)
+ # Should be an absolute path if found
+ assert os.path.isabs(binary)
+
+
+def test_get_plugin_dir():
+ """Test get_plugin_dir() finds correct plugin directory."""
+ # Use this test file's path
+ test_file = __file__
+ plugin_dir = get_plugin_dir(test_file)
+
+ assert plugin_dir.exists()
+ assert plugin_dir.is_dir()
+ # Should be the chrome plugin directory
+ assert plugin_dir.name == 'chrome'
+ assert (plugin_dir.parent.name == 'plugins')
+
+
+def test_get_hook_script_finds_existing_hook():
+ """Test get_hook_script() can find an existing hook."""
+ from archivebox.plugins.chrome.tests.chrome_test_helpers import CHROME_PLUGIN_DIR
+
+ # Try to find the chrome launch hook
+ hook = get_hook_script(CHROME_PLUGIN_DIR, 'on_Crawl__*_chrome_launch.*')
+
+ if hook: # May not exist in all test environments
+ assert hook.exists()
+ assert hook.is_file()
+ assert 'chrome_launch' in hook.name
+
+
+def test_get_hook_script_returns_none_for_missing():
+ """Test get_hook_script() returns None for non-existent hooks."""
+ from archivebox.plugins.chrome.tests.chrome_test_helpers import CHROME_PLUGIN_DIR
+
+ hook = get_hook_script(CHROME_PLUGIN_DIR, 'nonexistent_hook_*_pattern.*')
+ assert hook is None
+
+
+def test_parse_jsonl_output_valid():
+ """Test parse_jsonl_output() parses valid JSONL."""
+ jsonl_output = '''{"type": "ArchiveResult", "status": "succeeded", "output": "test1"}
+{"type": "ArchiveResult", "status": "failed", "error": "test2"}
+'''
+
+ # Returns first match only
+ result = parse_jsonl_output(jsonl_output)
+ assert result is not None
+ assert result['type'] == 'ArchiveResult'
+ assert result['status'] == 'succeeded'
+ assert result['output'] == 'test1'
+
+
+def test_parse_jsonl_output_with_non_json_lines():
+ """Test parse_jsonl_output() skips non-JSON lines."""
+ mixed_output = '''Some non-JSON output
+{"type": "ArchiveResult", "status": "succeeded"}
+More non-JSON
+{"type": "ArchiveResult", "status": "failed"}
+'''
+
+ result = parse_jsonl_output(mixed_output)
+ assert result is not None
+ assert result['type'] == 'ArchiveResult'
+ assert result['status'] == 'succeeded'
+
+
+def test_parse_jsonl_output_empty():
+ """Test parse_jsonl_output() handles empty input."""
+ result = parse_jsonl_output('')
+ assert result is None
+
+
+def test_parse_jsonl_output_filters_by_type():
+ """Test parse_jsonl_output() can filter by record type."""
+ jsonl_output = '''{"type": "LogEntry", "data": "log1"}
+{"type": "ArchiveResult", "data": "result1"}
+{"type": "ArchiveResult", "data": "result2"}
+'''
+
+ # Should return first ArchiveResult, not LogEntry
+ result = parse_jsonl_output(jsonl_output, record_type='ArchiveResult')
+ assert result is not None
+ assert result['type'] == 'ArchiveResult'
+ assert result['data'] == 'result1' # First ArchiveResult
+
+
+def test_parse_jsonl_output_filters_custom_type():
+ """Test parse_jsonl_output() can filter by custom record type."""
+ jsonl_output = '''{"type": "ArchiveResult", "data": "result1"}
+{"type": "LogEntry", "data": "log1"}
+{"type": "ArchiveResult", "data": "result2"}
+'''
+
+ result = parse_jsonl_output(jsonl_output, record_type='LogEntry')
+ assert result is not None
+ assert result['type'] == 'LogEntry'
+ assert result['data'] == 'log1'
+
+
+def test_machine_type_consistency():
+ """Test that machine type is consistent across calls."""
+ mt1 = get_machine_type()
+ mt2 = get_machine_type()
+ assert mt1 == mt2, "Machine type should be stable across calls"
+
+
+def test_lib_dir_is_directory():
+ """Test that lib_dir points to an actual directory when DATA_DIR is set."""
+ with tempfile.TemporaryDirectory() as tmpdir:
+ old_data_dir = os.environ.get('DATA_DIR')
+ try:
+ os.environ['DATA_DIR'] = tmpdir
+ # Create the expected directory structure
+ machine_type = get_machine_type()
+ lib_dir = Path(tmpdir) / 'lib' / machine_type
+ lib_dir.mkdir(parents=True, exist_ok=True)
+
+ result = get_lib_dir()
+ # Should return a Path object
+ assert isinstance(result, Path)
+ finally:
+ if old_data_dir:
+ os.environ['DATA_DIR'] = old_data_dir
+ else:
+ os.environ.pop('DATA_DIR', None)
+
+
+if __name__ == '__main__':
+ pytest.main([__file__, '-v'])
diff --git a/archivebox/plugins/consolelog/tests/test_consolelog.py b/archivebox/plugins/consolelog/tests/test_consolelog.py
index 741776f0..2f9189ff 100644
--- a/archivebox/plugins/consolelog/tests/test_consolelog.py
+++ b/archivebox/plugins/consolelog/tests/test_consolelog.py
@@ -72,10 +72,9 @@ class TestConsolelogWithChrome(TestCase):
test_url=test_url,
navigate=True,
timeout=30,
- ) as (chrome_process, chrome_pid, snapshot_chrome_dir):
- # Get environment and run the consolelog hook
- env = get_test_env()
- env['CHROME_HEADLESS'] = 'true'
+ ) as (chrome_process, chrome_pid, snapshot_chrome_dir, env):
+ # Use the environment from chrome_session (already has CHROME_HEADLESS=true)
+
# Run consolelog hook with the active Chrome session
result = subprocess.run(
diff --git a/archivebox/plugins/forumdl/on_Crawl__13_forumdl_install.py b/archivebox/plugins/forumdl/on_Crawl__13_forumdl_install.py
new file mode 100755
index 00000000..f52a72f2
--- /dev/null
+++ b/archivebox/plugins/forumdl/on_Crawl__13_forumdl_install.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python3
+"""
+Detect forum-dl binary and emit Binary JSONL record.
+
+Output: Binary JSONL record to stdout if forum-dl is found
+"""
+
+import json
+import os
+import sys
+
+from abx_pkg import Binary, EnvProvider
+
+
+def get_env(name: str, default: str = '') -> str:
+ return os.environ.get(name, default).strip()
+
+def get_env_bool(name: str, default: bool = False) -> bool:
+ val = get_env(name, '').lower()
+ if val in ('true', '1', 'yes', 'on'):
+ return True
+ if val in ('false', '0', 'no', 'off'):
+ return False
+ return default
+
+
+def output_binary_found(binary: Binary, name: str):
+ """Output Binary JSONL record for an installed binary."""
+ machine_id = os.environ.get('MACHINE_ID', '')
+
+ record = {
+ 'type': 'Binary',
+ 'name': name,
+ 'abspath': str(binary.abspath),
+ 'version': str(binary.version) if binary.version else '',
+ 'sha256': binary.sha256 or '',
+ 'binprovider': 'env', # Already installed
+ 'machine_id': machine_id,
+ }
+ print(json.dumps(record))
+
+
+def output_binary_missing(name: str, binproviders: str):
+ """Output Binary JSONL record for a missing binary that needs installation."""
+ machine_id = os.environ.get('MACHINE_ID', '')
+
+ record = {
+ 'type': 'Binary',
+ 'name': name,
+ 'binproviders': binproviders, # Providers that can install it
+ 'machine_id': machine_id,
+ }
+ print(json.dumps(record))
+
+
+def main():
+ forumdl_enabled = get_env_bool('FORUMDL_ENABLED', True)
+ forumdl_binary = get_env('FORUMDL_BINARY', 'forum-dl')
+
+ if not forumdl_enabled:
+ sys.exit(0)
+
+ provider = EnvProvider()
+ try:
+ binary = Binary(name=forumdl_binary, binproviders=[provider]).load()
+ if binary.abspath:
+ # Binary found
+ output_binary_found(binary, name='forum-dl')
+ else:
+ # Binary not found
+ output_binary_missing(name='forum-dl', binproviders='pip')
+ except Exception:
+ # Binary not found
+ output_binary_missing(name='forum-dl', binproviders='pip')
+
+ sys.exit(0)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/archivebox/plugins/infiniscroll/tests/test_infiniscroll.py b/archivebox/plugins/infiniscroll/tests/test_infiniscroll.py
index eee44ce4..1248518a 100644
--- a/archivebox/plugins/infiniscroll/tests/test_infiniscroll.py
+++ b/archivebox/plugins/infiniscroll/tests/test_infiniscroll.py
@@ -24,8 +24,7 @@ import pytest
# Import shared Chrome test helpers
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
get_test_env,
- setup_chrome_session,
- cleanup_chrome,
+ chrome_session,
)
@@ -101,22 +100,17 @@ def test_fails_gracefully_without_chrome_session():
def test_scrolls_page_and_outputs_stats():
"""Integration test: scroll page and verify JSONL output format."""
with tempfile.TemporaryDirectory() as tmpdir:
- chrome_launch_process = None
- chrome_pid = None
- try:
- chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(
- Path(tmpdir),
- crawl_id='test-infiniscroll',
- snapshot_id='snap-infiniscroll',
- test_url=TEST_URL,
- )
-
+ with chrome_session(
+ Path(tmpdir),
+ crawl_id='test-infiniscroll',
+ snapshot_id='snap-infiniscroll',
+ test_url=TEST_URL,
+ ) as (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env):
# Create infiniscroll output directory (sibling to chrome)
infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll'
infiniscroll_dir.mkdir()
# Run infiniscroll hook
- env = get_test_env()
env['INFINISCROLL_SCROLL_LIMIT'] = '3' # Limit scrolls for faster test
env['INFINISCROLL_SCROLL_DELAY'] = '500' # Faster scrolling
env['INFINISCROLL_MIN_HEIGHT'] = '1000' # Lower threshold for test
@@ -158,29 +152,21 @@ def test_scrolls_page_and_outputs_stats():
output_files = list(infiniscroll_dir.iterdir())
assert len(output_files) == 0, f"Should not create any files, but found: {output_files}"
- finally:
- if chrome_launch_process and chrome_pid:
- cleanup_chrome(chrome_launch_process, chrome_pid)
-
def test_config_scroll_limit_honored():
"""Test that INFINISCROLL_SCROLL_LIMIT config is respected."""
with tempfile.TemporaryDirectory() as tmpdir:
- chrome_launch_process = None
- chrome_pid = None
- try:
- chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(
- Path(tmpdir),
- crawl_id='test-scroll-limit',
- snapshot_id='snap-limit',
- test_url=TEST_URL,
- )
+ with chrome_session(
+ Path(tmpdir),
+ crawl_id='test-scroll-limit',
+ snapshot_id='snap-limit',
+ test_url=TEST_URL,
+ ) as (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env):
infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll'
infiniscroll_dir.mkdir()
- # Set scroll limit to 2
- env = get_test_env()
+ # Set scroll limit to 2 (use env from setup_chrome_session)
env['INFINISCROLL_SCROLL_LIMIT'] = '2'
env['INFINISCROLL_SCROLL_DELAY'] = '500'
env['INFINISCROLL_MIN_HEIGHT'] = '100000' # High threshold so limit kicks in
@@ -215,29 +201,22 @@ def test_config_scroll_limit_honored():
assert output_str.startswith('scrolled to'), f"Should have valid output_str: {output_str}"
assert result_json['status'] == 'succeeded', f"Should succeed with scroll limit: {result_json}"
- finally:
- if chrome_launch_process and chrome_pid:
- cleanup_chrome(chrome_launch_process, chrome_pid)
def test_config_timeout_honored():
"""Test that INFINISCROLL_TIMEOUT config is respected."""
with tempfile.TemporaryDirectory() as tmpdir:
- chrome_launch_process = None
- chrome_pid = None
- try:
- chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(
- Path(tmpdir),
- crawl_id='test-timeout',
- snapshot_id='snap-timeout',
- test_url=TEST_URL,
- )
+ with chrome_session(
+ Path(tmpdir),
+ crawl_id='test-timeout',
+ snapshot_id='snap-timeout',
+ test_url=TEST_URL,
+ ) as (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env):
infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll'
infiniscroll_dir.mkdir()
- # Set very short timeout
- env = get_test_env()
+ # Set very short timeout (use env from setup_chrome_session)
env['INFINISCROLL_TIMEOUT'] = '3' # 3 seconds
env['INFINISCROLL_SCROLL_DELAY'] = '2000' # 2s delay - timeout should trigger
env['INFINISCROLL_SCROLL_LIMIT'] = '100' # High limit
@@ -258,9 +237,6 @@ def test_config_timeout_honored():
assert elapsed < 15, f"Should respect timeout, took {elapsed:.1f}s"
assert result.returncode == 0, f"Should complete even with timeout: {result.stderr}"
- finally:
- if chrome_launch_process and chrome_pid:
- cleanup_chrome(chrome_launch_process, chrome_pid)
if __name__ == '__main__':
diff --git a/archivebox/plugins/istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.py b/archivebox/plugins/istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.py
index a9525c89..7fdc1c4a 100644
--- a/archivebox/plugins/istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.py
+++ b/archivebox/plugins/istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.py
@@ -154,8 +154,7 @@ def test_extension_loads_in_chromium():
# Step 1: Install the extension
result = subprocess.run(
['node', str(INSTALL_SCRIPT)],
- cwd=str(tmpdir,
- env=get_test_env()),
+ cwd=str(tmpdir),
capture_output=True,
text=True,
env=env,
diff --git a/archivebox/plugins/mercury/on_Crawl__12_mercury_install.py b/archivebox/plugins/mercury/on_Crawl__12_mercury_install.py
index 57fe5e7e..25d1c9c1 100755
--- a/archivebox/plugins/mercury/on_Crawl__12_mercury_install.py
+++ b/archivebox/plugins/mercury/on_Crawl__12_mercury_install.py
@@ -1,8 +1,8 @@
#!/usr/bin/env python3
"""
-Detect mercury-parser binary and emit Binary JSONL record.
+Detect postlight-parser binary and emit Binary JSONL record.
-Output: Binary JSONL record to stdout if mercury-parser is found
+Output: Binary JSONL record to stdout if postlight-parser is found
"""
import json
@@ -48,6 +48,11 @@ def output_binary_missing(name: str, binproviders: str):
'type': 'Binary',
'name': name,
'binproviders': binproviders, # Providers that can install it
+ 'overrides': {
+ 'npm': {
+ 'packages': ['@postlight/parser'],
+ }
+ },
'machine_id': machine_id,
}
print(json.dumps(record))
@@ -55,7 +60,7 @@ def output_binary_missing(name: str, binproviders: str):
def main():
mercury_enabled = get_env_bool('MERCURY_ENABLED', True)
- mercury_binary = get_env('MERCURY_BINARY', 'mercury-parser')
+ mercury_binary = get_env('MERCURY_BINARY', 'postlight-parser')
if not mercury_enabled:
sys.exit(0)
@@ -65,13 +70,13 @@ def main():
binary = Binary(name=mercury_binary, binproviders=[provider]).load()
if binary.abspath:
# Binary found
- output_binary_found(binary, name='mercury-parser')
+ output_binary_found(binary, name='postlight-parser')
else:
# Binary not found
- output_binary_missing(name='mercury-parser', binproviders='npm')
+ output_binary_missing(name='postlight-parser', binproviders='npm')
except Exception:
# Binary not found
- output_binary_missing(name='mercury-parser', binproviders='npm')
+ output_binary_missing(name='postlight-parser', binproviders='npm')
sys.exit(0)
diff --git a/archivebox/plugins/modalcloser/tests/test_modalcloser.py b/archivebox/plugins/modalcloser/tests/test_modalcloser.py
index 1039d99c..b66d20d2 100644
--- a/archivebox/plugins/modalcloser/tests/test_modalcloser.py
+++ b/archivebox/plugins/modalcloser/tests/test_modalcloser.py
@@ -25,8 +25,7 @@ import pytest
# Import shared Chrome test helpers
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
get_test_env,
- setup_chrome_session,
- cleanup_chrome,
+ chrome_session,
)
@@ -103,129 +102,119 @@ def test_fails_gracefully_without_chrome_session():
def test_background_script_handles_sigterm():
"""Test that background script runs and handles SIGTERM correctly."""
with tempfile.TemporaryDirectory() as tmpdir:
- chrome_launch_process = None
- chrome_pid = None
modalcloser_process = None
try:
- chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(
+ with chrome_session(
Path(tmpdir),
crawl_id='test-modalcloser',
snapshot_id='snap-modalcloser',
test_url=TEST_URL,
- )
+ ) as (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env):
+ # Create modalcloser output directory (sibling to chrome)
+ modalcloser_dir = snapshot_chrome_dir.parent / 'modalcloser'
+ modalcloser_dir.mkdir()
- # Create modalcloser output directory (sibling to chrome)
- modalcloser_dir = snapshot_chrome_dir.parent / 'modalcloser'
- modalcloser_dir.mkdir()
+ # Run modalcloser as background process (use env from setup_chrome_session)
+ env['MODALCLOSER_POLL_INTERVAL'] = '200' # Faster polling for test
- # Run modalcloser as background process
- env = get_test_env()
- env['MODALCLOSER_POLL_INTERVAL'] = '200' # Faster polling for test
+ modalcloser_process = subprocess.Popen(
+ ['node', str(MODALCLOSER_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-modalcloser'],
+ cwd=str(modalcloser_dir),
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ text=True,
+ env=env
+ )
- modalcloser_process = subprocess.Popen(
- ['node', str(MODALCLOSER_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-modalcloser'],
- cwd=str(modalcloser_dir),
- stdout=subprocess.PIPE,
- stderr=subprocess.PIPE,
- text=True,
- env=env
- )
+ # Let it run for a bit
+ time.sleep(2)
- # Let it run for a bit
- time.sleep(2)
+ # Verify it's still running (background script)
+ assert modalcloser_process.poll() is None, "Modalcloser should still be running as background process"
- # Verify it's still running (background script)
- assert modalcloser_process.poll() is None, "Modalcloser should still be running as background process"
+ # Send SIGTERM
+ modalcloser_process.send_signal(signal.SIGTERM)
+ stdout, stderr = modalcloser_process.communicate(timeout=5)
- # Send SIGTERM
- modalcloser_process.send_signal(signal.SIGTERM)
- stdout, stderr = modalcloser_process.communicate(timeout=5)
+ assert modalcloser_process.returncode == 0, f"Should exit 0 on SIGTERM: {stderr}"
- assert modalcloser_process.returncode == 0, f"Should exit 0 on SIGTERM: {stderr}"
+ # Parse JSONL output
+ result_json = None
+ for line in stdout.strip().split('\n'):
+ line = line.strip()
+ if line.startswith('{'):
+ try:
+ record = json.loads(line)
+ if record.get('type') == 'ArchiveResult':
+ result_json = record
+ break
+ except json.JSONDecodeError:
+ pass
- # Parse JSONL output
- result_json = None
- for line in stdout.strip().split('\n'):
- line = line.strip()
- if line.startswith('{'):
- try:
- record = json.loads(line)
- if record.get('type') == 'ArchiveResult':
- result_json = record
- break
- except json.JSONDecodeError:
- pass
+ assert result_json is not None, f"Should have ArchiveResult JSONL output. Stdout: {stdout}"
+ assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
- assert result_json is not None, f"Should have ArchiveResult JSONL output. Stdout: {stdout}"
- assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
+ # Verify output_str format
+ output_str = result_json.get('output_str', '')
+ assert 'modal' in output_str.lower() or 'dialog' in output_str.lower(), \
+ f"output_str should mention modals/dialogs: {output_str}"
- # Verify output_str format
- output_str = result_json.get('output_str', '')
- assert 'modal' in output_str.lower() or 'dialog' in output_str.lower(), \
- f"output_str should mention modals/dialogs: {output_str}"
-
- # Verify no files created in output directory
- output_files = list(modalcloser_dir.iterdir())
- assert len(output_files) == 0, f"Should not create any files, but found: {output_files}"
+ # Verify no files created in output directory
+ output_files = list(modalcloser_dir.iterdir())
+ assert len(output_files) == 0, f"Should not create any files, but found: {output_files}"
finally:
if modalcloser_process and modalcloser_process.poll() is None:
modalcloser_process.kill()
- if chrome_launch_process and chrome_pid:
- cleanup_chrome(chrome_launch_process, chrome_pid)
def test_dialog_handler_logs_dialogs():
"""Test that dialog handler is set up correctly."""
with tempfile.TemporaryDirectory() as tmpdir:
- chrome_launch_process = None
- chrome_pid = None
modalcloser_process = None
try:
- chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(
- Path(tmpdir),
- crawl_id='test-dialog',
- snapshot_id='snap-dialog',
- test_url=TEST_URL,
- )
+ with chrome_session(
+ Path(tmpdir),
+ crawl_id='test-dialog',
+ snapshot_id='snap-dialog',
+ test_url=TEST_URL,
+ ) as (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env):
- modalcloser_dir = snapshot_chrome_dir.parent / 'modalcloser'
- modalcloser_dir.mkdir()
+ modalcloser_dir = snapshot_chrome_dir.parent / 'modalcloser'
+ modalcloser_dir.mkdir()
- env = get_test_env()
- env['MODALCLOSER_TIMEOUT'] = '100' # Fast timeout for test
- env['MODALCLOSER_POLL_INTERVAL'] = '200'
+ # Use env from setup_chrome_session
+ env['MODALCLOSER_TIMEOUT'] = '100' # Fast timeout for test
+ env['MODALCLOSER_POLL_INTERVAL'] = '200'
- modalcloser_process = subprocess.Popen(
- ['node', str(MODALCLOSER_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-dialog'],
- cwd=str(modalcloser_dir),
- stdout=subprocess.PIPE,
- stderr=subprocess.PIPE,
- text=True,
- env=env
- )
+ modalcloser_process = subprocess.Popen(
+ ['node', str(MODALCLOSER_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-dialog'],
+ cwd=str(modalcloser_dir),
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ text=True,
+ env=env
+ )
- # Let it run briefly
- time.sleep(1.5)
+ # Let it run briefly
+ time.sleep(1.5)
- # Verify it's running
- assert modalcloser_process.poll() is None, "Should be running"
+ # Verify it's running
+ assert modalcloser_process.poll() is None, "Should be running"
- # Check stderr for "listening" message
- # Note: Can't read stderr while process is running without blocking,
- # so we just verify it exits cleanly
- modalcloser_process.send_signal(signal.SIGTERM)
- stdout, stderr = modalcloser_process.communicate(timeout=5)
+ # Check stderr for "listening" message
+ # Note: Can't read stderr while process is running without blocking,
+ # so we just verify it exits cleanly
+ modalcloser_process.send_signal(signal.SIGTERM)
+ stdout, stderr = modalcloser_process.communicate(timeout=5)
- assert 'listening' in stderr.lower() or 'modalcloser' in stderr.lower(), \
- f"Should log startup message: {stderr}"
- assert modalcloser_process.returncode == 0, f"Should exit cleanly: {stderr}"
+ assert 'listening' in stderr.lower() or 'modalcloser' in stderr.lower(), \
+ f"Should log startup message: {stderr}"
+ assert modalcloser_process.returncode == 0, f"Should exit cleanly: {stderr}"
finally:
if modalcloser_process and modalcloser_process.poll() is None:
modalcloser_process.kill()
- if chrome_launch_process and chrome_pid:
- cleanup_chrome(chrome_launch_process, chrome_pid)
def test_config_poll_interval():
@@ -235,61 +224,58 @@ def test_config_poll_interval():
chrome_pid = None
modalcloser_process = None
try:
- chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(
- Path(tmpdir),
- crawl_id='test-poll',
- snapshot_id='snap-poll',
- test_url=TEST_URL,
- )
+ with chrome_session(
+ Path(tmpdir),
+ crawl_id='test-poll',
+ snapshot_id='snap-poll',
+ test_url=TEST_URL,
+ ) as (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env):
- modalcloser_dir = snapshot_chrome_dir.parent / 'modalcloser'
- modalcloser_dir.mkdir()
+ modalcloser_dir = snapshot_chrome_dir.parent / 'modalcloser'
+ modalcloser_dir.mkdir()
- # Set very short poll interval
- env = get_test_env()
- env['MODALCLOSER_POLL_INTERVAL'] = '100' # 100ms
+ # Set very short poll interval (use env from setup_chrome_session)
+ env['MODALCLOSER_POLL_INTERVAL'] = '100' # 100ms
- modalcloser_process = subprocess.Popen(
- ['node', str(MODALCLOSER_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-poll'],
- cwd=str(modalcloser_dir),
- stdout=subprocess.PIPE,
- stderr=subprocess.PIPE,
- text=True,
- env=env
- )
+ modalcloser_process = subprocess.Popen(
+ ['node', str(MODALCLOSER_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-poll'],
+ cwd=str(modalcloser_dir),
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ text=True,
+ env=env
+ )
- # Run for short time
- time.sleep(1)
+ # Run for short time
+ time.sleep(1)
- # Should still be running
- assert modalcloser_process.poll() is None, "Should still be running"
+ # Should still be running
+ assert modalcloser_process.poll() is None, "Should still be running"
- # Clean exit
- modalcloser_process.send_signal(signal.SIGTERM)
- stdout, stderr = modalcloser_process.communicate(timeout=5)
+ # Clean exit
+ modalcloser_process.send_signal(signal.SIGTERM)
+ stdout, stderr = modalcloser_process.communicate(timeout=5)
- assert modalcloser_process.returncode == 0, f"Should exit 0: {stderr}"
+ assert modalcloser_process.returncode == 0, f"Should exit 0: {stderr}"
- # Verify JSONL output exists
- result_json = None
- for line in stdout.strip().split('\n'):
- if line.strip().startswith('{'):
- try:
- record = json.loads(line)
- if record.get('type') == 'ArchiveResult':
- result_json = record
- break
- except json.JSONDecodeError:
- pass
+ # Verify JSONL output exists
+ result_json = None
+ for line in stdout.strip().split('\n'):
+ if line.strip().startswith('{'):
+ try:
+ record = json.loads(line)
+ if record.get('type') == 'ArchiveResult':
+ result_json = record
+ break
+ except json.JSONDecodeError:
+ pass
- assert result_json is not None, "Should have JSONL output"
- assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
+ assert result_json is not None, "Should have JSONL output"
+ assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
finally:
if modalcloser_process and modalcloser_process.poll() is None:
modalcloser_process.kill()
- if chrome_launch_process and chrome_pid:
- cleanup_chrome(chrome_launch_process, chrome_pid)
def test_hides_cookie_consent_on_filmin():
diff --git a/archivebox/plugins/papersdl/on_Crawl__14_papersdl_install.py b/archivebox/plugins/papersdl/on_Crawl__14_papersdl_install.py
new file mode 100755
index 00000000..8c548c7c
--- /dev/null
+++ b/archivebox/plugins/papersdl/on_Crawl__14_papersdl_install.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python3
+"""
+Detect papers-dl binary and emit Binary JSONL record.
+
+Output: Binary JSONL record to stdout if papers-dl is found
+"""
+
+import json
+import os
+import sys
+
+from abx_pkg import Binary, EnvProvider
+
+
+def get_env(name: str, default: str = '') -> str:
+ return os.environ.get(name, default).strip()
+
+def get_env_bool(name: str, default: bool = False) -> bool:
+ val = get_env(name, '').lower()
+ if val in ('true', '1', 'yes', 'on'):
+ return True
+ if val in ('false', '0', 'no', 'off'):
+ return False
+ return default
+
+
+def output_binary_found(binary: Binary, name: str):
+ """Output Binary JSONL record for an installed binary."""
+ machine_id = os.environ.get('MACHINE_ID', '')
+
+ record = {
+ 'type': 'Binary',
+ 'name': name,
+ 'abspath': str(binary.abspath),
+ 'version': str(binary.version) if binary.version else '',
+ 'sha256': binary.sha256 or '',
+ 'binprovider': 'env', # Already installed
+ 'machine_id': machine_id,
+ }
+ print(json.dumps(record))
+
+
+def output_binary_missing(name: str, binproviders: str):
+ """Output Binary JSONL record for a missing binary that needs installation."""
+ machine_id = os.environ.get('MACHINE_ID', '')
+
+ record = {
+ 'type': 'Binary',
+ 'name': name,
+ 'binproviders': binproviders, # Providers that can install it
+ 'machine_id': machine_id,
+ }
+ print(json.dumps(record))
+
+
+def main():
+ papersdl_enabled = get_env_bool('PAPERSDL_ENABLED', True)
+ papersdl_binary = get_env('PAPERSDL_BINARY', 'papers-dl')
+
+ if not papersdl_enabled:
+ sys.exit(0)
+
+ provider = EnvProvider()
+ try:
+ binary = Binary(name=papersdl_binary, binproviders=[provider]).load()
+ if binary.abspath:
+ # Binary found
+ output_binary_found(binary, name='papers-dl')
+ else:
+ # Binary not found
+ output_binary_missing(name='papers-dl', binproviders='pip')
+ except Exception:
+ # Binary not found
+ output_binary_missing(name='papers-dl', binproviders='pip')
+
+ sys.exit(0)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/archivebox/plugins/parse_dom_outlinks/tests/test_parse_dom_outlinks.py b/archivebox/plugins/parse_dom_outlinks/tests/test_parse_dom_outlinks.py
index d87df28d..33045184 100644
--- a/archivebox/plugins/parse_dom_outlinks/tests/test_parse_dom_outlinks.py
+++ b/archivebox/plugins/parse_dom_outlinks/tests/test_parse_dom_outlinks.py
@@ -72,10 +72,9 @@ class TestParseDomOutlinksWithChrome(TestCase):
test_url=test_url,
navigate=True,
timeout=30,
- ) as (chrome_process, chrome_pid, snapshot_chrome_dir):
- # Get environment and run the outlinks hook
- env = get_test_env()
- env['CHROME_HEADLESS'] = 'true'
+ ) as (chrome_process, chrome_pid, snapshot_chrome_dir, env):
+ # Use the environment from chrome_session (already has CHROME_HEADLESS=true)
+
# Run outlinks hook with the active Chrome session
result = subprocess.run(
diff --git a/archivebox/plugins/redirects/tests/test_redirects.py b/archivebox/plugins/redirects/tests/test_redirects.py
index 934b14c7..0164d461 100644
--- a/archivebox/plugins/redirects/tests/test_redirects.py
+++ b/archivebox/plugins/redirects/tests/test_redirects.py
@@ -73,10 +73,9 @@ class TestRedirectsWithChrome(TestCase):
test_url=test_url,
navigate=True,
timeout=30,
- ) as (chrome_process, chrome_pid, snapshot_chrome_dir):
- # Get environment and run the redirects hook
- env = get_test_env()
- env['CHROME_HEADLESS'] = 'true'
+ ) as (chrome_process, chrome_pid, snapshot_chrome_dir, env):
+ # Use the environment from chrome_session (already has CHROME_HEADLESS=true)
+
# Run redirects hook with the active Chrome session
result = subprocess.run(
diff --git a/archivebox/plugins/responses/tests/test_responses.py b/archivebox/plugins/responses/tests/test_responses.py
index ea710c83..c66f7652 100644
--- a/archivebox/plugins/responses/tests/test_responses.py
+++ b/archivebox/plugins/responses/tests/test_responses.py
@@ -72,10 +72,9 @@ class TestResponsesWithChrome(TestCase):
test_url=test_url,
navigate=True,
timeout=30,
- ) as (chrome_process, chrome_pid, snapshot_chrome_dir):
- # Get environment and run the responses hook
- env = get_test_env()
- env['CHROME_HEADLESS'] = 'true'
+ ) as (chrome_process, chrome_pid, snapshot_chrome_dir, env):
+ # Use the environment from chrome_session (already has CHROME_HEADLESS=true)
+
# Run responses hook with the active Chrome session
result = subprocess.run(
diff --git a/archivebox/plugins/screenshot/on_Snapshot__51_screenshot.js b/archivebox/plugins/screenshot/on_Snapshot__51_screenshot.js
index 7bcb7951..52c49b59 100644
--- a/archivebox/plugins/screenshot/on_Snapshot__51_screenshot.js
+++ b/archivebox/plugins/screenshot/on_Snapshot__51_screenshot.js
@@ -1,20 +1,15 @@
#!/usr/bin/env node
/**
- * Take a screenshot of a URL using Chrome/Puppeteer.
+ * Take a screenshot of a URL using an existing Chrome session.
*
- * If a Chrome session exists (from chrome plugin), connects to it via CDP.
- * Otherwise launches a new Chrome instance.
+ * Requires chrome plugin to have already created a Chrome session.
+ * Connects to the existing session via CDP and takes a screenshot.
*
* Usage: on_Snapshot__51_screenshot.js --url= --snapshot-id=
* Output: Writes screenshot/screenshot.png
*
* Environment variables:
- * CHROME_BINARY: Path to Chrome/Chromium binary
- * CHROME_TIMEOUT: Timeout in seconds (default: 60)
* CHROME_RESOLUTION: Screenshot resolution (default: 1440,2000)
- * CHROME_USER_AGENT: User agent string (optional)
- * CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true)
- * CHROME_HEADLESS: Run in headless mode (default: true)
* SCREENSHOT_ENABLED: Enable screenshot capture (default: true)
*/
@@ -24,10 +19,8 @@ const path = require('path');
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const {
- findChromium,
getEnv,
getEnvBool,
- getEnvInt,
parseResolution,
parseArgs,
readCdpUrl,
@@ -56,7 +49,7 @@ function hasStaticFileOutput() {
}
// Wait for chrome tab to be fully loaded
-async function waitForChromeTabLoaded(timeoutMs = 60000) {
+async function waitForChromeTabLoaded(timeoutMs = 10000) {
const navigationFile = path.join(CHROME_SESSION_DIR, 'navigation.json');
const startTime = Date.now();
@@ -72,102 +65,66 @@ async function waitForChromeTabLoaded(timeoutMs = 60000) {
}
async function takeScreenshot(url) {
- const timeout = (getEnvInt('CHROME_TIMEOUT') || getEnvInt('TIMEOUT', 60)) * 1000;
- const resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000');
- const userAgent = getEnv('CHROME_USER_AGENT') || getEnv('USER_AGENT', '');
- const checkSsl = getEnvBool('CHROME_CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true));
- const headless = getEnvBool('CHROME_HEADLESS', true);
-
+ const resolution = getEnv('CHROME_RESOLUTION', '1440,2000');
const { width, height } = parseResolution(resolution);
// Output directory is current directory (hook already runs in output dir)
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
- let browser = null;
- let page = null;
- let connectedToSession = false;
+ // Wait for chrome_navigate to complete (writes navigation.json)
+ const timeoutSeconds = parseInt(getEnv('SCREENSHOT_TIMEOUT', '10'), 10);
+ const timeoutMs = timeoutSeconds * 1000;
+ const pageLoaded = await waitForChromeTabLoaded(timeoutMs);
+ if (!pageLoaded) {
+ throw new Error(`Page not loaded after ${timeoutSeconds}s (chrome_navigate must complete first)`);
+ }
+
+ // Connect to existing Chrome session (required - no fallback)
+ const cdpUrl = readCdpUrl(CHROME_SESSION_DIR);
+ if (!cdpUrl) {
+ throw new Error('No Chrome session found (chrome plugin must run first)');
+ }
+
+ // Read target_id.txt to get the specific tab for this snapshot
+ const targetIdFile = path.join(CHROME_SESSION_DIR, 'target_id.txt');
+ if (!fs.existsSync(targetIdFile)) {
+ throw new Error('No target_id.txt found (chrome_tab must run first)');
+ }
+ const targetId = fs.readFileSync(targetIdFile, 'utf8').trim();
+
+ const browser = await puppeteer.connect({
+ browserWSEndpoint: cdpUrl,
+ defaultViewport: { width, height },
+ });
try {
- // Try to connect to existing Chrome session
- const cdpUrl = readCdpUrl(CHROME_SESSION_DIR);
- if (cdpUrl) {
- try {
- browser = await puppeteer.connect({
- browserWSEndpoint: cdpUrl,
- defaultViewport: { width, height },
- });
- connectedToSession = true;
-
- // Get existing pages or create new one
- const pages = await browser.pages();
- page = pages.find(p => p.url().startsWith('http')) || pages[0];
-
- if (!page) {
- page = await browser.newPage();
- }
-
- // Set viewport on the page
- await page.setViewport({ width, height });
-
- } catch (e) {
- console.error(`Failed to connect to CDP session: ${e.message}`);
- browser = null;
- }
+ // Get the specific page for this snapshot by target ID
+ const targets = await browser.targets();
+ const target = targets.find(t => t._targetId === targetId);
+ if (!target) {
+ throw new Error(`Target ${targetId} not found in Chrome session`);
}
- // Fall back to launching new browser
- if (!browser) {
- const executablePath = findChromium();
- if (!executablePath) {
- return { success: false, error: 'Chrome binary not found' };
- }
-
- browser = await puppeteer.launch({
- executablePath,
- headless: headless ? 'new' : false,
- args: [
- '--no-sandbox',
- '--disable-setuid-sandbox',
- '--disable-dev-shm-usage',
- '--disable-gpu',
- `--window-size=${width},${height}`,
- ...(checkSsl ? [] : ['--ignore-certificate-errors']),
- ],
- defaultViewport: { width, height },
- });
-
- page = await browser.newPage();
-
- // Navigate to URL (only if we launched fresh browser)
- if (userAgent) {
- await page.setUserAgent(userAgent);
- }
-
- await page.goto(url, {
- waitUntil: 'networkidle2',
- timeout,
- });
+ const page = await target.page();
+ if (!page) {
+ throw new Error(`Could not get page for target ${targetId}`);
}
- // Take screenshot
+ // Set viewport on the page
+ await page.setViewport({ width, height });
+
+ // Take screenshot (Puppeteer throws on failure)
await page.screenshot({
path: outputPath,
fullPage: true,
});
- if (fs.existsSync(outputPath) && fs.statSync(outputPath).size > 0) {
- return { success: true, output: outputPath };
- } else {
- return { success: false, error: 'Screenshot file not created' };
- }
+ return outputPath;
- } catch (e) {
- return { success: false, error: `${e.name}: ${e.message}` };
} finally {
- // Only close browser if we launched it (not if we connected to session)
- if (browser && !connectedToSession) {
- await browser.close();
- }
+ // Disconnect from browser (don't close it - we're connected to a shared session)
+ // The chrome_launch hook manages the browser lifecycle
+ await browser.disconnect();
}
}
@@ -181,54 +138,33 @@ async function main() {
process.exit(1);
}
- try {
- // Check if staticfile extractor already handled this (permanent skip)
- if (hasStaticFileOutput()) {
- console.error(`Skipping screenshot - staticfile extractor already downloaded this`);
- // Permanent skip - emit ArchiveResult
- console.log(JSON.stringify({
- type: 'ArchiveResult',
- status: 'skipped',
- output_str: 'staticfile already handled',
- }));
- process.exit(0);
- }
-
- // Only wait for page load if using shared Chrome session
- const cdpUrl = readCdpUrl(CHROME_SESSION_DIR);
- if (cdpUrl) {
- // Wait for page to be fully loaded
- const pageLoaded = await waitForChromeTabLoaded(60000);
- if (!pageLoaded) {
- throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
- }
- }
-
- const result = await takeScreenshot(url);
-
- if (result.success) {
- // Success - emit ArchiveResult
- const size = fs.statSync(result.output).size;
- console.error(`Screenshot saved (${size} bytes)`);
- console.log(JSON.stringify({
- type: 'ArchiveResult',
- status: 'succeeded',
- output_str: result.output,
- }));
- process.exit(0);
- } else {
- // Transient error - emit NO JSONL
- console.error(`ERROR: ${result.error}`);
- process.exit(1);
- }
- } catch (e) {
- // Transient error - emit NO JSONL
- console.error(`ERROR: ${e.name}: ${e.message}`);
- process.exit(1);
+ // Check if staticfile extractor already handled this (permanent skip)
+ if (hasStaticFileOutput()) {
+ console.error(`Skipping screenshot - staticfile extractor already downloaded this`);
+ // Permanent skip - emit ArchiveResult
+ console.log(JSON.stringify({
+ type: 'ArchiveResult',
+ status: 'skipped',
+ output_str: 'staticfile already handled',
+ }));
+ process.exit(0);
}
+
+ // Take screenshot (throws on error)
+ const outputPath = await takeScreenshot(url);
+
+ // Success - emit ArchiveResult
+ const size = fs.statSync(outputPath).size;
+ console.error(`Screenshot saved (${size} bytes)`);
+ console.log(JSON.stringify({
+ type: 'ArchiveResult',
+ status: 'succeeded',
+ output_str: outputPath,
+ }));
}
main().catch(e => {
- console.error(`Fatal error: ${e.message}`);
+ // Transient error - emit NO JSONL
+ console.error(`ERROR: ${e.message}`);
process.exit(1);
});
diff --git a/archivebox/plugins/screenshot/tests/test_screenshot.py b/archivebox/plugins/screenshot/tests/test_screenshot.py
index 04c89f7e..a522f38b 100644
--- a/archivebox/plugins/screenshot/tests/test_screenshot.py
+++ b/archivebox/plugins/screenshot/tests/test_screenshot.py
@@ -25,6 +25,7 @@ from archivebox.plugins.chrome.tests.chrome_test_helpers import (
get_plugin_dir,
get_hook_script,
run_hook_and_parse,
+ chrome_session,
LIB_DIR,
NODE_MODULES_DIR,
CHROME_PLUGIN_DIR,
@@ -62,192 +63,96 @@ def test_verify_deps_with_abx_pkg():
assert node_loaded and node_loaded.abspath, "Node.js required for screenshot plugin"
-def test_extracts_screenshot_from_example_com():
- """Test full workflow: extract screenshot from real example.com via hook.
-
- Replicates production directory structure:
- DATA_DIR/users/testuser/crawls/{crawl-id}/chrome/
- DATA_DIR/users/testuser/crawls/{crawl-id}/snapshots/{snap-id}/chrome/
- DATA_DIR/users/testuser/crawls/{crawl-id}/snapshots/{snap-id}/screenshot/
-
- This exercises the "connect to existing session" code path which is the primary
- path in production and accounts for ~50% of the code.
- """
- import signal
- import time
- import os
-
+def test_screenshot_with_chrome_session():
+ """Test multiple screenshot scenarios with one Chrome session to save time."""
with tempfile.TemporaryDirectory() as tmpdir:
- # Replicate exact production directory structure
- data_dir = Path(tmpdir)
- crawl_id = 'test-screenshot-crawl'
+ test_url = 'https://example.com'
snapshot_id = 'test-screenshot-snap'
- # Crawl: DATA_DIR/users/{username}/crawls/YYYYMMDD/example.com/{crawl-id}/{plugin}/
- crawl_dir = data_dir / 'users' / 'testuser' / 'crawls' / '20240101' / 'example.com' / crawl_id
- chrome_dir = crawl_dir / 'chrome'
- chrome_dir.mkdir(parents=True)
-
- # Snapshot: DATA_DIR/users/{username}/snapshots/YYYYMMDD/example.com/{snapshot-uuid}/{plugin}/
- snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / snapshot_id
- snapshot_chrome_dir = snapshot_dir / 'chrome'
- snapshot_chrome_dir.mkdir(parents=True)
-
- screenshot_dir = snapshot_dir / 'screenshot'
- screenshot_dir.mkdir()
-
- env = get_test_env()
- env['CHROME_HEADLESS'] = 'true'
-
- # Step 1: Launch Chrome session at crawl level (background process)
- chrome_launch_process = subprocess.Popen(
- ['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
- cwd=str(chrome_dir),
- stdout=subprocess.PIPE,
- stderr=subprocess.PIPE,
- text=True,
- env=env
- )
-
- # Wait for Chrome to launch
- for i in range(15):
- if chrome_launch_process.poll() is not None:
- stdout, stderr = chrome_launch_process.communicate()
- pytest.fail(f"Chrome launch failed:\nStdout: {stdout}\nStderr: {stderr}")
- if (chrome_dir / 'cdp_url.txt').exists():
- break
- time.sleep(1)
-
- assert (chrome_dir / 'cdp_url.txt').exists(), "Chrome CDP URL file should exist"
- assert (chrome_dir / 'chrome.pid').exists(), "Chrome PID file should exist"
-
- chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
-
try:
- # Step 2: Create tab at snapshot level
- env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
- result = subprocess.run(
- ['node', str(CHROME_TAB_HOOK), f'--url={TEST_URL}', f'--snapshot-id={snapshot_id}', f'--crawl-id={crawl_id}'],
- cwd=str(snapshot_chrome_dir),
- capture_output=True,
- text=True,
- timeout=60,
- env=env
- )
- assert result.returncode == 0, f"Tab creation failed: {result.stderr}"
- assert (snapshot_chrome_dir / 'cdp_url.txt').exists(), "Snapshot CDP URL should exist"
+ with chrome_session(
+ Path(tmpdir),
+ crawl_id='test-screenshot-crawl',
+ snapshot_id=snapshot_id,
+ test_url=test_url,
+ navigate=True,
+ timeout=30,
+ ) as (chrome_process, chrome_pid, snapshot_chrome_dir, env):
- # Step 3: Navigate to URL
- result = subprocess.run(
- ['node', str(CHROME_NAVIGATE_HOOK), f'--url={TEST_URL}', f'--snapshot-id={snapshot_id}'],
- cwd=str(snapshot_chrome_dir),
- capture_output=True,
- text=True,
- timeout=120,
- env=env
- )
- assert result.returncode == 0, f"Navigation failed: {result.stderr}"
- assert (snapshot_chrome_dir / 'navigation.json').exists(), "Navigation JSON should exist"
+ # Scenario 1: Basic screenshot extraction
+ screenshot_dir = snapshot_chrome_dir.parent / 'screenshot'
+ screenshot_dir.mkdir()
- # Step 4: Take screenshot (should connect to existing session)
- # Screenshot hook runs in screenshot/ dir and looks for ../chrome/cdp_url.txt
- result = subprocess.run(
- ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', f'--snapshot-id={snapshot_id}'],
- cwd=str(screenshot_dir),
- capture_output=True,
- text=True,
- timeout=120,
- env=env
- )
+ result = subprocess.run(
+ ['node', str(SCREENSHOT_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
+ cwd=str(screenshot_dir),
+ capture_output=True,
+ text=True,
+ timeout=30,
+ env=env
+ )
- assert result.returncode == 0, f"Screenshot extraction failed:\nStderr: {result.stderr}\nStdout: {result.stdout}"
+ assert result.returncode == 0, f"Screenshot extraction failed:\nStderr: {result.stderr}"
- # Parse JSONL output
- result_json = None
- for line in result.stdout.strip().split('\n'):
- line = line.strip()
- if line.startswith('{'):
- try:
- record = json.loads(line)
- if record.get('type') == 'ArchiveResult':
- result_json = record
- break
- except json.JSONDecodeError:
- pass
+ # Parse JSONL output
+ result_json = None
+ for line in result.stdout.strip().split('\n'):
+ line = line.strip()
+ if line.startswith('{'):
+ try:
+ record = json.loads(line)
+ if record.get('type') == 'ArchiveResult':
+ result_json = record
+ break
+ except json.JSONDecodeError:
+ pass
- assert result_json, "Should have ArchiveResult JSONL output"
- assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
- assert 'screenshot.png' in result_json['output_str'], f"Output should be screenshot.png: {result_json}"
+ assert result_json and result_json['status'] == 'succeeded'
+ screenshot_file = screenshot_dir / 'screenshot.png'
+ assert screenshot_file.exists() and screenshot_file.stat().st_size > 1000
+ assert screenshot_file.read_bytes()[:8] == b'\x89PNG\r\n\x1a\n'
- # Verify filesystem output
- screenshot_file = screenshot_dir / 'screenshot.png'
- assert screenshot_file.exists(), f"screenshot.png not created at {screenshot_file}"
+ # Scenario 2: Custom resolution
+ screenshot_dir2 = snapshot_chrome_dir.parent / 'screenshot2'
+ screenshot_dir2.mkdir()
+ env['CHROME_RESOLUTION'] = '800,600'
- # Verify file is valid PNG
- file_size = screenshot_file.stat().st_size
- assert file_size > 1000, f"Screenshot too small: {file_size} bytes"
- assert file_size < 10 * 1024 * 1024, f"Screenshot suspiciously large: {file_size} bytes"
+ result = subprocess.run(
+ ['node', str(SCREENSHOT_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
+ cwd=str(screenshot_dir2),
+ capture_output=True,
+ text=True,
+ timeout=30,
+ env=env
+ )
- # Check PNG magic bytes
- screenshot_data = screenshot_file.read_bytes()
- assert screenshot_data[:8] == b'\x89PNG\r\n\x1a\n', "Should be valid PNG file"
+ assert result.returncode == 0
+ screenshot_file2 = screenshot_dir2 / 'screenshot.png'
+ assert screenshot_file2.exists()
+ file_size = screenshot_file2.stat().st_size
+ assert 500 < file_size < 100000, f"800x600 screenshot size unexpected: {file_size}"
- finally:
- # Cleanup: Kill Chrome
- try:
- chrome_launch_process.send_signal(signal.SIGTERM)
- chrome_launch_process.wait(timeout=5)
- except:
- pass
- try:
- os.kill(chrome_pid, signal.SIGKILL)
- except OSError:
- pass
+ # Scenario 3: Wrong target ID (error case)
+ screenshot_dir3 = snapshot_chrome_dir.parent / 'screenshot3'
+ screenshot_dir3.mkdir()
+ (snapshot_chrome_dir / 'target_id.txt').write_text('nonexistent-target-id')
+ result = subprocess.run(
+ ['node', str(SCREENSHOT_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
+ cwd=str(screenshot_dir3),
+ capture_output=True,
+ text=True,
+ timeout=5,
+ env=env
+ )
-def test_extracts_screenshot_without_session():
- """Test screenshot extraction without existing Chrome session (fallback to own browser)."""
- with tempfile.TemporaryDirectory() as tmpdir:
- # Create proper snapshot directory structure
- data_dir = Path(tmpdir)
- snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-fallback'
- screenshot_dir = snapshot_dir / 'screenshot'
- screenshot_dir.mkdir(parents=True)
+ assert result.returncode != 0
+ assert 'target' in result.stderr.lower() and 'not found' in result.stderr.lower()
- # Don't set up Chrome session or staticfile - screenshot should launch its own browser
- env = get_test_env()
- result = subprocess.run(
- ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-fallback'],
- cwd=str(screenshot_dir),
- capture_output=True,
- text=True,
- timeout=120,
- env=env
- )
-
- assert result.returncode == 0, f"Extraction failed: {result.stderr}"
-
- # Parse JSONL output
- result_json = None
- for line in result.stdout.strip().split('\n'):
- line = line.strip()
- if line.startswith('{'):
- try:
- record = json.loads(line)
- if record.get('type') == 'ArchiveResult':
- result_json = record
- break
- except json.JSONDecodeError:
- pass
-
- assert result_json, "Should have ArchiveResult JSONL output"
- assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
- assert 'screenshot.png' in result_json['output_str']
-
- # Verify file created
- screenshot_file = screenshot_dir / 'screenshot.png'
- assert screenshot_file.exists(), "screenshot.png not created"
- assert screenshot_file.stat().st_size > 1000, "Screenshot too small"
+ except RuntimeError as e:
+ if 'Chrome' in str(e) or 'CDP' in str(e):
+ pytest.skip(f"Chrome session setup failed: {e}")
+ raise
def test_skips_when_staticfile_exists():
@@ -344,57 +249,42 @@ def test_reports_missing_chrome():
assert 'chrome' in combined.lower() or 'browser' in combined.lower() or 'ERROR=' in combined
-def test_custom_resolution_and_user_agent():
- """Test that CHROME_RESOLUTION and CHROME_USER_AGENT configs are respected."""
+def test_waits_for_navigation_timeout():
+ """Test that screenshot waits for navigation.json and times out quickly if missing."""
+ import time
+
with tempfile.TemporaryDirectory() as tmpdir:
- data_dir = Path(tmpdir)
- snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-config'
- screenshot_dir = snapshot_dir / 'screenshot'
- screenshot_dir.mkdir(parents=True)
+ tmpdir = Path(tmpdir)
+
+ # Create chrome directory without navigation.json to trigger timeout
+ chrome_dir = tmpdir.parent / 'chrome'
+ chrome_dir.mkdir(parents=True, exist_ok=True)
+ (chrome_dir / 'cdp_url.txt').write_text('ws://localhost:9222/devtools/browser/test')
+ (chrome_dir / 'target_id.txt').write_text('test-target-id')
+ # Intentionally NOT creating navigation.json to test timeout
+
+ screenshot_dir = tmpdir / 'screenshot'
+ screenshot_dir.mkdir()
env = get_test_env()
- env['CHROME_RESOLUTION'] = '800,600'
- env['CHROME_USER_AGENT'] = 'Test/1.0'
+ env['SCREENSHOT_TIMEOUT'] = '2' # Set 2 second timeout
+ start_time = time.time()
result = subprocess.run(
- ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-config'],
+ ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=test-timeout'],
cwd=str(screenshot_dir),
capture_output=True,
text=True,
- timeout=120,
+ timeout=5, # Test timeout slightly higher than SCREENSHOT_TIMEOUT
env=env
)
+ elapsed = time.time() - start_time
- assert result.returncode == 0, f"Extraction failed: {result.stderr}"
-
- screenshot_file = screenshot_dir / 'screenshot.png'
- assert screenshot_file.exists(), "screenshot.png not created"
- # Resolution affects file size
- assert screenshot_file.stat().st_size > 500, "Screenshot too small"
-
-
-def test_ssl_check_disabled():
- """Test that CHROME_CHECK_SSL_VALIDITY=False allows invalid certificates."""
- with tempfile.TemporaryDirectory() as tmpdir:
- data_dir = Path(tmpdir)
- snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-ssl'
- screenshot_dir = snapshot_dir / 'screenshot'
- screenshot_dir.mkdir(parents=True)
-
- env = get_test_env()
- env['CHROME_CHECK_SSL_VALIDITY'] = 'False'
-
- result = subprocess.run(
- ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-ssl'],
- cwd=str(screenshot_dir),
- capture_output=True,
- text=True,
- timeout=120,
- env=env
- )
-
- assert result.returncode == 0, f"Should succeed: {result.stderr}"
- assert (screenshot_dir / 'screenshot.png').exists()
+ # Should fail when navigation.json doesn't appear
+ assert result.returncode != 0, "Should fail when navigation.json missing"
+ assert 'not loaded' in result.stderr.lower() or 'navigate' in result.stderr.lower(), f"Should mention navigation timeout: {result.stderr}"
+ # Should complete within 3s (2s wait + 1s overhead)
+ assert elapsed < 3, f"Should timeout within 3s, took {elapsed:.1f}s"
def test_config_timeout_honored():
@@ -485,345 +375,114 @@ def test_invalid_resolution_format():
# (depending on implementation - script should not crash with uncaught error)
assert result.returncode in (0, 1), f"Script should handle bad resolution: {bad_resolution}"
-
-def test_boolean_env_var_parsing():
- """Test that boolean environment variables are parsed correctly."""
- import time
+def test_no_cdp_url_fails():
+ """Test error when chrome dir exists but no cdp_url.txt."""
with tempfile.TemporaryDirectory() as tmpdir:
- data_dir = Path(tmpdir)
- snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-bool'
- screenshot_dir = snapshot_dir / 'screenshot'
- screenshot_dir.mkdir(parents=True)
-
- env = get_test_env()
-
- # Test various boolean formats for CHROME_HEADLESS
- for bool_val in ['true', '1', 'yes', 'on', 'True', 'TRUE']:
- env['CHROME_HEADLESS'] = bool_val
- result = subprocess.run(
- ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-bool'],
- cwd=str(screenshot_dir),
- capture_output=True,
- text=True,
- timeout=120,
- env=env
- )
- # Should either succeed or fail, but shouldn't crash on boolean parsing
- assert result.returncode in (0, 1), f"Should handle boolean value: {bool_val}"
-
- # Clean up screenshot file if created
- screenshot_file = screenshot_dir / 'screenshot.png'
- if screenshot_file.exists():
- screenshot_file.unlink()
-
- time.sleep(0.5) # Brief pause between attempts
-
-
-def test_integer_env_var_parsing():
- """Test that integer environment variables are parsed correctly."""
- with tempfile.TemporaryDirectory() as tmpdir:
- data_dir = Path(tmpdir)
- snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-int'
- screenshot_dir = snapshot_dir / 'screenshot'
- screenshot_dir.mkdir(parents=True)
-
- env = get_test_env()
-
- # Test valid and invalid integer formats for CHROME_TIMEOUT
- test_cases = [
- ('60', True), # Valid integer
- ('invalid', True), # Invalid - should use default
- ('', True), # Empty - should use default
- ]
-
- for timeout_val, should_work in test_cases:
- env['CHROME_TIMEOUT'] = timeout_val
- result = subprocess.run(
- ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-int'],
- cwd=str(screenshot_dir),
- capture_output=True,
- text=True,
- timeout=120,
- env=env
- )
- # Should either succeed or fail gracefully, but shouldn't crash on int parsing
- assert result.returncode in (0, 1), f"Should handle timeout value: {timeout_val}"
-
- # Clean up screenshot file if created
- screenshot_file = screenshot_dir / 'screenshot.png'
- if screenshot_file.exists():
- screenshot_file.unlink()
-
-
-def test_extracts_screenshot_with_all_config_options():
- """Test screenshot with comprehensive config to exercise all code paths."""
- with tempfile.TemporaryDirectory() as tmpdir:
- data_dir = Path(tmpdir)
- snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-full'
- screenshot_dir = snapshot_dir / 'screenshot'
- screenshot_dir.mkdir(parents=True)
-
- # Set ALL config options to exercise all code paths
- env = get_test_env()
- env['CHROME_HEADLESS'] = 'true'
- env['CHROME_RESOLUTION'] = '800,600'
- env['CHROME_USER_AGENT'] = 'TestBot/1.0'
- env['CHROME_CHECK_SSL_VALIDITY'] = 'false' # Exercises checkSsl branch
- env['CHROME_TIMEOUT'] = '60'
-
- result = subprocess.run(
- ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-full'],
- cwd=str(screenshot_dir),
- capture_output=True,
- text=True,
- timeout=120,
- env=env
- )
-
- assert result.returncode == 0, f"Screenshot should succeed: {result.stderr}"
-
- # Verify JSONL output with success
- result_json = None
- for line in result.stdout.strip().split('\n'):
- if line.strip().startswith('{'):
- try:
- record = json.loads(line)
- if record.get('type') == 'ArchiveResult':
- result_json = record
- break
- except json.JSONDecodeError:
- pass
-
- assert result_json, "Should have ArchiveResult JSONL output"
- assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
- assert 'screenshot.png' in result_json['output_str']
-
- # Verify file created
- screenshot_file = screenshot_dir / 'screenshot.png'
- assert screenshot_file.exists(), "screenshot.png should be created"
- assert screenshot_file.stat().st_size > 1000, "Screenshot should have content"
-
-
-def test_headless_mode_false():
- """Test headless=false code path specifically."""
- with tempfile.TemporaryDirectory() as tmpdir:
- data_dir = Path(tmpdir)
- snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-headless'
- screenshot_dir = snapshot_dir / 'screenshot'
- screenshot_dir.mkdir(parents=True)
-
- env = get_test_env()
- # Explicitly test headless=false (exercises the ternary false branch)
- env['CHROME_HEADLESS'] = 'false'
-
- result = subprocess.run(
- ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-headless-false'],
- cwd=str(screenshot_dir),
- capture_output=True,
- text=True,
- timeout=120,
- env=env
- )
- # Should work or fail gracefully
- assert result.returncode in (0, 1), f"Headless=false should handle: {result.stderr}"
-
-
-def test_invalid_url_causes_error():
- """Test error path with invalid URL that causes navigation failure."""
- with tempfile.TemporaryDirectory() as tmpdir:
- data_dir = Path(tmpdir)
- snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-invalid'
- screenshot_dir = snapshot_dir / 'screenshot'
- screenshot_dir.mkdir(parents=True)
-
- env = get_test_env()
- env['CHROME_TIMEOUT'] = '5' # Short timeout
-
- # Use invalid URL to trigger error path
- result = subprocess.run(
- ['node', str(SCREENSHOT_HOOK), '--url=http://this-domain-does-not-exist-12345.invalid', '--snapshot-id=snap-invalid'],
- cwd=str(screenshot_dir),
- capture_output=True,
- text=True,
- timeout=30,
- env=env
- )
-
- # Should fail due to navigation error
- assert result.returncode != 0, "Should fail on invalid URL"
- # Should NOT emit JSONL (transient error)
- jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')]
- assert len(jsonl_lines) == 0, f"Should not emit JSONL on error: {jsonl_lines}"
-
-
-def test_with_corrupted_cdp_url_falls_back():
- """Test that corrupted CDP URL file causes fallback to launching browser."""
- with tempfile.TemporaryDirectory() as tmpdir:
- data_dir = Path(tmpdir)
- snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-corrupt-cdp'
- screenshot_dir = snapshot_dir / 'screenshot'
- screenshot_dir.mkdir(parents=True)
-
- # Create chrome directory with corrupted CDP URL
- chrome_dir = snapshot_dir / 'chrome'
+ tmpdir = Path(tmpdir)
+ chrome_dir = tmpdir / 'chrome'
chrome_dir.mkdir()
- (chrome_dir / 'cdp_url.txt').write_text('ws://127.0.0.1:99999/invalid')
+ # Create target_id.txt and navigation.json but NOT cdp_url.txt
+ (chrome_dir / 'target_id.txt').write_text('test-target')
+ (chrome_dir / 'navigation.json').write_text('{}')
- env = get_test_env()
- env['CHROME_HEADLESS'] = 'true'
- env['CHROME_TIMEOUT'] = '5' # Short timeout for fast test
+ screenshot_dir = tmpdir / 'screenshot'
+ screenshot_dir.mkdir()
- # Screenshot should try CDP, fail quickly, then fall back to launching own browser
result = subprocess.run(
- ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-corrupt-cdp'],
+ ['node', str(SCREENSHOT_HOOK), '--url=https://example.com', '--snapshot-id=test'],
cwd=str(screenshot_dir),
capture_output=True,
text=True,
- timeout=30,
- env=env
+ timeout=7,
+ env=get_test_env()
)
- # Should succeed by falling back to launching browser
- assert result.returncode == 0, f"Should fallback and succeed: {result.stderr}"
- assert 'Failed to connect to CDP' in result.stderr, "Should log CDP connection failure"
-
- # Verify screenshot was created via fallback path
- screenshot_file = screenshot_dir / 'screenshot.png'
- assert screenshot_file.exists(), "Screenshot should be created via fallback"
+ assert result.returncode != 0
+ assert 'no chrome session' in result.stderr.lower()
-def test_user_agent_is_applied():
- """Test that CHROME_USER_AGENT is actually applied when launching browser."""
+def test_no_target_id_fails():
+ """Test error when cdp_url exists but no target_id.txt."""
with tempfile.TemporaryDirectory() as tmpdir:
- data_dir = Path(tmpdir)
- snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-ua'
- screenshot_dir = snapshot_dir / 'screenshot'
- screenshot_dir.mkdir(parents=True)
+ tmpdir = Path(tmpdir)
+ chrome_dir = tmpdir / 'chrome'
+ chrome_dir.mkdir()
+ # Create cdp_url.txt and navigation.json but NOT target_id.txt
+ (chrome_dir / 'cdp_url.txt').write_text('ws://localhost:9222/devtools/browser/test')
+ (chrome_dir / 'navigation.json').write_text('{}')
- env = get_test_env()
- env['CHROME_USER_AGENT'] = 'CustomBot/9.9.9 (Testing)'
- env['CHROME_HEADLESS'] = 'true'
+ screenshot_dir = tmpdir / 'screenshot'
+ screenshot_dir.mkdir()
result = subprocess.run(
- ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-ua'],
+ ['node', str(SCREENSHOT_HOOK), '--url=https://example.com', '--snapshot-id=test'],
cwd=str(screenshot_dir),
capture_output=True,
text=True,
- timeout=120,
- env=env
+ timeout=7,
+ env=get_test_env()
)
- # Should succeed with custom user agent
- assert result.returncode == 0, f"Should succeed with custom UA: {result.stderr}"
- screenshot_file = screenshot_dir / 'screenshot.png'
- assert screenshot_file.exists(), "Screenshot should be created"
+ assert result.returncode != 0
+ assert 'target_id.txt' in result.stderr.lower()
-def test_check_ssl_false_branch():
- """Test CHROME_CHECK_SSL_VALIDITY=false adds ignore-certificate-errors arg."""
+def test_invalid_cdp_url_fails():
+ """Test error with malformed CDP URL."""
with tempfile.TemporaryDirectory() as tmpdir:
- data_dir = Path(tmpdir)
- snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-nossl'
- screenshot_dir = snapshot_dir / 'screenshot'
- screenshot_dir.mkdir(parents=True)
+ tmpdir = Path(tmpdir)
+ chrome_dir = tmpdir / 'chrome'
+ chrome_dir.mkdir()
+ (chrome_dir / 'cdp_url.txt').write_text('invalid-url')
+ (chrome_dir / 'target_id.txt').write_text('test-target')
+ (chrome_dir / 'navigation.json').write_text('{}')
- env = get_test_env()
- env['CHROME_CHECK_SSL_VALIDITY'] = 'false'
- env['CHROME_HEADLESS'] = 'true'
+ screenshot_dir = tmpdir / 'screenshot'
+ screenshot_dir.mkdir()
- # Test with both boolean false and string 'false'
result = subprocess.run(
- ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-nossl'],
+ ['node', str(SCREENSHOT_HOOK), '--url=https://example.com', '--snapshot-id=test'],
cwd=str(screenshot_dir),
capture_output=True,
text=True,
- timeout=120,
- env=env
+ timeout=7,
+ env=get_test_env()
)
- assert result.returncode == 0, f"Should work with SSL check disabled: {result.stderr}"
- assert (screenshot_dir / 'screenshot.png').exists()
+ assert result.returncode != 0
-def test_alternative_env_var_names():
- """Test fallback environment variable names (TIMEOUT vs CHROME_TIMEOUT, etc)."""
+def test_invalid_timeout_uses_default():
+ """Test that invalid SCREENSHOT_TIMEOUT falls back to default."""
with tempfile.TemporaryDirectory() as tmpdir:
- data_dir = Path(tmpdir)
- snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-altenv'
- screenshot_dir = snapshot_dir / 'screenshot'
- screenshot_dir.mkdir(parents=True)
+ tmpdir = Path(tmpdir)
+ chrome_dir = tmpdir / 'chrome'
+ chrome_dir.mkdir()
+ # No navigation.json to trigger timeout
+ (chrome_dir / 'cdp_url.txt').write_text('ws://localhost:9222/test')
+ (chrome_dir / 'target_id.txt').write_text('test')
+
+ screenshot_dir = tmpdir / 'screenshot'
+ screenshot_dir.mkdir()
env = get_test_env()
- # Use alternative env var names (without CHROME_ prefix)
- env['TIMEOUT'] = '45'
- env['RESOLUTION'] = '1024,768'
- env['USER_AGENT'] = 'AltBot/1.0'
- env['CHECK_SSL_VALIDITY'] = 'false'
+ env['SCREENSHOT_TIMEOUT'] = 'invalid' # Should fallback to default (10s becomes NaN, treated as 0)
+ import time
+ start = time.time()
result = subprocess.run(
- ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-altenv'],
+ ['node', str(SCREENSHOT_HOOK), '--url=https://example.com', '--snapshot-id=test'],
cwd=str(screenshot_dir),
capture_output=True,
text=True,
- timeout=120,
+ timeout=5,
env=env
)
+ elapsed = time.time() - start
- assert result.returncode == 0, f"Should work with alternative env vars: {result.stderr}"
- assert (screenshot_dir / 'screenshot.png').exists()
-
-
-def test_very_large_resolution():
- """Test screenshot with very large resolution."""
- with tempfile.TemporaryDirectory() as tmpdir:
- data_dir = Path(tmpdir)
- snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-large'
- screenshot_dir = snapshot_dir / 'screenshot'
- screenshot_dir.mkdir(parents=True)
-
- env = get_test_env()
- env['CHROME_RESOLUTION'] = '3840,2160' # 4K resolution
- env['CHROME_HEADLESS'] = 'true'
-
- result = subprocess.run(
- ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-large'],
- cwd=str(screenshot_dir),
- capture_output=True,
- text=True,
- timeout=120,
- env=env
- )
-
- assert result.returncode == 0, f"Should handle large resolution: {result.stderr}"
- screenshot_file = screenshot_dir / 'screenshot.png'
- assert screenshot_file.exists()
- # 4K screenshot should be larger
- assert screenshot_file.stat().st_size > 5000, "4K screenshot should be substantial"
-
-
-def test_very_small_resolution():
- """Test screenshot with very small resolution."""
- with tempfile.TemporaryDirectory() as tmpdir:
- data_dir = Path(tmpdir)
- snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-small'
- screenshot_dir = snapshot_dir / 'screenshot'
- screenshot_dir.mkdir(parents=True)
-
- env = get_test_env()
- env['CHROME_RESOLUTION'] = '320,240' # Very small
- env['CHROME_HEADLESS'] = 'true'
-
- result = subprocess.run(
- ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-small'],
- cwd=str(screenshot_dir),
- capture_output=True,
- text=True,
- timeout=120,
- env=env
- )
-
- assert result.returncode == 0, f"Should handle small resolution: {result.stderr}"
- assert (screenshot_dir / 'screenshot.png').exists()
+ # With invalid timeout, parseInt returns NaN, which should be handled
+ assert result.returncode != 0
+ assert elapsed < 2 # Should fail quickly, not wait 10s
if __name__ == '__main__':
diff --git a/archivebox/plugins/seo/tests/test_seo.py b/archivebox/plugins/seo/tests/test_seo.py
index 23beaa76..e365e4b0 100644
--- a/archivebox/plugins/seo/tests/test_seo.py
+++ b/archivebox/plugins/seo/tests/test_seo.py
@@ -72,10 +72,9 @@ class TestSEOWithChrome(TestCase):
test_url=test_url,
navigate=True,
timeout=30,
- ) as (chrome_process, chrome_pid, snapshot_chrome_dir):
- # Get environment and run the SEO hook
- env = get_test_env()
- env['CHROME_HEADLESS'] = 'true'
+ ) as (chrome_process, chrome_pid, snapshot_chrome_dir, env):
+ # Use the environment from chrome_session (already has CHROME_HEADLESS=true)
+
# Run SEO hook with the active Chrome session
result = subprocess.run(
diff --git a/archivebox/plugins/singlefile/tests/test_singlefile.py b/archivebox/plugins/singlefile/tests/test_singlefile.py
index 0fbd3c07..a473f152 100644
--- a/archivebox/plugins/singlefile/tests/test_singlefile.py
+++ b/archivebox/plugins/singlefile/tests/test_singlefile.py
@@ -22,7 +22,7 @@ from archivebox.plugins.chrome.tests.chrome_test_helpers import (
get_test_env,
get_plugin_dir,
get_hook_script,
- setup_chrome_session,
+ chrome_session,
cleanup_chrome,
)
@@ -96,17 +96,15 @@ def test_singlefile_with_chrome_session():
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
- try:
- # Set up Chrome session using shared helper
- chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(
- tmpdir=tmpdir,
- crawl_id='singlefile-test-crawl',
- snapshot_id='singlefile-test-snap',
- test_url=TEST_URL,
- navigate=False, # Don't navigate, singlefile will do that
- timeout=20,
- )
-
+ # Set up Chrome session using shared helper
+ with chrome_session(
+ tmpdir=tmpdir,
+ crawl_id='singlefile-test-crawl',
+ snapshot_id='singlefile-test-snap',
+ test_url=TEST_URL,
+ navigate=False, # Don't navigate, singlefile will do that
+ timeout=20,
+ ) as (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env):
# singlefile looks for ../chrome/cdp_url.txt relative to cwd
# So we need to run from a directory that has ../chrome pointing to our chrome dir
singlefile_output_dir = tmpdir / 'snapshot' / 'singlefile'
@@ -117,9 +115,8 @@ def test_singlefile_with_chrome_session():
if not chrome_link.exists():
chrome_link.symlink_to(tmpdir / 'crawl' / 'chrome')
- env = get_test_env()
+ # Use env from chrome_session
env['SINGLEFILE_ENABLED'] = 'true'
- env['CHROME_HEADLESS'] = 'true'
# Run singlefile - it should find and use the existing Chrome session
result = subprocess.run(
@@ -143,9 +140,6 @@ def test_singlefile_with_chrome_session():
assert result.returncode == 0 or 'browser-server' in result.stderr or 'cdp' in result.stderr.lower(), \
f"Singlefile should attempt CDP connection. stderr: {result.stderr}"
- finally:
- cleanup_chrome(chrome_launch_process, chrome_pid)
-
def test_singlefile_disabled_skips():
"""Test that SINGLEFILE_ENABLED=False exits without JSONL."""
diff --git a/archivebox/plugins/ssl/tests/test_ssl.py b/archivebox/plugins/ssl/tests/test_ssl.py
index 48ec0a6c..6261c26b 100644
--- a/archivebox/plugins/ssl/tests/test_ssl.py
+++ b/archivebox/plugins/ssl/tests/test_ssl.py
@@ -72,10 +72,9 @@ class TestSSLWithChrome(TestCase):
test_url=test_url,
navigate=True,
timeout=30,
- ) as (chrome_process, chrome_pid, snapshot_chrome_dir):
- # Get environment and run the SSL hook
- env = get_test_env()
- env['CHROME_HEADLESS'] = 'true'
+ ) as (chrome_process, chrome_pid, snapshot_chrome_dir, env):
+ # Use the environment from chrome_session (already has CHROME_HEADLESS=true)
+
# Run SSL hook with the active Chrome session
result = subprocess.run(
diff --git a/archivebox/plugins/staticfile/tests/test_staticfile.py b/archivebox/plugins/staticfile/tests/test_staticfile.py
index f80d0839..3f4412ae 100644
--- a/archivebox/plugins/staticfile/tests/test_staticfile.py
+++ b/archivebox/plugins/staticfile/tests/test_staticfile.py
@@ -72,16 +72,14 @@ class TestStaticfileWithChrome(TestCase):
test_url=test_url,
navigate=True,
timeout=30,
- ) as (chrome_process, chrome_pid, snapshot_chrome_dir):
- # Get environment and run the staticfile hook
- env = get_test_env()
- env['CHROME_HEADLESS'] = 'true'
+ ) as (chrome_process, chrome_pid, snapshot_chrome_dir, env):
+ # Use the environment from chrome_session (already has CHROME_HEADLESS=true)
+
# Run staticfile hook with the active Chrome session
result = subprocess.run(
['node', str(STATICFILE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
- cwd=str(snapshot_chrome_dir,
- env=get_test_env()),
+ cwd=str(snapshot_chrome_dir),
capture_output=True,
text=True,
timeout=120, # Longer timeout as it waits for navigation
diff --git a/archivebox/tests/test_recursive_crawl.py b/archivebox/tests/test_recursive_crawl.py
index 4ced2029..976a4e8c 100644
--- a/archivebox/tests/test_recursive_crawl.py
+++ b/archivebox/tests/test_recursive_crawl.py
@@ -384,11 +384,11 @@ def test_root_snapshot_has_depth_zero(tmp_path, process, disable_extractors_dict
def test_archiveresult_worker_queue_filters_by_foreground_extractors(tmp_path, process):
- """Test that ArchiveResultWorker.get_queue() only blocks on foreground extractors."""
+ """Test that background hooks don't block foreground extractors from running."""
os.chdir(tmp_path)
- # This test verifies the fix for the orchestrator bug where background hooks
- # were blocking parser extractors from running
+ # This test verifies that background hooks run concurrently with foreground hooks
+ # and don't block parser extractors
# Start a crawl
env = os.environ.copy()
diff --git a/archivebox/workers/orchestrator.py b/archivebox/workers/orchestrator.py
index ed8bf832..7c7b4d0b 100644
--- a/archivebox/workers/orchestrator.py
+++ b/archivebox/workers/orchestrator.py
@@ -1,15 +1,13 @@
"""
Orchestrator for managing worker processes.
-The Orchestrator polls queues for each model type (Crawl, Snapshot, ArchiveResult)
-and lazily spawns worker processes when there is work to be done.
+The Orchestrator polls the Crawl queue and spawns CrawlWorkers as needed.
Architecture:
- Orchestrator (main loop, polls queues)
- ├── CrawlWorker subprocess(es)
- ├── SnapshotWorker subprocess(es)
- └── ArchiveResultWorker subprocess(es)
- └── Each worker spawns task subprocesses via CLI
+ Orchestrator (polls Crawl queue)
+ └── CrawlWorker(s) (one per active Crawl)
+ └── SnapshotWorker(s) (one per Snapshot, up to limit)
+ └── Hook Processes (sequential, forked by SnapshotWorker)
Usage:
# Default: runs forever (for use as subprocess of server)
@@ -38,7 +36,7 @@ from django.utils import timezone
from rich import print
from archivebox.misc.logging_util import log_worker_event
-from .worker import Worker, CrawlWorker, SnapshotWorker, ArchiveResultWorker
+from .worker import Worker, CrawlWorker
def _run_orchestrator_process(exit_on_idle: bool) -> None:
@@ -52,22 +50,27 @@ def _run_orchestrator_process(exit_on_idle: bool) -> None:
class Orchestrator:
"""
Manages worker processes by polling queues and spawning workers as needed.
-
+
The orchestrator:
- 1. Polls each model queue (Crawl, Snapshot, ArchiveResult)
- 2. If items exist and fewer than MAX_CONCURRENT workers are running, spawns workers
+ 1. Polls Crawl queue
+ 2. If crawls exist and fewer than MAX_CRAWL_WORKERS are running, spawns CrawlWorkers
3. Monitors worker health and cleans up stale PIDs
- 4. Exits when all queues are empty (unless daemon mode)
+ 4. Exits when queue is empty (unless daemon mode)
+
+ Architecture:
+ - Orchestrator spawns CrawlWorkers (one per active Crawl)
+ - Each CrawlWorker spawns SnapshotWorkers (one per Snapshot, up to limit)
+ - Each SnapshotWorker runs hooks sequentially for its snapshot
"""
-
- WORKER_TYPES: list[Type[Worker]] = [CrawlWorker, SnapshotWorker, ArchiveResultWorker]
+
+ # Only CrawlWorker - SnapshotWorkers are spawned by CrawlWorker subprocess, not by Orchestrator
+ WORKER_TYPES: list[Type[Worker]] = [CrawlWorker]
# Configuration
POLL_INTERVAL: float = 2.0 # How often to check for new work (seconds)
IDLE_TIMEOUT: int = 3 # Exit after N idle ticks (0 = never exit)
- MAX_WORKERS_PER_TYPE: int = 8 # Max workers per model type
- MAX_TOTAL_WORKERS: int = 24 # Max workers across all types
-
+ MAX_CRAWL_WORKERS: int = 8 # Max crawls processing simultaneously
+
def __init__(self, exit_on_idle: bool = True, crawl_id: str | None = None):
self.exit_on_idle = exit_on_idle
self.crawl_id = crawl_id # If set, only process work for this crawl
@@ -76,11 +79,9 @@ class Orchestrator:
self.idle_count: int = 0
self._last_cleanup_time: float = 0.0 # For throttling cleanup_stale_running()
- # In foreground mode (exit_on_idle=True), limit workers but allow enough
- # for crawl progression: 1 CrawlWorker + 1 SnapshotWorker + 1 ArchiveResultWorker
+ # In foreground mode (exit_on_idle=True), limit to 1 CrawlWorker
if self.exit_on_idle:
- self.MAX_WORKERS_PER_TYPE = 1
- self.MAX_TOTAL_WORKERS = 3 # Allow one worker of each type to run concurrently
+ self.MAX_CRAWL_WORKERS = 1
def __repr__(self) -> str:
return f'[underline]Orchestrator[/underline]\\[pid={self.pid}]'
@@ -109,14 +110,18 @@ class Orchestrator:
# Clean up any stale Process records from previous runs
stale_count = Process.cleanup_stale_running()
+ # Clean up orphaned Chrome processes from previous crashes
+ chrome_count = Process.cleanup_orphaned_chrome()
+
# Collect startup metadata
metadata = {
- 'max_workers_per_type': self.MAX_WORKERS_PER_TYPE,
- 'max_total_workers': self.MAX_TOTAL_WORKERS,
+ 'max_crawl_workers': self.MAX_CRAWL_WORKERS,
'poll_interval': self.POLL_INTERVAL,
}
if stale_count:
metadata['cleaned_stale_pids'] = stale_count
+ if chrome_count:
+ metadata['cleaned_orphaned_chrome'] = chrome_count
log_worker_event(
worker_type='Orchestrator',
@@ -126,8 +131,34 @@ class Orchestrator:
metadata=metadata,
)
+ def terminate_all_workers(self) -> None:
+ """Terminate all running worker processes."""
+ from archivebox.machine.models import Process
+ import signal
+
+ # Get all running worker processes
+ running_workers = Process.objects.filter(
+ process_type=Process.TypeChoices.WORKER,
+ status__in=['running', 'started']
+ )
+
+ for worker_process in running_workers:
+ try:
+ # Send SIGTERM to gracefully terminate the worker
+ os.kill(worker_process.pid, signal.SIGTERM)
+ except ProcessLookupError:
+ # Process already dead
+ pass
+ except Exception:
+ # Ignore other errors during shutdown
+ pass
+
def on_shutdown(self, error: BaseException | None = None) -> None:
"""Called when orchestrator shuts down."""
+ # Terminate all worker processes in exit_on_idle mode
+ if self.exit_on_idle:
+ self.terminate_all_workers()
+
# Update Process record status
if hasattr(self, 'db_process') and self.db_process:
# KeyboardInterrupt is a graceful shutdown, not an error
@@ -163,20 +194,15 @@ class Orchestrator:
return len(WorkerClass.get_running_workers())
def should_spawn_worker(self, WorkerClass: Type[Worker], queue_count: int) -> bool:
- """Determine if we should spawn a new worker of the given type."""
+ """Determine if we should spawn a new CrawlWorker."""
if queue_count == 0:
return False
- # Check per-type limit
+ # Check CrawlWorker limit
running_workers = WorkerClass.get_running_workers()
running_count = len(running_workers)
- if running_count >= self.MAX_WORKERS_PER_TYPE:
- return False
-
- # Check total limit
- total_workers = self.get_total_worker_count()
- if total_workers >= self.MAX_TOTAL_WORKERS:
+ if running_count >= self.MAX_CRAWL_WORKERS:
return False
# Check if we already have enough workers for the queue size
@@ -190,7 +216,7 @@ class Orchestrator:
"""Spawn a new worker process. Returns PID or None if spawn failed."""
try:
print(f'[yellow]DEBUG: Spawning {WorkerClass.name} worker with crawl_id={self.crawl_id}...[/yellow]')
- pid = WorkerClass.start(daemon=False, crawl_id=self.crawl_id)
+ pid = WorkerClass.start(crawl_id=self.crawl_id)
print(f'[yellow]DEBUG: Spawned {WorkerClass.name} worker with PID={pid}[/yellow]')
# CRITICAL: Block until worker registers itself in Process table
@@ -259,24 +285,49 @@ class Orchestrator:
def check_queues_and_spawn_workers(self) -> dict[str, int]:
"""
- Check all queues and spawn workers as needed.
- Returns dict of queue sizes by worker type.
+ Check Crawl queue and spawn CrawlWorkers as needed.
+ Returns dict of queue sizes.
"""
+ from archivebox.crawls.models import Crawl
+
queue_sizes = {}
- for WorkerClass in self.WORKER_TYPES:
- # Get queue for this worker type
- # Need to instantiate worker to get queue (for model access)
- worker = WorkerClass(worker_id=-1, crawl_id=self.crawl_id) # temp instance just for queue access
- queue = worker.get_queue()
- queue_count = queue.count()
- queue_sizes[WorkerClass.name] = queue_count
+ # Only check Crawl queue
+ crawl_queue = Crawl.objects.filter(
+ retry_at__lte=timezone.now()
+ ).exclude(
+ status__in=Crawl.FINAL_STATES
+ )
+
+ # Apply crawl_id filter if set
+ if self.crawl_id:
+ crawl_queue = crawl_queue.filter(id=self.crawl_id)
+
+ crawl_queue = crawl_queue.order_by('retry_at')
+ crawl_count = crawl_queue.count()
+ queue_sizes['crawl'] = crawl_count
+
+ # Spawn CrawlWorker if needed
+ if self.should_spawn_worker(CrawlWorker, crawl_count):
+ # Claim next crawl
+ crawl = crawl_queue.first()
+ if crawl and self._claim_crawl(crawl):
+ CrawlWorker.start(crawl_id=str(crawl.id))
- # Spawn worker if needed
- if self.should_spawn_worker(WorkerClass, queue_count):
- self.spawn_worker(WorkerClass)
-
return queue_sizes
+
+ def _claim_crawl(self, crawl) -> bool:
+ """Atomically claim a crawl using optimistic locking."""
+ from archivebox.crawls.models import Crawl
+
+ updated = Crawl.objects.filter(
+ pk=crawl.pk,
+ retry_at=crawl.retry_at,
+ ).update(
+ retry_at=timezone.now() + timedelta(hours=24), # Long lock (crawls take time)
+ )
+
+ return updated == 1
def has_pending_work(self, queue_sizes: dict[str, int]) -> bool:
"""Check if any queue has pending work."""
@@ -287,30 +338,21 @@ class Orchestrator:
return self.get_total_worker_count() > 0
def has_future_work(self) -> bool:
- """Check if there's work scheduled for the future (retry_at > now)."""
- for WorkerClass in self.WORKER_TYPES:
- worker = WorkerClass(worker_id=-1, crawl_id=self.crawl_id)
- Model = worker.get_model()
+ """Check if there's work scheduled for the future (retry_at > now) in Crawl queue."""
+ from archivebox.crawls.models import Crawl
- # Build filter for future work, respecting crawl_id if set
- qs = Model.objects.filter(
- retry_at__gt=timezone.now()
- ).exclude(
- status__in=Model.FINAL_STATES
- )
+ # Build filter for future work, respecting crawl_id if set
+ qs = Crawl.objects.filter(
+ retry_at__gt=timezone.now()
+ ).exclude(
+ status__in=Crawl.FINAL_STATES
+ )
- # Apply crawl_id filter if set
- if self.crawl_id:
- if WorkerClass.name == 'crawl':
- qs = qs.filter(id=self.crawl_id)
- elif WorkerClass.name == 'snapshot':
- qs = qs.filter(crawl_id=self.crawl_id)
- elif WorkerClass.name == 'archiveresult':
- qs = qs.filter(snapshot__crawl_id=self.crawl_id)
+ # Apply crawl_id filter if set
+ if self.crawl_id:
+ qs = qs.filter(id=self.crawl_id)
- if qs.count() > 0:
- return True
- return False
+ return qs.count() > 0
def on_tick(self, queue_sizes: dict[str, int]) -> None:
"""Called each orchestrator tick. Override for custom behavior."""
@@ -345,20 +387,20 @@ class Orchestrator:
def runloop(self) -> None:
"""Main orchestrator loop."""
- from rich.progress import Progress, BarColumn, TextColumn, TaskProgressColumn
- from archivebox.misc.logging import IS_TTY, CONSOLE
+ from rich.live import Live
+ from archivebox.misc.logging import IS_TTY
+ from archivebox.misc.progress_layout import ArchiveBoxProgressLayout
import sys
import os
- # Enable progress bars only in TTY + foreground mode
+ # Enable progress layout only in TTY + foreground mode
show_progress = IS_TTY and self.exit_on_idle
self.on_startup()
- task_ids = {}
if not show_progress:
- # No progress bars - just run normally
- self._run_orchestrator_loop(None, task_ids)
+ # No progress layout - just run normally
+ self._run_orchestrator_loop(None)
else:
# Redirect worker subprocess output to /dev/null
devnull_fd = os.open(os.devnull, os.O_WRONLY)
@@ -384,14 +426,16 @@ class Orchestrator:
original_console = logging_module.CONSOLE
logging_module.CONSOLE = orchestrator_console
- # Now create Progress and run loop (DON'T restore stdout/stderr - workers need /dev/null)
- with Progress(
- TextColumn("[cyan]{task.description}"),
- BarColumn(bar_width=40),
- TaskProgressColumn(),
+ # Create layout and run with Live display
+ progress_layout = ArchiveBoxProgressLayout(crawl_id=self.crawl_id)
+
+ with Live(
+ progress_layout.get_layout(),
+ refresh_per_second=4,
+ screen=True,
console=orchestrator_console,
- ) as progress:
- self._run_orchestrator_loop(progress, task_ids)
+ ):
+ self._run_orchestrator_loop(progress_layout)
# Restore original console
logging_module.CONSOLE = original_console
@@ -409,22 +453,68 @@ class Orchestrator:
pass
# stdout_for_console is closed by orchestrator_console
- def _run_orchestrator_loop(self, progress, task_ids):
+ def _run_orchestrator_loop(self, progress_layout):
"""Run the main orchestrator loop with optional progress display."""
last_queue_sizes = {}
last_snapshot_count = None
+ tick_count = 0
+
+ # Track snapshot progress to detect changes
+ snapshot_progress = {} # snapshot_id -> (total, completed, current_plugin)
+
try:
while True:
+ tick_count += 1
+
# Check queues and spawn workers
queue_sizes = self.check_queues_and_spawn_workers()
- # Debug queue sizes (only when changed)
- if progress and queue_sizes != last_queue_sizes:
- progress.console.print(f'[yellow]DEBUG: Queue sizes: {queue_sizes}[/yellow]')
- last_queue_sizes = queue_sizes.copy()
+ # Get worker counts for each type
+ worker_counts = {
+ WorkerClass.name: len(WorkerClass.get_running_workers())
+ for WorkerClass in self.WORKER_TYPES
+ }
- # Update progress bars
- if progress:
+ # Update layout if enabled
+ if progress_layout:
+ # Get crawl queue and worker counts
+ crawl_queue_count = queue_sizes.get('crawl', 0)
+ crawl_workers_count = worker_counts.get('crawl', 0)
+
+ # Determine orchestrator status
+ if crawl_workers_count > 0:
+ status = "Working"
+ elif crawl_queue_count > 0:
+ status = "Spawning"
+ else:
+ status = "Idle"
+
+ # Update orchestrator status
+ progress_layout.update_orchestrator_status(
+ status=status,
+ crawl_queue_count=crawl_queue_count,
+ crawl_workers_count=crawl_workers_count,
+ max_crawl_workers=self.MAX_CRAWL_WORKERS,
+ )
+
+ # Log queue size changes
+ if queue_sizes != last_queue_sizes:
+ for worker_type, count in queue_sizes.items():
+ old_count = last_queue_sizes.get(worker_type, 0)
+ if count != old_count:
+ if count > old_count:
+ progress_layout.log_event(
+ f"{worker_type.capitalize()} queue: {old_count} → {count}",
+ style="yellow"
+ )
+ else:
+ progress_layout.log_event(
+ f"{worker_type.capitalize()} queue: {old_count} → {count}",
+ style="green"
+ )
+ last_queue_sizes = queue_sizes.copy()
+
+ # Update snapshot progress
from archivebox.core.models import Snapshot
# Get all started snapshots (optionally filtered by crawl_id)
@@ -438,9 +528,36 @@ class Orchestrator:
active_snapshots = list(Snapshot.objects.filter(**snapshot_filter))
- # Debug snapshot count (only when changed)
+ # Log snapshot count changes and details
if len(active_snapshots) != last_snapshot_count:
- progress.console.print(f'[yellow]DEBUG: Found {len(active_snapshots)} active snapshots (crawl_id={self.crawl_id})[/yellow]')
+ if last_snapshot_count is not None:
+ if len(active_snapshots) > last_snapshot_count:
+ progress_layout.log_event(
+ f"Active snapshots: {last_snapshot_count} → {len(active_snapshots)}",
+ style="cyan"
+ )
+ # Log which snapshots started
+ for snapshot in active_snapshots[-1:]: # Just show the newest one
+ progress_layout.log_event(
+ f"Started: {snapshot.url[:60]}",
+ style="green"
+ )
+
+ # Log SnapshotWorker count
+ from archivebox.machine.models import Process
+ all_workers = Process.objects.filter(
+ process_type=Process.TypeChoices.WORKER,
+ status__in=['running', 'started']
+ ).count()
+ progress_layout.log_event(
+ f"Workers running: {all_workers} ({crawl_workers_count} CrawlWorkers)",
+ style="grey53"
+ )
+ else:
+ progress_layout.log_event(
+ f"Active snapshots: {last_snapshot_count} → {len(active_snapshots)}",
+ style="blue"
+ )
last_snapshot_count = len(active_snapshots)
# Track which snapshots are still active
@@ -450,13 +567,14 @@ class Orchestrator:
active_ids.add(snapshot.id)
total = snapshot.archiveresult_set.count()
- if total == 0:
- continue
-
completed = snapshot.archiveresult_set.filter(
status__in=['succeeded', 'skipped', 'failed']
).count()
+ # Count hooks by status for debugging
+ queued = snapshot.archiveresult_set.filter(status='queued').count()
+ started = snapshot.archiveresult_set.filter(status='started').count()
+
# Find currently running hook (ordered by hook_name to get lowest step number)
current_ar = snapshot.archiveresult_set.filter(status='started').order_by('hook_name').first()
if not current_ar:
@@ -472,24 +590,78 @@ class Orchestrator:
# Clean up the name: remove prefix and extension
clean_name = hook_name.split('__')[-1] if '__' in hook_name else hook_name
clean_name = clean_name.replace('.py', '').replace('.sh', '').replace('.bg', '')
- current_plugin = f" • {clean_name}"
+ current_plugin = clean_name
+ elif total == 0:
+ # Snapshot just started, hooks not created yet
+ current_plugin = "initializing"
+ elif queued > 0:
+ # Hooks created but none started yet
+ current_plugin = "waiting"
- # Build description with URL + current plugin
- url = snapshot.url[:50] + '...' if len(snapshot.url) > 50 else snapshot.url
- description = f"{url}{current_plugin}"
+ # Update snapshot worker (show even if no hooks yet)
+ # Debug: Log first time we see this snapshot
+ if snapshot.id not in progress_layout.snapshot_to_worker:
+ progress_layout.log_event(
+ f"Assigning to worker: {snapshot.url[:50]}",
+ style="grey53"
+ )
- # Create or update task
- if snapshot.id not in task_ids:
- task_ids[snapshot.id] = progress.add_task(description, total=total, completed=completed)
- else:
- # Update both progress and description
- progress.update(task_ids[snapshot.id], description=description, completed=completed)
+ # Track progress changes
+ prev_progress = snapshot_progress.get(snapshot.id, (0, 0, ''))
+ curr_progress = (total, completed, current_plugin)
- # Remove tasks for snapshots that are no longer active
- for snapshot_id in list(task_ids.keys()):
+ if prev_progress != curr_progress:
+ prev_total, prev_completed, prev_plugin = prev_progress
+
+ # Log hooks created
+ if total > prev_total:
+ progress_layout.log_event(
+ f"Hooks created: {total} for {snapshot.url[:40]}",
+ style="cyan"
+ )
+
+ # Log hook completion
+ if completed > prev_completed:
+ progress_layout.log_event(
+ f"Hook completed: {completed}/{total} for {snapshot.url[:40]}",
+ style="green"
+ )
+
+ # Log plugin change
+ if current_plugin and current_plugin != prev_plugin:
+ progress_layout.log_event(
+ f"Running: {current_plugin} ({snapshot.url[:40]})",
+ style="yellow"
+ )
+
+ snapshot_progress[snapshot.id] = curr_progress
+
+ # Debug: Every 10 ticks, log detailed status if stuck at initializing
+ if tick_count % 10 == 0 and total == 0 and current_plugin == "initializing":
+ progress_layout.log_event(
+ f"DEBUG: Snapshot stuck at initializing (status={snapshot.status})",
+ style="red"
+ )
+
+ progress_layout.update_snapshot_worker(
+ snapshot_id=snapshot.id,
+ url=snapshot.url,
+ total=max(total, 1), # Show at least 1 to avoid division by zero
+ completed=completed,
+ current_plugin=current_plugin,
+ )
+
+ # Remove snapshots that are no longer active
+ for snapshot_id in list(progress_layout.snapshot_to_worker.keys()):
if snapshot_id not in active_ids:
- progress.remove_task(task_ids[snapshot_id])
- del task_ids[snapshot_id]
+ progress_layout.log_event(
+ f"Snapshot completed/removed",
+ style="blue"
+ )
+ progress_layout.remove_snapshot_worker(snapshot_id)
+ # Also clean up progress tracking
+ if snapshot_id in snapshot_progress:
+ del snapshot_progress[snapshot_id]
# Track idle state
has_pending = self.has_pending_work(queue_sizes)
@@ -503,6 +675,8 @@ class Orchestrator:
# Check if we should exit
if self.should_exit(queue_sizes):
+ if progress_layout:
+ progress_layout.log_event("All work complete", style="green")
log_worker_event(
worker_type='Orchestrator',
event='All work complete',
@@ -514,8 +688,12 @@ class Orchestrator:
time.sleep(self.POLL_INTERVAL)
except KeyboardInterrupt:
+ if progress_layout:
+ progress_layout.log_event("Interrupted by user", style="red")
print() # Newline after ^C
except BaseException as e:
+ if progress_layout:
+ progress_layout.log_event(f"Error: {e}", style="red")
self.on_shutdown(error=e)
raise
else:
diff --git a/archivebox/workers/worker.py b/archivebox/workers/worker.py
index 5a0c0980..439bdda4 100644
--- a/archivebox/workers/worker.py
+++ b/archivebox/workers/worker.py
@@ -34,7 +34,7 @@ CPU_COUNT = cpu_count()
WORKER_TYPES: dict[str, type['Worker']] = {}
-def _run_worker(worker_class_name: str, worker_id: int, daemon: bool, **kwargs):
+def _run_worker(worker_class_name: str, worker_id: int, **kwargs):
"""
Module-level function to run a worker. Must be at module level for pickling.
"""
@@ -43,16 +43,28 @@ def _run_worker(worker_class_name: str, worker_id: int, daemon: bool, **kwargs):
# Get worker class by name to avoid pickling class objects
worker_cls = WORKER_TYPES[worker_class_name]
- worker = worker_cls(worker_id=worker_id, daemon=daemon, **kwargs)
+ worker = worker_cls(worker_id=worker_id, **kwargs)
+ worker.runloop()
+
+
+def _run_snapshot_worker(snapshot_id: str, worker_id: int, **kwargs):
+ """
+ Module-level function to run a SnapshotWorker for a specific snapshot.
+ Must be at module level for pickling compatibility.
+ """
+ from archivebox.config.django import setup_django
+ setup_django()
+
+ worker = SnapshotWorker(snapshot_id=snapshot_id, worker_id=worker_id, **kwargs)
worker.runloop()
class Worker:
"""
- Base worker class that polls a queue and processes items directly.
+ Base worker class for CrawlWorker and SnapshotWorker.
- Each item is processed by calling its state machine tick() method.
- Workers exit when idle for too long (unless daemon mode).
+ Workers are spawned as subprocesses to process crawls and snapshots.
+ Each worker type has its own custom runloop implementation.
"""
name: ClassVar[str] = 'worker'
@@ -60,16 +72,10 @@ class Worker:
# Configuration (can be overridden by subclasses)
MAX_TICK_TIME: ClassVar[int] = 60
MAX_CONCURRENT_TASKS: ClassVar[int] = 1
- POLL_INTERVAL: ClassVar[float] = 0.1 # How often to check for new work (seconds)
- IDLE_TIMEOUT: ClassVar[int] = 100 # Exit after N idle iterations (10 sec at 0.1 poll interval)
- def __init__(self, worker_id: int = 0, daemon: bool = False, crawl_id: str | None = None, **kwargs: Any):
+ def __init__(self, worker_id: int = 0, **kwargs: Any):
self.worker_id = worker_id
- self.daemon = daemon
- self.crawl_id = crawl_id # If set, only process work for this crawl
self.pid: int = os.getpid()
- self.pid_file: Path | None = None
- self.idle_count: int = 0
def __repr__(self) -> str:
return f'[underline]{self.__class__.__name__}[/underline]\\[id={self.worker_id}, pid={self.pid}]'
@@ -78,55 +84,6 @@ class Worker:
"""Get the Django model class. Subclasses must override this."""
raise NotImplementedError("Subclasses must implement get_model()")
- def get_queue(self) -> QuerySet:
- """Get the queue of objects ready for processing."""
- Model = self.get_model()
- return Model.objects.filter(
- retry_at__lte=timezone.now()
- ).exclude(
- status__in=Model.FINAL_STATES
- ).order_by('retry_at')
-
- def claim_next(self):
- """
- Atomically claim the next object from the queue.
- Returns the claimed object or None if queue is empty or claim failed.
- """
- Model = self.get_model()
-
- queue = self.get_queue()
- obj = queue.first()
- if obj is None:
- return None
-
- # Atomic claim using optimistic locking on retry_at
- claimed = Model.objects.filter(
- pk=obj.pk,
- retry_at=obj.retry_at,
- ).update(
- retry_at=timezone.now() + timedelta(seconds=self.MAX_TICK_TIME)
- )
-
- if claimed == 1:
- obj.refresh_from_db()
- return obj
-
- return None # Someone else claimed it
-
- def process_item(self, obj) -> bool:
- """
- Process a single item by calling its state machine tick().
- Returns True on success, False on failure.
- Subclasses can override for custom processing.
- """
- try:
- obj.sm.tick()
- return True
- except Exception as e:
- # Error will be logged in runloop's completion event
- traceback.print_exc()
- return False
-
def on_startup(self) -> None:
"""Called when worker starts."""
from archivebox.machine.models import Process
@@ -139,7 +96,7 @@ class Worker:
if self.db_process.process_type != Process.TypeChoices.WORKER:
self.db_process.process_type = Process.TypeChoices.WORKER
update_fields.append('process_type')
- # Store worker type name (crawl/snapshot/archiveresult) in worker_type field
+ # Store worker type name (crawl/snapshot) in worker_type field
if not self.db_process.worker_type:
self.db_process.worker_type = self.name
update_fields.append('worker_type')
@@ -148,13 +105,11 @@ class Worker:
# Determine worker type for logging
worker_type_name = self.__class__.__name__
- indent_level = 1 # Default for most workers
+ indent_level = 1 # Default for CrawlWorker
- # Adjust indent level based on worker type
+ # SnapshotWorker gets indent level 2
if 'Snapshot' in worker_type_name:
indent_level = 2
- elif 'ArchiveResult' in worker_type_name:
- indent_level = 3
log_worker_event(
worker_type=worker_type_name,
@@ -162,10 +117,6 @@ class Worker:
indent_level=indent_level,
pid=self.pid,
worker_id=str(self.worker_id),
- metadata={
- 'max_concurrent': self.MAX_CONCURRENT_TASKS,
- 'poll_interval': self.POLL_INTERVAL,
- },
)
def on_shutdown(self, error: BaseException | None = None) -> None:
@@ -179,12 +130,10 @@ class Worker:
# Determine worker type for logging
worker_type_name = self.__class__.__name__
- indent_level = 1
+ indent_level = 1 # CrawlWorker
if 'Snapshot' in worker_type_name:
indent_level = 2
- elif 'ArchiveResult' in worker_type_name:
- indent_level = 3
log_worker_event(
worker_type=worker_type_name,
@@ -195,121 +144,157 @@ class Worker:
error=error if error and not isinstance(error, KeyboardInterrupt) else None,
)
- def should_exit(self) -> bool:
- """Check if worker should exit due to idle timeout."""
- if self.daemon:
- return False
+ def _terminate_background_hooks(
+ self,
+ background_processes: dict[str, 'Process'],
+ worker_type: str,
+ indent_level: int,
+ ) -> None:
+ """
+ Terminate background hooks in 3 phases (shared logic for Crawl/Snapshot workers).
- if self.IDLE_TIMEOUT == 0:
- return False
+ Phase 1: Send SIGTERM to all bg hooks + children in parallel (polite request to wrap up)
+ Phase 2: Wait for each hook's remaining timeout before SIGKILL
+ Phase 3: SIGKILL any stragglers that exceeded their timeout
- return self.idle_count >= self.IDLE_TIMEOUT
+ Args:
+ background_processes: Dict mapping hook name -> Process instance
+ worker_type: Worker type name for logging (e.g., 'CrawlWorker', 'SnapshotWorker')
+ indent_level: Logging indent level (1 for Crawl, 2 for Snapshot)
+ """
+ import signal
+ import time
- def runloop(self) -> None:
- """Main worker loop - polls queue, processes items."""
- self.on_startup()
+ if not background_processes:
+ return
- # Determine worker type for logging
- worker_type_name = self.__class__.__name__
- indent_level = 1
+ now = time.time()
- if 'Snapshot' in worker_type_name:
- indent_level = 2
- elif 'ArchiveResult' in worker_type_name:
- indent_level = 3
+ # Phase 1: Send SIGTERM to ALL background processes + children in parallel
+ log_worker_event(
+ worker_type=worker_type,
+ event=f'Sending SIGTERM to {len(background_processes)} background hooks (+ children)',
+ indent_level=indent_level,
+ pid=self.pid,
+ )
- try:
- while True:
- # Try to claim and process an item
- obj = self.claim_next()
-
- if obj is not None:
- self.idle_count = 0
-
- # Build metadata for task start
- start_metadata = {}
- url = None
- if hasattr(obj, 'url'):
- # SnapshotWorker
- url = str(obj.url) if obj.url else None
- elif hasattr(obj, 'snapshot') and hasattr(obj.snapshot, 'url'):
- # ArchiveResultWorker
- url = str(obj.snapshot.url) if obj.snapshot.url else None
- elif hasattr(obj, 'get_urls_list'):
- # CrawlWorker
- urls = obj.get_urls_list()
- url = urls[0] if urls else None
-
- plugin = None
- if hasattr(obj, 'plugin'):
- # ArchiveResultWorker, Crawl
- plugin = obj.plugin
+ # Build deadline map first (before killing, to get accurate remaining time)
+ deadlines = {}
+ for hook_name, process in background_processes.items():
+ elapsed = now - process.started_at.timestamp()
+ remaining = max(0, process.timeout - elapsed)
+ deadline = now + remaining
+ deadlines[hook_name] = (process, deadline)
+ # Send SIGTERM to all process trees in parallel (non-blocking)
+ for hook_name, process in background_processes.items():
+ try:
+ # Get chrome children (renderer processes etc) before sending signal
+ children_pids = process.get_children_pids()
+ if children_pids:
+ # Chrome hook with children - kill tree
+ os.kill(process.pid, signal.SIGTERM)
+ for child_pid in children_pids:
+ try:
+ os.kill(child_pid, signal.SIGTERM)
+ except ProcessLookupError:
+ pass
log_worker_event(
- worker_type=worker_type_name,
- event='Processing',
+ worker_type=worker_type,
+ event=f'Sent SIGTERM to {hook_name} + {len(children_pids)} children',
indent_level=indent_level,
pid=self.pid,
- worker_id=str(self.worker_id),
- url=url,
- plugin=plugin,
- metadata=start_metadata if start_metadata else None,
- )
-
- start_time = time.time()
- success = self.process_item(obj)
- elapsed = time.time() - start_time
-
- # Build metadata for task completion
- complete_metadata = {
- 'duration': elapsed,
- 'status': 'success' if success else 'failed',
- }
-
- log_worker_event(
- worker_type=worker_type_name,
- event='Completed' if success else 'Failed',
- indent_level=indent_level,
- pid=self.pid,
- worker_id=str(self.worker_id),
- url=url,
- plugin=plugin,
- metadata=complete_metadata,
)
else:
- # No work available - idle logging suppressed
- self.idle_count += 1
+ # No children - normal kill
+ os.kill(process.pid, signal.SIGTERM)
+ except ProcessLookupError:
+ pass # Already dead
+ except Exception as e:
+ log_worker_event(
+ worker_type=worker_type,
+ event=f'Failed to SIGTERM {hook_name}: {e}',
+ indent_level=indent_level,
+ pid=self.pid,
+ )
- # Check if we should exit
- if self.should_exit():
- # Exit logging suppressed - shutdown will be logged by on_shutdown()
- break
+ # Phase 2: Wait for all processes in parallel, respecting individual timeouts
+ for hook_name, (process, deadline) in deadlines.items():
+ remaining = deadline - now
+ log_worker_event(
+ worker_type=worker_type,
+ event=f'Waiting up to {remaining:.1f}s for {hook_name}',
+ indent_level=indent_level,
+ pid=self.pid,
+ )
- time.sleep(self.POLL_INTERVAL)
+ # Poll all processes in parallel using Process.poll()
+ still_running = set(deadlines.keys())
- except KeyboardInterrupt:
- pass
- except BaseException as e:
- self.on_shutdown(error=e)
- raise
- else:
- self.on_shutdown()
+ while still_running:
+ time.sleep(0.1)
+ now = time.time()
+
+ for hook_name in list(still_running):
+ process, deadline = deadlines[hook_name]
+
+ # Check if process exited using Process.poll()
+ exit_code = process.poll()
+ if exit_code is not None:
+ # Process exited
+ still_running.remove(hook_name)
+ log_worker_event(
+ worker_type=worker_type,
+ event=f'✓ {hook_name} exited with code {exit_code}',
+ indent_level=indent_level,
+ pid=self.pid,
+ )
+ continue
+
+ # Check if deadline exceeded
+ if now >= deadline:
+ # Timeout exceeded - SIGKILL process tree
+ try:
+ # Get children before killing (chrome may have spawned more)
+ children_pids = process.get_children_pids()
+ if children_pids:
+ # Kill children first
+ for child_pid in children_pids:
+ try:
+ os.kill(child_pid, signal.SIGKILL)
+ except ProcessLookupError:
+ pass
+ # Then kill parent
+ process.kill(signal_num=signal.SIGKILL)
+ log_worker_event(
+ worker_type=worker_type,
+ event=f'⚠ Sent SIGKILL to {hook_name} + {len(children_pids) if children_pids else 0} children (exceeded timeout)',
+ indent_level=indent_level,
+ pid=self.pid,
+ )
+ except Exception as e:
+ log_worker_event(
+ worker_type=worker_type,
+ event=f'Failed to SIGKILL {hook_name}: {e}',
+ indent_level=indent_level,
+ pid=self.pid,
+ )
+ still_running.remove(hook_name)
@classmethod
- def start(cls, worker_id: int | None = None, daemon: bool = False, **kwargs: Any) -> int:
+ def start(cls, **kwargs: Any) -> int:
"""
Fork a new worker as a subprocess.
Returns the PID of the new process.
"""
from archivebox.machine.models import Process
- if worker_id is None:
- worker_id = Process.get_next_worker_id(process_type=Process.TypeChoices.WORKER)
+ worker_id = Process.get_next_worker_id(process_type=Process.TypeChoices.WORKER)
# Use module-level function for pickling compatibility
proc = MPProcess(
target=_run_worker,
- args=(cls.name, worker_id, daemon),
+ args=(cls.name, worker_id),
kwargs=kwargs,
name=f'{cls.name}_worker_{worker_id}',
)
@@ -356,120 +341,397 @@ class Worker:
class CrawlWorker(Worker):
- """Worker for processing Crawl objects."""
+ """
+ Worker for processing Crawl objects.
+
+ Responsibilities:
+ 1. Run on_Crawl__* hooks (e.g., chrome launcher)
+ 2. Create Snapshots from URLs
+ 3. Spawn SnapshotWorkers (up to MAX_SNAPSHOT_WORKERS)
+ 4. Monitor snapshots and seal crawl when all done
+ """
name: ClassVar[str] = 'crawl'
MAX_TICK_TIME: ClassVar[int] = 60
+ MAX_SNAPSHOT_WORKERS: ClassVar[int] = 8 # Per crawl limit
+
+ def __init__(self, crawl_id: str, **kwargs: Any):
+ super().__init__(**kwargs)
+ self.crawl_id = crawl_id
+ self.crawl = None
def get_model(self):
from archivebox.crawls.models import Crawl
return Crawl
- def get_queue(self) -> QuerySet:
- """Get queue of Crawls ready for processing, optionally filtered by crawl_id."""
- qs = super().get_queue()
- if self.crawl_id:
- qs = qs.filter(id=self.crawl_id)
- return qs
+ def on_startup(self) -> None:
+ """Load crawl."""
+ super().on_startup()
+
+ from archivebox.crawls.models import Crawl
+ self.crawl = Crawl.objects.get(id=self.crawl_id)
+
+ def runloop(self) -> None:
+ """Run crawl state machine, spawn SnapshotWorkers."""
+ import sys
+ self.on_startup()
+
+ try:
+ print(f'[cyan]🔄 CrawlWorker.runloop: Starting tick() for crawl {self.crawl_id}[/cyan]', file=sys.stderr)
+ # Advance state machine: QUEUED → STARTED (triggers run() via @started.enter)
+ self.crawl.sm.tick()
+ self.crawl.refresh_from_db()
+ print(f'[cyan]🔄 tick() complete, crawl status={self.crawl.status}[/cyan]', file=sys.stderr)
+
+ # Now spawn SnapshotWorkers and monitor progress
+ while True:
+ # Check if crawl is done
+ if self._is_crawl_finished():
+ print(f'[cyan]🔄 Crawl finished, sealing...[/cyan]', file=sys.stderr)
+ self.crawl.sm.seal()
+ break
+
+ # Spawn workers for queued snapshots
+ self._spawn_snapshot_workers()
+
+ time.sleep(2) # Check every 2s
+
+ finally:
+ self.on_shutdown()
+
+ def _spawn_snapshot_workers(self) -> None:
+ """Spawn SnapshotWorkers for queued snapshots (up to limit)."""
+ from archivebox.core.models import Snapshot
+ from archivebox.machine.models import Process
+
+ # Count running SnapshotWorkers for this crawl
+ running_count = Process.objects.filter(
+ process_type=Process.TypeChoices.WORKER,
+ worker_type='snapshot',
+ parent_id=self.db_process.id, # Children of this CrawlWorker
+ status__in=['running', 'started'],
+ ).count()
+
+ if running_count >= self.MAX_SNAPSHOT_WORKERS:
+ return # At limit
+
+ # Get queued snapshots for this crawl (SnapshotWorker will mark as STARTED in on_startup)
+ queued_snapshots = Snapshot.objects.filter(
+ crawl_id=self.crawl_id,
+ status=Snapshot.StatusChoices.QUEUED,
+ ).order_by('created_at')[:self.MAX_SNAPSHOT_WORKERS - running_count]
+
+ import sys
+ print(f'[yellow]🔧 _spawn_snapshot_workers: running={running_count}/{self.MAX_SNAPSHOT_WORKERS}, queued={queued_snapshots.count()}[/yellow]', file=sys.stderr)
+
+ # Spawn workers
+ for snapshot in queued_snapshots:
+ print(f'[yellow]🔧 Spawning worker for {snapshot.url} (status={snapshot.status})[/yellow]', file=sys.stderr)
+ SnapshotWorker.start(snapshot_id=str(snapshot.id))
+ log_worker_event(
+ worker_type='CrawlWorker',
+ event=f'Spawned SnapshotWorker for {snapshot.url}',
+ indent_level=1,
+ pid=self.pid,
+ )
+
+ def _is_crawl_finished(self) -> bool:
+ """Check if all snapshots are sealed."""
+ from archivebox.core.models import Snapshot
+
+ pending = Snapshot.objects.filter(
+ crawl_id=self.crawl_id,
+ status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED],
+ ).count()
+
+ return pending == 0
+
+ def on_shutdown(self, error: BaseException | None = None) -> None:
+ """
+ Terminate all background Crawl hooks when crawl finishes.
+
+ Background hooks (e.g., chrome launcher) should only be killed when:
+ - All snapshots are done (crawl is sealed)
+ - Worker is shutting down
+ """
+ from archivebox.machine.models import Process
+
+ # Query for all running hook processes that are children of this CrawlWorker
+ background_hooks = Process.objects.filter(
+ parent_id=self.db_process.id,
+ process_type=Process.TypeChoices.HOOK,
+ status=Process.StatusChoices.RUNNING,
+ ).select_related('machine')
+
+ # Build dict for shared termination logic
+ background_processes = {
+ hook.cmd[0] if hook.cmd else f'hook-{hook.pid}': hook
+ for hook in background_hooks
+ }
+
+ # Use shared termination logic from Worker base class
+ self._terminate_background_hooks(
+ background_processes=background_processes,
+ worker_type='CrawlWorker',
+ indent_level=1,
+ )
+
+ super().on_shutdown(error)
class SnapshotWorker(Worker):
- """Worker for processing Snapshot objects."""
+ """
+ Worker that owns sequential hook execution for ONE snapshot.
+
+ Unlike other workers, SnapshotWorker doesn't poll a queue - it's given
+ a specific snapshot_id and runs all hooks for that snapshot sequentially.
+
+ Execution flow:
+ 1. Mark snapshot as STARTED
+ 2. Discover hooks for snapshot
+ 3. For each hook (sorted by name):
+ a. Fork hook Process
+ b. If foreground: wait for completion
+ c. If background: track but continue to next hook
+ d. Update ArchiveResult status
+ e. Advance current_step when all step's hooks complete
+ 4. When all hooks done: seal snapshot
+ 5. On shutdown: SIGTERM all background hooks
+ """
name: ClassVar[str] = 'snapshot'
- MAX_TICK_TIME: ClassVar[int] = 60
+
+ def __init__(self, snapshot_id: str, **kwargs: Any):
+ super().__init__(**kwargs)
+ self.snapshot_id = snapshot_id
+ self.snapshot = None
+ self.background_processes: dict[str, Any] = {} # hook_name -> Process
def get_model(self):
+ """Not used - SnapshotWorker doesn't poll queues."""
from archivebox.core.models import Snapshot
return Snapshot
- def get_queue(self) -> QuerySet:
- """Get queue of Snapshots ready for processing, optionally filtered by crawl_id."""
- qs = super().get_queue()
- if self.crawl_id:
- qs = qs.filter(crawl_id=self.crawl_id)
- return qs
+ def on_startup(self) -> None:
+ """Load snapshot and mark as STARTED."""
+ super().on_startup()
+ from archivebox.core.models import Snapshot
+ self.snapshot = Snapshot.objects.get(id=self.snapshot_id)
-class ArchiveResultWorker(Worker):
- """Worker for processing ArchiveResult objects."""
+ # Mark snapshot as STARTED
+ self.snapshot.status = Snapshot.StatusChoices.STARTED
+ self.snapshot.retry_at = None # No more polling needed
+ self.snapshot.save(update_fields=['status', 'retry_at', 'modified_at'])
- name: ClassVar[str] = 'archiveresult'
- MAX_TICK_TIME: ClassVar[int] = 120
-
- def __init__(self, plugin: str | None = None, **kwargs: Any):
- super().__init__(**kwargs)
- self.plugin = plugin
-
- def get_model(self):
+ def runloop(self) -> None:
+ """Execute all hooks sequentially."""
+ from archivebox.hooks import discover_hooks, is_background_hook, extract_step
from archivebox.core.models import ArchiveResult
- return ArchiveResult
- def get_queue(self) -> QuerySet:
- """
- Get queue of ArchiveResults ready for processing.
+ self.on_startup()
- Uses step-based filtering: only claims ARs where hook step <= snapshot.current_step.
- This ensures hooks execute in order (step 0 → 1 → 2 ... → 9).
- """
- from archivebox.core.models import ArchiveResult
- from archivebox.hooks import extract_step
-
- qs = super().get_queue()
-
- if self.crawl_id:
- qs = qs.filter(snapshot__crawl_id=self.crawl_id)
-
- if self.plugin:
- qs = qs.filter(plugin=self.plugin)
-
- # Step-based filtering: only process ARs whose step <= snapshot.current_step
- # Since step is derived from hook_name, we filter in Python after initial query
- # This is efficient because the base query already filters by retry_at and status
-
- # Get candidate ARs
- candidates = list(qs[:50]) # Limit to avoid loading too many
- ready_pks = []
-
- for ar in candidates:
- if not ar.hook_name:
- # Legacy ARs without hook_name - process them
- ready_pks.append(ar.pk)
- continue
-
- ar_step = extract_step(ar.hook_name)
- snapshot_step = ar.snapshot.current_step
-
- if ar_step <= snapshot_step:
- ready_pks.append(ar.pk)
-
- # Return filtered queryset ordered by hook_name (so earlier hooks run first within a step)
- return ArchiveResult.objects.filter(pk__in=ready_pks).order_by('hook_name', 'retry_at')
-
- def process_item(self, obj) -> bool:
- """Process an ArchiveResult by running its plugin."""
try:
- obj.sm.tick()
- return True
+ # Discover all hooks for this snapshot
+ hooks = discover_hooks('Snapshot', config=self.snapshot.config)
+ hooks = sorted(hooks, key=lambda h: h.name) # Sort by name (includes step prefix)
+
+ # Execute each hook sequentially
+ for hook_path in hooks:
+ hook_name = hook_path.name
+ plugin = self._extract_plugin_name(hook_name)
+ hook_step = extract_step(hook_name)
+ is_background = is_background_hook(hook_name)
+
+ # Create ArchiveResult for THIS HOOK (not per plugin)
+ # One plugin can have multiple hooks (e.g., chrome/on_Snapshot__20_launch_chrome.js, chrome/on_Snapshot__21_navigate_chrome.js)
+ # Unique key = (snapshot, plugin, hook_name) for idempotency
+ ar, created = ArchiveResult.objects.get_or_create(
+ snapshot=self.snapshot,
+ plugin=plugin,
+ hook_name=hook_name,
+ defaults={
+ 'status': ArchiveResult.StatusChoices.STARTED,
+ 'start_ts': timezone.now(),
+ }
+ )
+
+ if not created:
+ # Update existing AR to STARTED
+ ar.status = ArchiveResult.StatusChoices.STARTED
+ ar.start_ts = timezone.now()
+ ar.save(update_fields=['status', 'start_ts', 'modified_at'])
+
+ # Fork and run the hook
+ process = self._run_hook(hook_path, ar)
+
+ if is_background:
+ # Track but don't wait
+ self.background_processes[hook_name] = process
+ log_worker_event(
+ worker_type='SnapshotWorker',
+ event=f'Started background hook: {hook_name} (timeout={process.timeout}s)',
+ indent_level=2,
+ pid=self.pid,
+ )
+ else:
+ # Wait for foreground hook to complete
+ self._wait_for_hook(process, ar)
+ log_worker_event(
+ worker_type='SnapshotWorker',
+ event=f'Completed hook: {hook_name}',
+ indent_level=2,
+ pid=self.pid,
+ )
+
+ # Check if we can advance to next step
+ self._try_advance_step()
+
+ # All hooks launched (or completed) - cleanup and seal
+ self._cleanup_empty_archiveresults()
+ self.snapshot.status = Snapshot.StatusChoices.SEALED
+ self.snapshot.save(update_fields=['status', 'modified_at'])
+
except Exception as e:
- # Error will be logged in runloop's completion event
- traceback.print_exc()
- return False
+ # Mark snapshot as failed
+ self.snapshot.status = Snapshot.StatusChoices.SEALED # Still seal on error
+ self.snapshot.save(update_fields=['status', 'modified_at'])
+ raise
+ finally:
+ self.on_shutdown()
+
+ def _run_hook(self, hook_path: Path, ar: Any) -> Any:
+ """Fork and run a hook using Process model, return Process."""
+ from archivebox.hooks import run_hook
+
+ # Create output directory
+ output_dir = ar.create_output_dir()
+
+ # Run hook using Process.launch() - returns Process model directly
+ # Pass self.db_process as parent to track SnapshotWorker -> Hook hierarchy
+ process = run_hook(
+ script=hook_path,
+ output_dir=output_dir,
+ config=self.snapshot.config,
+ timeout=120,
+ parent=self.db_process,
+ url=str(self.snapshot.url),
+ snapshot_id=str(self.snapshot.id),
+ )
+
+ # Link ArchiveResult to Process for tracking
+ ar.process = process
+ ar.save(update_fields=['process_id', 'modified_at'])
+
+ return process
+
+ def _wait_for_hook(self, process: Any, ar: Any) -> None:
+ """Wait for hook using Process.wait(), update AR status."""
+ # Use Process.wait() helper instead of manual polling
+ try:
+ exit_code = process.wait(timeout=process.timeout)
+ except TimeoutError:
+ # Hook exceeded timeout - kill it
+ process.kill(signal_num=9)
+ exit_code = -1
+
+ # Update ArchiveResult from hook output
+ ar.update_from_output()
+ ar.end_ts = timezone.now()
+
+ # Determine final status from hook exit code
+ if exit_code == 0:
+ ar.status = ar.StatusChoices.SUCCEEDED
+ else:
+ ar.status = ar.StatusChoices.FAILED
+
+ ar.save(update_fields=['status', 'end_ts', 'modified_at'])
+
+ def _try_advance_step(self) -> None:
+ """Advance current_step if all foreground hooks in current step are done."""
+ from django.db.models import Q
+ from archivebox.core.models import ArchiveResult
+
+ current_step = self.snapshot.current_step
+
+ # Single query: foreground hooks in current step that aren't finished
+ # Foreground hooks: hook_name doesn't contain '.bg.'
+ pending_foreground = self.snapshot.archiveresult_set.filter(
+ Q(hook_name__contains=f'__{current_step}_') & # Current step
+ ~Q(hook_name__contains='.bg.') & # Not background
+ ~Q(status__in=ArchiveResult.FINAL_STATES) # Not finished
+ ).exists()
+
+ if pending_foreground:
+ return # Still waiting for hooks
+
+ # All foreground hooks done - advance!
+ self.snapshot.current_step += 1
+ self.snapshot.save(update_fields=['current_step', 'modified_at'])
+
+ log_worker_event(
+ worker_type='SnapshotWorker',
+ event=f'Advanced to step {self.snapshot.current_step}',
+ indent_level=2,
+ pid=self.pid,
+ )
+
+ def _cleanup_empty_archiveresults(self) -> None:
+ """Delete ArchiveResults that produced no output files."""
+ empty_ars = self.snapshot.archiveresult_set.filter(
+ output_files={} # No output files
+ ).filter(
+ status__in=self.snapshot.archiveresult_set.model.FINAL_STATES # Only delete finished ones
+ )
+
+ deleted_count = empty_ars.count()
+ if deleted_count > 0:
+ empty_ars.delete()
+ log_worker_event(
+ worker_type='SnapshotWorker',
+ event=f'Deleted {deleted_count} empty ArchiveResults',
+ indent_level=2,
+ pid=self.pid,
+ )
+
+ def on_shutdown(self, error: BaseException | None = None) -> None:
+ """
+ Terminate all background Snapshot hooks when snapshot finishes.
+
+ Background hooks should only be killed when:
+ - All foreground hooks are done (snapshot is sealed)
+ - Worker is shutting down
+ """
+ # Use shared termination logic from Worker base class
+ self._terminate_background_hooks(
+ background_processes=self.background_processes,
+ worker_type='SnapshotWorker',
+ indent_level=2,
+ )
+
+ super().on_shutdown(error)
+
+ @staticmethod
+ def _extract_plugin_name(hook_name: str) -> str:
+ """Extract plugin name from hook filename."""
+ # on_Snapshot__50_wget.py -> wget
+ name = hook_name.split('__')[-1] # Get part after last __
+ name = name.replace('.py', '').replace('.js', '').replace('.sh', '')
+ name = name.replace('.bg', '') # Remove .bg suffix
+ return name
@classmethod
- def start(cls, worker_id: int | None = None, daemon: bool = False, plugin: str | None = None, **kwargs: Any) -> int:
- """Fork a new worker as subprocess with optional plugin filter."""
+ def start(cls, snapshot_id: str, **kwargs: Any) -> int:
+ """Fork a SnapshotWorker for a specific snapshot."""
from archivebox.machine.models import Process
- if worker_id is None:
- worker_id = Process.get_next_worker_id(process_type=Process.TypeChoices.WORKER)
+ worker_id = Process.get_next_worker_id(process_type=Process.TypeChoices.WORKER)
- # Use module-level function for pickling compatibility
proc = MPProcess(
- target=_run_worker,
- args=(cls.name, worker_id, daemon),
- kwargs={'plugin': plugin, **kwargs},
- name=f'{cls.name}_worker_{worker_id}',
+ target=_run_snapshot_worker, # New module-level function
+ args=(snapshot_id, worker_id),
+ kwargs=kwargs,
+ name=f'snapshot_worker_{snapshot_id[:8]}',
)
proc.start()
@@ -481,7 +743,6 @@ class ArchiveResultWorker(Worker):
WORKER_TYPES.update({
'crawl': CrawlWorker,
'snapshot': SnapshotWorker,
- 'archiveresult': ArchiveResultWorker,
})
diff --git a/old/TODO_Process_cleanup_unification.md b/old/TODO_Process_cleanup_unification.md
new file mode 100644
index 00000000..ea12a0a0
--- /dev/null
+++ b/old/TODO_Process_cleanup_unification.md
@@ -0,0 +1,333 @@
+# Process Model Integration Plan
+
+## Current Architecture
+
+### Hook Execution Flow
+```
+Orchestrator
+ ├─> CrawlWorker
+ │ └─> Crawl.run() [state machine @started.enter]
+ │ └─> run_hook() for on_Crawl__* hooks
+ │ └─> subprocess.Popen (NOT using Process model)
+ │
+ └─> SnapshotWorker
+ └─> Snapshot.run() [planned - doesn't exist yet]
+ └─> ArchiveResult.run() [state machine @started.enter]
+ └─> run_hook() for on_Snapshot__* hooks
+ └─> subprocess.Popen (NOT using Process model)
+```
+
+### Problem
+1. **No Process tracking**: `run_hook()` uses `subprocess.Popen` directly, never creates Process records
+2. **Orphaned Process model**: Process model has `.launch()`, `.wait()`, `.terminate()` methods that are NEVER used
+3. **Manual process management**: SnapshotWorker manually uses psutil for waiting/killing
+4. **Duplicate logic**: Process model and run_hook() both do subprocess management independently
+
+## Unified Architecture
+
+### Goal
+Make Process model the **single source of truth** for all subprocess operations:
+- Hook execution
+- PID tracking
+- stdout/stderr capture
+- Process lifecycle (launch, wait, terminate)
+
+### Design
+
+```python
+# hooks.py - Thin wrapper
+def run_hook(...) -> Process:
+ """
+ Run a hook using Process model (THIN WRAPPER).
+
+ Returns Process model instance for tracking and control.
+ """
+ from archivebox.machine.models import Process
+
+ # Build command
+ cmd = build_hook_cmd(script, kwargs)
+
+ # Use Process.launch() - handles everything
+ process = Process.objects.create(
+ machine=Machine.current(),
+ process_type=Process.TypeChoices.HOOK,
+ pwd=str(output_dir),
+ cmd=cmd,
+ env=build_hook_env(config),
+ timeout=timeout,
+ )
+
+ # Launch subprocess
+ process.launch(background=is_background_hook(script.name))
+
+ return process # Return Process, not dict
+
+
+# worker.py - Use Process methods
+class SnapshotWorker:
+ def _run_hook(self, hook_path, ar) -> Process:
+ """Fork hook using Process model."""
+ process = run_hook(
+ hook_path,
+ ar.create_output_dir(),
+ self.snapshot.config,
+ url=self.snapshot.url,
+ snapshot_id=str(self.snapshot.id),
+ )
+
+ # Link ArchiveResult to Process
+ ar.process = process
+ ar.save()
+
+ return process
+
+ def _wait_for_hook(self, process, ar):
+ """Wait using Process.wait() method."""
+ exit_code = process.wait(timeout=None)
+
+ # Update AR from hook output
+ ar.update_from_output()
+ ar.status = ar.StatusChoices.SUCCEEDED if exit_code == 0 else ar.StatusChoices.FAILED
+ ar.save()
+
+ def on_shutdown(self):
+ """
+ Terminate all background hooks in parallel with per-plugin timeouts.
+
+ Phase 1: Send SIGTERM to all in parallel (polite request to wrap up)
+ Phase 2: Wait for all in parallel, respecting individual plugin timeouts
+ Phase 3: SIGKILL any that exceed their timeout
+
+ Each plugin has its own timeout (SCREENSHOT_TIMEOUT=60, YTDLP_TIMEOUT=300, etc.)
+ Some hooks (consolelog, responses) exit immediately on SIGTERM.
+ Others (ytdlp, wget) need their full timeout to finish actual work.
+ """
+ # Send SIGTERM to all processes in parallel
+ for hook_name, process in self.background_processes.items():
+ os.kill(process.pid, signal.SIGTERM)
+
+ # Build per-process deadlines based on plugin-specific timeouts
+ deadlines = {
+ name: (proc, time.time() + max(0, proc.timeout - (time.time() - proc.started_at.timestamp())))
+ for name, proc in self.background_processes.items()
+ }
+
+ # Poll all processes in parallel - no head-of-line blocking
+ still_running = set(deadlines.keys())
+ while still_running:
+ time.sleep(0.1)
+ for name in list(still_running):
+ proc, deadline = deadlines[name]
+ if not proc.is_running():
+ still_running.remove(name)
+ elif time.time() >= deadline:
+ os.kill(proc.pid, signal.SIGKILL) # Timeout exceeded
+ still_running.remove(name)
+
+
+# models.py - Process becomes active
+class Process:
+ def launch(self, background=False):
+ """Spawn subprocess and track it."""
+ with open(self.stdout_file, 'w') as out, open(self.stderr_file, 'w') as err:
+ proc = subprocess.Popen(
+ self.cmd,
+ cwd=self.pwd,
+ stdout=out,
+ stderr=err,
+ env=self._build_env(),
+ )
+
+ self.pid = proc.pid
+ self.started_at = timezone.now()
+ self.status = self.StatusChoices.RUNNING
+ self.save()
+
+ if not background:
+ # Foreground - wait inline
+ proc.wait()
+ self.exit_code = proc.returncode
+ self.ended_at = timezone.now()
+ self.status = self.StatusChoices.EXITED
+ self.save()
+
+ return self
+
+ def wait(self, timeout=None):
+ """Wait for process to exit, polling DB."""
+ while True:
+ self.refresh_from_db()
+ if self.status == self.StatusChoices.EXITED:
+ return self.exit_code
+
+ # Check via psutil if Process died without updating DB
+ if not self.is_running():
+ self._reap() # Update status from OS
+ return self.exit_code
+
+ time.sleep(0.1)
+
+ def terminate(self, sig=signal.SIGTERM):
+ """Gracefully terminate: SIGTERM → wait → SIGKILL."""
+ if not self.is_running():
+ return True
+
+ os.kill(self.pid, sig)
+
+ # Wait for graceful shutdown
+ for _ in range(50): # 5 seconds
+ if not self.is_running():
+ self._reap()
+ return True
+ time.sleep(0.1)
+
+ # Escalate to SIGKILL
+ os.kill(self.pid, signal.SIGKILL)
+ self._reap()
+ return True
+```
+
+## Migration Steps
+
+### Step 1: Update Process.launch() (DONE - already exists)
+Process model already has `.launch()`, `.wait()`, `.terminate()` methods implemented in machine/models.py:1295-1593
+
+### Step 2: Refactor run_hook() to use Process.launch()
+**File**: `archivebox/hooks.py`
+
+Change signature from:
+```python
+def run_hook(...) -> HookResult: # Returns dict
+```
+
+To:
+```python
+def run_hook(...) -> Process: # Returns Process model
+```
+
+**Implementation**:
+```python
+def run_hook(script, output_dir, config, timeout=None, **kwargs) -> Process:
+ from archivebox.machine.models import Process, Machine
+
+ # Build command
+ cmd = build_hook_cmd(script, kwargs)
+ env = build_hook_env(config)
+ is_bg = is_background_hook(script.name)
+
+ # Create Process record
+ process = Process.objects.create(
+ machine=Machine.current(),
+ process_type=Process.TypeChoices.HOOK,
+ pwd=str(output_dir),
+ cmd=cmd,
+ env=env,
+ timeout=timeout or 120,
+ )
+
+ # Launch subprocess
+ process.launch(background=is_bg)
+
+ return process
+```
+
+### Step 3: Update SnapshotWorker to use Process methods
+**File**: `archivebox/workers/worker.py`
+
+Replace manual psutil code with Process model methods (shown above in Design section).
+
+### Step 4: Update ArchiveResult.run() to use new run_hook()
+**File**: `archivebox/core/models.py:2559`
+
+Change from:
+```python
+result = run_hook(...) # Returns HookResult dict
+if result is None:
+ is_bg_hook = True
+```
+
+To:
+```python
+process = run_hook(...) # Returns Process
+self.process = process
+self.save()
+
+if process.status == Process.StatusChoices.RUNNING:
+ # Background hook - still running
+ return
+else:
+ # Foreground hook - completed
+ self.update_from_output()
+```
+
+### Step 5: Update Crawl.run() similarly
+**File**: `archivebox/crawls/models.py:374`
+
+Same pattern as ArchiveResult.run()
+
+## Benefits
+
+### 1. Single Source of Truth
+- Process model owns ALL subprocess operations
+- No duplicate logic between run_hook(), Process, and workers
+- Consistent PID tracking, stdout/stderr handling
+
+### 2. Proper Hierarchy
+```
+Process.parent_id creates tree:
+Orchestrator (PID 1000)
+ └─> CrawlWorker (PID 1001, parent=1000)
+ └─> on_Crawl__01_chrome.js (PID 1010, parent=1001)
+ └─> SnapshotWorker (PID 1020, parent=1000)
+ └─> on_Snapshot__50_wget.py (PID 1021, parent=1020)
+ └─> on_Snapshot__63_ytdlp.bg.py (PID 1022, parent=1020)
+```
+
+### 3. Better Observability
+- Query all hook processes: `snapshot.process_set.all()`
+- Count running: `Process.objects.filter(status='running').count()`
+- Track resource usage via Process.get_memory_info()
+
+### 4. Cleaner Code
+- SnapshotWorker._wait_for_hook: 25 lines → 8 lines
+- SnapshotWorker.on_shutdown: 12 lines → 7 lines
+- run_hook(): ~200 lines → ~50 lines
+- Total: ~100 LoC saved
+
+## Risks & Mitigation
+
+### Risk 1: Breaking existing run_hook() callers
+**Mitigation**: Two-phase rollout
+1. Phase 1: Add run_hook_v2() that returns Process
+2. Phase 2: Migrate callers to run_hook_v2()
+3. Phase 3: Rename run_hook → run_hook_legacy, run_hook_v2 → run_hook
+
+### Risk 2: Background hook tracking changes
+**Mitigation**:
+- Process.launch(background=True) handles async launches
+- Process.wait() already polls for completion
+- Behavior identical to current subprocess.Popen
+
+### Risk 3: Performance overhead (extra DB writes)
+**Mitigation**:
+- Process records already being created (just not used)
+- Batch updates where possible
+- Monitor via metrics
+
+## Timeline
+
+### Immediate (This PR)
+- [x] State machine fixes (completed)
+- [x] Step advancement optimization (completed)
+- [x] Document unified architecture (this file)
+
+### Next PR (Process Integration)
+1. Add run_hook_v2() returning Process
+2. Update SnapshotWorker to use Process methods
+3. Migrate ArchiveResult.run() and Crawl.run()
+4. Deprecate old run_hook()
+
+### Future
+- Remove run_hook_legacy after migration complete
+- Add Process.get_tree() for hierarchy visualization
+- Add ProcessMachine state machine for lifecycle management
diff --git a/TODO_fix_migration_path.md b/old/TODO_fix_migration_path.md
similarity index 100%
rename from TODO_fix_migration_path.md
rename to old/TODO_fix_migration_path.md