From dd775110260c3847a237255ee716f2c85d94b6b2 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 2 Jan 2026 04:20:34 -0800 Subject: [PATCH] unified Process source of truth and better screenshot tests --- ArchiveBox.conf | 3 - archivebox/api/v1_workers.py | 63 +- archivebox/cli/archivebox_add.py | 14 +- archivebox/cli/archivebox_init.py | 4 +- archivebox/cli/archivebox_pluginmap.py | 119 +-- archivebox/cli/archivebox_update.py | 82 +- .../0027_copy_archiveresult_to_process.py | 13 +- .../0028_alter_snapshot_fs_version.py | 18 + .../0029_migrate_archiveresult_to_uuid_pk.py | 181 +++++ .../migrations/0030_alter_archiveresult_id.py | 19 + archivebox/core/models.py | 290 +++---- archivebox/crawls/admin.py | 29 - archivebox/crawls/models.py | 215 ++--- archivebox/hooks.py | 285 ++++--- .../migrations/0009_alter_binary_status.py | 18 + archivebox/machine/models.py | 61 +- archivebox/misc/logging_util.py | 10 +- archivebox/misc/progress_layout.py | 345 ++++++++ .../accessibility/tests/test_accessibility.py | 85 +- archivebox/plugins/chrome/chrome_utils.js | 8 +- .../chrome/tests/chrome_test_helpers.py | 246 +++--- .../plugins/chrome/tests/test_chrome.py | 3 +- .../chrome/tests/test_chrome_test_helpers.py | 260 ++++++ .../consolelog/tests/test_consolelog.py | 7 +- .../forumdl/on_Crawl__13_forumdl_install.py | 80 ++ .../infiniscroll/tests/test_infiniscroll.py | 66 +- .../tests/test_istilldontcareaboutcookies.py | 3 +- .../mercury/on_Crawl__12_mercury_install.py | 17 +- .../modalcloser/tests/test_modalcloser.py | 244 +++--- .../papersdl/on_Crawl__14_papersdl_install.py | 80 ++ .../tests/test_parse_dom_outlinks.py | 7 +- .../plugins/redirects/tests/test_redirects.py | 7 +- .../plugins/responses/tests/test_responses.py | 7 +- .../screenshot/on_Snapshot__51_screenshot.js | 204 ++--- .../screenshot/tests/test_screenshot.py | 667 ++++------------ archivebox/plugins/seo/tests/test_seo.py | 7 +- .../singlefile/tests/test_singlefile.py | 28 +- archivebox/plugins/ssl/tests/test_ssl.py | 7 +- .../staticfile/tests/test_staticfile.py | 10 +- archivebox/tests/test_recursive_crawl.py | 6 +- archivebox/workers/orchestrator.py | 388 ++++++--- archivebox/workers/worker.py | 749 ++++++++++++------ old/TODO_Process_cleanup_unification.md | 333 ++++++++ .../TODO_fix_migration_path.md | 0 44 files changed, 3369 insertions(+), 1919 deletions(-) delete mode 100644 ArchiveBox.conf create mode 100644 archivebox/core/migrations/0028_alter_snapshot_fs_version.py create mode 100644 archivebox/core/migrations/0029_migrate_archiveresult_to_uuid_pk.py create mode 100644 archivebox/core/migrations/0030_alter_archiveresult_id.py create mode 100644 archivebox/machine/migrations/0009_alter_binary_status.py create mode 100644 archivebox/misc/progress_layout.py create mode 100644 archivebox/plugins/chrome/tests/test_chrome_test_helpers.py create mode 100755 archivebox/plugins/forumdl/on_Crawl__13_forumdl_install.py create mode 100755 archivebox/plugins/papersdl/on_Crawl__14_papersdl_install.py create mode 100644 old/TODO_Process_cleanup_unification.md rename TODO_fix_migration_path.md => old/TODO_fix_migration_path.md (100%) diff --git a/ArchiveBox.conf b/ArchiveBox.conf deleted file mode 100644 index a7eaff41..00000000 --- a/ArchiveBox.conf +++ /dev/null @@ -1,3 +0,0 @@ -[SERVER_CONFIG] -SECRET_KEY = y6fw9wcaqls9sx_dze6ahky9ggpkpzoaw5g5v98_u3ro5j0_4f - diff --git a/archivebox/api/v1_workers.py b/archivebox/api/v1_workers.py index 95678ef5..9e138e16 100644 --- a/archivebox/api/v1_workers.py +++ b/archivebox/api/v1_workers.py @@ -35,12 +35,8 @@ class WorkerSchema(Schema): model: str max_tick_time: int max_concurrent_tasks: int - poll_interval: float - idle_timeout: int running_count: int running_workers: List[dict[str, Any]] - queue_count: int - queue: List[QueueItemSchema] @staticmethod def resolve_model(obj) -> str: @@ -55,38 +51,21 @@ class WorkerSchema(Schema): def resolve_max_concurrent_tasks(obj) -> int: return obj.MAX_CONCURRENT_TASKS - @staticmethod - def resolve_poll_interval(obj) -> float: - return obj.POLL_INTERVAL - - @staticmethod - def resolve_idle_timeout(obj) -> int: - return obj.IDLE_TIMEOUT - @staticmethod def resolve_running_count(obj) -> int: - return len(obj.get_running_workers()) + return obj.get_worker_count() @staticmethod def resolve_running_workers(obj) -> List[dict[str, Any]]: return obj.get_running_workers() - @staticmethod - def resolve_queue_count(obj) -> int: - return obj.get_queue().count() - - @staticmethod - def resolve_queue(obj) -> List[QueueItemSchema]: - return list(obj.get_queue()[:50]) # Limit to 50 items - class OrchestratorSchema(Schema): """Schema for the Orchestrator.""" is_running: bool poll_interval: float idle_timeout: int - max_workers_per_type: int - max_total_workers: int + max_crawl_workers: int total_worker_count: int workers: List[WorkerSchema] @@ -95,23 +74,20 @@ class OrchestratorSchema(Schema): def get_orchestrator(request): """Get the orchestrator status and all worker queues.""" from archivebox.workers.orchestrator import Orchestrator - from archivebox.workers.worker import CrawlWorker, SnapshotWorker, ArchiveResultWorker + from archivebox.workers.worker import CrawlWorker orchestrator = Orchestrator() # Create temporary worker instances to query their queues workers = [ CrawlWorker(worker_id=-1), - SnapshotWorker(worker_id=-1), - ArchiveResultWorker(worker_id=-1), ] return { 'is_running': orchestrator.is_running(), 'poll_interval': orchestrator.POLL_INTERVAL, 'idle_timeout': orchestrator.IDLE_TIMEOUT, - 'max_workers_per_type': orchestrator.MAX_WORKERS_PER_TYPE, - 'max_total_workers': orchestrator.MAX_TOTAL_WORKERS, + 'max_crawl_workers': orchestrator.MAX_CRAWL_WORKERS, 'total_worker_count': orchestrator.get_total_worker_count(), 'workers': workers, } @@ -120,41 +96,12 @@ def get_orchestrator(request): @router.get("/workers", response=List[WorkerSchema], url_name="get_workers") def get_workers(request): """List all worker types and their current status.""" - from archivebox.workers.worker import CrawlWorker, SnapshotWorker, ArchiveResultWorker + from archivebox.workers.worker import CrawlWorker # Create temporary instances to query their queues return [ CrawlWorker(worker_id=-1), - SnapshotWorker(worker_id=-1), - ArchiveResultWorker(worker_id=-1), ] -@router.get("/worker/{worker_name}", response=WorkerSchema, url_name="get_worker") -def get_worker(request, worker_name: str): - """Get status and queue for a specific worker type.""" - from archivebox.workers.worker import WORKER_TYPES - - if worker_name not in WORKER_TYPES: - from ninja.errors import HttpError - raise HttpError(404, f"Unknown worker type: {worker_name}. Valid types: {list(WORKER_TYPES.keys())}") - - WorkerClass = WORKER_TYPES[worker_name] - return WorkerClass(worker_id=-1) - - -@router.get("/worker/{worker_name}/queue", response=List[QueueItemSchema], url_name="get_worker_queue") -def get_worker_queue(request, worker_name: str, limit: int = 100): - """Get the current queue for a specific worker type.""" - from archivebox.workers.worker import WORKER_TYPES - - if worker_name not in WORKER_TYPES: - from ninja.errors import HttpError - raise HttpError(404, f"Unknown worker type: {worker_name}. Valid types: {list(WORKER_TYPES.keys())}") - - WorkerClass = WORKER_TYPES[worker_name] - worker = WorkerClass(worker_id=-1) - return list(worker.get_queue()[:limit]) - - # Progress endpoint moved to core.views.live_progress_view for simplicity diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index 5043f3ed..25b0815b 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -96,10 +96,9 @@ def add(urls: str | list[str], first_url = crawl.get_urls_list()[0] if crawl.get_urls_list() else '' print(f' [dim]First URL: {first_url}[/dim]') - # 3. The CrawlMachine will create the root Snapshot when started - # If URLs are from a file: first URL = file:///path/to/sources/...txt - # Parser extractors will run on it and discover more URLs - # Those URLs become child Snapshots (depth=1) + # 3. The CrawlMachine will create Snapshots from all URLs when started + # Parser extractors run on snapshots and discover more URLs + # Discovered URLs become child Snapshots (depth+1) if index_only: # Just create the crawl but don't start processing @@ -119,10 +118,9 @@ def add(urls: str | list[str], # 5. Start the orchestrator to process the queue # The orchestrator will: - # - Process Crawl -> create root Snapshot - # - Process root Snapshot -> run parser extractors -> discover URLs - # - Create child Snapshots from discovered URLs - # - Process child Snapshots -> run extractors + # - Process Crawl -> create Snapshots from all URLs + # - Process Snapshots -> run extractors + # - Parser extractors discover new URLs -> create child Snapshots # - Repeat until max_depth reached if bg: diff --git a/archivebox/cli/archivebox_init.py b/archivebox/cli/archivebox_init.py index 5ef6c9ca..34b10faa 100755 --- a/archivebox/cli/archivebox_init.py +++ b/archivebox/cli/archivebox_init.py @@ -160,10 +160,12 @@ def init(force: bool=False, quick: bool=False, install: bool=False) -> None: CONSTANTS.PERSONAS_DIR.mkdir(parents=True, exist_ok=True) CONSTANTS.DEFAULT_TMP_DIR.mkdir(parents=True, exist_ok=True) CONSTANTS.DEFAULT_LIB_DIR.mkdir(parents=True, exist_ok=True) - + (CONSTANTS.DEFAULT_LIB_DIR / 'bin').mkdir(parents=True, exist_ok=True) + from archivebox.config.common import STORAGE_CONFIG STORAGE_CONFIG.TMP_DIR.mkdir(parents=True, exist_ok=True) STORAGE_CONFIG.LIB_DIR.mkdir(parents=True, exist_ok=True) + (STORAGE_CONFIG.LIB_DIR / 'bin').mkdir(parents=True, exist_ok=True) if install: from archivebox.cli.archivebox_install import install as install_method diff --git a/archivebox/cli/archivebox_pluginmap.py b/archivebox/cli/archivebox_pluginmap.py index b168a480..04a8cba6 100644 --- a/archivebox/cli/archivebox_pluginmap.py +++ b/archivebox/cli/archivebox_pluginmap.py @@ -96,33 +96,45 @@ ARCHIVERESULT_MACHINE_DIAGRAM = """ ├─────────────────────────────────────────────────────────────────────────────┤ │ │ │ ┌─────────────┐ │ -│ │ QUEUED │◄────────────────┐ │ -│ │ (initial) │ │ │ -│ └──────┬──────┘ │ │ -│ │ │ tick() unless can_start() │ -│ │ tick() when │ │ -│ │ can_start() │ │ -│ ▼ │ │ -│ ┌─────────────┐ │ │ -│ │ STARTED │─────────────────┘ │ -│ │ │◄────────────────┐ │ -│ │ enter: │ │ tick() unless is_finished() │ -│ │ result.run()│─────────────────┘ │ -│ │ (execute │ │ -│ │ hook via │ │ -│ │ run_hook())│ │ -│ └──────┬──────┘ │ -│ │ │ -│ │ tick() checks status set by hook output │ -│ ├────────────────┬────────────────┬────────────────┐ │ -│ ▼ ▼ ▼ ▼ │ -│ ┌───────────┐ ┌───────────┐ ┌───────────┐ ┌───────────┐ │ -│ │ SUCCEEDED │ │ FAILED │ │ SKIPPED │ │ BACKOFF │ │ -│ │ (final) │ │ (final) │ │ (final) │ │ │ │ -│ └───────────┘ └───────────┘ └───────────┘ └─────┬─────┘ │ -│ │ │ -│ can_start()───┘ │ -│ loops back to STARTED │ +│ │ QUEUED │◄─────────────────┐ │ +│ │ (initial) │ │ │ +│ └──┬───────┬──┘ │ │ +│ │ │ │ tick() unless can_start() │ +│ │ │ exceeded_max_ │ │ +│ │ │ attempts │ │ +│ │ ▼ │ │ +│ │ ┌──────────┐ │ │ +│ │ │ SKIPPED │ │ │ +│ │ │ (final) │ │ │ +│ │ └──────────┘ │ │ +│ │ tick() when │ │ +│ │ can_start() │ │ +│ ▼ │ │ +│ ┌─────────────┐ │ │ +│ │ STARTED │──────────────────┘ │ +│ │ │◄─────────────────────────────────────────────────┐ │ +│ │ enter: │ │ │ │ +│ │ result.run()│ tick() unless │ │ │ +│ │ (execute │ is_finished() │ │ │ +│ │ hook via │──────────────────────┘ │ │ +│ │ run_hook())│ │ │ +│ └──────┬──────┘ │ │ +│ │ │ │ +│ │ tick() checks status set by hook output │ │ +│ ├─────────────┬─────────────┬─────────────┐ │ │ +│ ▼ ▼ ▼ ▼ │ │ +│ ┌───────────┐ ┌───────────┐ ┌───────────┐ ┌───────────┐ │ │ +│ │ SUCCEEDED │ │ FAILED │ │ SKIPPED │ │ BACKOFF │ │ │ +│ │ (final) │ │ (final) │ │ (final) │ │ │ │ │ +│ └───────────┘ └───────────┘ └───────────┘ └──┬──────┬─┘ │ │ +│ │ │ │ │ +│ exceeded_max_ │ │ can_start()│ │ +│ attempts │ │ loops back │ │ +│ ▼ │ └────────────┘ │ +│ ┌──────────┐ │ │ +│ │ SKIPPED │◄─┘ │ +│ │ (final) │ │ +│ └──────────┘ │ │ │ │ Each ArchiveResult runs ONE specific hook (stored in .hook_name field) │ └─────────────────────────────────────────────────────────────────────────────┘ @@ -137,35 +149,38 @@ BINARY_MACHINE_DIAGRAM = """ │ │ QUEUED │◄────────────────┐ │ │ │ (initial) │ │ │ │ └──────┬──────┘ │ │ -│ │ │ tick() unless can_start() │ +│ │ │ tick() unless can_install() │ +│ │ │ (stays queued if failed) │ │ │ tick() when │ │ -│ │ can_start() │ │ -│ ▼ │ │ -│ ┌─────────────┐ │ │ -│ │ STARTED │─────────────────┘ │ -│ │ │◄────────────────┐ │ -│ │ enter: │ │ │ -│ │ binary.run()│ │ tick() unless is_finished() │ -│ │ (discover │─────────────────┘ │ -│ │ Binary │ │ -│ │ hooks, │ │ -│ │ try each │ │ -│ │ provider) │ │ -│ └──────┬──────┘ │ +│ │ can_install() │ │ +│ │ │ │ +│ │ on_install() runs │ │ +│ │ during transition: │ │ +│ │ • binary.run() │ │ +│ │ (discover Binary │ │ +│ │ hooks, try each │ │ +│ │ provider until │ │ +│ │ one succeeds) │ │ +│ │ • Sets abspath, │ │ +│ │ version, sha256 │ │ +│ │ │ │ +│ │ If install fails: │ │ +│ │ raises exception──────┘ │ +│ │ (retry_at bumped) │ │ │ │ -│ │ tick() checks status set by hook output │ -│ ├────────────────────────────────┐ │ -│ ▼ ▼ │ -│ ┌─────────────┐ ┌─────────────┐ │ -│ │ SUCCEEDED │ │ FAILED │ │ -│ │ (final) │ │ (final) │ │ -│ │ │ │ │ │ -│ │ abspath, │ │ no provider │ │ -│ │ version set │ │ succeeded │ │ -│ └─────────────┘ └─────────────┘ │ +│ ▼ │ +│ ┌─────────────┐ │ +│ │ INSTALLED │ │ +│ │ (final) │ │ +│ │ │ │ +│ │ Binary is │ │ +│ │ ready to │ │ +│ │ use │ │ +│ └─────────────┘ │ │ │ -│ Hooks triggered: on_Binary__* (provider hooks during STARTED.enter) │ +│ Hooks triggered: on_Binary__* (provider hooks during transition) │ │ Providers tried in sequence until one succeeds: apt, brew, pip, npm, etc. │ +│ Installation is synchronous - no intermediate STARTED state │ └─────────────────────────────────────────────────────────────────────────────┘ """ diff --git a/archivebox/cli/archivebox_update.py b/archivebox/cli/archivebox_update.py index 2fbd05c0..f780a289 100644 --- a/archivebox/cli/archivebox_update.py +++ b/archivebox/cli/archivebox_update.py @@ -109,15 +109,18 @@ def drain_old_archive_dirs(resume_from: str = None, batch_size: int = 100) -> di if not archive_dir.exists(): return stats - print('[*] Scanning for old directories in archive/...') + print('[DEBUG Phase1] Scanning for old directories in archive/...') # Scan for real directories only (skip symlinks - they're already migrated) + all_entries = list(os.scandir(archive_dir)) + print(f'[DEBUG Phase1] Total entries in archive/: {len(all_entries)}') entries = [ (e.stat().st_mtime, e.path) - for e in os.scandir(archive_dir) + for e in all_entries if e.is_dir(follow_symlinks=False) # Skip symlinks ] entries.sort(reverse=True) # Newest first + print(f'[DEBUG Phase1] Real directories (not symlinks): {len(entries)}') print(f'[*] Found {len(entries)} old directories to drain') for mtime, entry_path in entries: @@ -142,14 +145,48 @@ def drain_old_archive_dirs(resume_from: str = None, batch_size: int = 100) -> di print(f" [{stats['processed']}] Invalid: {entry_path.name}") continue + # Ensure snapshot has a valid crawl (migration 0024 may have failed) + from archivebox.crawls.models import Crawl + has_valid_crawl = False + if snapshot.crawl_id: + # Check if the crawl actually exists + has_valid_crawl = Crawl.objects.filter(id=snapshot.crawl_id).exists() + + if not has_valid_crawl: + # Create a new crawl (created_by will default to system user) + crawl = Crawl.objects.create(urls=snapshot.url) + # Use queryset update to avoid triggering save() hooks + from archivebox.core.models import Snapshot as SnapshotModel + SnapshotModel.objects.filter(pk=snapshot.pk).update(crawl=crawl) + # Refresh the instance + snapshot.crawl = crawl + snapshot.crawl_id = crawl.id + print(f"[DEBUG Phase1] Created missing crawl for snapshot {str(snapshot.id)[:8]}") + # Check if needs migration (0.8.x → 0.9.x) + print(f"[DEBUG Phase1] Snapshot {str(snapshot.id)[:8]}: fs_version={snapshot.fs_version}, needs_migration={snapshot.fs_migration_needed}") if snapshot.fs_migration_needed: try: - # Manually trigger filesystem migration without full save() - # This avoids UNIQUE constraint issues while still migrating files - cleanup_info = None - if hasattr(snapshot, '_fs_migrate_from_0_8_0_to_0_9_0'): - cleanup_info = snapshot._fs_migrate_from_0_8_0_to_0_9_0() + # Calculate paths using actual directory (entry_path), not snapshot.timestamp + # because snapshot.timestamp might be truncated + old_dir = entry_path + new_dir = snapshot.get_storage_path_for_version('0.9.0') + print(f"[DEBUG Phase1] Migrating {old_dir.name} → {new_dir}") + + # Manually migrate files + if not new_dir.exists() and old_dir.exists(): + new_dir.mkdir(parents=True, exist_ok=True) + import shutil + file_count = 0 + for old_file in old_dir.rglob('*'): + if old_file.is_file(): + rel_path = old_file.relative_to(old_dir) + new_file = new_dir / rel_path + if not new_file.exists(): + new_file.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(old_file, new_file) + file_count += 1 + print(f"[DEBUG Phase1] Copied {file_count} files") # Update only fs_version field using queryset update (bypasses validation) from archivebox.core.models import Snapshot as SnapshotModel @@ -158,9 +195,8 @@ def drain_old_archive_dirs(resume_from: str = None, batch_size: int = 100) -> di # Commit the transaction transaction.commit() - # Manually call cleanup since we bypassed normal save() flow - if cleanup_info: - old_dir, new_dir = cleanup_info + # Cleanup: delete old dir and create symlink + if old_dir.exists() and old_dir != new_dir: snapshot._cleanup_old_migration_dir(old_dir, new_dir) stats['migrated'] += 1 @@ -207,19 +243,39 @@ def process_all_db_snapshots(batch_size: int = 100) -> dict: continue try: - # Reconcile index.json with DB - snapshot.reconcile_with_index_json() + print(f"[DEBUG Phase2] Snapshot {str(snapshot.id)[:8]}: fs_version={snapshot.fs_version}, needs_migration={snapshot.fs_migration_needed}") + + # Check if snapshot has a directory on disk + from pathlib import Path + output_dir = Path(snapshot.output_dir) + has_directory = output_dir.exists() and output_dir.is_dir() + + # Only reconcile if directory exists (don't create empty directories for orphans) + if has_directory: + snapshot.reconcile_with_index_json() # Clean up invalid field values from old migrations if not isinstance(snapshot.current_step, int): snapshot.current_step = 0 + # If still needs migration, it's an orphan (no directory on disk) + # Mark it as migrated to prevent save() from triggering filesystem migration + if snapshot.fs_migration_needed: + if has_directory: + print(f"[DEBUG Phase2] WARNING: Snapshot {str(snapshot.id)[:8]} has directory but still needs migration") + else: + print(f"[DEBUG Phase2] Orphan snapshot {str(snapshot.id)[:8]} - marking as migrated without filesystem operation") + # Use queryset update to set fs_version without triggering save() hooks + from archivebox.core.models import Snapshot as SnapshotModel + SnapshotModel.objects.filter(pk=snapshot.pk).update(fs_version='0.9.0') + snapshot.fs_version = '0.9.0' + # Queue for archiving (state machine will handle it) snapshot.status = Snapshot.StatusChoices.QUEUED snapshot.retry_at = timezone.now() snapshot.save() - stats['reconciled'] += 1 + stats['reconciled'] += 1 if has_directory else 0 stats['queued'] += 1 except Exception as e: # Skip snapshots that can't be processed (e.g., missing crawl) diff --git a/archivebox/core/migrations/0027_copy_archiveresult_to_process.py b/archivebox/core/migrations/0027_copy_archiveresult_to_process.py index 5b0666c5..8ac9d889 100644 --- a/archivebox/core/migrations/0027_copy_archiveresult_to_process.py +++ b/archivebox/core/migrations/0027_copy_archiveresult_to_process.py @@ -4,6 +4,7 @@ from django.db import migrations, connection import json from pathlib import Path +from archivebox.uuid_compat import uuid7 def parse_cmd_field(cmd_raw): @@ -39,7 +40,6 @@ def parse_cmd_field(cmd_raw): def get_or_create_current_machine(cursor): """Get or create Machine.current() using raw SQL.""" - import uuid import socket from datetime import datetime @@ -55,7 +55,8 @@ def get_or_create_current_machine(cursor): return row[0] # Create new machine - machine_id = str(uuid.uuid4()) + # Django UUIDField stores UUIDs as 32-char hex (no dashes) in SQLite + machine_id = uuid7().hex now = datetime.now().isoformat() # Check which columns exist (schema differs between 0.8.x and 0.9.x) @@ -103,7 +104,6 @@ def get_or_create_binary(cursor, machine_id, name, abspath, version): Returns: binary_id (str) """ - import uuid from datetime import datetime # If abspath is just a name without slashes, it's not a full path @@ -123,7 +123,8 @@ def get_or_create_binary(cursor, machine_id, name, abspath, version): return row[0] # Create new binary - binary_id = str(uuid.uuid4()) + # Django UUIDField stores UUIDs as 32-char hex (no dashes) in SQLite + binary_id = uuid7().hex now = datetime.now().isoformat() # Check which columns exist (schema differs between 0.8.x and 0.9.x) @@ -186,10 +187,10 @@ def create_process(cursor, machine_id, pwd, cmd, status, exit_code, started_at, Returns: process_id (str) """ - import uuid from datetime import datetime - process_id = str(uuid.uuid4()) + # Django UUIDField stores UUIDs as 32-char hex (no dashes) in SQLite + process_id = uuid7().hex now = datetime.now().isoformat() # Convert cmd array to JSON diff --git a/archivebox/core/migrations/0028_alter_snapshot_fs_version.py b/archivebox/core/migrations/0028_alter_snapshot_fs_version.py new file mode 100644 index 00000000..eb86883d --- /dev/null +++ b/archivebox/core/migrations/0028_alter_snapshot_fs_version.py @@ -0,0 +1,18 @@ +# Generated by Django 6.0 on 2026-01-02 08:43 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0027_copy_archiveresult_to_process'), + ] + + operations = [ + migrations.AlterField( + model_name='snapshot', + name='fs_version', + field=models.CharField(default='0.9.0', help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().', max_length=10), + ), + ] diff --git a/archivebox/core/migrations/0029_migrate_archiveresult_to_uuid_pk.py b/archivebox/core/migrations/0029_migrate_archiveresult_to_uuid_pk.py new file mode 100644 index 00000000..36b9f14c --- /dev/null +++ b/archivebox/core/migrations/0029_migrate_archiveresult_to_uuid_pk.py @@ -0,0 +1,181 @@ +# Generated by hand on 2026-01-02 +# Migrate ArchiveResult from integer PK to UUID PK (matching Snapshot) + +from django.db import migrations, models, connection +from uuid import UUID +from archivebox.uuid_compat import uuid7 + + +def migrate_archiveresult_id_to_uuid(apps, schema_editor): + """ + Migrate ArchiveResult from integer PK to UUID PK. + + Strategy: + 1. Add old_id field to store current integer IDs + 2. Generate UUIDs for any records missing them + 3. Swap id and uuid fields (uuid becomes PK, old integer id becomes old_id) + """ + cursor = connection.cursor() + + # Check if table exists and has data + cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='core_archiveresult'") + if not cursor.fetchone(): + print('ArchiveResult table does not exist, skipping migration') + return + + cursor.execute("SELECT COUNT(*) FROM core_archiveresult") + row_count = cursor.fetchone()[0] + + if row_count == 0: + print('No ArchiveResult records to migrate') + return + + print(f'Migrating {row_count} ArchiveResult records from integer PK to UUID PK...') + + # Step 0: Check if machine_process table exists, if not NULL out process_id values + cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='machine_process'") + machine_process_exists = cursor.fetchone() is not None + + if not machine_process_exists: + print('machine_process table does not exist yet, setting process_id to NULL') + cursor.execute("UPDATE core_archiveresult SET process_id = NULL WHERE process_id IS NOT NULL") + + # Step 1: Create new table with UUID as primary key + cursor.execute(""" + CREATE TABLE core_archiveresult_new ( + id TEXT PRIMARY KEY NOT NULL, + old_id INTEGER, + uuid TEXT UNIQUE, + created_at DATETIME NOT NULL, + modified_at DATETIME NOT NULL, + + snapshot_id TEXT NOT NULL, + plugin VARCHAR(32) NOT NULL, + hook_name VARCHAR(255) NOT NULL DEFAULT '', + + status VARCHAR(15) NOT NULL DEFAULT 'queued', + retry_at DATETIME, + + start_ts DATETIME, + end_ts DATETIME, + + output_str TEXT NOT NULL DEFAULT '', + output_json TEXT, + output_files TEXT NOT NULL DEFAULT '{}', + output_size BIGINT NOT NULL DEFAULT 0, + output_mimetypes VARCHAR(512) NOT NULL DEFAULT '', + + config TEXT NOT NULL DEFAULT '{}', + notes TEXT NOT NULL DEFAULT '', + num_uses_succeeded INTEGER NOT NULL DEFAULT 0, + num_uses_failed INTEGER NOT NULL DEFAULT 0, + + process_id TEXT, + + FOREIGN KEY (snapshot_id) REFERENCES core_snapshot(id) ON DELETE CASCADE, + FOREIGN KEY (process_id) REFERENCES machine_process(id) ON DELETE SET NULL + ); + """) + + # Step 2: Generate UUIDs for records that don't have them + cursor.execute("SELECT id, uuid FROM core_archiveresult") + records = cursor.fetchall() + + id_to_uuid = {} + for old_id, existing_uuid in records: + if existing_uuid: + # Normalize existing UUID to 32-char hex format (Django SQLite UUIDField format) + # (existing UUIDs might be stored with or without dashes in old schema) + id_to_uuid[old_id] = UUID(existing_uuid).hex + else: + # Generate new UUIDv7 (time-ordered) as 32-char hex + id_to_uuid[old_id] = uuid7().hex + + # Step 3: Copy data with UUIDs as new primary key + cursor.execute("SELECT * FROM core_archiveresult") + old_records = cursor.fetchall() + + # Get column names + cursor.execute("PRAGMA table_info(core_archiveresult)") + columns = cursor.fetchall() + col_names = [col[1] for col in columns] + + for i, record in enumerate(old_records): + old_id = record[col_names.index('id')] + new_uuid = id_to_uuid[old_id] + + # Build insert with new structure + values = {col_names[i]: record[i] for i in range(len(col_names))} + + # Check which fields exist in new table + fields_to_copy = [ + 'created_at', 'modified_at', 'snapshot_id', 'plugin', 'hook_name', + 'status', 'retry_at', 'start_ts', 'end_ts', + 'output_str', 'output_json', 'output_files', 'output_size', 'output_mimetypes', + 'config', 'notes', 'num_uses_succeeded', 'num_uses_failed', 'process_id' + ] + + # Build INSERT statement + existing_fields = [f for f in fields_to_copy if f in values] + placeholders = ', '.join(['?'] * (len(existing_fields) + 3)) # +3 for id, old_id, uuid + field_list = 'id, old_id, uuid, ' + ', '.join(existing_fields) + + insert_values = [new_uuid, old_id, new_uuid] + [values.get(f) for f in existing_fields] + + cursor.execute( + f"INSERT INTO core_archiveresult_new ({field_list}) VALUES ({placeholders})", + insert_values + ) + + # Step 4: Replace old table with new table + cursor.execute("DROP TABLE core_archiveresult") + cursor.execute("ALTER TABLE core_archiveresult_new RENAME TO core_archiveresult") + + # Step 5: Create indexes + cursor.execute("CREATE INDEX core_archiveresult_snapshot_id_idx ON core_archiveresult(snapshot_id)") + cursor.execute("CREATE INDEX core_archiveresult_plugin_idx ON core_archiveresult(plugin)") + cursor.execute("CREATE INDEX core_archiveresult_status_idx ON core_archiveresult(status)") + cursor.execute("CREATE INDEX core_archiveresult_retry_at_idx ON core_archiveresult(retry_at)") + cursor.execute("CREATE INDEX core_archiveresult_created_at_idx ON core_archiveresult(created_at)") + cursor.execute("CREATE INDEX core_archiveresult_hook_name_idx ON core_archiveresult(hook_name)") + cursor.execute("CREATE INDEX core_archiveresult_process_id_idx ON core_archiveresult(process_id)") + cursor.execute("CREATE INDEX core_archiveresult_old_id_idx ON core_archiveresult(old_id)") + + print(f'✓ Migrated {row_count} ArchiveResult records to UUID primary key') + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0028_alter_snapshot_fs_version'), + ] + + operations = [ + migrations.SeparateDatabaseAndState( + database_operations=[ + migrations.RunPython( + migrate_archiveresult_id_to_uuid, + reverse_code=migrations.RunPython.noop, + ), + ], + state_operations=[ + # Remove old uuid field + migrations.RemoveField( + model_name='archiveresult', + name='uuid', + ), + # Change id from AutoField to UUIDField + migrations.AlterField( + model_name='archiveresult', + name='id', + field=models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True), + ), + # Add old_id field to preserve legacy integer IDs + migrations.AddField( + model_name='archiveresult', + name='old_id', + field=models.IntegerField(null=True, blank=True, db_index=True, help_text='Legacy integer ID from pre-0.9.0 versions'), + ), + ], + ), + ] diff --git a/archivebox/core/migrations/0030_alter_archiveresult_id.py b/archivebox/core/migrations/0030_alter_archiveresult_id.py new file mode 100644 index 00000000..0c5e54b0 --- /dev/null +++ b/archivebox/core/migrations/0030_alter_archiveresult_id.py @@ -0,0 +1,19 @@ +# Generated by Django 6.0 on 2026-01-02 10:02 + +import uuid +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0029_migrate_archiveresult_to_uuid_pk'), + ] + + operations = [ + migrations.AlterField( + model_name='archiveresult', + name='id', + field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True), + ), + ] diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 3a21041a..3de5b4f8 100755 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -362,6 +362,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea # Migrate filesystem if needed (happens automatically on save) if self.pk and self.fs_migration_needed: + print(f"[DEBUG save()] Triggering filesystem migration for {str(self.id)[:8]}: {self.fs_version} → {self._fs_current_version()}") # Walk through migration chain automatically current = self.fs_version target = self._fs_current_version() @@ -372,6 +373,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea # Only run if method exists (most are no-ops) if hasattr(self, method): + print(f"[DEBUG save()] Running {method}()") getattr(self, method)() current = next_ver @@ -449,10 +451,18 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea old_dir = self.get_storage_path_for_version('0.8.0') new_dir = self.get_storage_path_for_version('0.9.0') - if not old_dir.exists() or old_dir == new_dir or new_dir.exists(): - # Even if no directory migration needed, still convert index format - self.convert_index_json_to_jsonl() - return + print(f"[DEBUG _fs_migrate] {self.timestamp}: old_exists={old_dir.exists()}, same={old_dir == new_dir}, new_exists={new_dir.exists()}") + + if not old_dir.exists() or old_dir == new_dir: + # No migration needed + print(f"[DEBUG _fs_migrate] Returning None (early return)") + return None + + if new_dir.exists(): + # New directory already exists (files already copied), but we still need cleanup + # Return cleanup info so old directory can be cleaned up + print(f"[DEBUG _fs_migrate] Returning cleanup info (new_dir exists)") + return (old_dir, new_dir) new_dir.mkdir(parents=True, exist_ok=True) @@ -495,47 +505,32 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea def _cleanup_old_migration_dir(self, old_dir: Path, new_dir: Path): """ Delete old directory and create symlink after successful migration. - Called via transaction.on_commit() after DB commit succeeds. """ import shutil import logging - print(f"[DEBUG] _cleanup_old_migration_dir called: old_dir={old_dir}, new_dir={new_dir}") - # Delete old directory if old_dir.exists() and not old_dir.is_symlink(): - print(f"[DEBUG] Attempting to delete old directory: {old_dir}") try: shutil.rmtree(old_dir) - print(f"[DEBUG] Successfully deleted old directory: {old_dir}") except Exception as e: - # Log but don't raise - migration succeeded, this is just cleanup - print(f"[DEBUG] Failed to delete old directory {old_dir}: {e}") logging.getLogger('archivebox.migration').warning( f"Could not remove old migration directory {old_dir}: {e}" ) return # Don't create symlink if cleanup failed - else: - print(f"[DEBUG] Old directory doesn't exist or is already a symlink: {old_dir}") # Create backwards-compat symlink (after old dir is deleted) symlink_path = old_dir # Same path as old_dir if symlink_path.is_symlink(): - print(f"[DEBUG] Unlinking existing symlink: {symlink_path}") symlink_path.unlink() if not symlink_path.exists(): - print(f"[DEBUG] Creating symlink: {symlink_path} -> {new_dir}") try: symlink_path.symlink_to(new_dir, target_is_directory=True) - print(f"[DEBUG] Successfully created symlink") except Exception as e: - print(f"[DEBUG] Failed to create symlink: {e}") logging.getLogger('archivebox.migration').warning( f"Could not create symlink from {symlink_path} to {new_dir}: {e}" ) - else: - print(f"[DEBUG] Symlink path already exists: {symlink_path}") # ========================================================================= # Path Calculation and Migration Helpers @@ -660,13 +655,28 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea if not timestamp: return None - # Look up existing + # Look up existing (try exact match first, then fuzzy match for truncated timestamps) try: - return cls.objects.get(url=url, timestamp=timestamp) + snapshot = cls.objects.get(url=url, timestamp=timestamp) + print(f"[DEBUG load_from_directory] Found existing snapshot for {url} @ {timestamp}: {str(snapshot.id)[:8]}") + return snapshot except cls.DoesNotExist: + print(f"[DEBUG load_from_directory] NOT FOUND (exact): {url} @ {timestamp}") + # Try fuzzy match - index.json may have truncated timestamp + # e.g., index has "1767000340" but DB has "1767000340.624737" + candidates = cls.objects.filter(url=url, timestamp__startswith=timestamp) + if candidates.count() == 1: + snapshot = candidates.first() + print(f"[DEBUG load_from_directory] Found via fuzzy match: {snapshot.timestamp}") + return snapshot + elif candidates.count() > 1: + print(f"[DEBUG load_from_directory] Multiple fuzzy matches, using first") + return candidates.first() + print(f"[DEBUG load_from_directory] NOT FOUND (fuzzy): {url} @ {timestamp}") return None except cls.MultipleObjectsReturned: # Should not happen with unique constraint + print(f"[DEBUG load_from_directory] Multiple snapshots found for {url} @ {timestamp}") return cls.objects.filter(url=url, timestamp=timestamp).first() @classmethod @@ -1668,83 +1678,20 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea return archiveresults - def advance_step_if_ready(self) -> bool: - """ - Advance current_step if all foreground hooks in current step are finished. - - Called by the state machine to check if step can advance. - Background hooks (.bg) don't block step advancement. - - Step advancement rules: - - All foreground ARs in current step must be finished (SUCCEEDED/FAILED/SKIPPED) - - Background ARs (hook_name contains '.bg.') are ignored for advancement - - When ready, increments current_step by 1 (up to 9) - - Returns: - True if step was advanced, False if not ready or already at step 9. - """ - from archivebox.hooks import extract_step, is_background_hook - - if self.current_step >= 9: - return False # Already at final step - - # Get all ARs for current step that are foreground - current_step_ars = self.archiveresult_set.filter( - hook_name__isnull=False - ).exclude(hook_name='') - - # Check each AR in current step - for ar in current_step_ars: - ar_step = extract_step(ar.hook_name) - if ar_step != self.current_step: - continue # Not in current step - - if is_background_hook(ar.hook_name): - continue # Background hooks don't block - - # Foreground hook in current step - check if finished - if ar.status not in ArchiveResult.FINAL_OR_ACTIVE_STATES: - # Still pending/queued - can't advance - return False - - if ar.status == ArchiveResult.StatusChoices.STARTED: - # Still running - can't advance - return False - - # All foreground hooks in current step are finished - advance! - self.current_step += 1 - self.save(update_fields=['current_step', 'modified_at']) - return True def is_finished_processing(self) -> bool: """ - Check if this snapshot has finished processing. + Check if all ArchiveResults are finished. - Used by SnapshotMachine.is_finished() to determine if snapshot is complete. - - Returns: - True if all archiveresults are finished (or no work to do), False otherwise. + Note: This is only called for observability/progress tracking. + SnapshotWorker owns the execution and doesn't poll this. """ - # if no archiveresults exist yet, it's not finished - if not self.archiveresult_set.exists(): - return False + # Check if any ARs are still pending/started + pending = self.archiveresult_set.exclude( + status__in=ArchiveResult.FINAL_OR_ACTIVE_STATES + ).exists() - # Try to advance step if ready (handles step-based hook execution) - # This will increment current_step when all foreground hooks in current step are done - while self.advance_step_if_ready(): - pass # Keep advancing until we can't anymore - - # if archiveresults exist but are still pending, it's not finished - if self.pending_archiveresults().exists(): - return False - - # Don't wait for background hooks - they'll be cleaned up on entering sealed state - # Background hooks in STARTED state are excluded by pending_archiveresults() - # (STARTED is in FINAL_OR_ACTIVE_STATES) so once all results are FINAL or ACTIVE, - # we can transition to sealed and cleanup() will kill the background hooks - - # otherwise archiveresults exist and are all finished, so it's finished - return True + return not pending def get_progress_stats(self) -> dict: """ @@ -2242,7 +2189,6 @@ class SnapshotMachine(BaseStateMachine, strict_states=True): tick = ( queued.to.itself(unless='can_start') | queued.to(started, cond='can_start') | - started.to.itself(unless='is_finished') | started.to(sealed, cond='is_finished') ) @@ -2253,6 +2199,10 @@ class SnapshotMachine(BaseStateMachine, strict_states=True): can_start = bool(self.snapshot.url) return can_start + def is_finished(self) -> bool: + """Check if all ArchiveResults for this snapshot are finished.""" + return self.snapshot.is_finished_processing() + @queued.enter def enter_queued(self): self.snapshot.update_and_requeue( @@ -2262,29 +2212,10 @@ class SnapshotMachine(BaseStateMachine, strict_states=True): @started.enter def enter_started(self): - import sys - - print(f'[cyan] 🔄 SnapshotMachine.enter_started() - creating archiveresults for {self.snapshot.url}[/cyan]', file=sys.stderr) - - # Run the snapshot - creates pending archiveresults for all enabled plugins - self.snapshot.run() - - # Check if any archiveresults were created - ar_count = self.snapshot.archiveresult_set.count() - print(f'[cyan] 🔄 ArchiveResult count: {ar_count}[/cyan]', file=sys.stderr) - - if ar_count == 0: - # No archiveresults created, seal immediately - print(f'[cyan] 🔄 No archiveresults created, sealing snapshot immediately[/cyan]', file=sys.stderr) - self.seal() - else: - # Set status = started with retry_at far future (so workers don't claim us - we're waiting for ARs) - # Last AR will manually call self.seal() when done - self.snapshot.update_and_requeue( - retry_at=timezone.now() + timedelta(days=365), - status=Snapshot.StatusChoices.STARTED, - ) - print(f'[cyan] 🔄 {ar_count} archiveresults created, waiting for them to finish[/cyan]', file=sys.stderr) + """Just mark as started - SnapshotWorker will create ARs and run hooks.""" + self.snapshot.status = Snapshot.StatusChoices.STARTED + self.snapshot.retry_at = None # No more polling + self.snapshot.save(update_fields=['status', 'retry_at', 'modified_at']) @sealed.enter def enter_sealed(self): @@ -2329,12 +2260,11 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi plugins = [get_plugin_name(e) for e in get_plugins()] return tuple((e, e) for e in plugins) - # Keep AutoField for backward compatibility with 0.7.x databases - # UUID field is added separately by migration for new records - id = models.AutoField(primary_key=True, editable=False) - # Note: unique constraint is added by migration 0027 - don't set unique=True here - # or SQLite table recreation in earlier migrations will fail - uuid = models.UUIDField(default=uuid7, null=True, blank=True, db_index=True) + # UUID primary key (migrated from integer in 0029) + id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True) + # old_id preserves the legacy integer ID for backward compatibility + old_id = models.IntegerField(null=True, blank=True, db_index=True, help_text='Legacy integer ID from pre-0.9.0 versions') + # Note: uuid field was removed in migration 0029 when id became UUID created_at = models.DateTimeField(default=timezone.now, db_index=True) modified_at = models.DateTimeField(auto_now=True) @@ -2684,13 +2614,11 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi plugin_dir = Path(self.snapshot.output_dir) / self.plugin start_ts = timezone.now() - is_bg_hook = False + process = None for hook in hooks: - # Check if this is a background hook - is_bg_hook = is_background_hook(hook.name) - - result = run_hook( + # Run hook using Process.launch() - returns Process model + process = run_hook( hook, output_dir=plugin_dir, config=config, @@ -2700,27 +2628,25 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi depth=self.snapshot.depth, ) - # Background hooks return None - if result is None: - is_bg_hook = True - - # Update status based on hook execution - if is_bg_hook: - # BACKGROUND HOOK - still running, return immediately - # Status stays STARTED, will be finalized by Snapshot.cleanup() - self.status = self.StatusChoices.STARTED + # Link ArchiveResult to Process + self.process = process self.start_ts = start_ts - if self.process_id: - self.process.pwd = str(plugin_dir) - self.process.save() + self.save(update_fields=['process_id', 'start_ts', 'modified_at']) + + if not process: + # No hooks ran + self.status = self.StatusChoices.FAILED + self.output_str = 'No hooks executed' self.save() return + # Update status based on hook execution + if process.status == process.StatusChoices.RUNNING: + # BACKGROUND HOOK - still running, return immediately + # Status is already STARTED from enter_started(), will be finalized by Snapshot.cleanup() + return + # FOREGROUND HOOK - completed, update from filesystem - self.start_ts = start_ts - if self.process_id: - self.process.pwd = str(plugin_dir) - self.process.save() self.update_from_output() # Clean up empty output directory if no files were created @@ -3037,26 +2963,30 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True): skipped = State(value=ArchiveResult.StatusChoices.SKIPPED, final=True) # Tick Event - transitions based on conditions + # Flow: queued → started → (succeeded|failed|skipped) + # queued → skipped (if exceeded max attempts) + # started → backoff → started (retry) tick = ( + queued.to(skipped, cond='is_exceeded_max_attempts') | # Check skip first queued.to.itself(unless='can_start') | queued.to(started, cond='can_start') | - started.to.itself(unless='is_finished') | started.to(succeeded, cond='is_succeeded') | started.to(failed, cond='is_failed') | started.to(skipped, cond='is_skipped') | started.to(backoff, cond='is_backoff') | + backoff.to(skipped, cond='is_exceeded_max_attempts') | # Check skip from backoff too backoff.to.itself(unless='can_start') | - backoff.to(started, cond='can_start') | - backoff.to(succeeded, cond='is_succeeded') | - backoff.to(failed, cond='is_failed') | - backoff.to(skipped, cond='is_skipped') + backoff.to(started, cond='can_start') + # Removed redundant transitions: backoff.to(succeeded/failed/skipped) + # Reason: backoff should always retry→started, then started→final states ) def can_start(self) -> bool: - if not self.archiveresult.snapshot.url: - return False + """Pure function - check if AR can start (has valid URL).""" + return bool(self.archiveresult.snapshot.url) - # Check if snapshot has exceeded MAX_URL_ATTEMPTS failed results + def is_exceeded_max_attempts(self) -> bool: + """Check if snapshot has exceeded MAX_URL_ATTEMPTS failed results.""" from archivebox.config.configset import get_config config = get_config( @@ -3070,15 +3000,7 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True): status=ArchiveResult.StatusChoices.FAILED ).count() - if failed_count >= max_attempts: - # Mark this result as skipped since we've hit the limit - self.archiveresult.status = ArchiveResult.StatusChoices.SKIPPED - self.archiveresult.output_str = f'Skipped: snapshot exceeded MAX_URL_ATTEMPTS ({max_attempts} failures)' - self.archiveresult.retry_at = None - self.archiveresult.save() - return False - - return True + return failed_count >= max_attempts def is_succeeded(self) -> bool: """Check if extractor plugin succeeded (status was set by run()).""" @@ -3101,12 +3023,35 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True): ) def is_finished(self) -> bool: - """Check if extraction has completed (success, failure, or skipped).""" - return self.archiveresult.status in ( + """ + Check if extraction has completed (success, failure, or skipped). + + For background hooks in STARTED state, checks if their Process has finished and reaps them. + """ + # If already in final state, return True + if self.archiveresult.status in ( ArchiveResult.StatusChoices.SUCCEEDED, ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED, - ) + ): + return True + + # If in STARTED state with a Process, check if Process has finished running + if self.archiveresult.status == ArchiveResult.StatusChoices.STARTED: + if self.archiveresult.process_id: + process = self.archiveresult.process + + # If process is NOT running anymore, reap the background hook + if not process.is_running(): + self.archiveresult.update_from_output() + # Check if now in final state after reaping + return self.archiveresult.status in ( + ArchiveResult.StatusChoices.SUCCEEDED, + ArchiveResult.StatusChoices.FAILED, + ArchiveResult.StatusChoices.SKIPPED, + ) + + return False @queued.enter def enter_queued(self): @@ -3148,7 +3093,12 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True): ) def _check_and_seal_parent_snapshot(self): - """Check if this is the last ArchiveResult to finish - if so, seal the parent Snapshot.""" + """ + Check if this is the last ArchiveResult to finish - if so, seal the parent Snapshot. + + Note: In the new architecture, SnapshotWorker handles step advancement and sealing. + This method is kept for backwards compatibility with manual CLI commands. + """ import sys snapshot = self.archiveresult.snapshot @@ -3189,6 +3139,8 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True): def enter_failed(self): import sys + print(f'[red] ❌ ArchiveResult.enter_failed() called for {self.archiveresult.plugin}[/red]', file=sys.stderr) + self.archiveresult.update_and_requeue( retry_at=None, status=ArchiveResult.StatusChoices.FAILED, @@ -3207,6 +3159,16 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True): def enter_skipped(self): import sys + # Set output_str if not already set (e.g., when skipped due to max attempts) + if not self.archiveresult.output_str and self.is_exceeded_max_attempts(): + from archivebox.config.configset import get_config + config = get_config( + crawl=self.archiveresult.snapshot.crawl, + snapshot=self.archiveresult.snapshot, + ) + max_attempts = config.get('MAX_URL_ATTEMPTS', 50) + self.archiveresult.output_str = f'Skipped: snapshot exceeded MAX_URL_ATTEMPTS ({max_attempts} failures)' + self.archiveresult.update_and_requeue( retry_at=None, status=ArchiveResult.StatusChoices.SKIPPED, diff --git a/archivebox/crawls/admin.py b/archivebox/crawls/admin.py index da08b0ac..110fe941 100644 --- a/archivebox/crawls/admin.py +++ b/archivebox/crawls/admin.py @@ -281,25 +281,11 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin): """Editor for crawl URLs.""" widget_id = f'crawl_urls_{obj.pk}' - # Check if it's a local file we can edit - source_file = obj.get_file_path() - is_file = source_file is not None - file_contents = "" - error = None - - if is_file and source_file: - try: - file_contents = source_file.read_text().strip() - except Exception as e: - error = f'Error reading {source_file}: {e}' - # Escape for safe HTML embedding escaped_urls = (obj.urls or '').replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"') - escaped_file_contents = file_contents.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"') # Count lines for auto-expand logic line_count = len((obj.urls or '').split('\n')) - file_line_count = len(file_contents.split('\n')) if file_contents else 0 uri_rows = min(max(3, line_count), 10) html = f''' @@ -318,21 +304,6 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin): {line_count} URL{'s' if line_count != 1 else ''} · Note: URLs displayed here for reference only

- - {"" if not is_file else f''' - -
- - {"
" + error + "
" if error else ""} - -
- '''} - ''' return mark_safe(html) diff --git a/archivebox/crawls/models.py b/archivebox/crawls/models.py index 52ed6c81..d8df425c 100755 --- a/archivebox/crawls/models.py +++ b/archivebox/crawls/models.py @@ -114,22 +114,6 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith }, ) - @classmethod - def from_file(cls, source_file: Path, max_depth: int = 0, label: str = '', extractor: str = 'auto', - tags_str: str = '', config=None, created_by=None): - """Create a crawl from a file containing URLs.""" - urls_content = source_file.read_text() - crawl = cls.objects.create( - urls=urls_content, - extractor=extractor, - max_depth=max_depth, - tags_str=tags_str, - label=label or source_file.name, - config=config or {}, - created_by_id=getattr(created_by, 'pk', created_by) or get_or_create_system_user_pk(), - ) - return crawl - @property def api_url(self) -> str: return reverse_lazy('api-1:get_crawl', args=[self.id]) @@ -196,15 +180,19 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith return crawl @property - def output_dir_parent(self) -> str: - """Construct parent directory: users/{username}/crawls/{YYYYMMDD}""" - date_str = self.created_at.strftime('%Y%m%d') - return f'users/{self.created_by.username}/crawls/{date_str}' + def OUTPUT_DIR(self) -> Path: + """ + Construct output directory: users/{username}/crawls/{YYYYMMDD}/{domain}/{crawl-id} + Domain is extracted from the first URL in the crawl. + """ + from archivebox import DATA_DIR + from archivebox.core.models import Snapshot - @property - def output_dir_name(self) -> str: - """Use crawl ID as directory name""" - return str(self.id) + date_str = self.created_at.strftime('%Y%m%d') + urls = self.get_urls_list() + domain = Snapshot.extract_domain_from_url(urls[0]) if urls else 'unknown' + + return DATA_DIR / 'users' / self.created_by.username / 'crawls' / date_str / domain / str(self.id) def get_urls_list(self) -> list[str]: """Get list of URLs from urls field, filtering out comments and empty lines.""" @@ -216,52 +204,6 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith if url.strip() and not url.strip().startswith('#') ] - def get_file_path(self) -> Path | None: - """ - Get filesystem path if this crawl references a local file. - Checks if the first URL is a file:// URI. - """ - urls = self.get_urls_list() - if not urls: - return None - - first_url = urls[0] - if not first_url.startswith('file://'): - return None - - # Remove file:// prefix - path_str = first_url.replace('file://', '', 1) - return Path(path_str) - - def create_root_snapshot(self) -> 'Snapshot': - from archivebox.core.models import Snapshot - - first_url = self.get_urls_list()[0] if self.get_urls_list() else None - if not first_url: - raise ValueError(f'Crawl {self.id} has no URLs to create root snapshot from') - - # Try to get existing snapshot - try: - snapshot = Snapshot.objects.get(crawl=self, url=first_url) - # If exists and already queued/started, return it as-is - if snapshot.status in [Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]: - # Update retry_at to now so it can be picked up immediately - snapshot.retry_at = timezone.now() - snapshot.save(update_fields=['retry_at']) - return snapshot - except Snapshot.DoesNotExist: - pass - - # Create new snapshot - root_snapshot = Snapshot.objects.create( - crawl=self, - url=first_url, - status=Snapshot.INITIAL_STATE, - retry_at=timezone.now(), - timestamp=str(timezone.now().timestamp()), - depth=0, - ) - return root_snapshot def add_url(self, entry: dict) -> bool: """ @@ -316,11 +258,15 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith Returns: List of newly created Snapshot objects """ + import sys import json from archivebox.core.models import Snapshot created_snapshots = [] + print(f'[cyan]DEBUG create_snapshots_from_urls: self.urls={repr(self.urls)}[/cyan]', file=sys.stderr) + print(f'[cyan]DEBUG create_snapshots_from_urls: lines={self.urls.splitlines()}[/cyan]', file=sys.stderr) + for line in self.urls.splitlines(): if not line.strip(): continue @@ -329,13 +275,13 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith try: entry = json.loads(line) url = entry.get('url', '') - depth = entry.get('depth', 1) + depth = entry.get('depth', 0) title = entry.get('title') timestamp = entry.get('timestamp') tags = entry.get('tags', '') except json.JSONDecodeError: url = line.strip() - depth = 1 + depth = 0 title = None timestamp = None tags = '' @@ -379,41 +325,90 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith The root Snapshot for this crawl, or None for system crawls that don't create snapshots """ import time + import json from pathlib import Path from archivebox.hooks import run_hook, discover_hooks, process_hook_records from archivebox.config.configset import get_config + # Debug logging to file (since stdout/stderr redirected to /dev/null in progress mode) + debug_log = Path('/tmp/archivebox_crawl_debug.log') + with open(debug_log, 'a') as f: + f.write(f'\n=== Crawl.run() starting for {self.id} at {time.time()} ===\n') + f.flush() + # Get merged config with crawl context config = get_config(crawl=self) + # Load all binaries.jsonl files from plugins + # This replaces individual on_Crawl install hooks with declarative configuration + from archivebox.hooks import BUILTIN_PLUGINS_DIR + from archivebox.machine.models import Machine + + machine_id = str(Machine.current().id) + binaries_records = [] + + for binaries_file in BUILTIN_PLUGINS_DIR.glob('*/binaries.jsonl'): + try: + with open(binaries_file, 'r') as f: + for line in f: + line = line.strip() + if line and not line.startswith('#'): + try: + record = json.loads(line) + if record.get('type') == 'Binary': + record['machine_id'] = machine_id + binaries_records.append(record) + except json.JSONDecodeError: + pass + except Exception: + pass + + # Process binary declarations before running hooks + if binaries_records: + overrides = {'crawl': self} + process_hook_records(binaries_records, overrides=overrides) + # Discover and run on_Crawl hooks + with open(debug_log, 'a') as f: + f.write(f'Discovering Crawl hooks...\n') + f.flush() hooks = discover_hooks('Crawl', config=config) - first_url = self.get_urls_list()[0] if self.get_urls_list() else '' + with open(debug_log, 'a') as f: + f.write(f'Found {len(hooks)} hooks\n') + f.flush() for hook in hooks: + with open(debug_log, 'a') as f: + f.write(f'Running hook: {hook.name}\n') + f.flush() hook_start = time.time() plugin_name = hook.parent.name output_dir = self.OUTPUT_DIR / plugin_name output_dir.mkdir(parents=True, exist_ok=True) - result = run_hook( + # Run hook using Process.launch() - returns Process model + process = run_hook( hook, output_dir=output_dir, config=config, crawl_id=str(self.id), - source_url=first_url, + source_url=self.urls, # Pass full newline-separated URLs ) + with open(debug_log, 'a') as f: + f.write(f'Hook {hook.name} completed with status={process.status}\n') + f.flush() hook_elapsed = time.time() - hook_start if hook_elapsed > 0.5: # Log slow hooks print(f'[yellow]⏱️ Hook {hook.name} took {hook_elapsed:.2f}s[/yellow]') - # Background hook - returns None, continues running - if result is None: + # Background hook - still running + if process.status == process.StatusChoices.RUNNING: continue # Foreground hook - process JSONL records - records = result.get('records', []) + from archivebox.hooks import extract_records_from_process + records = extract_records_from_process(process) if records: print(f'[cyan]📝 Processing {len(records)} records from {hook.name}[/cyan]') for record in records[:3]: # Show first 3 @@ -423,14 +418,33 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith if stats: print(f'[green]✓ Created: {stats}[/green]') - # System crawls (archivebox://*) don't create snapshots - they just run hooks - if first_url.startswith('archivebox://'): - return None + # Create snapshots from all URLs in self.urls + with open(debug_log, 'a') as f: + f.write(f'Creating snapshots from URLs...\n') + f.flush() + created_snapshots = self.create_snapshots_from_urls() + with open(debug_log, 'a') as f: + f.write(f'Created {len(created_snapshots)} snapshots\n') + f.write(f'=== Crawl.run() complete ===\n\n') + f.flush() + return created_snapshots[0] if created_snapshots else None - # Create snapshots from URLs - root_snapshot = self.create_root_snapshot() - self.create_snapshots_from_urls() - return root_snapshot + def is_finished(self) -> bool: + """Check if crawl is finished (all snapshots sealed or no snapshots exist).""" + from archivebox.core.models import Snapshot + + # Check if any snapshots exist for this crawl + snapshots = Snapshot.objects.filter(crawl=self) + + # If no snapshots exist, allow finishing (e.g., archivebox://install crawls that only run hooks) + if not snapshots.exists(): + return True + + # If snapshots exist, check if all are sealed + if snapshots.filter(status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]).exists(): + return False + + return True def cleanup(self): """Clean up background hooks and run on_CrawlEnd hooks.""" @@ -452,7 +466,6 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith config = get_config(crawl=self) hooks = discover_hooks('CrawlEnd', config=config) - first_url = self.get_urls_list()[0] if self.get_urls_list() else '' for hook in hooks: plugin_name = hook.parent.name @@ -464,7 +477,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith output_dir=output_dir, config=config, crawl_id=str(self.id), - source_url=first_url, + source_url=self.urls, # Pass full newline-separated URLs ) # Log failures but don't block @@ -494,7 +507,6 @@ class CrawlMachine(BaseStateMachine, strict_states=True): │ - run_hook(script, output_dir, ...) │ │ - Parse JSONL from hook output │ │ - process_hook_records() → creates Snapshots │ - │ • create_root_snapshot() → root snapshot for crawl │ │ • create_snapshots_from_urls() → from self.urls field │ │ │ │ 2. Snapshots process independently with their own │ @@ -518,7 +530,8 @@ class CrawlMachine(BaseStateMachine, strict_states=True): # Tick Event (polled by workers) tick = ( queued.to.itself(unless='can_start') | - queued.to(started, cond='can_start') + queued.to(started, cond='can_start') | + started.to(sealed, cond='is_finished') ) # Manual event (triggered by last Snapshot sealing) @@ -534,6 +547,10 @@ class CrawlMachine(BaseStateMachine, strict_states=True): return False return True + def is_finished(self) -> bool: + """Check if all Snapshots for this crawl are finished.""" + return self.crawl.is_finished() + @started.enter def enter_started(self): import sys @@ -543,25 +560,21 @@ class CrawlMachine(BaseStateMachine, strict_states=True): try: # Run the crawl - runs hooks, processes JSONL, creates snapshots - root_snapshot = self.crawl.run() + first_snapshot = self.crawl.run() - if root_snapshot: - print(f'[cyan]🔄 Created root snapshot: {root_snapshot.url}[/cyan]', file=sys.stderr) + if first_snapshot: + print(f'[cyan]🔄 Created {self.crawl.snapshot_set.count()} snapshot(s), first: {first_snapshot.url}[/cyan]', file=sys.stderr) # Update status to STARTED - # Set retry_at to None so workers don't claim us (we wait for snapshots to finish) - # Last snapshot will manually call self.seal() when done + # Set retry_at to near future so tick() can poll and check is_finished() self.crawl.update_and_requeue( - retry_at=None, + retry_at=timezone.now() + timedelta(seconds=2), status=Crawl.StatusChoices.STARTED, ) else: # No snapshots (system crawl like archivebox://install) - print(f'[cyan]🔄 No snapshots created, allowing immediate seal[/cyan]', file=sys.stderr) - # Set retry_at=now so next tick() will transition to sealed - self.crawl.update_and_requeue( - retry_at=timezone.now(), - status=Crawl.StatusChoices.STARTED, - ) + print(f'[cyan]🔄 No snapshots created, sealing crawl immediately[/cyan]', file=sys.stderr) + # Seal immediately since there's no work to do + self.seal() except Exception as e: print(f'[red]⚠️ Crawl {self.crawl.id} failed to start: {e}[/red]') diff --git a/archivebox/hooks.py b/archivebox/hooks.py index b21022dc..0f69ad77 100644 --- a/archivebox/hooks.py +++ b/archivebox/hooks.py @@ -240,13 +240,14 @@ def run_hook( output_dir: Path, config: Dict[str, Any], timeout: Optional[int] = None, + parent: Optional['Process'] = None, **kwargs: Any -) -> HookResult: +) -> 'Process': """ - Execute a hook script with the given arguments. + Execute a hook script with the given arguments using Process model. - This is the low-level hook executor. For running extractors with proper - metadata handling, use call_extractor() instead. + This is the low-level hook executor that creates a Process record and + uses Process.launch() for subprocess management. Config is passed to hooks via environment variables. Caller MUST use get_config() to merge all sources (file, env, machine, crawl, snapshot). @@ -257,16 +258,20 @@ def run_hook( config: Merged config dict from get_config(crawl=..., snapshot=...) - REQUIRED timeout: Maximum execution time in seconds If None, auto-detects from PLUGINNAME_TIMEOUT config (fallback to TIMEOUT, default 300) + parent: Optional parent Process (for tracking worker->hook hierarchy) **kwargs: Arguments passed to the script as --key=value Returns: - HookResult with 'returncode', 'stdout', 'stderr', 'output_json', 'output_files', 'duration_ms' + Process model instance (use process.exit_code, process.stdout, process.get_records()) Example: from archivebox.config.configset import get_config config = get_config(crawl=my_crawl, snapshot=my_snapshot) - result = run_hook(hook_path, output_dir, config=config, url=url, snapshot_id=id) + process = run_hook(hook_path, output_dir, config=config, url=url, snapshot_id=id) + if process.status == 'exited': + records = process.get_records() # Get parsed JSONL output """ + from archivebox.machine.models import Process, Machine import time start_time = time.time() @@ -276,18 +281,32 @@ def run_hook( plugin_config = get_plugin_special_config(plugin_name, config) timeout = plugin_config['timeout'] + # Get current machine + machine = Machine.current() + + # Auto-detect parent process if not explicitly provided + # This enables automatic hierarchy tracking: Worker -> Hook + if parent is None: + try: + parent = Process.current() + except Exception: + # If Process.current() fails (e.g., not in a worker context), leave parent as None + pass + if not script.exists(): - return HookResult( - returncode=1, - stdout='', + # Create a failed Process record for hooks that don't exist + process = Process.objects.create( + machine=machine, + parent=parent, + process_type=Process.TypeChoices.HOOK, + pwd=str(output_dir), + cmd=['echo', f'Hook script not found: {script}'], + timeout=timeout, + status=Process.StatusChoices.EXITED, + exit_code=1, stderr=f'Hook script not found: {script}', - output_json=None, - output_files=[], - duration_ms=0, - hook=str(script), - plugin=script.parent.name, - hook_name=script.name, ) + return process # Determine the interpreter based on file extension ext = script.suffix.lower() @@ -379,130 +398,138 @@ def run_hook( # Create output directory if needed output_dir.mkdir(parents=True, exist_ok=True) - # Capture files before execution to detect new output - files_before = set(output_dir.rglob('*')) if output_dir.exists() else set() - # Detect if this is a background hook (long-running daemon) # New convention: .bg. suffix (e.g., on_Snapshot__21_consolelog.bg.js) # Old convention: __background in stem (for backwards compatibility) is_background = '.bg.' in script.name or '__background' in script.stem - # Set up output files for ALL hooks (useful for debugging) - stdout_file = output_dir / 'stdout.log' - stderr_file = output_dir / 'stderr.log' - pid_file = output_dir / 'hook.pid' - cmd_file = output_dir / 'cmd.sh' - try: - # Write command script for validation - from archivebox.misc.process_utils import write_cmd_file - write_cmd_file(cmd_file, cmd) - - # Open log files for writing - with open(stdout_file, 'w') as out, open(stderr_file, 'w') as err: - process = subprocess.Popen( - cmd, - cwd=str(output_dir), - stdout=out, - stderr=err, - env=env, - ) - - # Write PID with mtime set to process start time for validation - from archivebox.misc.process_utils import write_pid_file_with_mtime - process_start_time = time.time() - write_pid_file_with_mtime(pid_file, process.pid, process_start_time) - - if is_background: - # Background hook - return None immediately, don't wait - # Process continues running, writing to stdout.log - # ArchiveResult will poll for completion later - return None - - # Normal hook - wait for completion with timeout - try: - returncode = process.wait(timeout=timeout) - except subprocess.TimeoutExpired: - process.kill() - process.wait() # Clean up zombie - duration_ms = int((time.time() - start_time) * 1000) - return HookResult( - returncode=-1, - stdout='', - stderr=f'Hook timed out after {timeout} seconds', - output_json=None, - output_files=[], - duration_ms=duration_ms, - hook=str(script), - ) - - # Read output from files - stdout = stdout_file.read_text() if stdout_file.exists() else '' - stderr = stderr_file.read_text() if stderr_file.exists() else '' - - # Detect new files created by the hook - files_after = set(output_dir.rglob('*')) if output_dir.exists() else set() - new_files = [str(f.relative_to(output_dir)) for f in (files_after - files_before) if f.is_file()] - # Exclude the log files themselves from new_files - new_files = [f for f in new_files if f not in ('stdout.log', 'stderr.log', 'hook.pid')] - - # Parse JSONL output from stdout - # Each line starting with { that has 'type' field is a record - records = [] - plugin_name = script.parent.name # Plugin directory name (e.g., 'wget') - hook_name = script.name # Full hook filename (e.g., 'on_Snapshot__50_wget.py') - - for line in stdout.splitlines(): - line = line.strip() - if not line or not line.startswith('{'): - continue - - try: - data = json.loads(line) - if 'type' in data: - # Add plugin metadata to every record - data['plugin'] = plugin_name - data['hook_name'] = hook_name - data['plugin_hook'] = str(script) - records.append(data) - except json.JSONDecodeError: - pass - - duration_ms = int((time.time() - start_time) * 1000) - - # Clean up log files on success (keep on failure for debugging) - if returncode == 0: - stdout_file.unlink(missing_ok=True) - stderr_file.unlink(missing_ok=True) - pid_file.unlink(missing_ok=True) - - return HookResult( - returncode=returncode, - stdout=stdout, - stderr=stderr, - output_json=None, # Legacy field, we now use records for JSONL - output_files=new_files, - duration_ms=duration_ms, - hook=str(script), - plugin=plugin_name, - hook_name=hook_name, - records=records, + # Create Process record + process = Process.objects.create( + machine=machine, + parent=parent, + process_type=Process.TypeChoices.HOOK, + pwd=str(output_dir), + cmd=cmd, + timeout=timeout, ) + # Build environment from config (Process._build_env() expects self.env dict) + # We need to set env on the process before launching + process.env = {} + for key, value in config.items(): + if value is None: + continue + elif isinstance(value, bool): + process.env[key] = 'true' if value else 'false' + elif isinstance(value, (list, dict)): + process.env[key] = json.dumps(value) + else: + process.env[key] = str(value) + + # Add base paths to env + process.env['DATA_DIR'] = str(getattr(settings, 'DATA_DIR', Path.cwd())) + process.env['ARCHIVE_DIR'] = str(getattr(settings, 'ARCHIVE_DIR', Path.cwd() / 'archive')) + process.env.setdefault('MACHINE_ID', getattr(settings, 'MACHINE_ID', '') or os.environ.get('MACHINE_ID', '')) + + # Add LIB_DIR and LIB_BIN_DIR + lib_dir = config.get('LIB_DIR', getattr(settings, 'LIB_DIR', None)) + lib_bin_dir = config.get('LIB_BIN_DIR', getattr(settings, 'LIB_BIN_DIR', None)) + if lib_dir: + process.env['LIB_DIR'] = str(lib_dir) + if not lib_bin_dir and lib_dir: + lib_bin_dir = Path(lib_dir) / 'bin' + if lib_bin_dir: + process.env['LIB_BIN_DIR'] = str(lib_bin_dir) + + # Set PATH from Machine.config if available + try: + if machine and machine.config: + machine_path = machine.config.get('config/PATH') + if machine_path: + # Prepend LIB_BIN_DIR to machine PATH as well + if lib_bin_dir and not machine_path.startswith(f'{lib_bin_dir}:'): + process.env['PATH'] = f'{lib_bin_dir}:{machine_path}' + else: + process.env['PATH'] = machine_path + elif lib_bin_dir: + # Just prepend to current PATH + current_path = os.environ.get('PATH', '') + if not current_path.startswith(f'{lib_bin_dir}:'): + process.env['PATH'] = f'{lib_bin_dir}:{current_path}' if current_path else str(lib_bin_dir) + + # Also set NODE_MODULES_DIR if configured + node_modules_dir = machine.config.get('config/NODE_MODULES_DIR') + if node_modules_dir: + process.env['NODE_MODULES_DIR'] = node_modules_dir + except Exception: + pass # Fall back to system PATH if Machine not available + + # Save env before launching + process.save() + + # Launch subprocess using Process.launch() + process.launch(background=is_background) + + # Return Process object (caller can use process.exit_code, process.stdout, process.get_records()) + return process + except Exception as e: - duration_ms = int((time.time() - start_time) * 1000) - return HookResult( - returncode=-1, - stdout='', + # Create a failed Process record for exceptions + process = Process.objects.create( + machine=machine, + process_type=Process.TypeChoices.HOOK, + pwd=str(output_dir), + cmd=cmd, + timeout=timeout, + status=Process.StatusChoices.EXITED, + exit_code=-1, stderr=f'Failed to run hook: {type(e).__name__}: {e}', - output_json=None, - output_files=[], - duration_ms=duration_ms, - hook=str(script), - plugin=script.parent.name, - hook_name=script.name, - records=[], ) + return process + + +def extract_records_from_process(process: 'Process') -> List[Dict[str, Any]]: + """ + Extract JSONL records from a Process's stdout. + + Uses the same parse_line() logic from misc/jsonl.py. + Adds plugin metadata to each record. + + Args: + process: Process model instance with stdout captured + + Returns: + List of parsed JSONL records with plugin metadata + """ + from archivebox.misc.jsonl import parse_line + + records = [] + + # Read stdout from process + stdout = process.stdout + if not stdout and process.stdout_file and process.stdout_file.exists(): + stdout = process.stdout_file.read_text() + + if not stdout: + return records + + # Extract plugin metadata from process.pwd and process.cmd + plugin_name = Path(process.pwd).name if process.pwd else 'unknown' + hook_name = Path(process.cmd[1]).name if len(process.cmd) > 1 else 'unknown' + plugin_hook = process.cmd[1] if len(process.cmd) > 1 else '' + + # Parse each line as JSONL + for line in stdout.splitlines(): + record = parse_line(line) + if record and 'type' in record: + # Add plugin metadata to record + record.setdefault('plugin', plugin_name) + record.setdefault('hook_name', hook_name) + record.setdefault('plugin_hook', plugin_hook) + records.append(record) + + return records def collect_urls_from_plugins(snapshot_dir: Path) -> List[Dict[str, Any]]: @@ -940,7 +967,7 @@ def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[ else: # No PLUGINS whitelist - use PLUGINNAME_ENABLED (default True) import sys - print(f"DEBUG: NO PLUGINS whitelist in config, checking {plugin_name}_ENABLED", file=sys.stderr) + print(f"DEBUG: NO PLUGINS whitelist in config, checking {plugin_upper}_ENABLED", file=sys.stderr) enabled_key = f'{plugin_upper}_ENABLED' enabled = config.get(enabled_key) if enabled is None: diff --git a/archivebox/machine/migrations/0009_alter_binary_status.py b/archivebox/machine/migrations/0009_alter_binary_status.py new file mode 100644 index 00000000..88ed39ad --- /dev/null +++ b/archivebox/machine/migrations/0009_alter_binary_status.py @@ -0,0 +1,18 @@ +# Generated by Django 6.0 on 2026-01-02 08:43 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('machine', '0008_add_worker_type_field'), + ] + + operations = [ + migrations.AlterField( + model_name='binary', + name='status', + field=models.CharField(choices=[('queued', 'Queued'), ('installed', 'Installed')], db_index=True, default='queued', max_length=16), + ), + ] diff --git a/archivebox/machine/models.py b/archivebox/machine/models.py index 417e4c9f..73680905 100755 --- a/archivebox/machine/models.py +++ b/archivebox/machine/models.py @@ -683,6 +683,7 @@ class Process(models.Model): ORCHESTRATOR = 'orchestrator', 'Orchestrator' WORKER = 'worker', 'Worker' CLI = 'cli', 'CLI' + HOOK = 'hook', 'Hook' BINARY = 'binary', 'Binary' # Primary fields @@ -1415,6 +1416,10 @@ class Process(models.Model): """ Check if process has exited and update status if so. + Cleanup when process exits: + - Copy stdout/stderr to DB (keep files for debugging) + - Delete PID file + Returns: exit_code if exited, None if still running """ @@ -1422,11 +1427,25 @@ class Process(models.Model): return self.exit_code if not self.is_running: - # Process exited - read output and update status + # Process exited - read output and copy to DB if self.stdout_file and self.stdout_file.exists(): self.stdout = self.stdout_file.read_text() + # TODO: Uncomment to cleanup (keeping for debugging for now) + # self.stdout_file.unlink(missing_ok=True) if self.stderr_file and self.stderr_file.exists(): self.stderr = self.stderr_file.read_text() + # TODO: Uncomment to cleanup (keeping for debugging for now) + # self.stderr_file.unlink(missing_ok=True) + + # Clean up PID file (not needed for debugging) + if self.pid_file and self.pid_file.exists(): + self.pid_file.unlink(missing_ok=True) + + # TODO: Uncomment to cleanup cmd.sh (keeping for debugging for now) + # if self.pwd: + # cmd_file = Path(self.pwd) / 'cmd.sh' + # if cmd_file.exists(): + # cmd_file.unlink(missing_ok=True) # Try to get exit code from proc or default to unknown self.exit_code = self.exit_code if self.exit_code is not None else -1 @@ -1686,6 +1705,46 @@ class Process(models.Model): """ return cls.get_running_count(process_type=process_type, machine=machine) + @classmethod + def cleanup_orphaned_chrome(cls) -> int: + """ + Kill orphaned Chrome processes using chrome_utils.js killZombieChrome. + + Scans DATA_DIR for chrome/*.pid files from stale crawls (>5 min old) + and kills any orphaned Chrome processes. + + Called by: + - Orchestrator on startup (cleanup from previous crashes) + - Orchestrator periodically (every N minutes) + + Returns: + Number of zombie Chrome processes killed + """ + import subprocess + from pathlib import Path + from django.conf import settings + + chrome_utils = Path(__file__).parent.parent / 'plugins' / 'chrome' / 'chrome_utils.js' + if not chrome_utils.exists(): + return 0 + + try: + result = subprocess.run( + ['node', str(chrome_utils), 'killZombieChrome', str(settings.DATA_DIR)], + capture_output=True, + timeout=30, + text=True, + ) + if result.returncode == 0: + killed = int(result.stdout.strip()) + if killed > 0: + print(f'[yellow]🧹 Cleaned up {killed} orphaned Chrome processes[/yellow]') + return killed + except (subprocess.TimeoutExpired, ValueError, FileNotFoundError) as e: + print(f'[red]Failed to cleanup orphaned Chrome: {e}[/red]') + + return 0 + # ============================================================================= # Binary State Machine diff --git a/archivebox/misc/logging_util.py b/archivebox/misc/logging_util.py index 547b3b68..a3ad4566 100644 --- a/archivebox/misc/logging_util.py +++ b/archivebox/misc/logging_util.py @@ -530,13 +530,13 @@ def log_worker_event( Log a worker event with structured metadata and indentation. Args: - worker_type: Type of worker (Orchestrator, CrawlWorker, SnapshotWorker, etc.) + worker_type: Type of worker (Orchestrator, CrawlWorker, SnapshotWorker) event: Event name (Starting, Completed, Failed, etc.) - indent_level: Indentation level (0=Orchestrator, 1=CrawlWorker, 2=SnapshotWorker, 3=ArchiveResultWorker) + indent_level: Indentation level (0=Orchestrator, 1=CrawlWorker, 2=SnapshotWorker) pid: Process ID - worker_id: Worker ID (UUID for CrawlWorker, url for SnapshotWorker, plugin for ArchiveResultWorker) - url: URL being processed (for SnapshotWorker/ArchiveResultWorker) - plugin: Plugin name (for ArchiveResultWorker) + worker_id: Worker ID (UUID for workers) + url: URL being processed (for SnapshotWorker) + plugin: Plugin name (for hook processes) metadata: Dict of metadata to show in curly braces error: Exception if event is an error """ diff --git a/archivebox/misc/progress_layout.py b/archivebox/misc/progress_layout.py new file mode 100644 index 00000000..f386cd13 --- /dev/null +++ b/archivebox/misc/progress_layout.py @@ -0,0 +1,345 @@ +""" +Rich Layout-based live progress display for ArchiveBox orchestrator. + +Shows a comprehensive dashboard with: +- Top: Crawl queue status (full width) +- Middle: 4-column grid of SnapshotWorker progress panels +- Bottom: Orchestrator/Daphne logs +""" + +__package__ = 'archivebox.misc' + +from datetime import datetime, timezone +from typing import Dict, List, Optional, Any +from collections import deque + +from rich import box +from rich.align import Align +from rich.console import Console, Group, RenderableType +from rich.layout import Layout +from rich.panel import Panel +from rich.progress import Progress, BarColumn, TextColumn, TaskProgressColumn, SpinnerColumn +from rich.table import Table +from rich.text import Text + +from archivebox.config import VERSION + +# Maximum number of SnapshotWorker columns to display +MAX_WORKER_COLUMNS = 4 + + +class CrawlQueuePanel: + """Display crawl queue status across full width.""" + + def __init__(self): + self.orchestrator_status = "Idle" + self.crawl_queue_count = 0 + self.crawl_workers_count = 0 + self.max_crawl_workers = 8 + self.crawl_id: Optional[str] = None + + def __rich__(self) -> Panel: + grid = Table.grid(expand=True) + grid.add_column(justify="left", ratio=1) + grid.add_column(justify="center", ratio=1) + grid.add_column(justify="center", ratio=1) + grid.add_column(justify="right", ratio=1) + + # Left: ArchiveBox version + timestamp + left_text = Text() + left_text.append("ArchiveBox ", style="bold cyan") + left_text.append(f"v{VERSION}", style="bold yellow") + left_text.append(f" • {datetime.now(timezone.utc).strftime('%H:%M:%S')}", style="grey53") + + # Center-left: Crawl queue status + queue_style = "yellow" if self.crawl_queue_count > 0 else "grey53" + center_left_text = Text() + center_left_text.append("Crawls: ", style="white") + center_left_text.append(str(self.crawl_queue_count), style=f"bold {queue_style}") + center_left_text.append(" queued", style="grey53") + + # Center-right: CrawlWorker status + worker_style = "green" if self.crawl_workers_count > 0 else "grey53" + center_right_text = Text() + center_right_text.append("Workers: ", style="white") + center_right_text.append(f"{self.crawl_workers_count}/{self.max_crawl_workers}", style=f"bold {worker_style}") + center_right_text.append(" active", style="grey53") + + # Right: Orchestrator status + status_color = "green" if self.crawl_workers_count > 0 else "grey53" + right_text = Text() + right_text.append("Status: ", style="white") + right_text.append(self.orchestrator_status, style=f"bold {status_color}") + if self.crawl_id: + right_text.append(f" [{self.crawl_id[:8]}]", style="grey53") + + grid.add_row(left_text, center_left_text, center_right_text, right_text) + return Panel(grid, style="white on blue", box=box.ROUNDED) + + +class SnapshotWorkerPanel: + """Display progress for a single SnapshotWorker.""" + + def __init__(self, worker_num: int): + self.worker_num = worker_num + self.snapshot_id: Optional[str] = None + self.snapshot_url: Optional[str] = None + self.total_hooks: int = 0 + self.completed_hooks: int = 0 + self.current_plugin: Optional[str] = None + self.status: str = "idle" # idle, working, completed + self.recent_logs: deque = deque(maxlen=5) + + def __rich__(self) -> Panel: + if self.status == "idle": + content = Align.center( + Text("Idle", style="grey53"), + vertical="middle", + ) + border_style = "grey53" + title_style = "grey53" + else: + # Build progress display + lines = [] + + # URL (truncated) + if self.snapshot_url: + url_display = self.snapshot_url[:35] + "..." if len(self.snapshot_url) > 35 else self.snapshot_url + lines.append(Text(url_display, style="cyan")) + lines.append(Text()) # Spacing + + # Progress bar + if self.total_hooks > 0: + pct = (self.completed_hooks / self.total_hooks) * 100 + bar_width = 30 + filled = int((pct / 100) * bar_width) + bar = "█" * filled + "░" * (bar_width - filled) + + # Color based on progress + if pct < 30: + bar_style = "yellow" + elif pct < 100: + bar_style = "green" + else: + bar_style = "blue" + + progress_text = Text() + progress_text.append(bar, style=bar_style) + progress_text.append(f" {pct:.0f}%", style="white") + lines.append(progress_text) + lines.append(Text()) # Spacing + + # Stats + stats = Table.grid(padding=(0, 1)) + stats.add_column(style="grey53", no_wrap=True) + stats.add_column(style="white") + stats.add_row("Hooks:", f"{self.completed_hooks}/{self.total_hooks}") + if self.current_plugin: + stats.add_row("Current:", Text(self.current_plugin, style="yellow")) + lines.append(stats) + lines.append(Text()) # Spacing + + # Recent logs + if self.recent_logs: + lines.append(Text("Recent:", style="grey53")) + for log_msg, log_style in self.recent_logs: + log_text = Text(f"• {log_msg[:30]}", style=log_style) + lines.append(log_text) + + content = Group(*lines) + border_style = "green" if self.status == "working" else "blue" + title_style = "green" if self.status == "working" else "blue" + + return Panel( + content, + title=f"[{title_style}]Worker {self.worker_num}", + border_style=border_style, + box=box.ROUNDED, + height=20, + ) + + def add_log(self, message: str, style: str = "white"): + """Add a log message to this worker's recent logs.""" + self.recent_logs.append((message, style)) + + +class OrchestratorLogPanel: + """Display orchestrator and system logs.""" + + def __init__(self, max_events: int = 15): + self.events: deque = deque(maxlen=max_events) + self.max_events = max_events + + def add_event(self, message: str, style: str = "white"): + """Add an event to the log.""" + timestamp = datetime.now(timezone.utc).strftime("%H:%M:%S") + self.events.append((timestamp, message, style)) + + def __rich__(self) -> Panel: + if not self.events: + content = Text("No recent events", style="grey53", justify="center") + else: + lines = [] + for timestamp, message, style in self.events: + line = Text() + line.append(f"[{timestamp}] ", style="grey53") + line.append(message, style=style) + lines.append(line) + content = Group(*lines) + + return Panel( + content, + title="[bold white]Orchestrator / Daphne Logs", + border_style="white", + box=box.ROUNDED, + height=12, + ) + + +class ArchiveBoxProgressLayout: + """ + Main layout manager for ArchiveBox orchestrator progress display. + + Layout structure: + ┌─────────────────────────────────────────────────────────────┐ + │ Crawl Queue (full width) │ + ├───────────────┬───────────────┬───────────────┬─────────────┤ + │ Snapshot │ Snapshot │ Snapshot │ Snapshot │ + │ Worker 1 │ Worker 2 │ Worker 3 │ Worker 4 │ + │ │ │ │ │ + │ Progress + │ Progress + │ Progress + │ Progress + │ + │ Stats + │ Stats + │ Stats + │ Stats + │ + │ Logs │ Logs │ Logs │ Logs │ + ├───────────────┴───────────────┴───────────────┴─────────────┤ + │ Orchestrator / Daphne Logs │ + └─────────────────────────────────────────────────────────────┘ + """ + + def __init__(self, crawl_id: Optional[str] = None): + self.crawl_id = crawl_id + self.start_time = datetime.now(timezone.utc) + + # Create components + self.crawl_queue = CrawlQueuePanel() + self.crawl_queue.crawl_id = crawl_id + + # Create 4 worker panels + self.worker_panels = [SnapshotWorkerPanel(i + 1) for i in range(MAX_WORKER_COLUMNS)] + + self.orchestrator_log = OrchestratorLogPanel(max_events=12) + + # Create layout + self.layout = self._make_layout() + + # Track snapshot ID to worker panel mapping + self.snapshot_to_worker: Dict[str, int] = {} # snapshot_id -> worker_panel_index + + def _make_layout(self) -> Layout: + """Define the layout structure.""" + layout = Layout(name="root") + + # Top-level split: crawl_queue, workers, logs + layout.split( + Layout(name="crawl_queue", size=3), + Layout(name="workers", ratio=1), + Layout(name="logs", size=13), + ) + + # Split workers into 4 columns + layout["workers"].split_row( + Layout(name="worker1"), + Layout(name="worker2"), + Layout(name="worker3"), + Layout(name="worker4"), + ) + + # Assign components to layout sections + layout["crawl_queue"].update(self.crawl_queue) + layout["worker1"].update(self.worker_panels[0]) + layout["worker2"].update(self.worker_panels[1]) + layout["worker3"].update(self.worker_panels[2]) + layout["worker4"].update(self.worker_panels[3]) + layout["logs"].update(self.orchestrator_log) + + return layout + + def update_orchestrator_status( + self, + status: str, + crawl_queue_count: int = 0, + crawl_workers_count: int = 0, + max_crawl_workers: int = 8, + ): + """Update orchestrator status in the crawl queue panel.""" + self.crawl_queue.orchestrator_status = status + self.crawl_queue.crawl_queue_count = crawl_queue_count + self.crawl_queue.crawl_workers_count = crawl_workers_count + self.crawl_queue.max_crawl_workers = max_crawl_workers + + def update_snapshot_worker( + self, + snapshot_id: str, + url: str, + total: int, + completed: int, + current_plugin: str = "", + ): + """Update or assign a snapshot to a worker panel.""" + # Find or assign worker panel for this snapshot + if snapshot_id not in self.snapshot_to_worker: + # Find first idle worker panel + worker_idx = None + for idx, panel in enumerate(self.worker_panels): + if panel.status == "idle": + worker_idx = idx + break + + # If no idle worker, use round-robin (shouldn't happen often) + if worker_idx is None: + worker_idx = len(self.snapshot_to_worker) % MAX_WORKER_COLUMNS + + self.snapshot_to_worker[snapshot_id] = worker_idx + + # Get assigned worker panel + worker_idx = self.snapshot_to_worker[snapshot_id] + panel = self.worker_panels[worker_idx] + + # Update panel + panel.snapshot_id = snapshot_id + panel.snapshot_url = url + panel.total_hooks = total + panel.completed_hooks = completed + panel.current_plugin = current_plugin + panel.status = "working" if completed < total else "completed" + + def remove_snapshot_worker(self, snapshot_id: str): + """Mark a snapshot worker as idle after completion.""" + if snapshot_id in self.snapshot_to_worker: + worker_idx = self.snapshot_to_worker[snapshot_id] + panel = self.worker_panels[worker_idx] + + # Mark as idle + panel.status = "idle" + panel.snapshot_id = None + panel.snapshot_url = None + panel.total_hooks = 0 + panel.completed_hooks = 0 + panel.current_plugin = None + panel.recent_logs.clear() + + # Remove mapping + del self.snapshot_to_worker[snapshot_id] + + def log_to_worker(self, snapshot_id: str, message: str, style: str = "white"): + """Add a log message to a specific worker's panel.""" + if snapshot_id in self.snapshot_to_worker: + worker_idx = self.snapshot_to_worker[snapshot_id] + self.worker_panels[worker_idx].add_log(message, style) + + def log_event(self, message: str, style: str = "white"): + """Add an event to the orchestrator log.""" + self.orchestrator_log.add_event(message, style) + + def get_layout(self) -> Layout: + """Get the Rich Layout object for rendering.""" + return self.layout diff --git a/archivebox/plugins/accessibility/tests/test_accessibility.py b/archivebox/plugins/accessibility/tests/test_accessibility.py index 0c85b145..addd51df 100644 --- a/archivebox/plugins/accessibility/tests/test_accessibility.py +++ b/archivebox/plugins/accessibility/tests/test_accessibility.py @@ -72,10 +72,8 @@ class TestAccessibilityWithChrome(TestCase): test_url=test_url, navigate=True, timeout=30, - ) as (chrome_process, chrome_pid, snapshot_chrome_dir): - # Get environment and run the accessibility hook - env = get_test_env() - env['CHROME_HEADLESS'] = 'true' + ) as (chrome_process, chrome_pid, snapshot_chrome_dir, env): + # Use the environment from chrome_session (already has CHROME_HEADLESS=true) # Run accessibility hook with the active Chrome session result = subprocess.run( @@ -116,6 +114,85 @@ class TestAccessibilityWithChrome(TestCase): self.skipTest(f"Chrome session setup failed: {e}") raise + def test_accessibility_disabled_skips(self): + """Test that ACCESSIBILITY_ENABLED=False skips without error.""" + test_url = 'https://example.com' + snapshot_id = 'test-disabled' + + env = get_test_env() + env['ACCESSIBILITY_ENABLED'] = 'False' + + result = subprocess.run( + ['node', str(ACCESSIBILITY_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], + cwd=str(self.temp_dir), + capture_output=True, + text=True, + timeout=30, + env=env + ) + + # Should exit 0 even when disabled + self.assertEqual(result.returncode, 0, f"Should succeed when disabled: {result.stderr}") + + # Should NOT create output file when disabled + accessibility_output = self.temp_dir / 'accessibility.json' + self.assertFalse(accessibility_output.exists(), "Should not create file when disabled") + + def test_accessibility_missing_url_argument(self): + """Test that missing --url argument causes error.""" + snapshot_id = 'test-missing-url' + + result = subprocess.run( + ['node', str(ACCESSIBILITY_HOOK), f'--snapshot-id={snapshot_id}'], + cwd=str(self.temp_dir), + capture_output=True, + text=True, + timeout=30, + env=get_test_env() + ) + + # Should fail with non-zero exit code + self.assertNotEqual(result.returncode, 0, "Should fail when URL missing") + + def test_accessibility_missing_snapshot_id_argument(self): + """Test that missing --snapshot-id argument causes error.""" + test_url = 'https://example.com' + + result = subprocess.run( + ['node', str(ACCESSIBILITY_HOOK), f'--url={test_url}'], + cwd=str(self.temp_dir), + capture_output=True, + text=True, + timeout=30, + env=get_test_env() + ) + + # Should fail with non-zero exit code + self.assertNotEqual(result.returncode, 0, "Should fail when snapshot-id missing") + + def test_accessibility_with_no_chrome_session(self): + """Test that hook fails gracefully when no Chrome session exists.""" + test_url = 'https://example.com' + snapshot_id = 'test-no-chrome' + + result = subprocess.run( + ['node', str(ACCESSIBILITY_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], + cwd=str(self.temp_dir), + capture_output=True, + text=True, + timeout=30, + env=get_test_env() + ) + + # Should fail when no Chrome session + self.assertNotEqual(result.returncode, 0, "Should fail when no Chrome session exists") + # Error should mention CDP or Chrome + err_lower = result.stderr.lower() + self.assertTrue( + any(x in err_lower for x in ['chrome', 'cdp', 'cannot find', 'puppeteer']), + f"Should mention Chrome/CDP in error: {result.stderr}" + ) + if __name__ == '__main__': pytest.main([__file__, '-v']) diff --git a/archivebox/plugins/chrome/chrome_utils.js b/archivebox/plugins/chrome/chrome_utils.js index dd9ad47b..f61cfcdd 100755 --- a/archivebox/plugins/chrome/chrome_utils.js +++ b/archivebox/plugins/chrome/chrome_utils.js @@ -1397,11 +1397,11 @@ function getMachineType() { */ function getLibDir() { if (process.env.LIB_DIR) { - return process.env.LIB_DIR; + return path.resolve(process.env.LIB_DIR); } const dataDir = getEnv('DATA_DIR', './data'); const machineType = getMachineType(); - return path.join(dataDir, 'lib', machineType); + return path.resolve(path.join(dataDir, 'lib', machineType)); } /** @@ -1412,9 +1412,9 @@ function getLibDir() { */ function getNodeModulesDir() { if (process.env.NODE_MODULES_DIR) { - return process.env.NODE_MODULES_DIR; + return path.resolve(process.env.NODE_MODULES_DIR); } - return path.join(getLibDir(), 'npm', 'node_modules'); + return path.resolve(path.join(getLibDir(), 'npm', 'node_modules')); } /** diff --git a/archivebox/plugins/chrome/tests/chrome_test_helpers.py b/archivebox/plugins/chrome/tests/chrome_test_helpers.py index 89301f5f..3c2424ca 100644 --- a/archivebox/plugins/chrome/tests/chrome_test_helpers.py +++ b/archivebox/plugins/chrome/tests/chrome_test_helpers.py @@ -37,9 +37,8 @@ Usage: # For Chrome session tests: from archivebox.plugins.chrome.tests.chrome_test_helpers import ( - setup_chrome_session, # Full Chrome + tab setup - cleanup_chrome, # Cleanup by PID - chrome_session, # Context manager + chrome_session, # Context manager (Full Chrome + tab setup with automatic cleanup) + cleanup_chrome, # Manual cleanup by PID (rarely needed) ) # For extension tests: @@ -184,8 +183,7 @@ def get_lib_dir() -> Path: # Fallback to Python if os.environ.get('LIB_DIR'): return Path(os.environ['LIB_DIR']) - from archivebox.config.common import STORAGE_CONFIG - return Path(str(STORAGE_CONFIG.LIB_DIR)) + raise Exception('LIB_DIR env var must be set!') def get_node_modules_dir() -> Path: @@ -695,111 +693,6 @@ def chromium_session(env: dict, chrome_dir: Path, crawl_id: str): # ============================================================================= -def setup_chrome_session( - tmpdir: Path, - crawl_id: str = 'test-crawl', - snapshot_id: str = 'test-snapshot', - test_url: str = 'about:blank', - navigate: bool = True, - timeout: int = 15, -) -> Tuple[subprocess.Popen, int, Path]: - """Set up a Chrome session with tab and optional navigation. - - Creates the directory structure, launches Chrome, creates a tab, - and optionally navigates to the test URL. - - Args: - tmpdir: Temporary directory for test files - crawl_id: ID to use for the crawl - snapshot_id: ID to use for the snapshot - test_url: URL to navigate to (if navigate=True) - navigate: Whether to navigate to the URL after creating tab - timeout: Seconds to wait for Chrome to start - - Returns: - Tuple of (chrome_launch_process, chrome_pid, snapshot_chrome_dir) - - Raises: - RuntimeError: If Chrome fails to start or tab creation fails - """ - crawl_dir = Path(tmpdir) / 'crawl' - crawl_dir.mkdir(exist_ok=True) - chrome_dir = crawl_dir / 'chrome' - chrome_dir.mkdir(exist_ok=True) - - env = get_test_env() - env['CHROME_HEADLESS'] = 'true' - - # Launch Chrome at crawl level - chrome_launch_process = subprocess.Popen( - ['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'], - cwd=str(chrome_dir), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - env=env - ) - - # Wait for Chrome to launch - for i in range(timeout): - if chrome_launch_process.poll() is not None: - stdout, stderr = chrome_launch_process.communicate() - raise RuntimeError(f"Chrome launch failed:\nStdout: {stdout}\nStderr: {stderr}") - if (chrome_dir / 'cdp_url.txt').exists(): - break - time.sleep(1) - - if not (chrome_dir / 'cdp_url.txt').exists(): - raise RuntimeError(f"Chrome CDP URL not found after {timeout}s") - - chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip()) - - # Create snapshot directory structure - snapshot_dir = Path(tmpdir) / 'snapshot' - snapshot_dir.mkdir(exist_ok=True) - snapshot_chrome_dir = snapshot_dir / 'chrome' - snapshot_chrome_dir.mkdir(exist_ok=True) - - # Create tab - tab_env = env.copy() - tab_env['CRAWL_OUTPUT_DIR'] = str(crawl_dir) - try: - result = subprocess.run( - ['node', str(CHROME_TAB_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}', f'--crawl-id={crawl_id}'], - cwd=str(snapshot_chrome_dir), - capture_output=True, - text=True, - timeout=60, - env=tab_env - ) - if result.returncode != 0: - cleanup_chrome(chrome_launch_process, chrome_pid) - raise RuntimeError(f"Tab creation failed: {result.stderr}") - except subprocess.TimeoutExpired: - cleanup_chrome(chrome_launch_process, chrome_pid) - raise RuntimeError("Tab creation timed out after 60s") - - # Navigate to URL if requested - if navigate and CHROME_NAVIGATE_HOOK and test_url != 'about:blank': - try: - result = subprocess.run( - ['node', str(CHROME_NAVIGATE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], - cwd=str(snapshot_chrome_dir), - capture_output=True, - text=True, - timeout=120, - env=env - ) - if result.returncode != 0: - cleanup_chrome(chrome_launch_process, chrome_pid) - raise RuntimeError(f"Navigation failed: {result.stderr}") - except subprocess.TimeoutExpired: - cleanup_chrome(chrome_launch_process, chrome_pid) - raise RuntimeError("Navigation timed out after 120s") - - return chrome_launch_process, chrome_pid, snapshot_chrome_dir - - def cleanup_chrome(chrome_launch_process: subprocess.Popen, chrome_pid: int, chrome_dir: Optional[Path] = None) -> None: """Clean up Chrome processes using chrome_utils.js killChrome. @@ -835,8 +728,12 @@ def chrome_session( ): """Context manager for Chrome sessions with automatic cleanup. + Creates the directory structure, launches Chrome, creates a tab, + and optionally navigates to the test URL. Automatically cleans up + Chrome on exit. + Usage: - with chrome_session(tmpdir, test_url='https://example.com') as (process, pid, chrome_dir): + with chrome_session(tmpdir, test_url='https://example.com') as (process, pid, chrome_dir, env): # Run tests with chrome session pass # Chrome automatically cleaned up @@ -850,20 +747,129 @@ def chrome_session( timeout: Seconds to wait for Chrome to start Yields: - Tuple of (chrome_launch_process, chrome_pid, snapshot_chrome_dir) + Tuple of (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env) + + Raises: + RuntimeError: If Chrome fails to start or tab creation fails """ chrome_launch_process = None chrome_pid = None try: - chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session( - tmpdir=tmpdir, - crawl_id=crawl_id, - snapshot_id=snapshot_id, - test_url=test_url, - navigate=navigate, - timeout=timeout, + # Create proper directory structure in tmpdir + machine = platform.machine().lower() + system = platform.system().lower() + if machine in ('arm64', 'aarch64'): + machine = 'arm64' + elif machine in ('x86_64', 'amd64'): + machine = 'x86_64' + machine_type = f"{machine}-{system}" + + data_dir = Path(tmpdir) / 'data' + lib_dir = data_dir / 'lib' / machine_type + npm_dir = lib_dir / 'npm' + node_modules_dir = npm_dir / 'node_modules' + + # Create lib structure for puppeteer installation + node_modules_dir.mkdir(parents=True, exist_ok=True) + + # Create crawl and snapshot directories + crawl_dir = Path(tmpdir) / 'crawl' + crawl_dir.mkdir(exist_ok=True) + chrome_dir = crawl_dir / 'chrome' + chrome_dir.mkdir(exist_ok=True) + + # Build env with tmpdir-specific paths + env = os.environ.copy() + env.update({ + 'DATA_DIR': str(data_dir), + 'LIB_DIR': str(lib_dir), + 'MACHINE_TYPE': machine_type, + 'NODE_MODULES_DIR': str(node_modules_dir), + 'NODE_PATH': str(node_modules_dir), + 'NPM_BIN_DIR': str(npm_dir / '.bin'), + 'CHROME_HEADLESS': 'true', + }) + + # CRITICAL: Run chrome install hook first (installs puppeteer-core and chromium) + # chrome_launch assumes chrome_install has already run + install_result = subprocess.run( + ['python', str(CHROME_INSTALL_HOOK)], + capture_output=True, + text=True, + timeout=120, + env=env ) - yield chrome_launch_process, chrome_pid, snapshot_chrome_dir + if install_result.returncode != 0: + raise RuntimeError(f"Chrome install failed: {install_result.stderr}") + + # Launch Chrome at crawl level + chrome_launch_process = subprocess.Popen( + ['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'], + cwd=str(chrome_dir), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + env=env + ) + + # Wait for Chrome to launch + for i in range(timeout): + if chrome_launch_process.poll() is not None: + stdout, stderr = chrome_launch_process.communicate() + raise RuntimeError(f"Chrome launch failed:\nStdout: {stdout}\nStderr: {stderr}") + if (chrome_dir / 'cdp_url.txt').exists(): + break + time.sleep(1) + + if not (chrome_dir / 'cdp_url.txt').exists(): + raise RuntimeError(f"Chrome CDP URL not found after {timeout}s") + + chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip()) + + # Create snapshot directory structure + snapshot_dir = Path(tmpdir) / 'snapshot' + snapshot_dir.mkdir(exist_ok=True) + snapshot_chrome_dir = snapshot_dir / 'chrome' + snapshot_chrome_dir.mkdir(exist_ok=True) + + # Create tab + tab_env = env.copy() + tab_env['CRAWL_OUTPUT_DIR'] = str(crawl_dir) + try: + result = subprocess.run( + ['node', str(CHROME_TAB_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}', f'--crawl-id={crawl_id}'], + cwd=str(snapshot_chrome_dir), + capture_output=True, + text=True, + timeout=60, + env=tab_env + ) + if result.returncode != 0: + cleanup_chrome(chrome_launch_process, chrome_pid) + raise RuntimeError(f"Tab creation failed: {result.stderr}") + except subprocess.TimeoutExpired: + cleanup_chrome(chrome_launch_process, chrome_pid) + raise RuntimeError("Tab creation timed out after 60s") + + # Navigate to URL if requested + if navigate and CHROME_NAVIGATE_HOOK and test_url != 'about:blank': + try: + result = subprocess.run( + ['node', str(CHROME_NAVIGATE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], + cwd=str(snapshot_chrome_dir), + capture_output=True, + text=True, + timeout=120, + env=env + ) + if result.returncode != 0: + cleanup_chrome(chrome_launch_process, chrome_pid) + raise RuntimeError(f"Navigation failed: {result.stderr}") + except subprocess.TimeoutExpired: + cleanup_chrome(chrome_launch_process, chrome_pid) + raise RuntimeError("Navigation timed out after 120s") + + yield chrome_launch_process, chrome_pid, snapshot_chrome_dir, env finally: if chrome_launch_process and chrome_pid: cleanup_chrome(chrome_launch_process, chrome_pid) diff --git a/archivebox/plugins/chrome/tests/test_chrome.py b/archivebox/plugins/chrome/tests/test_chrome.py index 0702f95f..82672566 100644 --- a/archivebox/plugins/chrome/tests/test_chrome.py +++ b/archivebox/plugins/chrome/tests/test_chrome.py @@ -525,10 +525,9 @@ def test_zombie_prevention_hook_killed(): time.sleep(1) assert (chrome_dir / 'chrome.pid').exists(), "Chrome PID file should exist" - assert (chrome_dir / 'hook.pid').exists(), "Hook PID file should exist" chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip()) - hook_pid = int((chrome_dir / 'hook.pid').read_text().strip()) + hook_pid = chrome_launch_process.pid # Use the Popen process PID instead of hook.pid file # Verify both Chrome and hook are running try: diff --git a/archivebox/plugins/chrome/tests/test_chrome_test_helpers.py b/archivebox/plugins/chrome/tests/test_chrome_test_helpers.py new file mode 100644 index 00000000..703ea037 --- /dev/null +++ b/archivebox/plugins/chrome/tests/test_chrome_test_helpers.py @@ -0,0 +1,260 @@ +""" +Tests for chrome_test_helpers.py functions. + +These tests verify the Python helper functions used across Chrome plugin tests. +""" + +import os +import pytest +import tempfile +from pathlib import Path + +from archivebox.plugins.chrome.tests.chrome_test_helpers import ( + get_test_env, + get_machine_type, + get_lib_dir, + get_node_modules_dir, + get_extensions_dir, + find_chromium_binary, + get_plugin_dir, + get_hook_script, + parse_jsonl_output, +) + + +def test_get_machine_type(): + """Test get_machine_type() returns valid format.""" + machine_type = get_machine_type() + assert isinstance(machine_type, str) + assert '-' in machine_type, "Machine type should be in format: arch-os" + # Should be one of the expected formats + assert any(x in machine_type for x in ['arm64', 'x86_64']), "Should contain valid architecture" + assert any(x in machine_type for x in ['darwin', 'linux', 'win32']), "Should contain valid OS" + + +def test_get_lib_dir_with_env_var(): + """Test get_lib_dir() respects LIB_DIR env var.""" + with tempfile.TemporaryDirectory() as tmpdir: + custom_lib = Path(tmpdir) / 'custom_lib' + custom_lib.mkdir() + + old_lib_dir = os.environ.get('LIB_DIR') + try: + os.environ['LIB_DIR'] = str(custom_lib) + lib_dir = get_lib_dir() + assert lib_dir == custom_lib + finally: + if old_lib_dir: + os.environ['LIB_DIR'] = old_lib_dir + else: + os.environ.pop('LIB_DIR', None) + + +def test_get_node_modules_dir_with_env_var(): + """Test get_node_modules_dir() respects NODE_MODULES_DIR env var.""" + with tempfile.TemporaryDirectory() as tmpdir: + custom_nm = Path(tmpdir) / 'node_modules' + custom_nm.mkdir() + + old_nm_dir = os.environ.get('NODE_MODULES_DIR') + try: + os.environ['NODE_MODULES_DIR'] = str(custom_nm) + nm_dir = get_node_modules_dir() + assert nm_dir == custom_nm + finally: + if old_nm_dir: + os.environ['NODE_MODULES_DIR'] = old_nm_dir + else: + os.environ.pop('NODE_MODULES_DIR', None) + + +def test_get_extensions_dir_default(): + """Test get_extensions_dir() returns expected path format.""" + ext_dir = get_extensions_dir() + assert isinstance(ext_dir, str) + assert 'personas' in ext_dir + assert 'chrome_extensions' in ext_dir + + +def test_get_extensions_dir_with_custom_persona(): + """Test get_extensions_dir() respects ACTIVE_PERSONA env var.""" + old_persona = os.environ.get('ACTIVE_PERSONA') + old_data_dir = os.environ.get('DATA_DIR') + try: + os.environ['ACTIVE_PERSONA'] = 'TestPersona' + os.environ['DATA_DIR'] = '/tmp/test' + ext_dir = get_extensions_dir() + assert 'TestPersona' in ext_dir + assert '/tmp/test' in ext_dir + finally: + if old_persona: + os.environ['ACTIVE_PERSONA'] = old_persona + else: + os.environ.pop('ACTIVE_PERSONA', None) + if old_data_dir: + os.environ['DATA_DIR'] = old_data_dir + else: + os.environ.pop('DATA_DIR', None) + + +def test_get_test_env_returns_dict(): + """Test get_test_env() returns properly formatted environment dict.""" + env = get_test_env() + assert isinstance(env, dict) + + # Should include key paths + assert 'MACHINE_TYPE' in env + assert 'LIB_DIR' in env + assert 'NODE_MODULES_DIR' in env + assert 'NODE_PATH' in env # Critical for module resolution + assert 'NPM_BIN_DIR' in env + assert 'CHROME_EXTENSIONS_DIR' in env + + # Verify NODE_PATH equals NODE_MODULES_DIR (for Node.js module resolution) + assert env['NODE_PATH'] == env['NODE_MODULES_DIR'] + + +def test_get_test_env_paths_are_absolute(): + """Test that get_test_env() returns absolute paths.""" + env = get_test_env() + + # All path-like values should be absolute + assert Path(env['LIB_DIR']).is_absolute() + assert Path(env['NODE_MODULES_DIR']).is_absolute() + assert Path(env['NODE_PATH']).is_absolute() + + +def test_find_chromium_binary(): + """Test find_chromium_binary() returns a path or None.""" + binary = find_chromium_binary() + if binary: + assert isinstance(binary, str) + # Should be an absolute path if found + assert os.path.isabs(binary) + + +def test_get_plugin_dir(): + """Test get_plugin_dir() finds correct plugin directory.""" + # Use this test file's path + test_file = __file__ + plugin_dir = get_plugin_dir(test_file) + + assert plugin_dir.exists() + assert plugin_dir.is_dir() + # Should be the chrome plugin directory + assert plugin_dir.name == 'chrome' + assert (plugin_dir.parent.name == 'plugins') + + +def test_get_hook_script_finds_existing_hook(): + """Test get_hook_script() can find an existing hook.""" + from archivebox.plugins.chrome.tests.chrome_test_helpers import CHROME_PLUGIN_DIR + + # Try to find the chrome launch hook + hook = get_hook_script(CHROME_PLUGIN_DIR, 'on_Crawl__*_chrome_launch.*') + + if hook: # May not exist in all test environments + assert hook.exists() + assert hook.is_file() + assert 'chrome_launch' in hook.name + + +def test_get_hook_script_returns_none_for_missing(): + """Test get_hook_script() returns None for non-existent hooks.""" + from archivebox.plugins.chrome.tests.chrome_test_helpers import CHROME_PLUGIN_DIR + + hook = get_hook_script(CHROME_PLUGIN_DIR, 'nonexistent_hook_*_pattern.*') + assert hook is None + + +def test_parse_jsonl_output_valid(): + """Test parse_jsonl_output() parses valid JSONL.""" + jsonl_output = '''{"type": "ArchiveResult", "status": "succeeded", "output": "test1"} +{"type": "ArchiveResult", "status": "failed", "error": "test2"} +''' + + # Returns first match only + result = parse_jsonl_output(jsonl_output) + assert result is not None + assert result['type'] == 'ArchiveResult' + assert result['status'] == 'succeeded' + assert result['output'] == 'test1' + + +def test_parse_jsonl_output_with_non_json_lines(): + """Test parse_jsonl_output() skips non-JSON lines.""" + mixed_output = '''Some non-JSON output +{"type": "ArchiveResult", "status": "succeeded"} +More non-JSON +{"type": "ArchiveResult", "status": "failed"} +''' + + result = parse_jsonl_output(mixed_output) + assert result is not None + assert result['type'] == 'ArchiveResult' + assert result['status'] == 'succeeded' + + +def test_parse_jsonl_output_empty(): + """Test parse_jsonl_output() handles empty input.""" + result = parse_jsonl_output('') + assert result is None + + +def test_parse_jsonl_output_filters_by_type(): + """Test parse_jsonl_output() can filter by record type.""" + jsonl_output = '''{"type": "LogEntry", "data": "log1"} +{"type": "ArchiveResult", "data": "result1"} +{"type": "ArchiveResult", "data": "result2"} +''' + + # Should return first ArchiveResult, not LogEntry + result = parse_jsonl_output(jsonl_output, record_type='ArchiveResult') + assert result is not None + assert result['type'] == 'ArchiveResult' + assert result['data'] == 'result1' # First ArchiveResult + + +def test_parse_jsonl_output_filters_custom_type(): + """Test parse_jsonl_output() can filter by custom record type.""" + jsonl_output = '''{"type": "ArchiveResult", "data": "result1"} +{"type": "LogEntry", "data": "log1"} +{"type": "ArchiveResult", "data": "result2"} +''' + + result = parse_jsonl_output(jsonl_output, record_type='LogEntry') + assert result is not None + assert result['type'] == 'LogEntry' + assert result['data'] == 'log1' + + +def test_machine_type_consistency(): + """Test that machine type is consistent across calls.""" + mt1 = get_machine_type() + mt2 = get_machine_type() + assert mt1 == mt2, "Machine type should be stable across calls" + + +def test_lib_dir_is_directory(): + """Test that lib_dir points to an actual directory when DATA_DIR is set.""" + with tempfile.TemporaryDirectory() as tmpdir: + old_data_dir = os.environ.get('DATA_DIR') + try: + os.environ['DATA_DIR'] = tmpdir + # Create the expected directory structure + machine_type = get_machine_type() + lib_dir = Path(tmpdir) / 'lib' / machine_type + lib_dir.mkdir(parents=True, exist_ok=True) + + result = get_lib_dir() + # Should return a Path object + assert isinstance(result, Path) + finally: + if old_data_dir: + os.environ['DATA_DIR'] = old_data_dir + else: + os.environ.pop('DATA_DIR', None) + + +if __name__ == '__main__': + pytest.main([__file__, '-v']) diff --git a/archivebox/plugins/consolelog/tests/test_consolelog.py b/archivebox/plugins/consolelog/tests/test_consolelog.py index 741776f0..2f9189ff 100644 --- a/archivebox/plugins/consolelog/tests/test_consolelog.py +++ b/archivebox/plugins/consolelog/tests/test_consolelog.py @@ -72,10 +72,9 @@ class TestConsolelogWithChrome(TestCase): test_url=test_url, navigate=True, timeout=30, - ) as (chrome_process, chrome_pid, snapshot_chrome_dir): - # Get environment and run the consolelog hook - env = get_test_env() - env['CHROME_HEADLESS'] = 'true' + ) as (chrome_process, chrome_pid, snapshot_chrome_dir, env): + # Use the environment from chrome_session (already has CHROME_HEADLESS=true) + # Run consolelog hook with the active Chrome session result = subprocess.run( diff --git a/archivebox/plugins/forumdl/on_Crawl__13_forumdl_install.py b/archivebox/plugins/forumdl/on_Crawl__13_forumdl_install.py new file mode 100755 index 00000000..f52a72f2 --- /dev/null +++ b/archivebox/plugins/forumdl/on_Crawl__13_forumdl_install.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python3 +""" +Detect forum-dl binary and emit Binary JSONL record. + +Output: Binary JSONL record to stdout if forum-dl is found +""" + +import json +import os +import sys + +from abx_pkg import Binary, EnvProvider + + +def get_env(name: str, default: str = '') -> str: + return os.environ.get(name, default).strip() + +def get_env_bool(name: str, default: bool = False) -> bool: + val = get_env(name, '').lower() + if val in ('true', '1', 'yes', 'on'): + return True + if val in ('false', '0', 'no', 'off'): + return False + return default + + +def output_binary_found(binary: Binary, name: str): + """Output Binary JSONL record for an installed binary.""" + machine_id = os.environ.get('MACHINE_ID', '') + + record = { + 'type': 'Binary', + 'name': name, + 'abspath': str(binary.abspath), + 'version': str(binary.version) if binary.version else '', + 'sha256': binary.sha256 or '', + 'binprovider': 'env', # Already installed + 'machine_id': machine_id, + } + print(json.dumps(record)) + + +def output_binary_missing(name: str, binproviders: str): + """Output Binary JSONL record for a missing binary that needs installation.""" + machine_id = os.environ.get('MACHINE_ID', '') + + record = { + 'type': 'Binary', + 'name': name, + 'binproviders': binproviders, # Providers that can install it + 'machine_id': machine_id, + } + print(json.dumps(record)) + + +def main(): + forumdl_enabled = get_env_bool('FORUMDL_ENABLED', True) + forumdl_binary = get_env('FORUMDL_BINARY', 'forum-dl') + + if not forumdl_enabled: + sys.exit(0) + + provider = EnvProvider() + try: + binary = Binary(name=forumdl_binary, binproviders=[provider]).load() + if binary.abspath: + # Binary found + output_binary_found(binary, name='forum-dl') + else: + # Binary not found + output_binary_missing(name='forum-dl', binproviders='pip') + except Exception: + # Binary not found + output_binary_missing(name='forum-dl', binproviders='pip') + + sys.exit(0) + + +if __name__ == '__main__': + main() diff --git a/archivebox/plugins/infiniscroll/tests/test_infiniscroll.py b/archivebox/plugins/infiniscroll/tests/test_infiniscroll.py index eee44ce4..1248518a 100644 --- a/archivebox/plugins/infiniscroll/tests/test_infiniscroll.py +++ b/archivebox/plugins/infiniscroll/tests/test_infiniscroll.py @@ -24,8 +24,7 @@ import pytest # Import shared Chrome test helpers from archivebox.plugins.chrome.tests.chrome_test_helpers import ( get_test_env, - setup_chrome_session, - cleanup_chrome, + chrome_session, ) @@ -101,22 +100,17 @@ def test_fails_gracefully_without_chrome_session(): def test_scrolls_page_and_outputs_stats(): """Integration test: scroll page and verify JSONL output format.""" with tempfile.TemporaryDirectory() as tmpdir: - chrome_launch_process = None - chrome_pid = None - try: - chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session( - Path(tmpdir), - crawl_id='test-infiniscroll', - snapshot_id='snap-infiniscroll', - test_url=TEST_URL, - ) - + with chrome_session( + Path(tmpdir), + crawl_id='test-infiniscroll', + snapshot_id='snap-infiniscroll', + test_url=TEST_URL, + ) as (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env): # Create infiniscroll output directory (sibling to chrome) infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll' infiniscroll_dir.mkdir() # Run infiniscroll hook - env = get_test_env() env['INFINISCROLL_SCROLL_LIMIT'] = '3' # Limit scrolls for faster test env['INFINISCROLL_SCROLL_DELAY'] = '500' # Faster scrolling env['INFINISCROLL_MIN_HEIGHT'] = '1000' # Lower threshold for test @@ -158,29 +152,21 @@ def test_scrolls_page_and_outputs_stats(): output_files = list(infiniscroll_dir.iterdir()) assert len(output_files) == 0, f"Should not create any files, but found: {output_files}" - finally: - if chrome_launch_process and chrome_pid: - cleanup_chrome(chrome_launch_process, chrome_pid) - def test_config_scroll_limit_honored(): """Test that INFINISCROLL_SCROLL_LIMIT config is respected.""" with tempfile.TemporaryDirectory() as tmpdir: - chrome_launch_process = None - chrome_pid = None - try: - chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session( - Path(tmpdir), - crawl_id='test-scroll-limit', - snapshot_id='snap-limit', - test_url=TEST_URL, - ) + with chrome_session( + Path(tmpdir), + crawl_id='test-scroll-limit', + snapshot_id='snap-limit', + test_url=TEST_URL, + ) as (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env): infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll' infiniscroll_dir.mkdir() - # Set scroll limit to 2 - env = get_test_env() + # Set scroll limit to 2 (use env from setup_chrome_session) env['INFINISCROLL_SCROLL_LIMIT'] = '2' env['INFINISCROLL_SCROLL_DELAY'] = '500' env['INFINISCROLL_MIN_HEIGHT'] = '100000' # High threshold so limit kicks in @@ -215,29 +201,22 @@ def test_config_scroll_limit_honored(): assert output_str.startswith('scrolled to'), f"Should have valid output_str: {output_str}" assert result_json['status'] == 'succeeded', f"Should succeed with scroll limit: {result_json}" - finally: - if chrome_launch_process and chrome_pid: - cleanup_chrome(chrome_launch_process, chrome_pid) def test_config_timeout_honored(): """Test that INFINISCROLL_TIMEOUT config is respected.""" with tempfile.TemporaryDirectory() as tmpdir: - chrome_launch_process = None - chrome_pid = None - try: - chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session( - Path(tmpdir), - crawl_id='test-timeout', - snapshot_id='snap-timeout', - test_url=TEST_URL, - ) + with chrome_session( + Path(tmpdir), + crawl_id='test-timeout', + snapshot_id='snap-timeout', + test_url=TEST_URL, + ) as (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env): infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll' infiniscroll_dir.mkdir() - # Set very short timeout - env = get_test_env() + # Set very short timeout (use env from setup_chrome_session) env['INFINISCROLL_TIMEOUT'] = '3' # 3 seconds env['INFINISCROLL_SCROLL_DELAY'] = '2000' # 2s delay - timeout should trigger env['INFINISCROLL_SCROLL_LIMIT'] = '100' # High limit @@ -258,9 +237,6 @@ def test_config_timeout_honored(): assert elapsed < 15, f"Should respect timeout, took {elapsed:.1f}s" assert result.returncode == 0, f"Should complete even with timeout: {result.stderr}" - finally: - if chrome_launch_process and chrome_pid: - cleanup_chrome(chrome_launch_process, chrome_pid) if __name__ == '__main__': diff --git a/archivebox/plugins/istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.py b/archivebox/plugins/istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.py index a9525c89..7fdc1c4a 100644 --- a/archivebox/plugins/istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.py +++ b/archivebox/plugins/istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.py @@ -154,8 +154,7 @@ def test_extension_loads_in_chromium(): # Step 1: Install the extension result = subprocess.run( ['node', str(INSTALL_SCRIPT)], - cwd=str(tmpdir, - env=get_test_env()), + cwd=str(tmpdir), capture_output=True, text=True, env=env, diff --git a/archivebox/plugins/mercury/on_Crawl__12_mercury_install.py b/archivebox/plugins/mercury/on_Crawl__12_mercury_install.py index 57fe5e7e..25d1c9c1 100755 --- a/archivebox/plugins/mercury/on_Crawl__12_mercury_install.py +++ b/archivebox/plugins/mercury/on_Crawl__12_mercury_install.py @@ -1,8 +1,8 @@ #!/usr/bin/env python3 """ -Detect mercury-parser binary and emit Binary JSONL record. +Detect postlight-parser binary and emit Binary JSONL record. -Output: Binary JSONL record to stdout if mercury-parser is found +Output: Binary JSONL record to stdout if postlight-parser is found """ import json @@ -48,6 +48,11 @@ def output_binary_missing(name: str, binproviders: str): 'type': 'Binary', 'name': name, 'binproviders': binproviders, # Providers that can install it + 'overrides': { + 'npm': { + 'packages': ['@postlight/parser'], + } + }, 'machine_id': machine_id, } print(json.dumps(record)) @@ -55,7 +60,7 @@ def output_binary_missing(name: str, binproviders: str): def main(): mercury_enabled = get_env_bool('MERCURY_ENABLED', True) - mercury_binary = get_env('MERCURY_BINARY', 'mercury-parser') + mercury_binary = get_env('MERCURY_BINARY', 'postlight-parser') if not mercury_enabled: sys.exit(0) @@ -65,13 +70,13 @@ def main(): binary = Binary(name=mercury_binary, binproviders=[provider]).load() if binary.abspath: # Binary found - output_binary_found(binary, name='mercury-parser') + output_binary_found(binary, name='postlight-parser') else: # Binary not found - output_binary_missing(name='mercury-parser', binproviders='npm') + output_binary_missing(name='postlight-parser', binproviders='npm') except Exception: # Binary not found - output_binary_missing(name='mercury-parser', binproviders='npm') + output_binary_missing(name='postlight-parser', binproviders='npm') sys.exit(0) diff --git a/archivebox/plugins/modalcloser/tests/test_modalcloser.py b/archivebox/plugins/modalcloser/tests/test_modalcloser.py index 1039d99c..b66d20d2 100644 --- a/archivebox/plugins/modalcloser/tests/test_modalcloser.py +++ b/archivebox/plugins/modalcloser/tests/test_modalcloser.py @@ -25,8 +25,7 @@ import pytest # Import shared Chrome test helpers from archivebox.plugins.chrome.tests.chrome_test_helpers import ( get_test_env, - setup_chrome_session, - cleanup_chrome, + chrome_session, ) @@ -103,129 +102,119 @@ def test_fails_gracefully_without_chrome_session(): def test_background_script_handles_sigterm(): """Test that background script runs and handles SIGTERM correctly.""" with tempfile.TemporaryDirectory() as tmpdir: - chrome_launch_process = None - chrome_pid = None modalcloser_process = None try: - chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session( + with chrome_session( Path(tmpdir), crawl_id='test-modalcloser', snapshot_id='snap-modalcloser', test_url=TEST_URL, - ) + ) as (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env): + # Create modalcloser output directory (sibling to chrome) + modalcloser_dir = snapshot_chrome_dir.parent / 'modalcloser' + modalcloser_dir.mkdir() - # Create modalcloser output directory (sibling to chrome) - modalcloser_dir = snapshot_chrome_dir.parent / 'modalcloser' - modalcloser_dir.mkdir() + # Run modalcloser as background process (use env from setup_chrome_session) + env['MODALCLOSER_POLL_INTERVAL'] = '200' # Faster polling for test - # Run modalcloser as background process - env = get_test_env() - env['MODALCLOSER_POLL_INTERVAL'] = '200' # Faster polling for test + modalcloser_process = subprocess.Popen( + ['node', str(MODALCLOSER_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-modalcloser'], + cwd=str(modalcloser_dir), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + env=env + ) - modalcloser_process = subprocess.Popen( - ['node', str(MODALCLOSER_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-modalcloser'], - cwd=str(modalcloser_dir), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - env=env - ) + # Let it run for a bit + time.sleep(2) - # Let it run for a bit - time.sleep(2) + # Verify it's still running (background script) + assert modalcloser_process.poll() is None, "Modalcloser should still be running as background process" - # Verify it's still running (background script) - assert modalcloser_process.poll() is None, "Modalcloser should still be running as background process" + # Send SIGTERM + modalcloser_process.send_signal(signal.SIGTERM) + stdout, stderr = modalcloser_process.communicate(timeout=5) - # Send SIGTERM - modalcloser_process.send_signal(signal.SIGTERM) - stdout, stderr = modalcloser_process.communicate(timeout=5) + assert modalcloser_process.returncode == 0, f"Should exit 0 on SIGTERM: {stderr}" - assert modalcloser_process.returncode == 0, f"Should exit 0 on SIGTERM: {stderr}" + # Parse JSONL output + result_json = None + for line in stdout.strip().split('\n'): + line = line.strip() + if line.startswith('{'): + try: + record = json.loads(line) + if record.get('type') == 'ArchiveResult': + result_json = record + break + except json.JSONDecodeError: + pass - # Parse JSONL output - result_json = None - for line in stdout.strip().split('\n'): - line = line.strip() - if line.startswith('{'): - try: - record = json.loads(line) - if record.get('type') == 'ArchiveResult': - result_json = record - break - except json.JSONDecodeError: - pass + assert result_json is not None, f"Should have ArchiveResult JSONL output. Stdout: {stdout}" + assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}" - assert result_json is not None, f"Should have ArchiveResult JSONL output. Stdout: {stdout}" - assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}" + # Verify output_str format + output_str = result_json.get('output_str', '') + assert 'modal' in output_str.lower() or 'dialog' in output_str.lower(), \ + f"output_str should mention modals/dialogs: {output_str}" - # Verify output_str format - output_str = result_json.get('output_str', '') - assert 'modal' in output_str.lower() or 'dialog' in output_str.lower(), \ - f"output_str should mention modals/dialogs: {output_str}" - - # Verify no files created in output directory - output_files = list(modalcloser_dir.iterdir()) - assert len(output_files) == 0, f"Should not create any files, but found: {output_files}" + # Verify no files created in output directory + output_files = list(modalcloser_dir.iterdir()) + assert len(output_files) == 0, f"Should not create any files, but found: {output_files}" finally: if modalcloser_process and modalcloser_process.poll() is None: modalcloser_process.kill() - if chrome_launch_process and chrome_pid: - cleanup_chrome(chrome_launch_process, chrome_pid) def test_dialog_handler_logs_dialogs(): """Test that dialog handler is set up correctly.""" with tempfile.TemporaryDirectory() as tmpdir: - chrome_launch_process = None - chrome_pid = None modalcloser_process = None try: - chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session( - Path(tmpdir), - crawl_id='test-dialog', - snapshot_id='snap-dialog', - test_url=TEST_URL, - ) + with chrome_session( + Path(tmpdir), + crawl_id='test-dialog', + snapshot_id='snap-dialog', + test_url=TEST_URL, + ) as (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env): - modalcloser_dir = snapshot_chrome_dir.parent / 'modalcloser' - modalcloser_dir.mkdir() + modalcloser_dir = snapshot_chrome_dir.parent / 'modalcloser' + modalcloser_dir.mkdir() - env = get_test_env() - env['MODALCLOSER_TIMEOUT'] = '100' # Fast timeout for test - env['MODALCLOSER_POLL_INTERVAL'] = '200' + # Use env from setup_chrome_session + env['MODALCLOSER_TIMEOUT'] = '100' # Fast timeout for test + env['MODALCLOSER_POLL_INTERVAL'] = '200' - modalcloser_process = subprocess.Popen( - ['node', str(MODALCLOSER_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-dialog'], - cwd=str(modalcloser_dir), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - env=env - ) + modalcloser_process = subprocess.Popen( + ['node', str(MODALCLOSER_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-dialog'], + cwd=str(modalcloser_dir), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + env=env + ) - # Let it run briefly - time.sleep(1.5) + # Let it run briefly + time.sleep(1.5) - # Verify it's running - assert modalcloser_process.poll() is None, "Should be running" + # Verify it's running + assert modalcloser_process.poll() is None, "Should be running" - # Check stderr for "listening" message - # Note: Can't read stderr while process is running without blocking, - # so we just verify it exits cleanly - modalcloser_process.send_signal(signal.SIGTERM) - stdout, stderr = modalcloser_process.communicate(timeout=5) + # Check stderr for "listening" message + # Note: Can't read stderr while process is running without blocking, + # so we just verify it exits cleanly + modalcloser_process.send_signal(signal.SIGTERM) + stdout, stderr = modalcloser_process.communicate(timeout=5) - assert 'listening' in stderr.lower() or 'modalcloser' in stderr.lower(), \ - f"Should log startup message: {stderr}" - assert modalcloser_process.returncode == 0, f"Should exit cleanly: {stderr}" + assert 'listening' in stderr.lower() or 'modalcloser' in stderr.lower(), \ + f"Should log startup message: {stderr}" + assert modalcloser_process.returncode == 0, f"Should exit cleanly: {stderr}" finally: if modalcloser_process and modalcloser_process.poll() is None: modalcloser_process.kill() - if chrome_launch_process and chrome_pid: - cleanup_chrome(chrome_launch_process, chrome_pid) def test_config_poll_interval(): @@ -235,61 +224,58 @@ def test_config_poll_interval(): chrome_pid = None modalcloser_process = None try: - chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session( - Path(tmpdir), - crawl_id='test-poll', - snapshot_id='snap-poll', - test_url=TEST_URL, - ) + with chrome_session( + Path(tmpdir), + crawl_id='test-poll', + snapshot_id='snap-poll', + test_url=TEST_URL, + ) as (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env): - modalcloser_dir = snapshot_chrome_dir.parent / 'modalcloser' - modalcloser_dir.mkdir() + modalcloser_dir = snapshot_chrome_dir.parent / 'modalcloser' + modalcloser_dir.mkdir() - # Set very short poll interval - env = get_test_env() - env['MODALCLOSER_POLL_INTERVAL'] = '100' # 100ms + # Set very short poll interval (use env from setup_chrome_session) + env['MODALCLOSER_POLL_INTERVAL'] = '100' # 100ms - modalcloser_process = subprocess.Popen( - ['node', str(MODALCLOSER_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-poll'], - cwd=str(modalcloser_dir), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - env=env - ) + modalcloser_process = subprocess.Popen( + ['node', str(MODALCLOSER_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-poll'], + cwd=str(modalcloser_dir), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + env=env + ) - # Run for short time - time.sleep(1) + # Run for short time + time.sleep(1) - # Should still be running - assert modalcloser_process.poll() is None, "Should still be running" + # Should still be running + assert modalcloser_process.poll() is None, "Should still be running" - # Clean exit - modalcloser_process.send_signal(signal.SIGTERM) - stdout, stderr = modalcloser_process.communicate(timeout=5) + # Clean exit + modalcloser_process.send_signal(signal.SIGTERM) + stdout, stderr = modalcloser_process.communicate(timeout=5) - assert modalcloser_process.returncode == 0, f"Should exit 0: {stderr}" + assert modalcloser_process.returncode == 0, f"Should exit 0: {stderr}" - # Verify JSONL output exists - result_json = None - for line in stdout.strip().split('\n'): - if line.strip().startswith('{'): - try: - record = json.loads(line) - if record.get('type') == 'ArchiveResult': - result_json = record - break - except json.JSONDecodeError: - pass + # Verify JSONL output exists + result_json = None + for line in stdout.strip().split('\n'): + if line.strip().startswith('{'): + try: + record = json.loads(line) + if record.get('type') == 'ArchiveResult': + result_json = record + break + except json.JSONDecodeError: + pass - assert result_json is not None, "Should have JSONL output" - assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}" + assert result_json is not None, "Should have JSONL output" + assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}" finally: if modalcloser_process and modalcloser_process.poll() is None: modalcloser_process.kill() - if chrome_launch_process and chrome_pid: - cleanup_chrome(chrome_launch_process, chrome_pid) def test_hides_cookie_consent_on_filmin(): diff --git a/archivebox/plugins/papersdl/on_Crawl__14_papersdl_install.py b/archivebox/plugins/papersdl/on_Crawl__14_papersdl_install.py new file mode 100755 index 00000000..8c548c7c --- /dev/null +++ b/archivebox/plugins/papersdl/on_Crawl__14_papersdl_install.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python3 +""" +Detect papers-dl binary and emit Binary JSONL record. + +Output: Binary JSONL record to stdout if papers-dl is found +""" + +import json +import os +import sys + +from abx_pkg import Binary, EnvProvider + + +def get_env(name: str, default: str = '') -> str: + return os.environ.get(name, default).strip() + +def get_env_bool(name: str, default: bool = False) -> bool: + val = get_env(name, '').lower() + if val in ('true', '1', 'yes', 'on'): + return True + if val in ('false', '0', 'no', 'off'): + return False + return default + + +def output_binary_found(binary: Binary, name: str): + """Output Binary JSONL record for an installed binary.""" + machine_id = os.environ.get('MACHINE_ID', '') + + record = { + 'type': 'Binary', + 'name': name, + 'abspath': str(binary.abspath), + 'version': str(binary.version) if binary.version else '', + 'sha256': binary.sha256 or '', + 'binprovider': 'env', # Already installed + 'machine_id': machine_id, + } + print(json.dumps(record)) + + +def output_binary_missing(name: str, binproviders: str): + """Output Binary JSONL record for a missing binary that needs installation.""" + machine_id = os.environ.get('MACHINE_ID', '') + + record = { + 'type': 'Binary', + 'name': name, + 'binproviders': binproviders, # Providers that can install it + 'machine_id': machine_id, + } + print(json.dumps(record)) + + +def main(): + papersdl_enabled = get_env_bool('PAPERSDL_ENABLED', True) + papersdl_binary = get_env('PAPERSDL_BINARY', 'papers-dl') + + if not papersdl_enabled: + sys.exit(0) + + provider = EnvProvider() + try: + binary = Binary(name=papersdl_binary, binproviders=[provider]).load() + if binary.abspath: + # Binary found + output_binary_found(binary, name='papers-dl') + else: + # Binary not found + output_binary_missing(name='papers-dl', binproviders='pip') + except Exception: + # Binary not found + output_binary_missing(name='papers-dl', binproviders='pip') + + sys.exit(0) + + +if __name__ == '__main__': + main() diff --git a/archivebox/plugins/parse_dom_outlinks/tests/test_parse_dom_outlinks.py b/archivebox/plugins/parse_dom_outlinks/tests/test_parse_dom_outlinks.py index d87df28d..33045184 100644 --- a/archivebox/plugins/parse_dom_outlinks/tests/test_parse_dom_outlinks.py +++ b/archivebox/plugins/parse_dom_outlinks/tests/test_parse_dom_outlinks.py @@ -72,10 +72,9 @@ class TestParseDomOutlinksWithChrome(TestCase): test_url=test_url, navigate=True, timeout=30, - ) as (chrome_process, chrome_pid, snapshot_chrome_dir): - # Get environment and run the outlinks hook - env = get_test_env() - env['CHROME_HEADLESS'] = 'true' + ) as (chrome_process, chrome_pid, snapshot_chrome_dir, env): + # Use the environment from chrome_session (already has CHROME_HEADLESS=true) + # Run outlinks hook with the active Chrome session result = subprocess.run( diff --git a/archivebox/plugins/redirects/tests/test_redirects.py b/archivebox/plugins/redirects/tests/test_redirects.py index 934b14c7..0164d461 100644 --- a/archivebox/plugins/redirects/tests/test_redirects.py +++ b/archivebox/plugins/redirects/tests/test_redirects.py @@ -73,10 +73,9 @@ class TestRedirectsWithChrome(TestCase): test_url=test_url, navigate=True, timeout=30, - ) as (chrome_process, chrome_pid, snapshot_chrome_dir): - # Get environment and run the redirects hook - env = get_test_env() - env['CHROME_HEADLESS'] = 'true' + ) as (chrome_process, chrome_pid, snapshot_chrome_dir, env): + # Use the environment from chrome_session (already has CHROME_HEADLESS=true) + # Run redirects hook with the active Chrome session result = subprocess.run( diff --git a/archivebox/plugins/responses/tests/test_responses.py b/archivebox/plugins/responses/tests/test_responses.py index ea710c83..c66f7652 100644 --- a/archivebox/plugins/responses/tests/test_responses.py +++ b/archivebox/plugins/responses/tests/test_responses.py @@ -72,10 +72,9 @@ class TestResponsesWithChrome(TestCase): test_url=test_url, navigate=True, timeout=30, - ) as (chrome_process, chrome_pid, snapshot_chrome_dir): - # Get environment and run the responses hook - env = get_test_env() - env['CHROME_HEADLESS'] = 'true' + ) as (chrome_process, chrome_pid, snapshot_chrome_dir, env): + # Use the environment from chrome_session (already has CHROME_HEADLESS=true) + # Run responses hook with the active Chrome session result = subprocess.run( diff --git a/archivebox/plugins/screenshot/on_Snapshot__51_screenshot.js b/archivebox/plugins/screenshot/on_Snapshot__51_screenshot.js index 7bcb7951..52c49b59 100644 --- a/archivebox/plugins/screenshot/on_Snapshot__51_screenshot.js +++ b/archivebox/plugins/screenshot/on_Snapshot__51_screenshot.js @@ -1,20 +1,15 @@ #!/usr/bin/env node /** - * Take a screenshot of a URL using Chrome/Puppeteer. + * Take a screenshot of a URL using an existing Chrome session. * - * If a Chrome session exists (from chrome plugin), connects to it via CDP. - * Otherwise launches a new Chrome instance. + * Requires chrome plugin to have already created a Chrome session. + * Connects to the existing session via CDP and takes a screenshot. * * Usage: on_Snapshot__51_screenshot.js --url= --snapshot-id= * Output: Writes screenshot/screenshot.png * * Environment variables: - * CHROME_BINARY: Path to Chrome/Chromium binary - * CHROME_TIMEOUT: Timeout in seconds (default: 60) * CHROME_RESOLUTION: Screenshot resolution (default: 1440,2000) - * CHROME_USER_AGENT: User agent string (optional) - * CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true) - * CHROME_HEADLESS: Run in headless mode (default: true) * SCREENSHOT_ENABLED: Enable screenshot capture (default: true) */ @@ -24,10 +19,8 @@ const path = require('path'); if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); const { - findChromium, getEnv, getEnvBool, - getEnvInt, parseResolution, parseArgs, readCdpUrl, @@ -56,7 +49,7 @@ function hasStaticFileOutput() { } // Wait for chrome tab to be fully loaded -async function waitForChromeTabLoaded(timeoutMs = 60000) { +async function waitForChromeTabLoaded(timeoutMs = 10000) { const navigationFile = path.join(CHROME_SESSION_DIR, 'navigation.json'); const startTime = Date.now(); @@ -72,102 +65,66 @@ async function waitForChromeTabLoaded(timeoutMs = 60000) { } async function takeScreenshot(url) { - const timeout = (getEnvInt('CHROME_TIMEOUT') || getEnvInt('TIMEOUT', 60)) * 1000; - const resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000'); - const userAgent = getEnv('CHROME_USER_AGENT') || getEnv('USER_AGENT', ''); - const checkSsl = getEnvBool('CHROME_CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true)); - const headless = getEnvBool('CHROME_HEADLESS', true); - + const resolution = getEnv('CHROME_RESOLUTION', '1440,2000'); const { width, height } = parseResolution(resolution); // Output directory is current directory (hook already runs in output dir) const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE); - let browser = null; - let page = null; - let connectedToSession = false; + // Wait for chrome_navigate to complete (writes navigation.json) + const timeoutSeconds = parseInt(getEnv('SCREENSHOT_TIMEOUT', '10'), 10); + const timeoutMs = timeoutSeconds * 1000; + const pageLoaded = await waitForChromeTabLoaded(timeoutMs); + if (!pageLoaded) { + throw new Error(`Page not loaded after ${timeoutSeconds}s (chrome_navigate must complete first)`); + } + + // Connect to existing Chrome session (required - no fallback) + const cdpUrl = readCdpUrl(CHROME_SESSION_DIR); + if (!cdpUrl) { + throw new Error('No Chrome session found (chrome plugin must run first)'); + } + + // Read target_id.txt to get the specific tab for this snapshot + const targetIdFile = path.join(CHROME_SESSION_DIR, 'target_id.txt'); + if (!fs.existsSync(targetIdFile)) { + throw new Error('No target_id.txt found (chrome_tab must run first)'); + } + const targetId = fs.readFileSync(targetIdFile, 'utf8').trim(); + + const browser = await puppeteer.connect({ + browserWSEndpoint: cdpUrl, + defaultViewport: { width, height }, + }); try { - // Try to connect to existing Chrome session - const cdpUrl = readCdpUrl(CHROME_SESSION_DIR); - if (cdpUrl) { - try { - browser = await puppeteer.connect({ - browserWSEndpoint: cdpUrl, - defaultViewport: { width, height }, - }); - connectedToSession = true; - - // Get existing pages or create new one - const pages = await browser.pages(); - page = pages.find(p => p.url().startsWith('http')) || pages[0]; - - if (!page) { - page = await browser.newPage(); - } - - // Set viewport on the page - await page.setViewport({ width, height }); - - } catch (e) { - console.error(`Failed to connect to CDP session: ${e.message}`); - browser = null; - } + // Get the specific page for this snapshot by target ID + const targets = await browser.targets(); + const target = targets.find(t => t._targetId === targetId); + if (!target) { + throw new Error(`Target ${targetId} not found in Chrome session`); } - // Fall back to launching new browser - if (!browser) { - const executablePath = findChromium(); - if (!executablePath) { - return { success: false, error: 'Chrome binary not found' }; - } - - browser = await puppeteer.launch({ - executablePath, - headless: headless ? 'new' : false, - args: [ - '--no-sandbox', - '--disable-setuid-sandbox', - '--disable-dev-shm-usage', - '--disable-gpu', - `--window-size=${width},${height}`, - ...(checkSsl ? [] : ['--ignore-certificate-errors']), - ], - defaultViewport: { width, height }, - }); - - page = await browser.newPage(); - - // Navigate to URL (only if we launched fresh browser) - if (userAgent) { - await page.setUserAgent(userAgent); - } - - await page.goto(url, { - waitUntil: 'networkidle2', - timeout, - }); + const page = await target.page(); + if (!page) { + throw new Error(`Could not get page for target ${targetId}`); } - // Take screenshot + // Set viewport on the page + await page.setViewport({ width, height }); + + // Take screenshot (Puppeteer throws on failure) await page.screenshot({ path: outputPath, fullPage: true, }); - if (fs.existsSync(outputPath) && fs.statSync(outputPath).size > 0) { - return { success: true, output: outputPath }; - } else { - return { success: false, error: 'Screenshot file not created' }; - } + return outputPath; - } catch (e) { - return { success: false, error: `${e.name}: ${e.message}` }; } finally { - // Only close browser if we launched it (not if we connected to session) - if (browser && !connectedToSession) { - await browser.close(); - } + // Disconnect from browser (don't close it - we're connected to a shared session) + // The chrome_launch hook manages the browser lifecycle + await browser.disconnect(); } } @@ -181,54 +138,33 @@ async function main() { process.exit(1); } - try { - // Check if staticfile extractor already handled this (permanent skip) - if (hasStaticFileOutput()) { - console.error(`Skipping screenshot - staticfile extractor already downloaded this`); - // Permanent skip - emit ArchiveResult - console.log(JSON.stringify({ - type: 'ArchiveResult', - status: 'skipped', - output_str: 'staticfile already handled', - })); - process.exit(0); - } - - // Only wait for page load if using shared Chrome session - const cdpUrl = readCdpUrl(CHROME_SESSION_DIR); - if (cdpUrl) { - // Wait for page to be fully loaded - const pageLoaded = await waitForChromeTabLoaded(60000); - if (!pageLoaded) { - throw new Error('Page not loaded after 60s (chrome_navigate must complete first)'); - } - } - - const result = await takeScreenshot(url); - - if (result.success) { - // Success - emit ArchiveResult - const size = fs.statSync(result.output).size; - console.error(`Screenshot saved (${size} bytes)`); - console.log(JSON.stringify({ - type: 'ArchiveResult', - status: 'succeeded', - output_str: result.output, - })); - process.exit(0); - } else { - // Transient error - emit NO JSONL - console.error(`ERROR: ${result.error}`); - process.exit(1); - } - } catch (e) { - // Transient error - emit NO JSONL - console.error(`ERROR: ${e.name}: ${e.message}`); - process.exit(1); + // Check if staticfile extractor already handled this (permanent skip) + if (hasStaticFileOutput()) { + console.error(`Skipping screenshot - staticfile extractor already downloaded this`); + // Permanent skip - emit ArchiveResult + console.log(JSON.stringify({ + type: 'ArchiveResult', + status: 'skipped', + output_str: 'staticfile already handled', + })); + process.exit(0); } + + // Take screenshot (throws on error) + const outputPath = await takeScreenshot(url); + + // Success - emit ArchiveResult + const size = fs.statSync(outputPath).size; + console.error(`Screenshot saved (${size} bytes)`); + console.log(JSON.stringify({ + type: 'ArchiveResult', + status: 'succeeded', + output_str: outputPath, + })); } main().catch(e => { - console.error(`Fatal error: ${e.message}`); + // Transient error - emit NO JSONL + console.error(`ERROR: ${e.message}`); process.exit(1); }); diff --git a/archivebox/plugins/screenshot/tests/test_screenshot.py b/archivebox/plugins/screenshot/tests/test_screenshot.py index 04c89f7e..a522f38b 100644 --- a/archivebox/plugins/screenshot/tests/test_screenshot.py +++ b/archivebox/plugins/screenshot/tests/test_screenshot.py @@ -25,6 +25,7 @@ from archivebox.plugins.chrome.tests.chrome_test_helpers import ( get_plugin_dir, get_hook_script, run_hook_and_parse, + chrome_session, LIB_DIR, NODE_MODULES_DIR, CHROME_PLUGIN_DIR, @@ -62,192 +63,96 @@ def test_verify_deps_with_abx_pkg(): assert node_loaded and node_loaded.abspath, "Node.js required for screenshot plugin" -def test_extracts_screenshot_from_example_com(): - """Test full workflow: extract screenshot from real example.com via hook. - - Replicates production directory structure: - DATA_DIR/users/testuser/crawls/{crawl-id}/chrome/ - DATA_DIR/users/testuser/crawls/{crawl-id}/snapshots/{snap-id}/chrome/ - DATA_DIR/users/testuser/crawls/{crawl-id}/snapshots/{snap-id}/screenshot/ - - This exercises the "connect to existing session" code path which is the primary - path in production and accounts for ~50% of the code. - """ - import signal - import time - import os - +def test_screenshot_with_chrome_session(): + """Test multiple screenshot scenarios with one Chrome session to save time.""" with tempfile.TemporaryDirectory() as tmpdir: - # Replicate exact production directory structure - data_dir = Path(tmpdir) - crawl_id = 'test-screenshot-crawl' + test_url = 'https://example.com' snapshot_id = 'test-screenshot-snap' - # Crawl: DATA_DIR/users/{username}/crawls/YYYYMMDD/example.com/{crawl-id}/{plugin}/ - crawl_dir = data_dir / 'users' / 'testuser' / 'crawls' / '20240101' / 'example.com' / crawl_id - chrome_dir = crawl_dir / 'chrome' - chrome_dir.mkdir(parents=True) - - # Snapshot: DATA_DIR/users/{username}/snapshots/YYYYMMDD/example.com/{snapshot-uuid}/{plugin}/ - snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / snapshot_id - snapshot_chrome_dir = snapshot_dir / 'chrome' - snapshot_chrome_dir.mkdir(parents=True) - - screenshot_dir = snapshot_dir / 'screenshot' - screenshot_dir.mkdir() - - env = get_test_env() - env['CHROME_HEADLESS'] = 'true' - - # Step 1: Launch Chrome session at crawl level (background process) - chrome_launch_process = subprocess.Popen( - ['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'], - cwd=str(chrome_dir), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - env=env - ) - - # Wait for Chrome to launch - for i in range(15): - if chrome_launch_process.poll() is not None: - stdout, stderr = chrome_launch_process.communicate() - pytest.fail(f"Chrome launch failed:\nStdout: {stdout}\nStderr: {stderr}") - if (chrome_dir / 'cdp_url.txt').exists(): - break - time.sleep(1) - - assert (chrome_dir / 'cdp_url.txt').exists(), "Chrome CDP URL file should exist" - assert (chrome_dir / 'chrome.pid').exists(), "Chrome PID file should exist" - - chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip()) - try: - # Step 2: Create tab at snapshot level - env['CRAWL_OUTPUT_DIR'] = str(crawl_dir) - result = subprocess.run( - ['node', str(CHROME_TAB_HOOK), f'--url={TEST_URL}', f'--snapshot-id={snapshot_id}', f'--crawl-id={crawl_id}'], - cwd=str(snapshot_chrome_dir), - capture_output=True, - text=True, - timeout=60, - env=env - ) - assert result.returncode == 0, f"Tab creation failed: {result.stderr}" - assert (snapshot_chrome_dir / 'cdp_url.txt').exists(), "Snapshot CDP URL should exist" + with chrome_session( + Path(tmpdir), + crawl_id='test-screenshot-crawl', + snapshot_id=snapshot_id, + test_url=test_url, + navigate=True, + timeout=30, + ) as (chrome_process, chrome_pid, snapshot_chrome_dir, env): - # Step 3: Navigate to URL - result = subprocess.run( - ['node', str(CHROME_NAVIGATE_HOOK), f'--url={TEST_URL}', f'--snapshot-id={snapshot_id}'], - cwd=str(snapshot_chrome_dir), - capture_output=True, - text=True, - timeout=120, - env=env - ) - assert result.returncode == 0, f"Navigation failed: {result.stderr}" - assert (snapshot_chrome_dir / 'navigation.json').exists(), "Navigation JSON should exist" + # Scenario 1: Basic screenshot extraction + screenshot_dir = snapshot_chrome_dir.parent / 'screenshot' + screenshot_dir.mkdir() - # Step 4: Take screenshot (should connect to existing session) - # Screenshot hook runs in screenshot/ dir and looks for ../chrome/cdp_url.txt - result = subprocess.run( - ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', f'--snapshot-id={snapshot_id}'], - cwd=str(screenshot_dir), - capture_output=True, - text=True, - timeout=120, - env=env - ) + result = subprocess.run( + ['node', str(SCREENSHOT_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], + cwd=str(screenshot_dir), + capture_output=True, + text=True, + timeout=30, + env=env + ) - assert result.returncode == 0, f"Screenshot extraction failed:\nStderr: {result.stderr}\nStdout: {result.stdout}" + assert result.returncode == 0, f"Screenshot extraction failed:\nStderr: {result.stderr}" - # Parse JSONL output - result_json = None - for line in result.stdout.strip().split('\n'): - line = line.strip() - if line.startswith('{'): - try: - record = json.loads(line) - if record.get('type') == 'ArchiveResult': - result_json = record - break - except json.JSONDecodeError: - pass + # Parse JSONL output + result_json = None + for line in result.stdout.strip().split('\n'): + line = line.strip() + if line.startswith('{'): + try: + record = json.loads(line) + if record.get('type') == 'ArchiveResult': + result_json = record + break + except json.JSONDecodeError: + pass - assert result_json, "Should have ArchiveResult JSONL output" - assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}" - assert 'screenshot.png' in result_json['output_str'], f"Output should be screenshot.png: {result_json}" + assert result_json and result_json['status'] == 'succeeded' + screenshot_file = screenshot_dir / 'screenshot.png' + assert screenshot_file.exists() and screenshot_file.stat().st_size > 1000 + assert screenshot_file.read_bytes()[:8] == b'\x89PNG\r\n\x1a\n' - # Verify filesystem output - screenshot_file = screenshot_dir / 'screenshot.png' - assert screenshot_file.exists(), f"screenshot.png not created at {screenshot_file}" + # Scenario 2: Custom resolution + screenshot_dir2 = snapshot_chrome_dir.parent / 'screenshot2' + screenshot_dir2.mkdir() + env['CHROME_RESOLUTION'] = '800,600' - # Verify file is valid PNG - file_size = screenshot_file.stat().st_size - assert file_size > 1000, f"Screenshot too small: {file_size} bytes" - assert file_size < 10 * 1024 * 1024, f"Screenshot suspiciously large: {file_size} bytes" + result = subprocess.run( + ['node', str(SCREENSHOT_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], + cwd=str(screenshot_dir2), + capture_output=True, + text=True, + timeout=30, + env=env + ) - # Check PNG magic bytes - screenshot_data = screenshot_file.read_bytes() - assert screenshot_data[:8] == b'\x89PNG\r\n\x1a\n', "Should be valid PNG file" + assert result.returncode == 0 + screenshot_file2 = screenshot_dir2 / 'screenshot.png' + assert screenshot_file2.exists() + file_size = screenshot_file2.stat().st_size + assert 500 < file_size < 100000, f"800x600 screenshot size unexpected: {file_size}" - finally: - # Cleanup: Kill Chrome - try: - chrome_launch_process.send_signal(signal.SIGTERM) - chrome_launch_process.wait(timeout=5) - except: - pass - try: - os.kill(chrome_pid, signal.SIGKILL) - except OSError: - pass + # Scenario 3: Wrong target ID (error case) + screenshot_dir3 = snapshot_chrome_dir.parent / 'screenshot3' + screenshot_dir3.mkdir() + (snapshot_chrome_dir / 'target_id.txt').write_text('nonexistent-target-id') + result = subprocess.run( + ['node', str(SCREENSHOT_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], + cwd=str(screenshot_dir3), + capture_output=True, + text=True, + timeout=5, + env=env + ) -def test_extracts_screenshot_without_session(): - """Test screenshot extraction without existing Chrome session (fallback to own browser).""" - with tempfile.TemporaryDirectory() as tmpdir: - # Create proper snapshot directory structure - data_dir = Path(tmpdir) - snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-fallback' - screenshot_dir = snapshot_dir / 'screenshot' - screenshot_dir.mkdir(parents=True) + assert result.returncode != 0 + assert 'target' in result.stderr.lower() and 'not found' in result.stderr.lower() - # Don't set up Chrome session or staticfile - screenshot should launch its own browser - env = get_test_env() - result = subprocess.run( - ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-fallback'], - cwd=str(screenshot_dir), - capture_output=True, - text=True, - timeout=120, - env=env - ) - - assert result.returncode == 0, f"Extraction failed: {result.stderr}" - - # Parse JSONL output - result_json = None - for line in result.stdout.strip().split('\n'): - line = line.strip() - if line.startswith('{'): - try: - record = json.loads(line) - if record.get('type') == 'ArchiveResult': - result_json = record - break - except json.JSONDecodeError: - pass - - assert result_json, "Should have ArchiveResult JSONL output" - assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}" - assert 'screenshot.png' in result_json['output_str'] - - # Verify file created - screenshot_file = screenshot_dir / 'screenshot.png' - assert screenshot_file.exists(), "screenshot.png not created" - assert screenshot_file.stat().st_size > 1000, "Screenshot too small" + except RuntimeError as e: + if 'Chrome' in str(e) or 'CDP' in str(e): + pytest.skip(f"Chrome session setup failed: {e}") + raise def test_skips_when_staticfile_exists(): @@ -344,57 +249,42 @@ def test_reports_missing_chrome(): assert 'chrome' in combined.lower() or 'browser' in combined.lower() or 'ERROR=' in combined -def test_custom_resolution_and_user_agent(): - """Test that CHROME_RESOLUTION and CHROME_USER_AGENT configs are respected.""" +def test_waits_for_navigation_timeout(): + """Test that screenshot waits for navigation.json and times out quickly if missing.""" + import time + with tempfile.TemporaryDirectory() as tmpdir: - data_dir = Path(tmpdir) - snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-config' - screenshot_dir = snapshot_dir / 'screenshot' - screenshot_dir.mkdir(parents=True) + tmpdir = Path(tmpdir) + + # Create chrome directory without navigation.json to trigger timeout + chrome_dir = tmpdir.parent / 'chrome' + chrome_dir.mkdir(parents=True, exist_ok=True) + (chrome_dir / 'cdp_url.txt').write_text('ws://localhost:9222/devtools/browser/test') + (chrome_dir / 'target_id.txt').write_text('test-target-id') + # Intentionally NOT creating navigation.json to test timeout + + screenshot_dir = tmpdir / 'screenshot' + screenshot_dir.mkdir() env = get_test_env() - env['CHROME_RESOLUTION'] = '800,600' - env['CHROME_USER_AGENT'] = 'Test/1.0' + env['SCREENSHOT_TIMEOUT'] = '2' # Set 2 second timeout + start_time = time.time() result = subprocess.run( - ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-config'], + ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=test-timeout'], cwd=str(screenshot_dir), capture_output=True, text=True, - timeout=120, + timeout=5, # Test timeout slightly higher than SCREENSHOT_TIMEOUT env=env ) + elapsed = time.time() - start_time - assert result.returncode == 0, f"Extraction failed: {result.stderr}" - - screenshot_file = screenshot_dir / 'screenshot.png' - assert screenshot_file.exists(), "screenshot.png not created" - # Resolution affects file size - assert screenshot_file.stat().st_size > 500, "Screenshot too small" - - -def test_ssl_check_disabled(): - """Test that CHROME_CHECK_SSL_VALIDITY=False allows invalid certificates.""" - with tempfile.TemporaryDirectory() as tmpdir: - data_dir = Path(tmpdir) - snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-ssl' - screenshot_dir = snapshot_dir / 'screenshot' - screenshot_dir.mkdir(parents=True) - - env = get_test_env() - env['CHROME_CHECK_SSL_VALIDITY'] = 'False' - - result = subprocess.run( - ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-ssl'], - cwd=str(screenshot_dir), - capture_output=True, - text=True, - timeout=120, - env=env - ) - - assert result.returncode == 0, f"Should succeed: {result.stderr}" - assert (screenshot_dir / 'screenshot.png').exists() + # Should fail when navigation.json doesn't appear + assert result.returncode != 0, "Should fail when navigation.json missing" + assert 'not loaded' in result.stderr.lower() or 'navigate' in result.stderr.lower(), f"Should mention navigation timeout: {result.stderr}" + # Should complete within 3s (2s wait + 1s overhead) + assert elapsed < 3, f"Should timeout within 3s, took {elapsed:.1f}s" def test_config_timeout_honored(): @@ -485,345 +375,114 @@ def test_invalid_resolution_format(): # (depending on implementation - script should not crash with uncaught error) assert result.returncode in (0, 1), f"Script should handle bad resolution: {bad_resolution}" - -def test_boolean_env_var_parsing(): - """Test that boolean environment variables are parsed correctly.""" - import time +def test_no_cdp_url_fails(): + """Test error when chrome dir exists but no cdp_url.txt.""" with tempfile.TemporaryDirectory() as tmpdir: - data_dir = Path(tmpdir) - snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-bool' - screenshot_dir = snapshot_dir / 'screenshot' - screenshot_dir.mkdir(parents=True) - - env = get_test_env() - - # Test various boolean formats for CHROME_HEADLESS - for bool_val in ['true', '1', 'yes', 'on', 'True', 'TRUE']: - env['CHROME_HEADLESS'] = bool_val - result = subprocess.run( - ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-bool'], - cwd=str(screenshot_dir), - capture_output=True, - text=True, - timeout=120, - env=env - ) - # Should either succeed or fail, but shouldn't crash on boolean parsing - assert result.returncode in (0, 1), f"Should handle boolean value: {bool_val}" - - # Clean up screenshot file if created - screenshot_file = screenshot_dir / 'screenshot.png' - if screenshot_file.exists(): - screenshot_file.unlink() - - time.sleep(0.5) # Brief pause between attempts - - -def test_integer_env_var_parsing(): - """Test that integer environment variables are parsed correctly.""" - with tempfile.TemporaryDirectory() as tmpdir: - data_dir = Path(tmpdir) - snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-int' - screenshot_dir = snapshot_dir / 'screenshot' - screenshot_dir.mkdir(parents=True) - - env = get_test_env() - - # Test valid and invalid integer formats for CHROME_TIMEOUT - test_cases = [ - ('60', True), # Valid integer - ('invalid', True), # Invalid - should use default - ('', True), # Empty - should use default - ] - - for timeout_val, should_work in test_cases: - env['CHROME_TIMEOUT'] = timeout_val - result = subprocess.run( - ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-int'], - cwd=str(screenshot_dir), - capture_output=True, - text=True, - timeout=120, - env=env - ) - # Should either succeed or fail gracefully, but shouldn't crash on int parsing - assert result.returncode in (0, 1), f"Should handle timeout value: {timeout_val}" - - # Clean up screenshot file if created - screenshot_file = screenshot_dir / 'screenshot.png' - if screenshot_file.exists(): - screenshot_file.unlink() - - -def test_extracts_screenshot_with_all_config_options(): - """Test screenshot with comprehensive config to exercise all code paths.""" - with tempfile.TemporaryDirectory() as tmpdir: - data_dir = Path(tmpdir) - snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-full' - screenshot_dir = snapshot_dir / 'screenshot' - screenshot_dir.mkdir(parents=True) - - # Set ALL config options to exercise all code paths - env = get_test_env() - env['CHROME_HEADLESS'] = 'true' - env['CHROME_RESOLUTION'] = '800,600' - env['CHROME_USER_AGENT'] = 'TestBot/1.0' - env['CHROME_CHECK_SSL_VALIDITY'] = 'false' # Exercises checkSsl branch - env['CHROME_TIMEOUT'] = '60' - - result = subprocess.run( - ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-full'], - cwd=str(screenshot_dir), - capture_output=True, - text=True, - timeout=120, - env=env - ) - - assert result.returncode == 0, f"Screenshot should succeed: {result.stderr}" - - # Verify JSONL output with success - result_json = None - for line in result.stdout.strip().split('\n'): - if line.strip().startswith('{'): - try: - record = json.loads(line) - if record.get('type') == 'ArchiveResult': - result_json = record - break - except json.JSONDecodeError: - pass - - assert result_json, "Should have ArchiveResult JSONL output" - assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}" - assert 'screenshot.png' in result_json['output_str'] - - # Verify file created - screenshot_file = screenshot_dir / 'screenshot.png' - assert screenshot_file.exists(), "screenshot.png should be created" - assert screenshot_file.stat().st_size > 1000, "Screenshot should have content" - - -def test_headless_mode_false(): - """Test headless=false code path specifically.""" - with tempfile.TemporaryDirectory() as tmpdir: - data_dir = Path(tmpdir) - snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-headless' - screenshot_dir = snapshot_dir / 'screenshot' - screenshot_dir.mkdir(parents=True) - - env = get_test_env() - # Explicitly test headless=false (exercises the ternary false branch) - env['CHROME_HEADLESS'] = 'false' - - result = subprocess.run( - ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-headless-false'], - cwd=str(screenshot_dir), - capture_output=True, - text=True, - timeout=120, - env=env - ) - # Should work or fail gracefully - assert result.returncode in (0, 1), f"Headless=false should handle: {result.stderr}" - - -def test_invalid_url_causes_error(): - """Test error path with invalid URL that causes navigation failure.""" - with tempfile.TemporaryDirectory() as tmpdir: - data_dir = Path(tmpdir) - snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-invalid' - screenshot_dir = snapshot_dir / 'screenshot' - screenshot_dir.mkdir(parents=True) - - env = get_test_env() - env['CHROME_TIMEOUT'] = '5' # Short timeout - - # Use invalid URL to trigger error path - result = subprocess.run( - ['node', str(SCREENSHOT_HOOK), '--url=http://this-domain-does-not-exist-12345.invalid', '--snapshot-id=snap-invalid'], - cwd=str(screenshot_dir), - capture_output=True, - text=True, - timeout=30, - env=env - ) - - # Should fail due to navigation error - assert result.returncode != 0, "Should fail on invalid URL" - # Should NOT emit JSONL (transient error) - jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')] - assert len(jsonl_lines) == 0, f"Should not emit JSONL on error: {jsonl_lines}" - - -def test_with_corrupted_cdp_url_falls_back(): - """Test that corrupted CDP URL file causes fallback to launching browser.""" - with tempfile.TemporaryDirectory() as tmpdir: - data_dir = Path(tmpdir) - snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-corrupt-cdp' - screenshot_dir = snapshot_dir / 'screenshot' - screenshot_dir.mkdir(parents=True) - - # Create chrome directory with corrupted CDP URL - chrome_dir = snapshot_dir / 'chrome' + tmpdir = Path(tmpdir) + chrome_dir = tmpdir / 'chrome' chrome_dir.mkdir() - (chrome_dir / 'cdp_url.txt').write_text('ws://127.0.0.1:99999/invalid') + # Create target_id.txt and navigation.json but NOT cdp_url.txt + (chrome_dir / 'target_id.txt').write_text('test-target') + (chrome_dir / 'navigation.json').write_text('{}') - env = get_test_env() - env['CHROME_HEADLESS'] = 'true' - env['CHROME_TIMEOUT'] = '5' # Short timeout for fast test + screenshot_dir = tmpdir / 'screenshot' + screenshot_dir.mkdir() - # Screenshot should try CDP, fail quickly, then fall back to launching own browser result = subprocess.run( - ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-corrupt-cdp'], + ['node', str(SCREENSHOT_HOOK), '--url=https://example.com', '--snapshot-id=test'], cwd=str(screenshot_dir), capture_output=True, text=True, - timeout=30, - env=env + timeout=7, + env=get_test_env() ) - # Should succeed by falling back to launching browser - assert result.returncode == 0, f"Should fallback and succeed: {result.stderr}" - assert 'Failed to connect to CDP' in result.stderr, "Should log CDP connection failure" - - # Verify screenshot was created via fallback path - screenshot_file = screenshot_dir / 'screenshot.png' - assert screenshot_file.exists(), "Screenshot should be created via fallback" + assert result.returncode != 0 + assert 'no chrome session' in result.stderr.lower() -def test_user_agent_is_applied(): - """Test that CHROME_USER_AGENT is actually applied when launching browser.""" +def test_no_target_id_fails(): + """Test error when cdp_url exists but no target_id.txt.""" with tempfile.TemporaryDirectory() as tmpdir: - data_dir = Path(tmpdir) - snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-ua' - screenshot_dir = snapshot_dir / 'screenshot' - screenshot_dir.mkdir(parents=True) + tmpdir = Path(tmpdir) + chrome_dir = tmpdir / 'chrome' + chrome_dir.mkdir() + # Create cdp_url.txt and navigation.json but NOT target_id.txt + (chrome_dir / 'cdp_url.txt').write_text('ws://localhost:9222/devtools/browser/test') + (chrome_dir / 'navigation.json').write_text('{}') - env = get_test_env() - env['CHROME_USER_AGENT'] = 'CustomBot/9.9.9 (Testing)' - env['CHROME_HEADLESS'] = 'true' + screenshot_dir = tmpdir / 'screenshot' + screenshot_dir.mkdir() result = subprocess.run( - ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-ua'], + ['node', str(SCREENSHOT_HOOK), '--url=https://example.com', '--snapshot-id=test'], cwd=str(screenshot_dir), capture_output=True, text=True, - timeout=120, - env=env + timeout=7, + env=get_test_env() ) - # Should succeed with custom user agent - assert result.returncode == 0, f"Should succeed with custom UA: {result.stderr}" - screenshot_file = screenshot_dir / 'screenshot.png' - assert screenshot_file.exists(), "Screenshot should be created" + assert result.returncode != 0 + assert 'target_id.txt' in result.stderr.lower() -def test_check_ssl_false_branch(): - """Test CHROME_CHECK_SSL_VALIDITY=false adds ignore-certificate-errors arg.""" +def test_invalid_cdp_url_fails(): + """Test error with malformed CDP URL.""" with tempfile.TemporaryDirectory() as tmpdir: - data_dir = Path(tmpdir) - snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-nossl' - screenshot_dir = snapshot_dir / 'screenshot' - screenshot_dir.mkdir(parents=True) + tmpdir = Path(tmpdir) + chrome_dir = tmpdir / 'chrome' + chrome_dir.mkdir() + (chrome_dir / 'cdp_url.txt').write_text('invalid-url') + (chrome_dir / 'target_id.txt').write_text('test-target') + (chrome_dir / 'navigation.json').write_text('{}') - env = get_test_env() - env['CHROME_CHECK_SSL_VALIDITY'] = 'false' - env['CHROME_HEADLESS'] = 'true' + screenshot_dir = tmpdir / 'screenshot' + screenshot_dir.mkdir() - # Test with both boolean false and string 'false' result = subprocess.run( - ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-nossl'], + ['node', str(SCREENSHOT_HOOK), '--url=https://example.com', '--snapshot-id=test'], cwd=str(screenshot_dir), capture_output=True, text=True, - timeout=120, - env=env + timeout=7, + env=get_test_env() ) - assert result.returncode == 0, f"Should work with SSL check disabled: {result.stderr}" - assert (screenshot_dir / 'screenshot.png').exists() + assert result.returncode != 0 -def test_alternative_env_var_names(): - """Test fallback environment variable names (TIMEOUT vs CHROME_TIMEOUT, etc).""" +def test_invalid_timeout_uses_default(): + """Test that invalid SCREENSHOT_TIMEOUT falls back to default.""" with tempfile.TemporaryDirectory() as tmpdir: - data_dir = Path(tmpdir) - snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-altenv' - screenshot_dir = snapshot_dir / 'screenshot' - screenshot_dir.mkdir(parents=True) + tmpdir = Path(tmpdir) + chrome_dir = tmpdir / 'chrome' + chrome_dir.mkdir() + # No navigation.json to trigger timeout + (chrome_dir / 'cdp_url.txt').write_text('ws://localhost:9222/test') + (chrome_dir / 'target_id.txt').write_text('test') + + screenshot_dir = tmpdir / 'screenshot' + screenshot_dir.mkdir() env = get_test_env() - # Use alternative env var names (without CHROME_ prefix) - env['TIMEOUT'] = '45' - env['RESOLUTION'] = '1024,768' - env['USER_AGENT'] = 'AltBot/1.0' - env['CHECK_SSL_VALIDITY'] = 'false' + env['SCREENSHOT_TIMEOUT'] = 'invalid' # Should fallback to default (10s becomes NaN, treated as 0) + import time + start = time.time() result = subprocess.run( - ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-altenv'], + ['node', str(SCREENSHOT_HOOK), '--url=https://example.com', '--snapshot-id=test'], cwd=str(screenshot_dir), capture_output=True, text=True, - timeout=120, + timeout=5, env=env ) + elapsed = time.time() - start - assert result.returncode == 0, f"Should work with alternative env vars: {result.stderr}" - assert (screenshot_dir / 'screenshot.png').exists() - - -def test_very_large_resolution(): - """Test screenshot with very large resolution.""" - with tempfile.TemporaryDirectory() as tmpdir: - data_dir = Path(tmpdir) - snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-large' - screenshot_dir = snapshot_dir / 'screenshot' - screenshot_dir.mkdir(parents=True) - - env = get_test_env() - env['CHROME_RESOLUTION'] = '3840,2160' # 4K resolution - env['CHROME_HEADLESS'] = 'true' - - result = subprocess.run( - ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-large'], - cwd=str(screenshot_dir), - capture_output=True, - text=True, - timeout=120, - env=env - ) - - assert result.returncode == 0, f"Should handle large resolution: {result.stderr}" - screenshot_file = screenshot_dir / 'screenshot.png' - assert screenshot_file.exists() - # 4K screenshot should be larger - assert screenshot_file.stat().st_size > 5000, "4K screenshot should be substantial" - - -def test_very_small_resolution(): - """Test screenshot with very small resolution.""" - with tempfile.TemporaryDirectory() as tmpdir: - data_dir = Path(tmpdir) - snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-small' - screenshot_dir = snapshot_dir / 'screenshot' - screenshot_dir.mkdir(parents=True) - - env = get_test_env() - env['CHROME_RESOLUTION'] = '320,240' # Very small - env['CHROME_HEADLESS'] = 'true' - - result = subprocess.run( - ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-small'], - cwd=str(screenshot_dir), - capture_output=True, - text=True, - timeout=120, - env=env - ) - - assert result.returncode == 0, f"Should handle small resolution: {result.stderr}" - assert (screenshot_dir / 'screenshot.png').exists() + # With invalid timeout, parseInt returns NaN, which should be handled + assert result.returncode != 0 + assert elapsed < 2 # Should fail quickly, not wait 10s if __name__ == '__main__': diff --git a/archivebox/plugins/seo/tests/test_seo.py b/archivebox/plugins/seo/tests/test_seo.py index 23beaa76..e365e4b0 100644 --- a/archivebox/plugins/seo/tests/test_seo.py +++ b/archivebox/plugins/seo/tests/test_seo.py @@ -72,10 +72,9 @@ class TestSEOWithChrome(TestCase): test_url=test_url, navigate=True, timeout=30, - ) as (chrome_process, chrome_pid, snapshot_chrome_dir): - # Get environment and run the SEO hook - env = get_test_env() - env['CHROME_HEADLESS'] = 'true' + ) as (chrome_process, chrome_pid, snapshot_chrome_dir, env): + # Use the environment from chrome_session (already has CHROME_HEADLESS=true) + # Run SEO hook with the active Chrome session result = subprocess.run( diff --git a/archivebox/plugins/singlefile/tests/test_singlefile.py b/archivebox/plugins/singlefile/tests/test_singlefile.py index 0fbd3c07..a473f152 100644 --- a/archivebox/plugins/singlefile/tests/test_singlefile.py +++ b/archivebox/plugins/singlefile/tests/test_singlefile.py @@ -22,7 +22,7 @@ from archivebox.plugins.chrome.tests.chrome_test_helpers import ( get_test_env, get_plugin_dir, get_hook_script, - setup_chrome_session, + chrome_session, cleanup_chrome, ) @@ -96,17 +96,15 @@ def test_singlefile_with_chrome_session(): with tempfile.TemporaryDirectory() as tmpdir: tmpdir = Path(tmpdir) - try: - # Set up Chrome session using shared helper - chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session( - tmpdir=tmpdir, - crawl_id='singlefile-test-crawl', - snapshot_id='singlefile-test-snap', - test_url=TEST_URL, - navigate=False, # Don't navigate, singlefile will do that - timeout=20, - ) - + # Set up Chrome session using shared helper + with chrome_session( + tmpdir=tmpdir, + crawl_id='singlefile-test-crawl', + snapshot_id='singlefile-test-snap', + test_url=TEST_URL, + navigate=False, # Don't navigate, singlefile will do that + timeout=20, + ) as (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env): # singlefile looks for ../chrome/cdp_url.txt relative to cwd # So we need to run from a directory that has ../chrome pointing to our chrome dir singlefile_output_dir = tmpdir / 'snapshot' / 'singlefile' @@ -117,9 +115,8 @@ def test_singlefile_with_chrome_session(): if not chrome_link.exists(): chrome_link.symlink_to(tmpdir / 'crawl' / 'chrome') - env = get_test_env() + # Use env from chrome_session env['SINGLEFILE_ENABLED'] = 'true' - env['CHROME_HEADLESS'] = 'true' # Run singlefile - it should find and use the existing Chrome session result = subprocess.run( @@ -143,9 +140,6 @@ def test_singlefile_with_chrome_session(): assert result.returncode == 0 or 'browser-server' in result.stderr or 'cdp' in result.stderr.lower(), \ f"Singlefile should attempt CDP connection. stderr: {result.stderr}" - finally: - cleanup_chrome(chrome_launch_process, chrome_pid) - def test_singlefile_disabled_skips(): """Test that SINGLEFILE_ENABLED=False exits without JSONL.""" diff --git a/archivebox/plugins/ssl/tests/test_ssl.py b/archivebox/plugins/ssl/tests/test_ssl.py index 48ec0a6c..6261c26b 100644 --- a/archivebox/plugins/ssl/tests/test_ssl.py +++ b/archivebox/plugins/ssl/tests/test_ssl.py @@ -72,10 +72,9 @@ class TestSSLWithChrome(TestCase): test_url=test_url, navigate=True, timeout=30, - ) as (chrome_process, chrome_pid, snapshot_chrome_dir): - # Get environment and run the SSL hook - env = get_test_env() - env['CHROME_HEADLESS'] = 'true' + ) as (chrome_process, chrome_pid, snapshot_chrome_dir, env): + # Use the environment from chrome_session (already has CHROME_HEADLESS=true) + # Run SSL hook with the active Chrome session result = subprocess.run( diff --git a/archivebox/plugins/staticfile/tests/test_staticfile.py b/archivebox/plugins/staticfile/tests/test_staticfile.py index f80d0839..3f4412ae 100644 --- a/archivebox/plugins/staticfile/tests/test_staticfile.py +++ b/archivebox/plugins/staticfile/tests/test_staticfile.py @@ -72,16 +72,14 @@ class TestStaticfileWithChrome(TestCase): test_url=test_url, navigate=True, timeout=30, - ) as (chrome_process, chrome_pid, snapshot_chrome_dir): - # Get environment and run the staticfile hook - env = get_test_env() - env['CHROME_HEADLESS'] = 'true' + ) as (chrome_process, chrome_pid, snapshot_chrome_dir, env): + # Use the environment from chrome_session (already has CHROME_HEADLESS=true) + # Run staticfile hook with the active Chrome session result = subprocess.run( ['node', str(STATICFILE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], - cwd=str(snapshot_chrome_dir, - env=get_test_env()), + cwd=str(snapshot_chrome_dir), capture_output=True, text=True, timeout=120, # Longer timeout as it waits for navigation diff --git a/archivebox/tests/test_recursive_crawl.py b/archivebox/tests/test_recursive_crawl.py index 4ced2029..976a4e8c 100644 --- a/archivebox/tests/test_recursive_crawl.py +++ b/archivebox/tests/test_recursive_crawl.py @@ -384,11 +384,11 @@ def test_root_snapshot_has_depth_zero(tmp_path, process, disable_extractors_dict def test_archiveresult_worker_queue_filters_by_foreground_extractors(tmp_path, process): - """Test that ArchiveResultWorker.get_queue() only blocks on foreground extractors.""" + """Test that background hooks don't block foreground extractors from running.""" os.chdir(tmp_path) - # This test verifies the fix for the orchestrator bug where background hooks - # were blocking parser extractors from running + # This test verifies that background hooks run concurrently with foreground hooks + # and don't block parser extractors # Start a crawl env = os.environ.copy() diff --git a/archivebox/workers/orchestrator.py b/archivebox/workers/orchestrator.py index ed8bf832..7c7b4d0b 100644 --- a/archivebox/workers/orchestrator.py +++ b/archivebox/workers/orchestrator.py @@ -1,15 +1,13 @@ """ Orchestrator for managing worker processes. -The Orchestrator polls queues for each model type (Crawl, Snapshot, ArchiveResult) -and lazily spawns worker processes when there is work to be done. +The Orchestrator polls the Crawl queue and spawns CrawlWorkers as needed. Architecture: - Orchestrator (main loop, polls queues) - ├── CrawlWorker subprocess(es) - ├── SnapshotWorker subprocess(es) - └── ArchiveResultWorker subprocess(es) - └── Each worker spawns task subprocesses via CLI + Orchestrator (polls Crawl queue) + └── CrawlWorker(s) (one per active Crawl) + └── SnapshotWorker(s) (one per Snapshot, up to limit) + └── Hook Processes (sequential, forked by SnapshotWorker) Usage: # Default: runs forever (for use as subprocess of server) @@ -38,7 +36,7 @@ from django.utils import timezone from rich import print from archivebox.misc.logging_util import log_worker_event -from .worker import Worker, CrawlWorker, SnapshotWorker, ArchiveResultWorker +from .worker import Worker, CrawlWorker def _run_orchestrator_process(exit_on_idle: bool) -> None: @@ -52,22 +50,27 @@ def _run_orchestrator_process(exit_on_idle: bool) -> None: class Orchestrator: """ Manages worker processes by polling queues and spawning workers as needed. - + The orchestrator: - 1. Polls each model queue (Crawl, Snapshot, ArchiveResult) - 2. If items exist and fewer than MAX_CONCURRENT workers are running, spawns workers + 1. Polls Crawl queue + 2. If crawls exist and fewer than MAX_CRAWL_WORKERS are running, spawns CrawlWorkers 3. Monitors worker health and cleans up stale PIDs - 4. Exits when all queues are empty (unless daemon mode) + 4. Exits when queue is empty (unless daemon mode) + + Architecture: + - Orchestrator spawns CrawlWorkers (one per active Crawl) + - Each CrawlWorker spawns SnapshotWorkers (one per Snapshot, up to limit) + - Each SnapshotWorker runs hooks sequentially for its snapshot """ - - WORKER_TYPES: list[Type[Worker]] = [CrawlWorker, SnapshotWorker, ArchiveResultWorker] + + # Only CrawlWorker - SnapshotWorkers are spawned by CrawlWorker subprocess, not by Orchestrator + WORKER_TYPES: list[Type[Worker]] = [CrawlWorker] # Configuration POLL_INTERVAL: float = 2.0 # How often to check for new work (seconds) IDLE_TIMEOUT: int = 3 # Exit after N idle ticks (0 = never exit) - MAX_WORKERS_PER_TYPE: int = 8 # Max workers per model type - MAX_TOTAL_WORKERS: int = 24 # Max workers across all types - + MAX_CRAWL_WORKERS: int = 8 # Max crawls processing simultaneously + def __init__(self, exit_on_idle: bool = True, crawl_id: str | None = None): self.exit_on_idle = exit_on_idle self.crawl_id = crawl_id # If set, only process work for this crawl @@ -76,11 +79,9 @@ class Orchestrator: self.idle_count: int = 0 self._last_cleanup_time: float = 0.0 # For throttling cleanup_stale_running() - # In foreground mode (exit_on_idle=True), limit workers but allow enough - # for crawl progression: 1 CrawlWorker + 1 SnapshotWorker + 1 ArchiveResultWorker + # In foreground mode (exit_on_idle=True), limit to 1 CrawlWorker if self.exit_on_idle: - self.MAX_WORKERS_PER_TYPE = 1 - self.MAX_TOTAL_WORKERS = 3 # Allow one worker of each type to run concurrently + self.MAX_CRAWL_WORKERS = 1 def __repr__(self) -> str: return f'[underline]Orchestrator[/underline]\\[pid={self.pid}]' @@ -109,14 +110,18 @@ class Orchestrator: # Clean up any stale Process records from previous runs stale_count = Process.cleanup_stale_running() + # Clean up orphaned Chrome processes from previous crashes + chrome_count = Process.cleanup_orphaned_chrome() + # Collect startup metadata metadata = { - 'max_workers_per_type': self.MAX_WORKERS_PER_TYPE, - 'max_total_workers': self.MAX_TOTAL_WORKERS, + 'max_crawl_workers': self.MAX_CRAWL_WORKERS, 'poll_interval': self.POLL_INTERVAL, } if stale_count: metadata['cleaned_stale_pids'] = stale_count + if chrome_count: + metadata['cleaned_orphaned_chrome'] = chrome_count log_worker_event( worker_type='Orchestrator', @@ -126,8 +131,34 @@ class Orchestrator: metadata=metadata, ) + def terminate_all_workers(self) -> None: + """Terminate all running worker processes.""" + from archivebox.machine.models import Process + import signal + + # Get all running worker processes + running_workers = Process.objects.filter( + process_type=Process.TypeChoices.WORKER, + status__in=['running', 'started'] + ) + + for worker_process in running_workers: + try: + # Send SIGTERM to gracefully terminate the worker + os.kill(worker_process.pid, signal.SIGTERM) + except ProcessLookupError: + # Process already dead + pass + except Exception: + # Ignore other errors during shutdown + pass + def on_shutdown(self, error: BaseException | None = None) -> None: """Called when orchestrator shuts down.""" + # Terminate all worker processes in exit_on_idle mode + if self.exit_on_idle: + self.terminate_all_workers() + # Update Process record status if hasattr(self, 'db_process') and self.db_process: # KeyboardInterrupt is a graceful shutdown, not an error @@ -163,20 +194,15 @@ class Orchestrator: return len(WorkerClass.get_running_workers()) def should_spawn_worker(self, WorkerClass: Type[Worker], queue_count: int) -> bool: - """Determine if we should spawn a new worker of the given type.""" + """Determine if we should spawn a new CrawlWorker.""" if queue_count == 0: return False - # Check per-type limit + # Check CrawlWorker limit running_workers = WorkerClass.get_running_workers() running_count = len(running_workers) - if running_count >= self.MAX_WORKERS_PER_TYPE: - return False - - # Check total limit - total_workers = self.get_total_worker_count() - if total_workers >= self.MAX_TOTAL_WORKERS: + if running_count >= self.MAX_CRAWL_WORKERS: return False # Check if we already have enough workers for the queue size @@ -190,7 +216,7 @@ class Orchestrator: """Spawn a new worker process. Returns PID or None if spawn failed.""" try: print(f'[yellow]DEBUG: Spawning {WorkerClass.name} worker with crawl_id={self.crawl_id}...[/yellow]') - pid = WorkerClass.start(daemon=False, crawl_id=self.crawl_id) + pid = WorkerClass.start(crawl_id=self.crawl_id) print(f'[yellow]DEBUG: Spawned {WorkerClass.name} worker with PID={pid}[/yellow]') # CRITICAL: Block until worker registers itself in Process table @@ -259,24 +285,49 @@ class Orchestrator: def check_queues_and_spawn_workers(self) -> dict[str, int]: """ - Check all queues and spawn workers as needed. - Returns dict of queue sizes by worker type. + Check Crawl queue and spawn CrawlWorkers as needed. + Returns dict of queue sizes. """ + from archivebox.crawls.models import Crawl + queue_sizes = {} - for WorkerClass in self.WORKER_TYPES: - # Get queue for this worker type - # Need to instantiate worker to get queue (for model access) - worker = WorkerClass(worker_id=-1, crawl_id=self.crawl_id) # temp instance just for queue access - queue = worker.get_queue() - queue_count = queue.count() - queue_sizes[WorkerClass.name] = queue_count + # Only check Crawl queue + crawl_queue = Crawl.objects.filter( + retry_at__lte=timezone.now() + ).exclude( + status__in=Crawl.FINAL_STATES + ) + + # Apply crawl_id filter if set + if self.crawl_id: + crawl_queue = crawl_queue.filter(id=self.crawl_id) + + crawl_queue = crawl_queue.order_by('retry_at') + crawl_count = crawl_queue.count() + queue_sizes['crawl'] = crawl_count + + # Spawn CrawlWorker if needed + if self.should_spawn_worker(CrawlWorker, crawl_count): + # Claim next crawl + crawl = crawl_queue.first() + if crawl and self._claim_crawl(crawl): + CrawlWorker.start(crawl_id=str(crawl.id)) - # Spawn worker if needed - if self.should_spawn_worker(WorkerClass, queue_count): - self.spawn_worker(WorkerClass) - return queue_sizes + + def _claim_crawl(self, crawl) -> bool: + """Atomically claim a crawl using optimistic locking.""" + from archivebox.crawls.models import Crawl + + updated = Crawl.objects.filter( + pk=crawl.pk, + retry_at=crawl.retry_at, + ).update( + retry_at=timezone.now() + timedelta(hours=24), # Long lock (crawls take time) + ) + + return updated == 1 def has_pending_work(self, queue_sizes: dict[str, int]) -> bool: """Check if any queue has pending work.""" @@ -287,30 +338,21 @@ class Orchestrator: return self.get_total_worker_count() > 0 def has_future_work(self) -> bool: - """Check if there's work scheduled for the future (retry_at > now).""" - for WorkerClass in self.WORKER_TYPES: - worker = WorkerClass(worker_id=-1, crawl_id=self.crawl_id) - Model = worker.get_model() + """Check if there's work scheduled for the future (retry_at > now) in Crawl queue.""" + from archivebox.crawls.models import Crawl - # Build filter for future work, respecting crawl_id if set - qs = Model.objects.filter( - retry_at__gt=timezone.now() - ).exclude( - status__in=Model.FINAL_STATES - ) + # Build filter for future work, respecting crawl_id if set + qs = Crawl.objects.filter( + retry_at__gt=timezone.now() + ).exclude( + status__in=Crawl.FINAL_STATES + ) - # Apply crawl_id filter if set - if self.crawl_id: - if WorkerClass.name == 'crawl': - qs = qs.filter(id=self.crawl_id) - elif WorkerClass.name == 'snapshot': - qs = qs.filter(crawl_id=self.crawl_id) - elif WorkerClass.name == 'archiveresult': - qs = qs.filter(snapshot__crawl_id=self.crawl_id) + # Apply crawl_id filter if set + if self.crawl_id: + qs = qs.filter(id=self.crawl_id) - if qs.count() > 0: - return True - return False + return qs.count() > 0 def on_tick(self, queue_sizes: dict[str, int]) -> None: """Called each orchestrator tick. Override for custom behavior.""" @@ -345,20 +387,20 @@ class Orchestrator: def runloop(self) -> None: """Main orchestrator loop.""" - from rich.progress import Progress, BarColumn, TextColumn, TaskProgressColumn - from archivebox.misc.logging import IS_TTY, CONSOLE + from rich.live import Live + from archivebox.misc.logging import IS_TTY + from archivebox.misc.progress_layout import ArchiveBoxProgressLayout import sys import os - # Enable progress bars only in TTY + foreground mode + # Enable progress layout only in TTY + foreground mode show_progress = IS_TTY and self.exit_on_idle self.on_startup() - task_ids = {} if not show_progress: - # No progress bars - just run normally - self._run_orchestrator_loop(None, task_ids) + # No progress layout - just run normally + self._run_orchestrator_loop(None) else: # Redirect worker subprocess output to /dev/null devnull_fd = os.open(os.devnull, os.O_WRONLY) @@ -384,14 +426,16 @@ class Orchestrator: original_console = logging_module.CONSOLE logging_module.CONSOLE = orchestrator_console - # Now create Progress and run loop (DON'T restore stdout/stderr - workers need /dev/null) - with Progress( - TextColumn("[cyan]{task.description}"), - BarColumn(bar_width=40), - TaskProgressColumn(), + # Create layout and run with Live display + progress_layout = ArchiveBoxProgressLayout(crawl_id=self.crawl_id) + + with Live( + progress_layout.get_layout(), + refresh_per_second=4, + screen=True, console=orchestrator_console, - ) as progress: - self._run_orchestrator_loop(progress, task_ids) + ): + self._run_orchestrator_loop(progress_layout) # Restore original console logging_module.CONSOLE = original_console @@ -409,22 +453,68 @@ class Orchestrator: pass # stdout_for_console is closed by orchestrator_console - def _run_orchestrator_loop(self, progress, task_ids): + def _run_orchestrator_loop(self, progress_layout): """Run the main orchestrator loop with optional progress display.""" last_queue_sizes = {} last_snapshot_count = None + tick_count = 0 + + # Track snapshot progress to detect changes + snapshot_progress = {} # snapshot_id -> (total, completed, current_plugin) + try: while True: + tick_count += 1 + # Check queues and spawn workers queue_sizes = self.check_queues_and_spawn_workers() - # Debug queue sizes (only when changed) - if progress and queue_sizes != last_queue_sizes: - progress.console.print(f'[yellow]DEBUG: Queue sizes: {queue_sizes}[/yellow]') - last_queue_sizes = queue_sizes.copy() + # Get worker counts for each type + worker_counts = { + WorkerClass.name: len(WorkerClass.get_running_workers()) + for WorkerClass in self.WORKER_TYPES + } - # Update progress bars - if progress: + # Update layout if enabled + if progress_layout: + # Get crawl queue and worker counts + crawl_queue_count = queue_sizes.get('crawl', 0) + crawl_workers_count = worker_counts.get('crawl', 0) + + # Determine orchestrator status + if crawl_workers_count > 0: + status = "Working" + elif crawl_queue_count > 0: + status = "Spawning" + else: + status = "Idle" + + # Update orchestrator status + progress_layout.update_orchestrator_status( + status=status, + crawl_queue_count=crawl_queue_count, + crawl_workers_count=crawl_workers_count, + max_crawl_workers=self.MAX_CRAWL_WORKERS, + ) + + # Log queue size changes + if queue_sizes != last_queue_sizes: + for worker_type, count in queue_sizes.items(): + old_count = last_queue_sizes.get(worker_type, 0) + if count != old_count: + if count > old_count: + progress_layout.log_event( + f"{worker_type.capitalize()} queue: {old_count} → {count}", + style="yellow" + ) + else: + progress_layout.log_event( + f"{worker_type.capitalize()} queue: {old_count} → {count}", + style="green" + ) + last_queue_sizes = queue_sizes.copy() + + # Update snapshot progress from archivebox.core.models import Snapshot # Get all started snapshots (optionally filtered by crawl_id) @@ -438,9 +528,36 @@ class Orchestrator: active_snapshots = list(Snapshot.objects.filter(**snapshot_filter)) - # Debug snapshot count (only when changed) + # Log snapshot count changes and details if len(active_snapshots) != last_snapshot_count: - progress.console.print(f'[yellow]DEBUG: Found {len(active_snapshots)} active snapshots (crawl_id={self.crawl_id})[/yellow]') + if last_snapshot_count is not None: + if len(active_snapshots) > last_snapshot_count: + progress_layout.log_event( + f"Active snapshots: {last_snapshot_count} → {len(active_snapshots)}", + style="cyan" + ) + # Log which snapshots started + for snapshot in active_snapshots[-1:]: # Just show the newest one + progress_layout.log_event( + f"Started: {snapshot.url[:60]}", + style="green" + ) + + # Log SnapshotWorker count + from archivebox.machine.models import Process + all_workers = Process.objects.filter( + process_type=Process.TypeChoices.WORKER, + status__in=['running', 'started'] + ).count() + progress_layout.log_event( + f"Workers running: {all_workers} ({crawl_workers_count} CrawlWorkers)", + style="grey53" + ) + else: + progress_layout.log_event( + f"Active snapshots: {last_snapshot_count} → {len(active_snapshots)}", + style="blue" + ) last_snapshot_count = len(active_snapshots) # Track which snapshots are still active @@ -450,13 +567,14 @@ class Orchestrator: active_ids.add(snapshot.id) total = snapshot.archiveresult_set.count() - if total == 0: - continue - completed = snapshot.archiveresult_set.filter( status__in=['succeeded', 'skipped', 'failed'] ).count() + # Count hooks by status for debugging + queued = snapshot.archiveresult_set.filter(status='queued').count() + started = snapshot.archiveresult_set.filter(status='started').count() + # Find currently running hook (ordered by hook_name to get lowest step number) current_ar = snapshot.archiveresult_set.filter(status='started').order_by('hook_name').first() if not current_ar: @@ -472,24 +590,78 @@ class Orchestrator: # Clean up the name: remove prefix and extension clean_name = hook_name.split('__')[-1] if '__' in hook_name else hook_name clean_name = clean_name.replace('.py', '').replace('.sh', '').replace('.bg', '') - current_plugin = f" • {clean_name}" + current_plugin = clean_name + elif total == 0: + # Snapshot just started, hooks not created yet + current_plugin = "initializing" + elif queued > 0: + # Hooks created but none started yet + current_plugin = "waiting" - # Build description with URL + current plugin - url = snapshot.url[:50] + '...' if len(snapshot.url) > 50 else snapshot.url - description = f"{url}{current_plugin}" + # Update snapshot worker (show even if no hooks yet) + # Debug: Log first time we see this snapshot + if snapshot.id not in progress_layout.snapshot_to_worker: + progress_layout.log_event( + f"Assigning to worker: {snapshot.url[:50]}", + style="grey53" + ) - # Create or update task - if snapshot.id not in task_ids: - task_ids[snapshot.id] = progress.add_task(description, total=total, completed=completed) - else: - # Update both progress and description - progress.update(task_ids[snapshot.id], description=description, completed=completed) + # Track progress changes + prev_progress = snapshot_progress.get(snapshot.id, (0, 0, '')) + curr_progress = (total, completed, current_plugin) - # Remove tasks for snapshots that are no longer active - for snapshot_id in list(task_ids.keys()): + if prev_progress != curr_progress: + prev_total, prev_completed, prev_plugin = prev_progress + + # Log hooks created + if total > prev_total: + progress_layout.log_event( + f"Hooks created: {total} for {snapshot.url[:40]}", + style="cyan" + ) + + # Log hook completion + if completed > prev_completed: + progress_layout.log_event( + f"Hook completed: {completed}/{total} for {snapshot.url[:40]}", + style="green" + ) + + # Log plugin change + if current_plugin and current_plugin != prev_plugin: + progress_layout.log_event( + f"Running: {current_plugin} ({snapshot.url[:40]})", + style="yellow" + ) + + snapshot_progress[snapshot.id] = curr_progress + + # Debug: Every 10 ticks, log detailed status if stuck at initializing + if tick_count % 10 == 0 and total == 0 and current_plugin == "initializing": + progress_layout.log_event( + f"DEBUG: Snapshot stuck at initializing (status={snapshot.status})", + style="red" + ) + + progress_layout.update_snapshot_worker( + snapshot_id=snapshot.id, + url=snapshot.url, + total=max(total, 1), # Show at least 1 to avoid division by zero + completed=completed, + current_plugin=current_plugin, + ) + + # Remove snapshots that are no longer active + for snapshot_id in list(progress_layout.snapshot_to_worker.keys()): if snapshot_id not in active_ids: - progress.remove_task(task_ids[snapshot_id]) - del task_ids[snapshot_id] + progress_layout.log_event( + f"Snapshot completed/removed", + style="blue" + ) + progress_layout.remove_snapshot_worker(snapshot_id) + # Also clean up progress tracking + if snapshot_id in snapshot_progress: + del snapshot_progress[snapshot_id] # Track idle state has_pending = self.has_pending_work(queue_sizes) @@ -503,6 +675,8 @@ class Orchestrator: # Check if we should exit if self.should_exit(queue_sizes): + if progress_layout: + progress_layout.log_event("All work complete", style="green") log_worker_event( worker_type='Orchestrator', event='All work complete', @@ -514,8 +688,12 @@ class Orchestrator: time.sleep(self.POLL_INTERVAL) except KeyboardInterrupt: + if progress_layout: + progress_layout.log_event("Interrupted by user", style="red") print() # Newline after ^C except BaseException as e: + if progress_layout: + progress_layout.log_event(f"Error: {e}", style="red") self.on_shutdown(error=e) raise else: diff --git a/archivebox/workers/worker.py b/archivebox/workers/worker.py index 5a0c0980..439bdda4 100644 --- a/archivebox/workers/worker.py +++ b/archivebox/workers/worker.py @@ -34,7 +34,7 @@ CPU_COUNT = cpu_count() WORKER_TYPES: dict[str, type['Worker']] = {} -def _run_worker(worker_class_name: str, worker_id: int, daemon: bool, **kwargs): +def _run_worker(worker_class_name: str, worker_id: int, **kwargs): """ Module-level function to run a worker. Must be at module level for pickling. """ @@ -43,16 +43,28 @@ def _run_worker(worker_class_name: str, worker_id: int, daemon: bool, **kwargs): # Get worker class by name to avoid pickling class objects worker_cls = WORKER_TYPES[worker_class_name] - worker = worker_cls(worker_id=worker_id, daemon=daemon, **kwargs) + worker = worker_cls(worker_id=worker_id, **kwargs) + worker.runloop() + + +def _run_snapshot_worker(snapshot_id: str, worker_id: int, **kwargs): + """ + Module-level function to run a SnapshotWorker for a specific snapshot. + Must be at module level for pickling compatibility. + """ + from archivebox.config.django import setup_django + setup_django() + + worker = SnapshotWorker(snapshot_id=snapshot_id, worker_id=worker_id, **kwargs) worker.runloop() class Worker: """ - Base worker class that polls a queue and processes items directly. + Base worker class for CrawlWorker and SnapshotWorker. - Each item is processed by calling its state machine tick() method. - Workers exit when idle for too long (unless daemon mode). + Workers are spawned as subprocesses to process crawls and snapshots. + Each worker type has its own custom runloop implementation. """ name: ClassVar[str] = 'worker' @@ -60,16 +72,10 @@ class Worker: # Configuration (can be overridden by subclasses) MAX_TICK_TIME: ClassVar[int] = 60 MAX_CONCURRENT_TASKS: ClassVar[int] = 1 - POLL_INTERVAL: ClassVar[float] = 0.1 # How often to check for new work (seconds) - IDLE_TIMEOUT: ClassVar[int] = 100 # Exit after N idle iterations (10 sec at 0.1 poll interval) - def __init__(self, worker_id: int = 0, daemon: bool = False, crawl_id: str | None = None, **kwargs: Any): + def __init__(self, worker_id: int = 0, **kwargs: Any): self.worker_id = worker_id - self.daemon = daemon - self.crawl_id = crawl_id # If set, only process work for this crawl self.pid: int = os.getpid() - self.pid_file: Path | None = None - self.idle_count: int = 0 def __repr__(self) -> str: return f'[underline]{self.__class__.__name__}[/underline]\\[id={self.worker_id}, pid={self.pid}]' @@ -78,55 +84,6 @@ class Worker: """Get the Django model class. Subclasses must override this.""" raise NotImplementedError("Subclasses must implement get_model()") - def get_queue(self) -> QuerySet: - """Get the queue of objects ready for processing.""" - Model = self.get_model() - return Model.objects.filter( - retry_at__lte=timezone.now() - ).exclude( - status__in=Model.FINAL_STATES - ).order_by('retry_at') - - def claim_next(self): - """ - Atomically claim the next object from the queue. - Returns the claimed object or None if queue is empty or claim failed. - """ - Model = self.get_model() - - queue = self.get_queue() - obj = queue.first() - if obj is None: - return None - - # Atomic claim using optimistic locking on retry_at - claimed = Model.objects.filter( - pk=obj.pk, - retry_at=obj.retry_at, - ).update( - retry_at=timezone.now() + timedelta(seconds=self.MAX_TICK_TIME) - ) - - if claimed == 1: - obj.refresh_from_db() - return obj - - return None # Someone else claimed it - - def process_item(self, obj) -> bool: - """ - Process a single item by calling its state machine tick(). - Returns True on success, False on failure. - Subclasses can override for custom processing. - """ - try: - obj.sm.tick() - return True - except Exception as e: - # Error will be logged in runloop's completion event - traceback.print_exc() - return False - def on_startup(self) -> None: """Called when worker starts.""" from archivebox.machine.models import Process @@ -139,7 +96,7 @@ class Worker: if self.db_process.process_type != Process.TypeChoices.WORKER: self.db_process.process_type = Process.TypeChoices.WORKER update_fields.append('process_type') - # Store worker type name (crawl/snapshot/archiveresult) in worker_type field + # Store worker type name (crawl/snapshot) in worker_type field if not self.db_process.worker_type: self.db_process.worker_type = self.name update_fields.append('worker_type') @@ -148,13 +105,11 @@ class Worker: # Determine worker type for logging worker_type_name = self.__class__.__name__ - indent_level = 1 # Default for most workers + indent_level = 1 # Default for CrawlWorker - # Adjust indent level based on worker type + # SnapshotWorker gets indent level 2 if 'Snapshot' in worker_type_name: indent_level = 2 - elif 'ArchiveResult' in worker_type_name: - indent_level = 3 log_worker_event( worker_type=worker_type_name, @@ -162,10 +117,6 @@ class Worker: indent_level=indent_level, pid=self.pid, worker_id=str(self.worker_id), - metadata={ - 'max_concurrent': self.MAX_CONCURRENT_TASKS, - 'poll_interval': self.POLL_INTERVAL, - }, ) def on_shutdown(self, error: BaseException | None = None) -> None: @@ -179,12 +130,10 @@ class Worker: # Determine worker type for logging worker_type_name = self.__class__.__name__ - indent_level = 1 + indent_level = 1 # CrawlWorker if 'Snapshot' in worker_type_name: indent_level = 2 - elif 'ArchiveResult' in worker_type_name: - indent_level = 3 log_worker_event( worker_type=worker_type_name, @@ -195,121 +144,157 @@ class Worker: error=error if error and not isinstance(error, KeyboardInterrupt) else None, ) - def should_exit(self) -> bool: - """Check if worker should exit due to idle timeout.""" - if self.daemon: - return False + def _terminate_background_hooks( + self, + background_processes: dict[str, 'Process'], + worker_type: str, + indent_level: int, + ) -> None: + """ + Terminate background hooks in 3 phases (shared logic for Crawl/Snapshot workers). - if self.IDLE_TIMEOUT == 0: - return False + Phase 1: Send SIGTERM to all bg hooks + children in parallel (polite request to wrap up) + Phase 2: Wait for each hook's remaining timeout before SIGKILL + Phase 3: SIGKILL any stragglers that exceeded their timeout - return self.idle_count >= self.IDLE_TIMEOUT + Args: + background_processes: Dict mapping hook name -> Process instance + worker_type: Worker type name for logging (e.g., 'CrawlWorker', 'SnapshotWorker') + indent_level: Logging indent level (1 for Crawl, 2 for Snapshot) + """ + import signal + import time - def runloop(self) -> None: - """Main worker loop - polls queue, processes items.""" - self.on_startup() + if not background_processes: + return - # Determine worker type for logging - worker_type_name = self.__class__.__name__ - indent_level = 1 + now = time.time() - if 'Snapshot' in worker_type_name: - indent_level = 2 - elif 'ArchiveResult' in worker_type_name: - indent_level = 3 + # Phase 1: Send SIGTERM to ALL background processes + children in parallel + log_worker_event( + worker_type=worker_type, + event=f'Sending SIGTERM to {len(background_processes)} background hooks (+ children)', + indent_level=indent_level, + pid=self.pid, + ) - try: - while True: - # Try to claim and process an item - obj = self.claim_next() - - if obj is not None: - self.idle_count = 0 - - # Build metadata for task start - start_metadata = {} - url = None - if hasattr(obj, 'url'): - # SnapshotWorker - url = str(obj.url) if obj.url else None - elif hasattr(obj, 'snapshot') and hasattr(obj.snapshot, 'url'): - # ArchiveResultWorker - url = str(obj.snapshot.url) if obj.snapshot.url else None - elif hasattr(obj, 'get_urls_list'): - # CrawlWorker - urls = obj.get_urls_list() - url = urls[0] if urls else None - - plugin = None - if hasattr(obj, 'plugin'): - # ArchiveResultWorker, Crawl - plugin = obj.plugin + # Build deadline map first (before killing, to get accurate remaining time) + deadlines = {} + for hook_name, process in background_processes.items(): + elapsed = now - process.started_at.timestamp() + remaining = max(0, process.timeout - elapsed) + deadline = now + remaining + deadlines[hook_name] = (process, deadline) + # Send SIGTERM to all process trees in parallel (non-blocking) + for hook_name, process in background_processes.items(): + try: + # Get chrome children (renderer processes etc) before sending signal + children_pids = process.get_children_pids() + if children_pids: + # Chrome hook with children - kill tree + os.kill(process.pid, signal.SIGTERM) + for child_pid in children_pids: + try: + os.kill(child_pid, signal.SIGTERM) + except ProcessLookupError: + pass log_worker_event( - worker_type=worker_type_name, - event='Processing', + worker_type=worker_type, + event=f'Sent SIGTERM to {hook_name} + {len(children_pids)} children', indent_level=indent_level, pid=self.pid, - worker_id=str(self.worker_id), - url=url, - plugin=plugin, - metadata=start_metadata if start_metadata else None, - ) - - start_time = time.time() - success = self.process_item(obj) - elapsed = time.time() - start_time - - # Build metadata for task completion - complete_metadata = { - 'duration': elapsed, - 'status': 'success' if success else 'failed', - } - - log_worker_event( - worker_type=worker_type_name, - event='Completed' if success else 'Failed', - indent_level=indent_level, - pid=self.pid, - worker_id=str(self.worker_id), - url=url, - plugin=plugin, - metadata=complete_metadata, ) else: - # No work available - idle logging suppressed - self.idle_count += 1 + # No children - normal kill + os.kill(process.pid, signal.SIGTERM) + except ProcessLookupError: + pass # Already dead + except Exception as e: + log_worker_event( + worker_type=worker_type, + event=f'Failed to SIGTERM {hook_name}: {e}', + indent_level=indent_level, + pid=self.pid, + ) - # Check if we should exit - if self.should_exit(): - # Exit logging suppressed - shutdown will be logged by on_shutdown() - break + # Phase 2: Wait for all processes in parallel, respecting individual timeouts + for hook_name, (process, deadline) in deadlines.items(): + remaining = deadline - now + log_worker_event( + worker_type=worker_type, + event=f'Waiting up to {remaining:.1f}s for {hook_name}', + indent_level=indent_level, + pid=self.pid, + ) - time.sleep(self.POLL_INTERVAL) + # Poll all processes in parallel using Process.poll() + still_running = set(deadlines.keys()) - except KeyboardInterrupt: - pass - except BaseException as e: - self.on_shutdown(error=e) - raise - else: - self.on_shutdown() + while still_running: + time.sleep(0.1) + now = time.time() + + for hook_name in list(still_running): + process, deadline = deadlines[hook_name] + + # Check if process exited using Process.poll() + exit_code = process.poll() + if exit_code is not None: + # Process exited + still_running.remove(hook_name) + log_worker_event( + worker_type=worker_type, + event=f'✓ {hook_name} exited with code {exit_code}', + indent_level=indent_level, + pid=self.pid, + ) + continue + + # Check if deadline exceeded + if now >= deadline: + # Timeout exceeded - SIGKILL process tree + try: + # Get children before killing (chrome may have spawned more) + children_pids = process.get_children_pids() + if children_pids: + # Kill children first + for child_pid in children_pids: + try: + os.kill(child_pid, signal.SIGKILL) + except ProcessLookupError: + pass + # Then kill parent + process.kill(signal_num=signal.SIGKILL) + log_worker_event( + worker_type=worker_type, + event=f'⚠ Sent SIGKILL to {hook_name} + {len(children_pids) if children_pids else 0} children (exceeded timeout)', + indent_level=indent_level, + pid=self.pid, + ) + except Exception as e: + log_worker_event( + worker_type=worker_type, + event=f'Failed to SIGKILL {hook_name}: {e}', + indent_level=indent_level, + pid=self.pid, + ) + still_running.remove(hook_name) @classmethod - def start(cls, worker_id: int | None = None, daemon: bool = False, **kwargs: Any) -> int: + def start(cls, **kwargs: Any) -> int: """ Fork a new worker as a subprocess. Returns the PID of the new process. """ from archivebox.machine.models import Process - if worker_id is None: - worker_id = Process.get_next_worker_id(process_type=Process.TypeChoices.WORKER) + worker_id = Process.get_next_worker_id(process_type=Process.TypeChoices.WORKER) # Use module-level function for pickling compatibility proc = MPProcess( target=_run_worker, - args=(cls.name, worker_id, daemon), + args=(cls.name, worker_id), kwargs=kwargs, name=f'{cls.name}_worker_{worker_id}', ) @@ -356,120 +341,397 @@ class Worker: class CrawlWorker(Worker): - """Worker for processing Crawl objects.""" + """ + Worker for processing Crawl objects. + + Responsibilities: + 1. Run on_Crawl__* hooks (e.g., chrome launcher) + 2. Create Snapshots from URLs + 3. Spawn SnapshotWorkers (up to MAX_SNAPSHOT_WORKERS) + 4. Monitor snapshots and seal crawl when all done + """ name: ClassVar[str] = 'crawl' MAX_TICK_TIME: ClassVar[int] = 60 + MAX_SNAPSHOT_WORKERS: ClassVar[int] = 8 # Per crawl limit + + def __init__(self, crawl_id: str, **kwargs: Any): + super().__init__(**kwargs) + self.crawl_id = crawl_id + self.crawl = None def get_model(self): from archivebox.crawls.models import Crawl return Crawl - def get_queue(self) -> QuerySet: - """Get queue of Crawls ready for processing, optionally filtered by crawl_id.""" - qs = super().get_queue() - if self.crawl_id: - qs = qs.filter(id=self.crawl_id) - return qs + def on_startup(self) -> None: + """Load crawl.""" + super().on_startup() + + from archivebox.crawls.models import Crawl + self.crawl = Crawl.objects.get(id=self.crawl_id) + + def runloop(self) -> None: + """Run crawl state machine, spawn SnapshotWorkers.""" + import sys + self.on_startup() + + try: + print(f'[cyan]🔄 CrawlWorker.runloop: Starting tick() for crawl {self.crawl_id}[/cyan]', file=sys.stderr) + # Advance state machine: QUEUED → STARTED (triggers run() via @started.enter) + self.crawl.sm.tick() + self.crawl.refresh_from_db() + print(f'[cyan]🔄 tick() complete, crawl status={self.crawl.status}[/cyan]', file=sys.stderr) + + # Now spawn SnapshotWorkers and monitor progress + while True: + # Check if crawl is done + if self._is_crawl_finished(): + print(f'[cyan]🔄 Crawl finished, sealing...[/cyan]', file=sys.stderr) + self.crawl.sm.seal() + break + + # Spawn workers for queued snapshots + self._spawn_snapshot_workers() + + time.sleep(2) # Check every 2s + + finally: + self.on_shutdown() + + def _spawn_snapshot_workers(self) -> None: + """Spawn SnapshotWorkers for queued snapshots (up to limit).""" + from archivebox.core.models import Snapshot + from archivebox.machine.models import Process + + # Count running SnapshotWorkers for this crawl + running_count = Process.objects.filter( + process_type=Process.TypeChoices.WORKER, + worker_type='snapshot', + parent_id=self.db_process.id, # Children of this CrawlWorker + status__in=['running', 'started'], + ).count() + + if running_count >= self.MAX_SNAPSHOT_WORKERS: + return # At limit + + # Get queued snapshots for this crawl (SnapshotWorker will mark as STARTED in on_startup) + queued_snapshots = Snapshot.objects.filter( + crawl_id=self.crawl_id, + status=Snapshot.StatusChoices.QUEUED, + ).order_by('created_at')[:self.MAX_SNAPSHOT_WORKERS - running_count] + + import sys + print(f'[yellow]🔧 _spawn_snapshot_workers: running={running_count}/{self.MAX_SNAPSHOT_WORKERS}, queued={queued_snapshots.count()}[/yellow]', file=sys.stderr) + + # Spawn workers + for snapshot in queued_snapshots: + print(f'[yellow]🔧 Spawning worker for {snapshot.url} (status={snapshot.status})[/yellow]', file=sys.stderr) + SnapshotWorker.start(snapshot_id=str(snapshot.id)) + log_worker_event( + worker_type='CrawlWorker', + event=f'Spawned SnapshotWorker for {snapshot.url}', + indent_level=1, + pid=self.pid, + ) + + def _is_crawl_finished(self) -> bool: + """Check if all snapshots are sealed.""" + from archivebox.core.models import Snapshot + + pending = Snapshot.objects.filter( + crawl_id=self.crawl_id, + status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED], + ).count() + + return pending == 0 + + def on_shutdown(self, error: BaseException | None = None) -> None: + """ + Terminate all background Crawl hooks when crawl finishes. + + Background hooks (e.g., chrome launcher) should only be killed when: + - All snapshots are done (crawl is sealed) + - Worker is shutting down + """ + from archivebox.machine.models import Process + + # Query for all running hook processes that are children of this CrawlWorker + background_hooks = Process.objects.filter( + parent_id=self.db_process.id, + process_type=Process.TypeChoices.HOOK, + status=Process.StatusChoices.RUNNING, + ).select_related('machine') + + # Build dict for shared termination logic + background_processes = { + hook.cmd[0] if hook.cmd else f'hook-{hook.pid}': hook + for hook in background_hooks + } + + # Use shared termination logic from Worker base class + self._terminate_background_hooks( + background_processes=background_processes, + worker_type='CrawlWorker', + indent_level=1, + ) + + super().on_shutdown(error) class SnapshotWorker(Worker): - """Worker for processing Snapshot objects.""" + """ + Worker that owns sequential hook execution for ONE snapshot. + + Unlike other workers, SnapshotWorker doesn't poll a queue - it's given + a specific snapshot_id and runs all hooks for that snapshot sequentially. + + Execution flow: + 1. Mark snapshot as STARTED + 2. Discover hooks for snapshot + 3. For each hook (sorted by name): + a. Fork hook Process + b. If foreground: wait for completion + c. If background: track but continue to next hook + d. Update ArchiveResult status + e. Advance current_step when all step's hooks complete + 4. When all hooks done: seal snapshot + 5. On shutdown: SIGTERM all background hooks + """ name: ClassVar[str] = 'snapshot' - MAX_TICK_TIME: ClassVar[int] = 60 + + def __init__(self, snapshot_id: str, **kwargs: Any): + super().__init__(**kwargs) + self.snapshot_id = snapshot_id + self.snapshot = None + self.background_processes: dict[str, Any] = {} # hook_name -> Process def get_model(self): + """Not used - SnapshotWorker doesn't poll queues.""" from archivebox.core.models import Snapshot return Snapshot - def get_queue(self) -> QuerySet: - """Get queue of Snapshots ready for processing, optionally filtered by crawl_id.""" - qs = super().get_queue() - if self.crawl_id: - qs = qs.filter(crawl_id=self.crawl_id) - return qs + def on_startup(self) -> None: + """Load snapshot and mark as STARTED.""" + super().on_startup() + from archivebox.core.models import Snapshot + self.snapshot = Snapshot.objects.get(id=self.snapshot_id) -class ArchiveResultWorker(Worker): - """Worker for processing ArchiveResult objects.""" + # Mark snapshot as STARTED + self.snapshot.status = Snapshot.StatusChoices.STARTED + self.snapshot.retry_at = None # No more polling needed + self.snapshot.save(update_fields=['status', 'retry_at', 'modified_at']) - name: ClassVar[str] = 'archiveresult' - MAX_TICK_TIME: ClassVar[int] = 120 - - def __init__(self, plugin: str | None = None, **kwargs: Any): - super().__init__(**kwargs) - self.plugin = plugin - - def get_model(self): + def runloop(self) -> None: + """Execute all hooks sequentially.""" + from archivebox.hooks import discover_hooks, is_background_hook, extract_step from archivebox.core.models import ArchiveResult - return ArchiveResult - def get_queue(self) -> QuerySet: - """ - Get queue of ArchiveResults ready for processing. + self.on_startup() - Uses step-based filtering: only claims ARs where hook step <= snapshot.current_step. - This ensures hooks execute in order (step 0 → 1 → 2 ... → 9). - """ - from archivebox.core.models import ArchiveResult - from archivebox.hooks import extract_step - - qs = super().get_queue() - - if self.crawl_id: - qs = qs.filter(snapshot__crawl_id=self.crawl_id) - - if self.plugin: - qs = qs.filter(plugin=self.plugin) - - # Step-based filtering: only process ARs whose step <= snapshot.current_step - # Since step is derived from hook_name, we filter in Python after initial query - # This is efficient because the base query already filters by retry_at and status - - # Get candidate ARs - candidates = list(qs[:50]) # Limit to avoid loading too many - ready_pks = [] - - for ar in candidates: - if not ar.hook_name: - # Legacy ARs without hook_name - process them - ready_pks.append(ar.pk) - continue - - ar_step = extract_step(ar.hook_name) - snapshot_step = ar.snapshot.current_step - - if ar_step <= snapshot_step: - ready_pks.append(ar.pk) - - # Return filtered queryset ordered by hook_name (so earlier hooks run first within a step) - return ArchiveResult.objects.filter(pk__in=ready_pks).order_by('hook_name', 'retry_at') - - def process_item(self, obj) -> bool: - """Process an ArchiveResult by running its plugin.""" try: - obj.sm.tick() - return True + # Discover all hooks for this snapshot + hooks = discover_hooks('Snapshot', config=self.snapshot.config) + hooks = sorted(hooks, key=lambda h: h.name) # Sort by name (includes step prefix) + + # Execute each hook sequentially + for hook_path in hooks: + hook_name = hook_path.name + plugin = self._extract_plugin_name(hook_name) + hook_step = extract_step(hook_name) + is_background = is_background_hook(hook_name) + + # Create ArchiveResult for THIS HOOK (not per plugin) + # One plugin can have multiple hooks (e.g., chrome/on_Snapshot__20_launch_chrome.js, chrome/on_Snapshot__21_navigate_chrome.js) + # Unique key = (snapshot, plugin, hook_name) for idempotency + ar, created = ArchiveResult.objects.get_or_create( + snapshot=self.snapshot, + plugin=plugin, + hook_name=hook_name, + defaults={ + 'status': ArchiveResult.StatusChoices.STARTED, + 'start_ts': timezone.now(), + } + ) + + if not created: + # Update existing AR to STARTED + ar.status = ArchiveResult.StatusChoices.STARTED + ar.start_ts = timezone.now() + ar.save(update_fields=['status', 'start_ts', 'modified_at']) + + # Fork and run the hook + process = self._run_hook(hook_path, ar) + + if is_background: + # Track but don't wait + self.background_processes[hook_name] = process + log_worker_event( + worker_type='SnapshotWorker', + event=f'Started background hook: {hook_name} (timeout={process.timeout}s)', + indent_level=2, + pid=self.pid, + ) + else: + # Wait for foreground hook to complete + self._wait_for_hook(process, ar) + log_worker_event( + worker_type='SnapshotWorker', + event=f'Completed hook: {hook_name}', + indent_level=2, + pid=self.pid, + ) + + # Check if we can advance to next step + self._try_advance_step() + + # All hooks launched (or completed) - cleanup and seal + self._cleanup_empty_archiveresults() + self.snapshot.status = Snapshot.StatusChoices.SEALED + self.snapshot.save(update_fields=['status', 'modified_at']) + except Exception as e: - # Error will be logged in runloop's completion event - traceback.print_exc() - return False + # Mark snapshot as failed + self.snapshot.status = Snapshot.StatusChoices.SEALED # Still seal on error + self.snapshot.save(update_fields=['status', 'modified_at']) + raise + finally: + self.on_shutdown() + + def _run_hook(self, hook_path: Path, ar: Any) -> Any: + """Fork and run a hook using Process model, return Process.""" + from archivebox.hooks import run_hook + + # Create output directory + output_dir = ar.create_output_dir() + + # Run hook using Process.launch() - returns Process model directly + # Pass self.db_process as parent to track SnapshotWorker -> Hook hierarchy + process = run_hook( + script=hook_path, + output_dir=output_dir, + config=self.snapshot.config, + timeout=120, + parent=self.db_process, + url=str(self.snapshot.url), + snapshot_id=str(self.snapshot.id), + ) + + # Link ArchiveResult to Process for tracking + ar.process = process + ar.save(update_fields=['process_id', 'modified_at']) + + return process + + def _wait_for_hook(self, process: Any, ar: Any) -> None: + """Wait for hook using Process.wait(), update AR status.""" + # Use Process.wait() helper instead of manual polling + try: + exit_code = process.wait(timeout=process.timeout) + except TimeoutError: + # Hook exceeded timeout - kill it + process.kill(signal_num=9) + exit_code = -1 + + # Update ArchiveResult from hook output + ar.update_from_output() + ar.end_ts = timezone.now() + + # Determine final status from hook exit code + if exit_code == 0: + ar.status = ar.StatusChoices.SUCCEEDED + else: + ar.status = ar.StatusChoices.FAILED + + ar.save(update_fields=['status', 'end_ts', 'modified_at']) + + def _try_advance_step(self) -> None: + """Advance current_step if all foreground hooks in current step are done.""" + from django.db.models import Q + from archivebox.core.models import ArchiveResult + + current_step = self.snapshot.current_step + + # Single query: foreground hooks in current step that aren't finished + # Foreground hooks: hook_name doesn't contain '.bg.' + pending_foreground = self.snapshot.archiveresult_set.filter( + Q(hook_name__contains=f'__{current_step}_') & # Current step + ~Q(hook_name__contains='.bg.') & # Not background + ~Q(status__in=ArchiveResult.FINAL_STATES) # Not finished + ).exists() + + if pending_foreground: + return # Still waiting for hooks + + # All foreground hooks done - advance! + self.snapshot.current_step += 1 + self.snapshot.save(update_fields=['current_step', 'modified_at']) + + log_worker_event( + worker_type='SnapshotWorker', + event=f'Advanced to step {self.snapshot.current_step}', + indent_level=2, + pid=self.pid, + ) + + def _cleanup_empty_archiveresults(self) -> None: + """Delete ArchiveResults that produced no output files.""" + empty_ars = self.snapshot.archiveresult_set.filter( + output_files={} # No output files + ).filter( + status__in=self.snapshot.archiveresult_set.model.FINAL_STATES # Only delete finished ones + ) + + deleted_count = empty_ars.count() + if deleted_count > 0: + empty_ars.delete() + log_worker_event( + worker_type='SnapshotWorker', + event=f'Deleted {deleted_count} empty ArchiveResults', + indent_level=2, + pid=self.pid, + ) + + def on_shutdown(self, error: BaseException | None = None) -> None: + """ + Terminate all background Snapshot hooks when snapshot finishes. + + Background hooks should only be killed when: + - All foreground hooks are done (snapshot is sealed) + - Worker is shutting down + """ + # Use shared termination logic from Worker base class + self._terminate_background_hooks( + background_processes=self.background_processes, + worker_type='SnapshotWorker', + indent_level=2, + ) + + super().on_shutdown(error) + + @staticmethod + def _extract_plugin_name(hook_name: str) -> str: + """Extract plugin name from hook filename.""" + # on_Snapshot__50_wget.py -> wget + name = hook_name.split('__')[-1] # Get part after last __ + name = name.replace('.py', '').replace('.js', '').replace('.sh', '') + name = name.replace('.bg', '') # Remove .bg suffix + return name @classmethod - def start(cls, worker_id: int | None = None, daemon: bool = False, plugin: str | None = None, **kwargs: Any) -> int: - """Fork a new worker as subprocess with optional plugin filter.""" + def start(cls, snapshot_id: str, **kwargs: Any) -> int: + """Fork a SnapshotWorker for a specific snapshot.""" from archivebox.machine.models import Process - if worker_id is None: - worker_id = Process.get_next_worker_id(process_type=Process.TypeChoices.WORKER) + worker_id = Process.get_next_worker_id(process_type=Process.TypeChoices.WORKER) - # Use module-level function for pickling compatibility proc = MPProcess( - target=_run_worker, - args=(cls.name, worker_id, daemon), - kwargs={'plugin': plugin, **kwargs}, - name=f'{cls.name}_worker_{worker_id}', + target=_run_snapshot_worker, # New module-level function + args=(snapshot_id, worker_id), + kwargs=kwargs, + name=f'snapshot_worker_{snapshot_id[:8]}', ) proc.start() @@ -481,7 +743,6 @@ class ArchiveResultWorker(Worker): WORKER_TYPES.update({ 'crawl': CrawlWorker, 'snapshot': SnapshotWorker, - 'archiveresult': ArchiveResultWorker, }) diff --git a/old/TODO_Process_cleanup_unification.md b/old/TODO_Process_cleanup_unification.md new file mode 100644 index 00000000..ea12a0a0 --- /dev/null +++ b/old/TODO_Process_cleanup_unification.md @@ -0,0 +1,333 @@ +# Process Model Integration Plan + +## Current Architecture + +### Hook Execution Flow +``` +Orchestrator + ├─> CrawlWorker + │ └─> Crawl.run() [state machine @started.enter] + │ └─> run_hook() for on_Crawl__* hooks + │ └─> subprocess.Popen (NOT using Process model) + │ + └─> SnapshotWorker + └─> Snapshot.run() [planned - doesn't exist yet] + └─> ArchiveResult.run() [state machine @started.enter] + └─> run_hook() for on_Snapshot__* hooks + └─> subprocess.Popen (NOT using Process model) +``` + +### Problem +1. **No Process tracking**: `run_hook()` uses `subprocess.Popen` directly, never creates Process records +2. **Orphaned Process model**: Process model has `.launch()`, `.wait()`, `.terminate()` methods that are NEVER used +3. **Manual process management**: SnapshotWorker manually uses psutil for waiting/killing +4. **Duplicate logic**: Process model and run_hook() both do subprocess management independently + +## Unified Architecture + +### Goal +Make Process model the **single source of truth** for all subprocess operations: +- Hook execution +- PID tracking +- stdout/stderr capture +- Process lifecycle (launch, wait, terminate) + +### Design + +```python +# hooks.py - Thin wrapper +def run_hook(...) -> Process: + """ + Run a hook using Process model (THIN WRAPPER). + + Returns Process model instance for tracking and control. + """ + from archivebox.machine.models import Process + + # Build command + cmd = build_hook_cmd(script, kwargs) + + # Use Process.launch() - handles everything + process = Process.objects.create( + machine=Machine.current(), + process_type=Process.TypeChoices.HOOK, + pwd=str(output_dir), + cmd=cmd, + env=build_hook_env(config), + timeout=timeout, + ) + + # Launch subprocess + process.launch(background=is_background_hook(script.name)) + + return process # Return Process, not dict + + +# worker.py - Use Process methods +class SnapshotWorker: + def _run_hook(self, hook_path, ar) -> Process: + """Fork hook using Process model.""" + process = run_hook( + hook_path, + ar.create_output_dir(), + self.snapshot.config, + url=self.snapshot.url, + snapshot_id=str(self.snapshot.id), + ) + + # Link ArchiveResult to Process + ar.process = process + ar.save() + + return process + + def _wait_for_hook(self, process, ar): + """Wait using Process.wait() method.""" + exit_code = process.wait(timeout=None) + + # Update AR from hook output + ar.update_from_output() + ar.status = ar.StatusChoices.SUCCEEDED if exit_code == 0 else ar.StatusChoices.FAILED + ar.save() + + def on_shutdown(self): + """ + Terminate all background hooks in parallel with per-plugin timeouts. + + Phase 1: Send SIGTERM to all in parallel (polite request to wrap up) + Phase 2: Wait for all in parallel, respecting individual plugin timeouts + Phase 3: SIGKILL any that exceed their timeout + + Each plugin has its own timeout (SCREENSHOT_TIMEOUT=60, YTDLP_TIMEOUT=300, etc.) + Some hooks (consolelog, responses) exit immediately on SIGTERM. + Others (ytdlp, wget) need their full timeout to finish actual work. + """ + # Send SIGTERM to all processes in parallel + for hook_name, process in self.background_processes.items(): + os.kill(process.pid, signal.SIGTERM) + + # Build per-process deadlines based on plugin-specific timeouts + deadlines = { + name: (proc, time.time() + max(0, proc.timeout - (time.time() - proc.started_at.timestamp()))) + for name, proc in self.background_processes.items() + } + + # Poll all processes in parallel - no head-of-line blocking + still_running = set(deadlines.keys()) + while still_running: + time.sleep(0.1) + for name in list(still_running): + proc, deadline = deadlines[name] + if not proc.is_running(): + still_running.remove(name) + elif time.time() >= deadline: + os.kill(proc.pid, signal.SIGKILL) # Timeout exceeded + still_running.remove(name) + + +# models.py - Process becomes active +class Process: + def launch(self, background=False): + """Spawn subprocess and track it.""" + with open(self.stdout_file, 'w') as out, open(self.stderr_file, 'w') as err: + proc = subprocess.Popen( + self.cmd, + cwd=self.pwd, + stdout=out, + stderr=err, + env=self._build_env(), + ) + + self.pid = proc.pid + self.started_at = timezone.now() + self.status = self.StatusChoices.RUNNING + self.save() + + if not background: + # Foreground - wait inline + proc.wait() + self.exit_code = proc.returncode + self.ended_at = timezone.now() + self.status = self.StatusChoices.EXITED + self.save() + + return self + + def wait(self, timeout=None): + """Wait for process to exit, polling DB.""" + while True: + self.refresh_from_db() + if self.status == self.StatusChoices.EXITED: + return self.exit_code + + # Check via psutil if Process died without updating DB + if not self.is_running(): + self._reap() # Update status from OS + return self.exit_code + + time.sleep(0.1) + + def terminate(self, sig=signal.SIGTERM): + """Gracefully terminate: SIGTERM → wait → SIGKILL.""" + if not self.is_running(): + return True + + os.kill(self.pid, sig) + + # Wait for graceful shutdown + for _ in range(50): # 5 seconds + if not self.is_running(): + self._reap() + return True + time.sleep(0.1) + + # Escalate to SIGKILL + os.kill(self.pid, signal.SIGKILL) + self._reap() + return True +``` + +## Migration Steps + +### Step 1: Update Process.launch() (DONE - already exists) +Process model already has `.launch()`, `.wait()`, `.terminate()` methods implemented in machine/models.py:1295-1593 + +### Step 2: Refactor run_hook() to use Process.launch() +**File**: `archivebox/hooks.py` + +Change signature from: +```python +def run_hook(...) -> HookResult: # Returns dict +``` + +To: +```python +def run_hook(...) -> Process: # Returns Process model +``` + +**Implementation**: +```python +def run_hook(script, output_dir, config, timeout=None, **kwargs) -> Process: + from archivebox.machine.models import Process, Machine + + # Build command + cmd = build_hook_cmd(script, kwargs) + env = build_hook_env(config) + is_bg = is_background_hook(script.name) + + # Create Process record + process = Process.objects.create( + machine=Machine.current(), + process_type=Process.TypeChoices.HOOK, + pwd=str(output_dir), + cmd=cmd, + env=env, + timeout=timeout or 120, + ) + + # Launch subprocess + process.launch(background=is_bg) + + return process +``` + +### Step 3: Update SnapshotWorker to use Process methods +**File**: `archivebox/workers/worker.py` + +Replace manual psutil code with Process model methods (shown above in Design section). + +### Step 4: Update ArchiveResult.run() to use new run_hook() +**File**: `archivebox/core/models.py:2559` + +Change from: +```python +result = run_hook(...) # Returns HookResult dict +if result is None: + is_bg_hook = True +``` + +To: +```python +process = run_hook(...) # Returns Process +self.process = process +self.save() + +if process.status == Process.StatusChoices.RUNNING: + # Background hook - still running + return +else: + # Foreground hook - completed + self.update_from_output() +``` + +### Step 5: Update Crawl.run() similarly +**File**: `archivebox/crawls/models.py:374` + +Same pattern as ArchiveResult.run() + +## Benefits + +### 1. Single Source of Truth +- Process model owns ALL subprocess operations +- No duplicate logic between run_hook(), Process, and workers +- Consistent PID tracking, stdout/stderr handling + +### 2. Proper Hierarchy +``` +Process.parent_id creates tree: +Orchestrator (PID 1000) + └─> CrawlWorker (PID 1001, parent=1000) + └─> on_Crawl__01_chrome.js (PID 1010, parent=1001) + └─> SnapshotWorker (PID 1020, parent=1000) + └─> on_Snapshot__50_wget.py (PID 1021, parent=1020) + └─> on_Snapshot__63_ytdlp.bg.py (PID 1022, parent=1020) +``` + +### 3. Better Observability +- Query all hook processes: `snapshot.process_set.all()` +- Count running: `Process.objects.filter(status='running').count()` +- Track resource usage via Process.get_memory_info() + +### 4. Cleaner Code +- SnapshotWorker._wait_for_hook: 25 lines → 8 lines +- SnapshotWorker.on_shutdown: 12 lines → 7 lines +- run_hook(): ~200 lines → ~50 lines +- Total: ~100 LoC saved + +## Risks & Mitigation + +### Risk 1: Breaking existing run_hook() callers +**Mitigation**: Two-phase rollout +1. Phase 1: Add run_hook_v2() that returns Process +2. Phase 2: Migrate callers to run_hook_v2() +3. Phase 3: Rename run_hook → run_hook_legacy, run_hook_v2 → run_hook + +### Risk 2: Background hook tracking changes +**Mitigation**: +- Process.launch(background=True) handles async launches +- Process.wait() already polls for completion +- Behavior identical to current subprocess.Popen + +### Risk 3: Performance overhead (extra DB writes) +**Mitigation**: +- Process records already being created (just not used) +- Batch updates where possible +- Monitor via metrics + +## Timeline + +### Immediate (This PR) +- [x] State machine fixes (completed) +- [x] Step advancement optimization (completed) +- [x] Document unified architecture (this file) + +### Next PR (Process Integration) +1. Add run_hook_v2() returning Process +2. Update SnapshotWorker to use Process methods +3. Migrate ArchiveResult.run() and Crawl.run() +4. Deprecate old run_hook() + +### Future +- Remove run_hook_legacy after migration complete +- Add Process.get_tree() for hierarchy visualization +- Add ProcessMachine state machine for lifecycle management diff --git a/TODO_fix_migration_path.md b/old/TODO_fix_migration_path.md similarity index 100% rename from TODO_fix_migration_path.md rename to old/TODO_fix_migration_path.md