mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
unified Process source of truth and better screenshot tests
This commit is contained in:
@@ -1,3 +0,0 @@
|
||||
[SERVER_CONFIG]
|
||||
SECRET_KEY = y6fw9wcaqls9sx_dze6ahky9ggpkpzoaw5g5v98_u3ro5j0_4f
|
||||
|
||||
@@ -35,12 +35,8 @@ class WorkerSchema(Schema):
|
||||
model: str
|
||||
max_tick_time: int
|
||||
max_concurrent_tasks: int
|
||||
poll_interval: float
|
||||
idle_timeout: int
|
||||
running_count: int
|
||||
running_workers: List[dict[str, Any]]
|
||||
queue_count: int
|
||||
queue: List[QueueItemSchema]
|
||||
|
||||
@staticmethod
|
||||
def resolve_model(obj) -> str:
|
||||
@@ -55,38 +51,21 @@ class WorkerSchema(Schema):
|
||||
def resolve_max_concurrent_tasks(obj) -> int:
|
||||
return obj.MAX_CONCURRENT_TASKS
|
||||
|
||||
@staticmethod
|
||||
def resolve_poll_interval(obj) -> float:
|
||||
return obj.POLL_INTERVAL
|
||||
|
||||
@staticmethod
|
||||
def resolve_idle_timeout(obj) -> int:
|
||||
return obj.IDLE_TIMEOUT
|
||||
|
||||
@staticmethod
|
||||
def resolve_running_count(obj) -> int:
|
||||
return len(obj.get_running_workers())
|
||||
return obj.get_worker_count()
|
||||
|
||||
@staticmethod
|
||||
def resolve_running_workers(obj) -> List[dict[str, Any]]:
|
||||
return obj.get_running_workers()
|
||||
|
||||
@staticmethod
|
||||
def resolve_queue_count(obj) -> int:
|
||||
return obj.get_queue().count()
|
||||
|
||||
@staticmethod
|
||||
def resolve_queue(obj) -> List[QueueItemSchema]:
|
||||
return list(obj.get_queue()[:50]) # Limit to 50 items
|
||||
|
||||
|
||||
class OrchestratorSchema(Schema):
|
||||
"""Schema for the Orchestrator."""
|
||||
is_running: bool
|
||||
poll_interval: float
|
||||
idle_timeout: int
|
||||
max_workers_per_type: int
|
||||
max_total_workers: int
|
||||
max_crawl_workers: int
|
||||
total_worker_count: int
|
||||
workers: List[WorkerSchema]
|
||||
|
||||
@@ -95,23 +74,20 @@ class OrchestratorSchema(Schema):
|
||||
def get_orchestrator(request):
|
||||
"""Get the orchestrator status and all worker queues."""
|
||||
from archivebox.workers.orchestrator import Orchestrator
|
||||
from archivebox.workers.worker import CrawlWorker, SnapshotWorker, ArchiveResultWorker
|
||||
from archivebox.workers.worker import CrawlWorker
|
||||
|
||||
orchestrator = Orchestrator()
|
||||
|
||||
# Create temporary worker instances to query their queues
|
||||
workers = [
|
||||
CrawlWorker(worker_id=-1),
|
||||
SnapshotWorker(worker_id=-1),
|
||||
ArchiveResultWorker(worker_id=-1),
|
||||
]
|
||||
|
||||
return {
|
||||
'is_running': orchestrator.is_running(),
|
||||
'poll_interval': orchestrator.POLL_INTERVAL,
|
||||
'idle_timeout': orchestrator.IDLE_TIMEOUT,
|
||||
'max_workers_per_type': orchestrator.MAX_WORKERS_PER_TYPE,
|
||||
'max_total_workers': orchestrator.MAX_TOTAL_WORKERS,
|
||||
'max_crawl_workers': orchestrator.MAX_CRAWL_WORKERS,
|
||||
'total_worker_count': orchestrator.get_total_worker_count(),
|
||||
'workers': workers,
|
||||
}
|
||||
@@ -120,41 +96,12 @@ def get_orchestrator(request):
|
||||
@router.get("/workers", response=List[WorkerSchema], url_name="get_workers")
|
||||
def get_workers(request):
|
||||
"""List all worker types and their current status."""
|
||||
from archivebox.workers.worker import CrawlWorker, SnapshotWorker, ArchiveResultWorker
|
||||
from archivebox.workers.worker import CrawlWorker
|
||||
|
||||
# Create temporary instances to query their queues
|
||||
return [
|
||||
CrawlWorker(worker_id=-1),
|
||||
SnapshotWorker(worker_id=-1),
|
||||
ArchiveResultWorker(worker_id=-1),
|
||||
]
|
||||
|
||||
|
||||
@router.get("/worker/{worker_name}", response=WorkerSchema, url_name="get_worker")
|
||||
def get_worker(request, worker_name: str):
|
||||
"""Get status and queue for a specific worker type."""
|
||||
from archivebox.workers.worker import WORKER_TYPES
|
||||
|
||||
if worker_name not in WORKER_TYPES:
|
||||
from ninja.errors import HttpError
|
||||
raise HttpError(404, f"Unknown worker type: {worker_name}. Valid types: {list(WORKER_TYPES.keys())}")
|
||||
|
||||
WorkerClass = WORKER_TYPES[worker_name]
|
||||
return WorkerClass(worker_id=-1)
|
||||
|
||||
|
||||
@router.get("/worker/{worker_name}/queue", response=List[QueueItemSchema], url_name="get_worker_queue")
|
||||
def get_worker_queue(request, worker_name: str, limit: int = 100):
|
||||
"""Get the current queue for a specific worker type."""
|
||||
from archivebox.workers.worker import WORKER_TYPES
|
||||
|
||||
if worker_name not in WORKER_TYPES:
|
||||
from ninja.errors import HttpError
|
||||
raise HttpError(404, f"Unknown worker type: {worker_name}. Valid types: {list(WORKER_TYPES.keys())}")
|
||||
|
||||
WorkerClass = WORKER_TYPES[worker_name]
|
||||
worker = WorkerClass(worker_id=-1)
|
||||
return list(worker.get_queue()[:limit])
|
||||
|
||||
|
||||
# Progress endpoint moved to core.views.live_progress_view for simplicity
|
||||
|
||||
@@ -96,10 +96,9 @@ def add(urls: str | list[str],
|
||||
first_url = crawl.get_urls_list()[0] if crawl.get_urls_list() else ''
|
||||
print(f' [dim]First URL: {first_url}[/dim]')
|
||||
|
||||
# 3. The CrawlMachine will create the root Snapshot when started
|
||||
# If URLs are from a file: first URL = file:///path/to/sources/...txt
|
||||
# Parser extractors will run on it and discover more URLs
|
||||
# Those URLs become child Snapshots (depth=1)
|
||||
# 3. The CrawlMachine will create Snapshots from all URLs when started
|
||||
# Parser extractors run on snapshots and discover more URLs
|
||||
# Discovered URLs become child Snapshots (depth+1)
|
||||
|
||||
if index_only:
|
||||
# Just create the crawl but don't start processing
|
||||
@@ -119,10 +118,9 @@ def add(urls: str | list[str],
|
||||
|
||||
# 5. Start the orchestrator to process the queue
|
||||
# The orchestrator will:
|
||||
# - Process Crawl -> create root Snapshot
|
||||
# - Process root Snapshot -> run parser extractors -> discover URLs
|
||||
# - Create child Snapshots from discovered URLs
|
||||
# - Process child Snapshots -> run extractors
|
||||
# - Process Crawl -> create Snapshots from all URLs
|
||||
# - Process Snapshots -> run extractors
|
||||
# - Parser extractors discover new URLs -> create child Snapshots
|
||||
# - Repeat until max_depth reached
|
||||
|
||||
if bg:
|
||||
|
||||
@@ -160,10 +160,12 @@ def init(force: bool=False, quick: bool=False, install: bool=False) -> None:
|
||||
CONSTANTS.PERSONAS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
CONSTANTS.DEFAULT_TMP_DIR.mkdir(parents=True, exist_ok=True)
|
||||
CONSTANTS.DEFAULT_LIB_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
(CONSTANTS.DEFAULT_LIB_DIR / 'bin').mkdir(parents=True, exist_ok=True)
|
||||
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
STORAGE_CONFIG.TMP_DIR.mkdir(parents=True, exist_ok=True)
|
||||
STORAGE_CONFIG.LIB_DIR.mkdir(parents=True, exist_ok=True)
|
||||
(STORAGE_CONFIG.LIB_DIR / 'bin').mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if install:
|
||||
from archivebox.cli.archivebox_install import install as install_method
|
||||
|
||||
@@ -96,33 +96,45 @@ ARCHIVERESULT_MACHINE_DIAGRAM = """
|
||||
├─────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ ┌─────────────┐ │
|
||||
│ │ QUEUED │◄────────────────┐ │
|
||||
│ │ (initial) │ │ │
|
||||
│ └──────┬──────┘ │ │
|
||||
│ │ │ tick() unless can_start() │
|
||||
│ │ tick() when │ │
|
||||
│ │ can_start() │ │
|
||||
│ ▼ │ │
|
||||
│ ┌─────────────┐ │ │
|
||||
│ │ STARTED │─────────────────┘ │
|
||||
│ │ │◄────────────────┐ │
|
||||
│ │ enter: │ │ tick() unless is_finished() │
|
||||
│ │ result.run()│─────────────────┘ │
|
||||
│ │ (execute │ │
|
||||
│ │ hook via │ │
|
||||
│ │ run_hook())│ │
|
||||
│ └──────┬──────┘ │
|
||||
│ │ │
|
||||
│ │ tick() checks status set by hook output │
|
||||
│ ├────────────────┬────────────────┬────────────────┐ │
|
||||
│ ▼ ▼ ▼ ▼ │
|
||||
│ ┌───────────┐ ┌───────────┐ ┌───────────┐ ┌───────────┐ │
|
||||
│ │ SUCCEEDED │ │ FAILED │ │ SKIPPED │ │ BACKOFF │ │
|
||||
│ │ (final) │ │ (final) │ │ (final) │ │ │ │
|
||||
│ └───────────┘ └───────────┘ └───────────┘ └─────┬─────┘ │
|
||||
│ │ │
|
||||
│ can_start()───┘ │
|
||||
│ loops back to STARTED │
|
||||
│ │ QUEUED │◄─────────────────┐ │
|
||||
│ │ (initial) │ │ │
|
||||
│ └──┬───────┬──┘ │ │
|
||||
│ │ │ │ tick() unless can_start() │
|
||||
│ │ │ exceeded_max_ │ │
|
||||
│ │ │ attempts │ │
|
||||
│ │ ▼ │ │
|
||||
│ │ ┌──────────┐ │ │
|
||||
│ │ │ SKIPPED │ │ │
|
||||
│ │ │ (final) │ │ │
|
||||
│ │ └──────────┘ │ │
|
||||
│ │ tick() when │ │
|
||||
│ │ can_start() │ │
|
||||
│ ▼ │ │
|
||||
│ ┌─────────────┐ │ │
|
||||
│ │ STARTED │──────────────────┘ │
|
||||
│ │ │◄─────────────────────────────────────────────────┐ │
|
||||
│ │ enter: │ │ │ │
|
||||
│ │ result.run()│ tick() unless │ │ │
|
||||
│ │ (execute │ is_finished() │ │ │
|
||||
│ │ hook via │──────────────────────┘ │ │
|
||||
│ │ run_hook())│ │ │
|
||||
│ └──────┬──────┘ │ │
|
||||
│ │ │ │
|
||||
│ │ tick() checks status set by hook output │ │
|
||||
│ ├─────────────┬─────────────┬─────────────┐ │ │
|
||||
│ ▼ ▼ ▼ ▼ │ │
|
||||
│ ┌───────────┐ ┌───────────┐ ┌───────────┐ ┌───────────┐ │ │
|
||||
│ │ SUCCEEDED │ │ FAILED │ │ SKIPPED │ │ BACKOFF │ │ │
|
||||
│ │ (final) │ │ (final) │ │ (final) │ │ │ │ │
|
||||
│ └───────────┘ └───────────┘ └───────────┘ └──┬──────┬─┘ │ │
|
||||
│ │ │ │ │
|
||||
│ exceeded_max_ │ │ can_start()│ │
|
||||
│ attempts │ │ loops back │ │
|
||||
│ ▼ │ └────────────┘ │
|
||||
│ ┌──────────┐ │ │
|
||||
│ │ SKIPPED │◄─┘ │
|
||||
│ │ (final) │ │
|
||||
│ └──────────┘ │
|
||||
│ │
|
||||
│ Each ArchiveResult runs ONE specific hook (stored in .hook_name field) │
|
||||
└─────────────────────────────────────────────────────────────────────────────┘
|
||||
@@ -137,35 +149,38 @@ BINARY_MACHINE_DIAGRAM = """
|
||||
│ │ QUEUED │◄────────────────┐ │
|
||||
│ │ (initial) │ │ │
|
||||
│ └──────┬──────┘ │ │
|
||||
│ │ │ tick() unless can_start() │
|
||||
│ │ │ tick() unless can_install() │
|
||||
│ │ │ (stays queued if failed) │
|
||||
│ │ tick() when │ │
|
||||
│ │ can_start() │ │
|
||||
│ ▼ │ │
|
||||
│ ┌─────────────┐ │ │
|
||||
│ │ STARTED │─────────────────┘ │
|
||||
│ │ │◄────────────────┐ │
|
||||
│ │ enter: │ │ │
|
||||
│ │ binary.run()│ │ tick() unless is_finished() │
|
||||
│ │ (discover │─────────────────┘ │
|
||||
│ │ Binary │ │
|
||||
│ │ hooks, │ │
|
||||
│ │ try each │ │
|
||||
│ │ provider) │ │
|
||||
│ └──────┬──────┘ │
|
||||
│ │ can_install() │ │
|
||||
│ │ │ │
|
||||
│ │ on_install() runs │ │
|
||||
│ │ during transition: │ │
|
||||
│ │ • binary.run() │ │
|
||||
│ │ (discover Binary │ │
|
||||
│ │ hooks, try each │ │
|
||||
│ │ provider until │ │
|
||||
│ │ one succeeds) │ │
|
||||
│ │ • Sets abspath, │ │
|
||||
│ │ version, sha256 │ │
|
||||
│ │ │ │
|
||||
│ │ If install fails: │ │
|
||||
│ │ raises exception──────┘ │
|
||||
│ │ (retry_at bumped) │
|
||||
│ │ │
|
||||
│ │ tick() checks status set by hook output │
|
||||
│ ├────────────────────────────────┐ │
|
||||
│ ▼ ▼ │
|
||||
│ ┌─────────────┐ ┌─────────────┐ │
|
||||
│ │ SUCCEEDED │ │ FAILED │ │
|
||||
│ │ (final) │ │ (final) │ │
|
||||
│ │ │ │ │ │
|
||||
│ │ abspath, │ │ no provider │ │
|
||||
│ │ version set │ │ succeeded │ │
|
||||
│ └─────────────┘ └─────────────┘ │
|
||||
│ ▼ │
|
||||
│ ┌─────────────┐ │
|
||||
│ │ INSTALLED │ │
|
||||
│ │ (final) │ │
|
||||
│ │ │ │
|
||||
│ │ Binary is │ │
|
||||
│ │ ready to │ │
|
||||
│ │ use │ │
|
||||
│ └─────────────┘ │
|
||||
│ │
|
||||
│ Hooks triggered: on_Binary__* (provider hooks during STARTED.enter) │
|
||||
│ Hooks triggered: on_Binary__* (provider hooks during transition) │
|
||||
│ Providers tried in sequence until one succeeds: apt, brew, pip, npm, etc. │
|
||||
│ Installation is synchronous - no intermediate STARTED state │
|
||||
└─────────────────────────────────────────────────────────────────────────────┘
|
||||
"""
|
||||
|
||||
|
||||
@@ -109,15 +109,18 @@ def drain_old_archive_dirs(resume_from: str = None, batch_size: int = 100) -> di
|
||||
if not archive_dir.exists():
|
||||
return stats
|
||||
|
||||
print('[*] Scanning for old directories in archive/...')
|
||||
print('[DEBUG Phase1] Scanning for old directories in archive/...')
|
||||
|
||||
# Scan for real directories only (skip symlinks - they're already migrated)
|
||||
all_entries = list(os.scandir(archive_dir))
|
||||
print(f'[DEBUG Phase1] Total entries in archive/: {len(all_entries)}')
|
||||
entries = [
|
||||
(e.stat().st_mtime, e.path)
|
||||
for e in os.scandir(archive_dir)
|
||||
for e in all_entries
|
||||
if e.is_dir(follow_symlinks=False) # Skip symlinks
|
||||
]
|
||||
entries.sort(reverse=True) # Newest first
|
||||
print(f'[DEBUG Phase1] Real directories (not symlinks): {len(entries)}')
|
||||
print(f'[*] Found {len(entries)} old directories to drain')
|
||||
|
||||
for mtime, entry_path in entries:
|
||||
@@ -142,14 +145,48 @@ def drain_old_archive_dirs(resume_from: str = None, batch_size: int = 100) -> di
|
||||
print(f" [{stats['processed']}] Invalid: {entry_path.name}")
|
||||
continue
|
||||
|
||||
# Ensure snapshot has a valid crawl (migration 0024 may have failed)
|
||||
from archivebox.crawls.models import Crawl
|
||||
has_valid_crawl = False
|
||||
if snapshot.crawl_id:
|
||||
# Check if the crawl actually exists
|
||||
has_valid_crawl = Crawl.objects.filter(id=snapshot.crawl_id).exists()
|
||||
|
||||
if not has_valid_crawl:
|
||||
# Create a new crawl (created_by will default to system user)
|
||||
crawl = Crawl.objects.create(urls=snapshot.url)
|
||||
# Use queryset update to avoid triggering save() hooks
|
||||
from archivebox.core.models import Snapshot as SnapshotModel
|
||||
SnapshotModel.objects.filter(pk=snapshot.pk).update(crawl=crawl)
|
||||
# Refresh the instance
|
||||
snapshot.crawl = crawl
|
||||
snapshot.crawl_id = crawl.id
|
||||
print(f"[DEBUG Phase1] Created missing crawl for snapshot {str(snapshot.id)[:8]}")
|
||||
|
||||
# Check if needs migration (0.8.x → 0.9.x)
|
||||
print(f"[DEBUG Phase1] Snapshot {str(snapshot.id)[:8]}: fs_version={snapshot.fs_version}, needs_migration={snapshot.fs_migration_needed}")
|
||||
if snapshot.fs_migration_needed:
|
||||
try:
|
||||
# Manually trigger filesystem migration without full save()
|
||||
# This avoids UNIQUE constraint issues while still migrating files
|
||||
cleanup_info = None
|
||||
if hasattr(snapshot, '_fs_migrate_from_0_8_0_to_0_9_0'):
|
||||
cleanup_info = snapshot._fs_migrate_from_0_8_0_to_0_9_0()
|
||||
# Calculate paths using actual directory (entry_path), not snapshot.timestamp
|
||||
# because snapshot.timestamp might be truncated
|
||||
old_dir = entry_path
|
||||
new_dir = snapshot.get_storage_path_for_version('0.9.0')
|
||||
print(f"[DEBUG Phase1] Migrating {old_dir.name} → {new_dir}")
|
||||
|
||||
# Manually migrate files
|
||||
if not new_dir.exists() and old_dir.exists():
|
||||
new_dir.mkdir(parents=True, exist_ok=True)
|
||||
import shutil
|
||||
file_count = 0
|
||||
for old_file in old_dir.rglob('*'):
|
||||
if old_file.is_file():
|
||||
rel_path = old_file.relative_to(old_dir)
|
||||
new_file = new_dir / rel_path
|
||||
if not new_file.exists():
|
||||
new_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
shutil.copy2(old_file, new_file)
|
||||
file_count += 1
|
||||
print(f"[DEBUG Phase1] Copied {file_count} files")
|
||||
|
||||
# Update only fs_version field using queryset update (bypasses validation)
|
||||
from archivebox.core.models import Snapshot as SnapshotModel
|
||||
@@ -158,9 +195,8 @@ def drain_old_archive_dirs(resume_from: str = None, batch_size: int = 100) -> di
|
||||
# Commit the transaction
|
||||
transaction.commit()
|
||||
|
||||
# Manually call cleanup since we bypassed normal save() flow
|
||||
if cleanup_info:
|
||||
old_dir, new_dir = cleanup_info
|
||||
# Cleanup: delete old dir and create symlink
|
||||
if old_dir.exists() and old_dir != new_dir:
|
||||
snapshot._cleanup_old_migration_dir(old_dir, new_dir)
|
||||
|
||||
stats['migrated'] += 1
|
||||
@@ -207,19 +243,39 @@ def process_all_db_snapshots(batch_size: int = 100) -> dict:
|
||||
continue
|
||||
|
||||
try:
|
||||
# Reconcile index.json with DB
|
||||
snapshot.reconcile_with_index_json()
|
||||
print(f"[DEBUG Phase2] Snapshot {str(snapshot.id)[:8]}: fs_version={snapshot.fs_version}, needs_migration={snapshot.fs_migration_needed}")
|
||||
|
||||
# Check if snapshot has a directory on disk
|
||||
from pathlib import Path
|
||||
output_dir = Path(snapshot.output_dir)
|
||||
has_directory = output_dir.exists() and output_dir.is_dir()
|
||||
|
||||
# Only reconcile if directory exists (don't create empty directories for orphans)
|
||||
if has_directory:
|
||||
snapshot.reconcile_with_index_json()
|
||||
|
||||
# Clean up invalid field values from old migrations
|
||||
if not isinstance(snapshot.current_step, int):
|
||||
snapshot.current_step = 0
|
||||
|
||||
# If still needs migration, it's an orphan (no directory on disk)
|
||||
# Mark it as migrated to prevent save() from triggering filesystem migration
|
||||
if snapshot.fs_migration_needed:
|
||||
if has_directory:
|
||||
print(f"[DEBUG Phase2] WARNING: Snapshot {str(snapshot.id)[:8]} has directory but still needs migration")
|
||||
else:
|
||||
print(f"[DEBUG Phase2] Orphan snapshot {str(snapshot.id)[:8]} - marking as migrated without filesystem operation")
|
||||
# Use queryset update to set fs_version without triggering save() hooks
|
||||
from archivebox.core.models import Snapshot as SnapshotModel
|
||||
SnapshotModel.objects.filter(pk=snapshot.pk).update(fs_version='0.9.0')
|
||||
snapshot.fs_version = '0.9.0'
|
||||
|
||||
# Queue for archiving (state machine will handle it)
|
||||
snapshot.status = Snapshot.StatusChoices.QUEUED
|
||||
snapshot.retry_at = timezone.now()
|
||||
snapshot.save()
|
||||
|
||||
stats['reconciled'] += 1
|
||||
stats['reconciled'] += 1 if has_directory else 0
|
||||
stats['queued'] += 1
|
||||
except Exception as e:
|
||||
# Skip snapshots that can't be processed (e.g., missing crawl)
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
from django.db import migrations, connection
|
||||
import json
|
||||
from pathlib import Path
|
||||
from archivebox.uuid_compat import uuid7
|
||||
|
||||
|
||||
def parse_cmd_field(cmd_raw):
|
||||
@@ -39,7 +40,6 @@ def parse_cmd_field(cmd_raw):
|
||||
|
||||
def get_or_create_current_machine(cursor):
|
||||
"""Get or create Machine.current() using raw SQL."""
|
||||
import uuid
|
||||
import socket
|
||||
from datetime import datetime
|
||||
|
||||
@@ -55,7 +55,8 @@ def get_or_create_current_machine(cursor):
|
||||
return row[0]
|
||||
|
||||
# Create new machine
|
||||
machine_id = str(uuid.uuid4())
|
||||
# Django UUIDField stores UUIDs as 32-char hex (no dashes) in SQLite
|
||||
machine_id = uuid7().hex
|
||||
now = datetime.now().isoformat()
|
||||
|
||||
# Check which columns exist (schema differs between 0.8.x and 0.9.x)
|
||||
@@ -103,7 +104,6 @@ def get_or_create_binary(cursor, machine_id, name, abspath, version):
|
||||
Returns:
|
||||
binary_id (str)
|
||||
"""
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
|
||||
# If abspath is just a name without slashes, it's not a full path
|
||||
@@ -123,7 +123,8 @@ def get_or_create_binary(cursor, machine_id, name, abspath, version):
|
||||
return row[0]
|
||||
|
||||
# Create new binary
|
||||
binary_id = str(uuid.uuid4())
|
||||
# Django UUIDField stores UUIDs as 32-char hex (no dashes) in SQLite
|
||||
binary_id = uuid7().hex
|
||||
now = datetime.now().isoformat()
|
||||
|
||||
# Check which columns exist (schema differs between 0.8.x and 0.9.x)
|
||||
@@ -186,10 +187,10 @@ def create_process(cursor, machine_id, pwd, cmd, status, exit_code, started_at,
|
||||
Returns:
|
||||
process_id (str)
|
||||
"""
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
|
||||
process_id = str(uuid.uuid4())
|
||||
# Django UUIDField stores UUIDs as 32-char hex (no dashes) in SQLite
|
||||
process_id = uuid7().hex
|
||||
now = datetime.now().isoformat()
|
||||
|
||||
# Convert cmd array to JSON
|
||||
|
||||
18
archivebox/core/migrations/0028_alter_snapshot_fs_version.py
Normal file
18
archivebox/core/migrations/0028_alter_snapshot_fs_version.py
Normal file
@@ -0,0 +1,18 @@
|
||||
# Generated by Django 6.0 on 2026-01-02 08:43
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0027_copy_archiveresult_to_process'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='fs_version',
|
||||
field=models.CharField(default='0.9.0', help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().', max_length=10),
|
||||
),
|
||||
]
|
||||
@@ -0,0 +1,181 @@
|
||||
# Generated by hand on 2026-01-02
|
||||
# Migrate ArchiveResult from integer PK to UUID PK (matching Snapshot)
|
||||
|
||||
from django.db import migrations, models, connection
|
||||
from uuid import UUID
|
||||
from archivebox.uuid_compat import uuid7
|
||||
|
||||
|
||||
def migrate_archiveresult_id_to_uuid(apps, schema_editor):
|
||||
"""
|
||||
Migrate ArchiveResult from integer PK to UUID PK.
|
||||
|
||||
Strategy:
|
||||
1. Add old_id field to store current integer IDs
|
||||
2. Generate UUIDs for any records missing them
|
||||
3. Swap id and uuid fields (uuid becomes PK, old integer id becomes old_id)
|
||||
"""
|
||||
cursor = connection.cursor()
|
||||
|
||||
# Check if table exists and has data
|
||||
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='core_archiveresult'")
|
||||
if not cursor.fetchone():
|
||||
print('ArchiveResult table does not exist, skipping migration')
|
||||
return
|
||||
|
||||
cursor.execute("SELECT COUNT(*) FROM core_archiveresult")
|
||||
row_count = cursor.fetchone()[0]
|
||||
|
||||
if row_count == 0:
|
||||
print('No ArchiveResult records to migrate')
|
||||
return
|
||||
|
||||
print(f'Migrating {row_count} ArchiveResult records from integer PK to UUID PK...')
|
||||
|
||||
# Step 0: Check if machine_process table exists, if not NULL out process_id values
|
||||
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='machine_process'")
|
||||
machine_process_exists = cursor.fetchone() is not None
|
||||
|
||||
if not machine_process_exists:
|
||||
print('machine_process table does not exist yet, setting process_id to NULL')
|
||||
cursor.execute("UPDATE core_archiveresult SET process_id = NULL WHERE process_id IS NOT NULL")
|
||||
|
||||
# Step 1: Create new table with UUID as primary key
|
||||
cursor.execute("""
|
||||
CREATE TABLE core_archiveresult_new (
|
||||
id TEXT PRIMARY KEY NOT NULL,
|
||||
old_id INTEGER,
|
||||
uuid TEXT UNIQUE,
|
||||
created_at DATETIME NOT NULL,
|
||||
modified_at DATETIME NOT NULL,
|
||||
|
||||
snapshot_id TEXT NOT NULL,
|
||||
plugin VARCHAR(32) NOT NULL,
|
||||
hook_name VARCHAR(255) NOT NULL DEFAULT '',
|
||||
|
||||
status VARCHAR(15) NOT NULL DEFAULT 'queued',
|
||||
retry_at DATETIME,
|
||||
|
||||
start_ts DATETIME,
|
||||
end_ts DATETIME,
|
||||
|
||||
output_str TEXT NOT NULL DEFAULT '',
|
||||
output_json TEXT,
|
||||
output_files TEXT NOT NULL DEFAULT '{}',
|
||||
output_size BIGINT NOT NULL DEFAULT 0,
|
||||
output_mimetypes VARCHAR(512) NOT NULL DEFAULT '',
|
||||
|
||||
config TEXT NOT NULL DEFAULT '{}',
|
||||
notes TEXT NOT NULL DEFAULT '',
|
||||
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
|
||||
num_uses_failed INTEGER NOT NULL DEFAULT 0,
|
||||
|
||||
process_id TEXT,
|
||||
|
||||
FOREIGN KEY (snapshot_id) REFERENCES core_snapshot(id) ON DELETE CASCADE,
|
||||
FOREIGN KEY (process_id) REFERENCES machine_process(id) ON DELETE SET NULL
|
||||
);
|
||||
""")
|
||||
|
||||
# Step 2: Generate UUIDs for records that don't have them
|
||||
cursor.execute("SELECT id, uuid FROM core_archiveresult")
|
||||
records = cursor.fetchall()
|
||||
|
||||
id_to_uuid = {}
|
||||
for old_id, existing_uuid in records:
|
||||
if existing_uuid:
|
||||
# Normalize existing UUID to 32-char hex format (Django SQLite UUIDField format)
|
||||
# (existing UUIDs might be stored with or without dashes in old schema)
|
||||
id_to_uuid[old_id] = UUID(existing_uuid).hex
|
||||
else:
|
||||
# Generate new UUIDv7 (time-ordered) as 32-char hex
|
||||
id_to_uuid[old_id] = uuid7().hex
|
||||
|
||||
# Step 3: Copy data with UUIDs as new primary key
|
||||
cursor.execute("SELECT * FROM core_archiveresult")
|
||||
old_records = cursor.fetchall()
|
||||
|
||||
# Get column names
|
||||
cursor.execute("PRAGMA table_info(core_archiveresult)")
|
||||
columns = cursor.fetchall()
|
||||
col_names = [col[1] for col in columns]
|
||||
|
||||
for i, record in enumerate(old_records):
|
||||
old_id = record[col_names.index('id')]
|
||||
new_uuid = id_to_uuid[old_id]
|
||||
|
||||
# Build insert with new structure
|
||||
values = {col_names[i]: record[i] for i in range(len(col_names))}
|
||||
|
||||
# Check which fields exist in new table
|
||||
fields_to_copy = [
|
||||
'created_at', 'modified_at', 'snapshot_id', 'plugin', 'hook_name',
|
||||
'status', 'retry_at', 'start_ts', 'end_ts',
|
||||
'output_str', 'output_json', 'output_files', 'output_size', 'output_mimetypes',
|
||||
'config', 'notes', 'num_uses_succeeded', 'num_uses_failed', 'process_id'
|
||||
]
|
||||
|
||||
# Build INSERT statement
|
||||
existing_fields = [f for f in fields_to_copy if f in values]
|
||||
placeholders = ', '.join(['?'] * (len(existing_fields) + 3)) # +3 for id, old_id, uuid
|
||||
field_list = 'id, old_id, uuid, ' + ', '.join(existing_fields)
|
||||
|
||||
insert_values = [new_uuid, old_id, new_uuid] + [values.get(f) for f in existing_fields]
|
||||
|
||||
cursor.execute(
|
||||
f"INSERT INTO core_archiveresult_new ({field_list}) VALUES ({placeholders})",
|
||||
insert_values
|
||||
)
|
||||
|
||||
# Step 4: Replace old table with new table
|
||||
cursor.execute("DROP TABLE core_archiveresult")
|
||||
cursor.execute("ALTER TABLE core_archiveresult_new RENAME TO core_archiveresult")
|
||||
|
||||
# Step 5: Create indexes
|
||||
cursor.execute("CREATE INDEX core_archiveresult_snapshot_id_idx ON core_archiveresult(snapshot_id)")
|
||||
cursor.execute("CREATE INDEX core_archiveresult_plugin_idx ON core_archiveresult(plugin)")
|
||||
cursor.execute("CREATE INDEX core_archiveresult_status_idx ON core_archiveresult(status)")
|
||||
cursor.execute("CREATE INDEX core_archiveresult_retry_at_idx ON core_archiveresult(retry_at)")
|
||||
cursor.execute("CREATE INDEX core_archiveresult_created_at_idx ON core_archiveresult(created_at)")
|
||||
cursor.execute("CREATE INDEX core_archiveresult_hook_name_idx ON core_archiveresult(hook_name)")
|
||||
cursor.execute("CREATE INDEX core_archiveresult_process_id_idx ON core_archiveresult(process_id)")
|
||||
cursor.execute("CREATE INDEX core_archiveresult_old_id_idx ON core_archiveresult(old_id)")
|
||||
|
||||
print(f'✓ Migrated {row_count} ArchiveResult records to UUID primary key')
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0028_alter_snapshot_fs_version'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.SeparateDatabaseAndState(
|
||||
database_operations=[
|
||||
migrations.RunPython(
|
||||
migrate_archiveresult_id_to_uuid,
|
||||
reverse_code=migrations.RunPython.noop,
|
||||
),
|
||||
],
|
||||
state_operations=[
|
||||
# Remove old uuid field
|
||||
migrations.RemoveField(
|
||||
model_name='archiveresult',
|
||||
name='uuid',
|
||||
),
|
||||
# Change id from AutoField to UUIDField
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='id',
|
||||
field=models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True),
|
||||
),
|
||||
# Add old_id field to preserve legacy integer IDs
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='old_id',
|
||||
field=models.IntegerField(null=True, blank=True, db_index=True, help_text='Legacy integer ID from pre-0.9.0 versions'),
|
||||
),
|
||||
],
|
||||
),
|
||||
]
|
||||
19
archivebox/core/migrations/0030_alter_archiveresult_id.py
Normal file
19
archivebox/core/migrations/0030_alter_archiveresult_id.py
Normal file
@@ -0,0 +1,19 @@
|
||||
# Generated by Django 6.0 on 2026-01-02 10:02
|
||||
|
||||
import uuid
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0029_migrate_archiveresult_to_uuid_pk'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='id',
|
||||
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
|
||||
),
|
||||
]
|
||||
@@ -362,6 +362,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
|
||||
# Migrate filesystem if needed (happens automatically on save)
|
||||
if self.pk and self.fs_migration_needed:
|
||||
print(f"[DEBUG save()] Triggering filesystem migration for {str(self.id)[:8]}: {self.fs_version} → {self._fs_current_version()}")
|
||||
# Walk through migration chain automatically
|
||||
current = self.fs_version
|
||||
target = self._fs_current_version()
|
||||
@@ -372,6 +373,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
|
||||
# Only run if method exists (most are no-ops)
|
||||
if hasattr(self, method):
|
||||
print(f"[DEBUG save()] Running {method}()")
|
||||
getattr(self, method)()
|
||||
|
||||
current = next_ver
|
||||
@@ -449,10 +451,18 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
old_dir = self.get_storage_path_for_version('0.8.0')
|
||||
new_dir = self.get_storage_path_for_version('0.9.0')
|
||||
|
||||
if not old_dir.exists() or old_dir == new_dir or new_dir.exists():
|
||||
# Even if no directory migration needed, still convert index format
|
||||
self.convert_index_json_to_jsonl()
|
||||
return
|
||||
print(f"[DEBUG _fs_migrate] {self.timestamp}: old_exists={old_dir.exists()}, same={old_dir == new_dir}, new_exists={new_dir.exists()}")
|
||||
|
||||
if not old_dir.exists() or old_dir == new_dir:
|
||||
# No migration needed
|
||||
print(f"[DEBUG _fs_migrate] Returning None (early return)")
|
||||
return None
|
||||
|
||||
if new_dir.exists():
|
||||
# New directory already exists (files already copied), but we still need cleanup
|
||||
# Return cleanup info so old directory can be cleaned up
|
||||
print(f"[DEBUG _fs_migrate] Returning cleanup info (new_dir exists)")
|
||||
return (old_dir, new_dir)
|
||||
|
||||
new_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
@@ -495,47 +505,32 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
def _cleanup_old_migration_dir(self, old_dir: Path, new_dir: Path):
|
||||
"""
|
||||
Delete old directory and create symlink after successful migration.
|
||||
Called via transaction.on_commit() after DB commit succeeds.
|
||||
"""
|
||||
import shutil
|
||||
import logging
|
||||
|
||||
print(f"[DEBUG] _cleanup_old_migration_dir called: old_dir={old_dir}, new_dir={new_dir}")
|
||||
|
||||
# Delete old directory
|
||||
if old_dir.exists() and not old_dir.is_symlink():
|
||||
print(f"[DEBUG] Attempting to delete old directory: {old_dir}")
|
||||
try:
|
||||
shutil.rmtree(old_dir)
|
||||
print(f"[DEBUG] Successfully deleted old directory: {old_dir}")
|
||||
except Exception as e:
|
||||
# Log but don't raise - migration succeeded, this is just cleanup
|
||||
print(f"[DEBUG] Failed to delete old directory {old_dir}: {e}")
|
||||
logging.getLogger('archivebox.migration').warning(
|
||||
f"Could not remove old migration directory {old_dir}: {e}"
|
||||
)
|
||||
return # Don't create symlink if cleanup failed
|
||||
else:
|
||||
print(f"[DEBUG] Old directory doesn't exist or is already a symlink: {old_dir}")
|
||||
|
||||
# Create backwards-compat symlink (after old dir is deleted)
|
||||
symlink_path = old_dir # Same path as old_dir
|
||||
if symlink_path.is_symlink():
|
||||
print(f"[DEBUG] Unlinking existing symlink: {symlink_path}")
|
||||
symlink_path.unlink()
|
||||
|
||||
if not symlink_path.exists():
|
||||
print(f"[DEBUG] Creating symlink: {symlink_path} -> {new_dir}")
|
||||
try:
|
||||
symlink_path.symlink_to(new_dir, target_is_directory=True)
|
||||
print(f"[DEBUG] Successfully created symlink")
|
||||
except Exception as e:
|
||||
print(f"[DEBUG] Failed to create symlink: {e}")
|
||||
logging.getLogger('archivebox.migration').warning(
|
||||
f"Could not create symlink from {symlink_path} to {new_dir}: {e}"
|
||||
)
|
||||
else:
|
||||
print(f"[DEBUG] Symlink path already exists: {symlink_path}")
|
||||
|
||||
# =========================================================================
|
||||
# Path Calculation and Migration Helpers
|
||||
@@ -660,13 +655,28 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
if not timestamp:
|
||||
return None
|
||||
|
||||
# Look up existing
|
||||
# Look up existing (try exact match first, then fuzzy match for truncated timestamps)
|
||||
try:
|
||||
return cls.objects.get(url=url, timestamp=timestamp)
|
||||
snapshot = cls.objects.get(url=url, timestamp=timestamp)
|
||||
print(f"[DEBUG load_from_directory] Found existing snapshot for {url} @ {timestamp}: {str(snapshot.id)[:8]}")
|
||||
return snapshot
|
||||
except cls.DoesNotExist:
|
||||
print(f"[DEBUG load_from_directory] NOT FOUND (exact): {url} @ {timestamp}")
|
||||
# Try fuzzy match - index.json may have truncated timestamp
|
||||
# e.g., index has "1767000340" but DB has "1767000340.624737"
|
||||
candidates = cls.objects.filter(url=url, timestamp__startswith=timestamp)
|
||||
if candidates.count() == 1:
|
||||
snapshot = candidates.first()
|
||||
print(f"[DEBUG load_from_directory] Found via fuzzy match: {snapshot.timestamp}")
|
||||
return snapshot
|
||||
elif candidates.count() > 1:
|
||||
print(f"[DEBUG load_from_directory] Multiple fuzzy matches, using first")
|
||||
return candidates.first()
|
||||
print(f"[DEBUG load_from_directory] NOT FOUND (fuzzy): {url} @ {timestamp}")
|
||||
return None
|
||||
except cls.MultipleObjectsReturned:
|
||||
# Should not happen with unique constraint
|
||||
print(f"[DEBUG load_from_directory] Multiple snapshots found for {url} @ {timestamp}")
|
||||
return cls.objects.filter(url=url, timestamp=timestamp).first()
|
||||
|
||||
@classmethod
|
||||
@@ -1668,83 +1678,20 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
|
||||
return archiveresults
|
||||
|
||||
def advance_step_if_ready(self) -> bool:
|
||||
"""
|
||||
Advance current_step if all foreground hooks in current step are finished.
|
||||
|
||||
Called by the state machine to check if step can advance.
|
||||
Background hooks (.bg) don't block step advancement.
|
||||
|
||||
Step advancement rules:
|
||||
- All foreground ARs in current step must be finished (SUCCEEDED/FAILED/SKIPPED)
|
||||
- Background ARs (hook_name contains '.bg.') are ignored for advancement
|
||||
- When ready, increments current_step by 1 (up to 9)
|
||||
|
||||
Returns:
|
||||
True if step was advanced, False if not ready or already at step 9.
|
||||
"""
|
||||
from archivebox.hooks import extract_step, is_background_hook
|
||||
|
||||
if self.current_step >= 9:
|
||||
return False # Already at final step
|
||||
|
||||
# Get all ARs for current step that are foreground
|
||||
current_step_ars = self.archiveresult_set.filter(
|
||||
hook_name__isnull=False
|
||||
).exclude(hook_name='')
|
||||
|
||||
# Check each AR in current step
|
||||
for ar in current_step_ars:
|
||||
ar_step = extract_step(ar.hook_name)
|
||||
if ar_step != self.current_step:
|
||||
continue # Not in current step
|
||||
|
||||
if is_background_hook(ar.hook_name):
|
||||
continue # Background hooks don't block
|
||||
|
||||
# Foreground hook in current step - check if finished
|
||||
if ar.status not in ArchiveResult.FINAL_OR_ACTIVE_STATES:
|
||||
# Still pending/queued - can't advance
|
||||
return False
|
||||
|
||||
if ar.status == ArchiveResult.StatusChoices.STARTED:
|
||||
# Still running - can't advance
|
||||
return False
|
||||
|
||||
# All foreground hooks in current step are finished - advance!
|
||||
self.current_step += 1
|
||||
self.save(update_fields=['current_step', 'modified_at'])
|
||||
return True
|
||||
|
||||
def is_finished_processing(self) -> bool:
|
||||
"""
|
||||
Check if this snapshot has finished processing.
|
||||
Check if all ArchiveResults are finished.
|
||||
|
||||
Used by SnapshotMachine.is_finished() to determine if snapshot is complete.
|
||||
|
||||
Returns:
|
||||
True if all archiveresults are finished (or no work to do), False otherwise.
|
||||
Note: This is only called for observability/progress tracking.
|
||||
SnapshotWorker owns the execution and doesn't poll this.
|
||||
"""
|
||||
# if no archiveresults exist yet, it's not finished
|
||||
if not self.archiveresult_set.exists():
|
||||
return False
|
||||
# Check if any ARs are still pending/started
|
||||
pending = self.archiveresult_set.exclude(
|
||||
status__in=ArchiveResult.FINAL_OR_ACTIVE_STATES
|
||||
).exists()
|
||||
|
||||
# Try to advance step if ready (handles step-based hook execution)
|
||||
# This will increment current_step when all foreground hooks in current step are done
|
||||
while self.advance_step_if_ready():
|
||||
pass # Keep advancing until we can't anymore
|
||||
|
||||
# if archiveresults exist but are still pending, it's not finished
|
||||
if self.pending_archiveresults().exists():
|
||||
return False
|
||||
|
||||
# Don't wait for background hooks - they'll be cleaned up on entering sealed state
|
||||
# Background hooks in STARTED state are excluded by pending_archiveresults()
|
||||
# (STARTED is in FINAL_OR_ACTIVE_STATES) so once all results are FINAL or ACTIVE,
|
||||
# we can transition to sealed and cleanup() will kill the background hooks
|
||||
|
||||
# otherwise archiveresults exist and are all finished, so it's finished
|
||||
return True
|
||||
return not pending
|
||||
|
||||
def get_progress_stats(self) -> dict:
|
||||
"""
|
||||
@@ -2242,7 +2189,6 @@ class SnapshotMachine(BaseStateMachine, strict_states=True):
|
||||
tick = (
|
||||
queued.to.itself(unless='can_start') |
|
||||
queued.to(started, cond='can_start') |
|
||||
started.to.itself(unless='is_finished') |
|
||||
started.to(sealed, cond='is_finished')
|
||||
)
|
||||
|
||||
@@ -2253,6 +2199,10 @@ class SnapshotMachine(BaseStateMachine, strict_states=True):
|
||||
can_start = bool(self.snapshot.url)
|
||||
return can_start
|
||||
|
||||
def is_finished(self) -> bool:
|
||||
"""Check if all ArchiveResults for this snapshot are finished."""
|
||||
return self.snapshot.is_finished_processing()
|
||||
|
||||
@queued.enter
|
||||
def enter_queued(self):
|
||||
self.snapshot.update_and_requeue(
|
||||
@@ -2262,29 +2212,10 @@ class SnapshotMachine(BaseStateMachine, strict_states=True):
|
||||
|
||||
@started.enter
|
||||
def enter_started(self):
|
||||
import sys
|
||||
|
||||
print(f'[cyan] 🔄 SnapshotMachine.enter_started() - creating archiveresults for {self.snapshot.url}[/cyan]', file=sys.stderr)
|
||||
|
||||
# Run the snapshot - creates pending archiveresults for all enabled plugins
|
||||
self.snapshot.run()
|
||||
|
||||
# Check if any archiveresults were created
|
||||
ar_count = self.snapshot.archiveresult_set.count()
|
||||
print(f'[cyan] 🔄 ArchiveResult count: {ar_count}[/cyan]', file=sys.stderr)
|
||||
|
||||
if ar_count == 0:
|
||||
# No archiveresults created, seal immediately
|
||||
print(f'[cyan] 🔄 No archiveresults created, sealing snapshot immediately[/cyan]', file=sys.stderr)
|
||||
self.seal()
|
||||
else:
|
||||
# Set status = started with retry_at far future (so workers don't claim us - we're waiting for ARs)
|
||||
# Last AR will manually call self.seal() when done
|
||||
self.snapshot.update_and_requeue(
|
||||
retry_at=timezone.now() + timedelta(days=365),
|
||||
status=Snapshot.StatusChoices.STARTED,
|
||||
)
|
||||
print(f'[cyan] 🔄 {ar_count} archiveresults created, waiting for them to finish[/cyan]', file=sys.stderr)
|
||||
"""Just mark as started - SnapshotWorker will create ARs and run hooks."""
|
||||
self.snapshot.status = Snapshot.StatusChoices.STARTED
|
||||
self.snapshot.retry_at = None # No more polling
|
||||
self.snapshot.save(update_fields=['status', 'retry_at', 'modified_at'])
|
||||
|
||||
@sealed.enter
|
||||
def enter_sealed(self):
|
||||
@@ -2329,12 +2260,11 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
plugins = [get_plugin_name(e) for e in get_plugins()]
|
||||
return tuple((e, e) for e in plugins)
|
||||
|
||||
# Keep AutoField for backward compatibility with 0.7.x databases
|
||||
# UUID field is added separately by migration for new records
|
||||
id = models.AutoField(primary_key=True, editable=False)
|
||||
# Note: unique constraint is added by migration 0027 - don't set unique=True here
|
||||
# or SQLite table recreation in earlier migrations will fail
|
||||
uuid = models.UUIDField(default=uuid7, null=True, blank=True, db_index=True)
|
||||
# UUID primary key (migrated from integer in 0029)
|
||||
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
|
||||
# old_id preserves the legacy integer ID for backward compatibility
|
||||
old_id = models.IntegerField(null=True, blank=True, db_index=True, help_text='Legacy integer ID from pre-0.9.0 versions')
|
||||
# Note: uuid field was removed in migration 0029 when id became UUID
|
||||
created_at = models.DateTimeField(default=timezone.now, db_index=True)
|
||||
modified_at = models.DateTimeField(auto_now=True)
|
||||
|
||||
@@ -2684,13 +2614,11 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
plugin_dir = Path(self.snapshot.output_dir) / self.plugin
|
||||
|
||||
start_ts = timezone.now()
|
||||
is_bg_hook = False
|
||||
process = None
|
||||
|
||||
for hook in hooks:
|
||||
# Check if this is a background hook
|
||||
is_bg_hook = is_background_hook(hook.name)
|
||||
|
||||
result = run_hook(
|
||||
# Run hook using Process.launch() - returns Process model
|
||||
process = run_hook(
|
||||
hook,
|
||||
output_dir=plugin_dir,
|
||||
config=config,
|
||||
@@ -2700,27 +2628,25 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
depth=self.snapshot.depth,
|
||||
)
|
||||
|
||||
# Background hooks return None
|
||||
if result is None:
|
||||
is_bg_hook = True
|
||||
|
||||
# Update status based on hook execution
|
||||
if is_bg_hook:
|
||||
# BACKGROUND HOOK - still running, return immediately
|
||||
# Status stays STARTED, will be finalized by Snapshot.cleanup()
|
||||
self.status = self.StatusChoices.STARTED
|
||||
# Link ArchiveResult to Process
|
||||
self.process = process
|
||||
self.start_ts = start_ts
|
||||
if self.process_id:
|
||||
self.process.pwd = str(plugin_dir)
|
||||
self.process.save()
|
||||
self.save(update_fields=['process_id', 'start_ts', 'modified_at'])
|
||||
|
||||
if not process:
|
||||
# No hooks ran
|
||||
self.status = self.StatusChoices.FAILED
|
||||
self.output_str = 'No hooks executed'
|
||||
self.save()
|
||||
return
|
||||
|
||||
# Update status based on hook execution
|
||||
if process.status == process.StatusChoices.RUNNING:
|
||||
# BACKGROUND HOOK - still running, return immediately
|
||||
# Status is already STARTED from enter_started(), will be finalized by Snapshot.cleanup()
|
||||
return
|
||||
|
||||
# FOREGROUND HOOK - completed, update from filesystem
|
||||
self.start_ts = start_ts
|
||||
if self.process_id:
|
||||
self.process.pwd = str(plugin_dir)
|
||||
self.process.save()
|
||||
self.update_from_output()
|
||||
|
||||
# Clean up empty output directory if no files were created
|
||||
@@ -3037,26 +2963,30 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):
|
||||
skipped = State(value=ArchiveResult.StatusChoices.SKIPPED, final=True)
|
||||
|
||||
# Tick Event - transitions based on conditions
|
||||
# Flow: queued → started → (succeeded|failed|skipped)
|
||||
# queued → skipped (if exceeded max attempts)
|
||||
# started → backoff → started (retry)
|
||||
tick = (
|
||||
queued.to(skipped, cond='is_exceeded_max_attempts') | # Check skip first
|
||||
queued.to.itself(unless='can_start') |
|
||||
queued.to(started, cond='can_start') |
|
||||
started.to.itself(unless='is_finished') |
|
||||
started.to(succeeded, cond='is_succeeded') |
|
||||
started.to(failed, cond='is_failed') |
|
||||
started.to(skipped, cond='is_skipped') |
|
||||
started.to(backoff, cond='is_backoff') |
|
||||
backoff.to(skipped, cond='is_exceeded_max_attempts') | # Check skip from backoff too
|
||||
backoff.to.itself(unless='can_start') |
|
||||
backoff.to(started, cond='can_start') |
|
||||
backoff.to(succeeded, cond='is_succeeded') |
|
||||
backoff.to(failed, cond='is_failed') |
|
||||
backoff.to(skipped, cond='is_skipped')
|
||||
backoff.to(started, cond='can_start')
|
||||
# Removed redundant transitions: backoff.to(succeeded/failed/skipped)
|
||||
# Reason: backoff should always retry→started, then started→final states
|
||||
)
|
||||
|
||||
def can_start(self) -> bool:
|
||||
if not self.archiveresult.snapshot.url:
|
||||
return False
|
||||
"""Pure function - check if AR can start (has valid URL)."""
|
||||
return bool(self.archiveresult.snapshot.url)
|
||||
|
||||
# Check if snapshot has exceeded MAX_URL_ATTEMPTS failed results
|
||||
def is_exceeded_max_attempts(self) -> bool:
|
||||
"""Check if snapshot has exceeded MAX_URL_ATTEMPTS failed results."""
|
||||
from archivebox.config.configset import get_config
|
||||
|
||||
config = get_config(
|
||||
@@ -3070,15 +3000,7 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):
|
||||
status=ArchiveResult.StatusChoices.FAILED
|
||||
).count()
|
||||
|
||||
if failed_count >= max_attempts:
|
||||
# Mark this result as skipped since we've hit the limit
|
||||
self.archiveresult.status = ArchiveResult.StatusChoices.SKIPPED
|
||||
self.archiveresult.output_str = f'Skipped: snapshot exceeded MAX_URL_ATTEMPTS ({max_attempts} failures)'
|
||||
self.archiveresult.retry_at = None
|
||||
self.archiveresult.save()
|
||||
return False
|
||||
|
||||
return True
|
||||
return failed_count >= max_attempts
|
||||
|
||||
def is_succeeded(self) -> bool:
|
||||
"""Check if extractor plugin succeeded (status was set by run())."""
|
||||
@@ -3101,12 +3023,35 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):
|
||||
)
|
||||
|
||||
def is_finished(self) -> bool:
|
||||
"""Check if extraction has completed (success, failure, or skipped)."""
|
||||
return self.archiveresult.status in (
|
||||
"""
|
||||
Check if extraction has completed (success, failure, or skipped).
|
||||
|
||||
For background hooks in STARTED state, checks if their Process has finished and reaps them.
|
||||
"""
|
||||
# If already in final state, return True
|
||||
if self.archiveresult.status in (
|
||||
ArchiveResult.StatusChoices.SUCCEEDED,
|
||||
ArchiveResult.StatusChoices.FAILED,
|
||||
ArchiveResult.StatusChoices.SKIPPED,
|
||||
)
|
||||
):
|
||||
return True
|
||||
|
||||
# If in STARTED state with a Process, check if Process has finished running
|
||||
if self.archiveresult.status == ArchiveResult.StatusChoices.STARTED:
|
||||
if self.archiveresult.process_id:
|
||||
process = self.archiveresult.process
|
||||
|
||||
# If process is NOT running anymore, reap the background hook
|
||||
if not process.is_running():
|
||||
self.archiveresult.update_from_output()
|
||||
# Check if now in final state after reaping
|
||||
return self.archiveresult.status in (
|
||||
ArchiveResult.StatusChoices.SUCCEEDED,
|
||||
ArchiveResult.StatusChoices.FAILED,
|
||||
ArchiveResult.StatusChoices.SKIPPED,
|
||||
)
|
||||
|
||||
return False
|
||||
|
||||
@queued.enter
|
||||
def enter_queued(self):
|
||||
@@ -3148,7 +3093,12 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):
|
||||
)
|
||||
|
||||
def _check_and_seal_parent_snapshot(self):
|
||||
"""Check if this is the last ArchiveResult to finish - if so, seal the parent Snapshot."""
|
||||
"""
|
||||
Check if this is the last ArchiveResult to finish - if so, seal the parent Snapshot.
|
||||
|
||||
Note: In the new architecture, SnapshotWorker handles step advancement and sealing.
|
||||
This method is kept for backwards compatibility with manual CLI commands.
|
||||
"""
|
||||
import sys
|
||||
|
||||
snapshot = self.archiveresult.snapshot
|
||||
@@ -3189,6 +3139,8 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):
|
||||
def enter_failed(self):
|
||||
import sys
|
||||
|
||||
print(f'[red] ❌ ArchiveResult.enter_failed() called for {self.archiveresult.plugin}[/red]', file=sys.stderr)
|
||||
|
||||
self.archiveresult.update_and_requeue(
|
||||
retry_at=None,
|
||||
status=ArchiveResult.StatusChoices.FAILED,
|
||||
@@ -3207,6 +3159,16 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):
|
||||
def enter_skipped(self):
|
||||
import sys
|
||||
|
||||
# Set output_str if not already set (e.g., when skipped due to max attempts)
|
||||
if not self.archiveresult.output_str and self.is_exceeded_max_attempts():
|
||||
from archivebox.config.configset import get_config
|
||||
config = get_config(
|
||||
crawl=self.archiveresult.snapshot.crawl,
|
||||
snapshot=self.archiveresult.snapshot,
|
||||
)
|
||||
max_attempts = config.get('MAX_URL_ATTEMPTS', 50)
|
||||
self.archiveresult.output_str = f'Skipped: snapshot exceeded MAX_URL_ATTEMPTS ({max_attempts} failures)'
|
||||
|
||||
self.archiveresult.update_and_requeue(
|
||||
retry_at=None,
|
||||
status=ArchiveResult.StatusChoices.SKIPPED,
|
||||
|
||||
@@ -281,25 +281,11 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||
"""Editor for crawl URLs."""
|
||||
widget_id = f'crawl_urls_{obj.pk}'
|
||||
|
||||
# Check if it's a local file we can edit
|
||||
source_file = obj.get_file_path()
|
||||
is_file = source_file is not None
|
||||
file_contents = ""
|
||||
error = None
|
||||
|
||||
if is_file and source_file:
|
||||
try:
|
||||
file_contents = source_file.read_text().strip()
|
||||
except Exception as e:
|
||||
error = f'Error reading {source_file}: {e}'
|
||||
|
||||
# Escape for safe HTML embedding
|
||||
escaped_urls = (obj.urls or '').replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"')
|
||||
escaped_file_contents = file_contents.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"')
|
||||
|
||||
# Count lines for auto-expand logic
|
||||
line_count = len((obj.urls or '').split('\n'))
|
||||
file_line_count = len(file_contents.split('\n')) if file_contents else 0
|
||||
uri_rows = min(max(3, line_count), 10)
|
||||
|
||||
html = f'''
|
||||
@@ -318,21 +304,6 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||
{line_count} URL{'s' if line_count != 1 else ''} · Note: URLs displayed here for reference only
|
||||
</p>
|
||||
</div>
|
||||
|
||||
{"" if not is_file else f'''
|
||||
<!-- File contents preview (if first URL is a file://) -->
|
||||
<div style="margin-bottom: 8px;">
|
||||
<label style="font-weight: bold; display: block; margin-bottom: 4px;">
|
||||
File Preview: <code style="font-weight: normal; color: #666;">{source_file}</code>
|
||||
</label>
|
||||
{"<div style='color: #dc3545; margin-bottom: 8px;'>" + error + "</div>" if error else ""}
|
||||
<textarea id="{widget_id}_file_preview"
|
||||
style="width: 100%; height: {min(400, max(150, file_line_count * 18))}px; font-family: monospace; font-size: 12px;
|
||||
padding: 8px; border: 1px solid #ccc; border-radius: 4px; resize: vertical; background: #f9f9f9;"
|
||||
readonly>{escaped_file_contents}</textarea>
|
||||
</div>
|
||||
'''}
|
||||
|
||||
</div>
|
||||
'''
|
||||
return mark_safe(html)
|
||||
|
||||
@@ -114,22 +114,6 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
},
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_file(cls, source_file: Path, max_depth: int = 0, label: str = '', extractor: str = 'auto',
|
||||
tags_str: str = '', config=None, created_by=None):
|
||||
"""Create a crawl from a file containing URLs."""
|
||||
urls_content = source_file.read_text()
|
||||
crawl = cls.objects.create(
|
||||
urls=urls_content,
|
||||
extractor=extractor,
|
||||
max_depth=max_depth,
|
||||
tags_str=tags_str,
|
||||
label=label or source_file.name,
|
||||
config=config or {},
|
||||
created_by_id=getattr(created_by, 'pk', created_by) or get_or_create_system_user_pk(),
|
||||
)
|
||||
return crawl
|
||||
|
||||
@property
|
||||
def api_url(self) -> str:
|
||||
return reverse_lazy('api-1:get_crawl', args=[self.id])
|
||||
@@ -196,15 +180,19 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
return crawl
|
||||
|
||||
@property
|
||||
def output_dir_parent(self) -> str:
|
||||
"""Construct parent directory: users/{username}/crawls/{YYYYMMDD}"""
|
||||
date_str = self.created_at.strftime('%Y%m%d')
|
||||
return f'users/{self.created_by.username}/crawls/{date_str}'
|
||||
def OUTPUT_DIR(self) -> Path:
|
||||
"""
|
||||
Construct output directory: users/{username}/crawls/{YYYYMMDD}/{domain}/{crawl-id}
|
||||
Domain is extracted from the first URL in the crawl.
|
||||
"""
|
||||
from archivebox import DATA_DIR
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
@property
|
||||
def output_dir_name(self) -> str:
|
||||
"""Use crawl ID as directory name"""
|
||||
return str(self.id)
|
||||
date_str = self.created_at.strftime('%Y%m%d')
|
||||
urls = self.get_urls_list()
|
||||
domain = Snapshot.extract_domain_from_url(urls[0]) if urls else 'unknown'
|
||||
|
||||
return DATA_DIR / 'users' / self.created_by.username / 'crawls' / date_str / domain / str(self.id)
|
||||
|
||||
def get_urls_list(self) -> list[str]:
|
||||
"""Get list of URLs from urls field, filtering out comments and empty lines."""
|
||||
@@ -216,52 +204,6 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
if url.strip() and not url.strip().startswith('#')
|
||||
]
|
||||
|
||||
def get_file_path(self) -> Path | None:
|
||||
"""
|
||||
Get filesystem path if this crawl references a local file.
|
||||
Checks if the first URL is a file:// URI.
|
||||
"""
|
||||
urls = self.get_urls_list()
|
||||
if not urls:
|
||||
return None
|
||||
|
||||
first_url = urls[0]
|
||||
if not first_url.startswith('file://'):
|
||||
return None
|
||||
|
||||
# Remove file:// prefix
|
||||
path_str = first_url.replace('file://', '', 1)
|
||||
return Path(path_str)
|
||||
|
||||
def create_root_snapshot(self) -> 'Snapshot':
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
first_url = self.get_urls_list()[0] if self.get_urls_list() else None
|
||||
if not first_url:
|
||||
raise ValueError(f'Crawl {self.id} has no URLs to create root snapshot from')
|
||||
|
||||
# Try to get existing snapshot
|
||||
try:
|
||||
snapshot = Snapshot.objects.get(crawl=self, url=first_url)
|
||||
# If exists and already queued/started, return it as-is
|
||||
if snapshot.status in [Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]:
|
||||
# Update retry_at to now so it can be picked up immediately
|
||||
snapshot.retry_at = timezone.now()
|
||||
snapshot.save(update_fields=['retry_at'])
|
||||
return snapshot
|
||||
except Snapshot.DoesNotExist:
|
||||
pass
|
||||
|
||||
# Create new snapshot
|
||||
root_snapshot = Snapshot.objects.create(
|
||||
crawl=self,
|
||||
url=first_url,
|
||||
status=Snapshot.INITIAL_STATE,
|
||||
retry_at=timezone.now(),
|
||||
timestamp=str(timezone.now().timestamp()),
|
||||
depth=0,
|
||||
)
|
||||
return root_snapshot
|
||||
|
||||
def add_url(self, entry: dict) -> bool:
|
||||
"""
|
||||
@@ -316,11 +258,15 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
Returns:
|
||||
List of newly created Snapshot objects
|
||||
"""
|
||||
import sys
|
||||
import json
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
created_snapshots = []
|
||||
|
||||
print(f'[cyan]DEBUG create_snapshots_from_urls: self.urls={repr(self.urls)}[/cyan]', file=sys.stderr)
|
||||
print(f'[cyan]DEBUG create_snapshots_from_urls: lines={self.urls.splitlines()}[/cyan]', file=sys.stderr)
|
||||
|
||||
for line in self.urls.splitlines():
|
||||
if not line.strip():
|
||||
continue
|
||||
@@ -329,13 +275,13 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
try:
|
||||
entry = json.loads(line)
|
||||
url = entry.get('url', '')
|
||||
depth = entry.get('depth', 1)
|
||||
depth = entry.get('depth', 0)
|
||||
title = entry.get('title')
|
||||
timestamp = entry.get('timestamp')
|
||||
tags = entry.get('tags', '')
|
||||
except json.JSONDecodeError:
|
||||
url = line.strip()
|
||||
depth = 1
|
||||
depth = 0
|
||||
title = None
|
||||
timestamp = None
|
||||
tags = ''
|
||||
@@ -379,41 +325,90 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
The root Snapshot for this crawl, or None for system crawls that don't create snapshots
|
||||
"""
|
||||
import time
|
||||
import json
|
||||
from pathlib import Path
|
||||
from archivebox.hooks import run_hook, discover_hooks, process_hook_records
|
||||
from archivebox.config.configset import get_config
|
||||
|
||||
# Debug logging to file (since stdout/stderr redirected to /dev/null in progress mode)
|
||||
debug_log = Path('/tmp/archivebox_crawl_debug.log')
|
||||
with open(debug_log, 'a') as f:
|
||||
f.write(f'\n=== Crawl.run() starting for {self.id} at {time.time()} ===\n')
|
||||
f.flush()
|
||||
|
||||
# Get merged config with crawl context
|
||||
config = get_config(crawl=self)
|
||||
|
||||
# Load all binaries.jsonl files from plugins
|
||||
# This replaces individual on_Crawl install hooks with declarative configuration
|
||||
from archivebox.hooks import BUILTIN_PLUGINS_DIR
|
||||
from archivebox.machine.models import Machine
|
||||
|
||||
machine_id = str(Machine.current().id)
|
||||
binaries_records = []
|
||||
|
||||
for binaries_file in BUILTIN_PLUGINS_DIR.glob('*/binaries.jsonl'):
|
||||
try:
|
||||
with open(binaries_file, 'r') as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if line and not line.startswith('#'):
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'Binary':
|
||||
record['machine_id'] = machine_id
|
||||
binaries_records.append(record)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Process binary declarations before running hooks
|
||||
if binaries_records:
|
||||
overrides = {'crawl': self}
|
||||
process_hook_records(binaries_records, overrides=overrides)
|
||||
|
||||
# Discover and run on_Crawl hooks
|
||||
with open(debug_log, 'a') as f:
|
||||
f.write(f'Discovering Crawl hooks...\n')
|
||||
f.flush()
|
||||
hooks = discover_hooks('Crawl', config=config)
|
||||
first_url = self.get_urls_list()[0] if self.get_urls_list() else ''
|
||||
with open(debug_log, 'a') as f:
|
||||
f.write(f'Found {len(hooks)} hooks\n')
|
||||
f.flush()
|
||||
|
||||
for hook in hooks:
|
||||
with open(debug_log, 'a') as f:
|
||||
f.write(f'Running hook: {hook.name}\n')
|
||||
f.flush()
|
||||
hook_start = time.time()
|
||||
plugin_name = hook.parent.name
|
||||
output_dir = self.OUTPUT_DIR / plugin_name
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
result = run_hook(
|
||||
# Run hook using Process.launch() - returns Process model
|
||||
process = run_hook(
|
||||
hook,
|
||||
output_dir=output_dir,
|
||||
config=config,
|
||||
crawl_id=str(self.id),
|
||||
source_url=first_url,
|
||||
source_url=self.urls, # Pass full newline-separated URLs
|
||||
)
|
||||
with open(debug_log, 'a') as f:
|
||||
f.write(f'Hook {hook.name} completed with status={process.status}\n')
|
||||
f.flush()
|
||||
|
||||
hook_elapsed = time.time() - hook_start
|
||||
if hook_elapsed > 0.5: # Log slow hooks
|
||||
print(f'[yellow]⏱️ Hook {hook.name} took {hook_elapsed:.2f}s[/yellow]')
|
||||
|
||||
# Background hook - returns None, continues running
|
||||
if result is None:
|
||||
# Background hook - still running
|
||||
if process.status == process.StatusChoices.RUNNING:
|
||||
continue
|
||||
|
||||
# Foreground hook - process JSONL records
|
||||
records = result.get('records', [])
|
||||
from archivebox.hooks import extract_records_from_process
|
||||
records = extract_records_from_process(process)
|
||||
if records:
|
||||
print(f'[cyan]📝 Processing {len(records)} records from {hook.name}[/cyan]')
|
||||
for record in records[:3]: # Show first 3
|
||||
@@ -423,14 +418,33 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
if stats:
|
||||
print(f'[green]✓ Created: {stats}[/green]')
|
||||
|
||||
# System crawls (archivebox://*) don't create snapshots - they just run hooks
|
||||
if first_url.startswith('archivebox://'):
|
||||
return None
|
||||
# Create snapshots from all URLs in self.urls
|
||||
with open(debug_log, 'a') as f:
|
||||
f.write(f'Creating snapshots from URLs...\n')
|
||||
f.flush()
|
||||
created_snapshots = self.create_snapshots_from_urls()
|
||||
with open(debug_log, 'a') as f:
|
||||
f.write(f'Created {len(created_snapshots)} snapshots\n')
|
||||
f.write(f'=== Crawl.run() complete ===\n\n')
|
||||
f.flush()
|
||||
return created_snapshots[0] if created_snapshots else None
|
||||
|
||||
# Create snapshots from URLs
|
||||
root_snapshot = self.create_root_snapshot()
|
||||
self.create_snapshots_from_urls()
|
||||
return root_snapshot
|
||||
def is_finished(self) -> bool:
|
||||
"""Check if crawl is finished (all snapshots sealed or no snapshots exist)."""
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
# Check if any snapshots exist for this crawl
|
||||
snapshots = Snapshot.objects.filter(crawl=self)
|
||||
|
||||
# If no snapshots exist, allow finishing (e.g., archivebox://install crawls that only run hooks)
|
||||
if not snapshots.exists():
|
||||
return True
|
||||
|
||||
# If snapshots exist, check if all are sealed
|
||||
if snapshots.filter(status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]).exists():
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def cleanup(self):
|
||||
"""Clean up background hooks and run on_CrawlEnd hooks."""
|
||||
@@ -452,7 +466,6 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
config = get_config(crawl=self)
|
||||
|
||||
hooks = discover_hooks('CrawlEnd', config=config)
|
||||
first_url = self.get_urls_list()[0] if self.get_urls_list() else ''
|
||||
|
||||
for hook in hooks:
|
||||
plugin_name = hook.parent.name
|
||||
@@ -464,7 +477,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
output_dir=output_dir,
|
||||
config=config,
|
||||
crawl_id=str(self.id),
|
||||
source_url=first_url,
|
||||
source_url=self.urls, # Pass full newline-separated URLs
|
||||
)
|
||||
|
||||
# Log failures but don't block
|
||||
@@ -494,7 +507,6 @@ class CrawlMachine(BaseStateMachine, strict_states=True):
|
||||
│ - run_hook(script, output_dir, ...) │
|
||||
│ - Parse JSONL from hook output │
|
||||
│ - process_hook_records() → creates Snapshots │
|
||||
│ • create_root_snapshot() → root snapshot for crawl │
|
||||
│ • create_snapshots_from_urls() → from self.urls field │
|
||||
│ │
|
||||
│ 2. Snapshots process independently with their own │
|
||||
@@ -518,7 +530,8 @@ class CrawlMachine(BaseStateMachine, strict_states=True):
|
||||
# Tick Event (polled by workers)
|
||||
tick = (
|
||||
queued.to.itself(unless='can_start') |
|
||||
queued.to(started, cond='can_start')
|
||||
queued.to(started, cond='can_start') |
|
||||
started.to(sealed, cond='is_finished')
|
||||
)
|
||||
|
||||
# Manual event (triggered by last Snapshot sealing)
|
||||
@@ -534,6 +547,10 @@ class CrawlMachine(BaseStateMachine, strict_states=True):
|
||||
return False
|
||||
return True
|
||||
|
||||
def is_finished(self) -> bool:
|
||||
"""Check if all Snapshots for this crawl are finished."""
|
||||
return self.crawl.is_finished()
|
||||
|
||||
@started.enter
|
||||
def enter_started(self):
|
||||
import sys
|
||||
@@ -543,25 +560,21 @@ class CrawlMachine(BaseStateMachine, strict_states=True):
|
||||
|
||||
try:
|
||||
# Run the crawl - runs hooks, processes JSONL, creates snapshots
|
||||
root_snapshot = self.crawl.run()
|
||||
first_snapshot = self.crawl.run()
|
||||
|
||||
if root_snapshot:
|
||||
print(f'[cyan]🔄 Created root snapshot: {root_snapshot.url}[/cyan]', file=sys.stderr)
|
||||
if first_snapshot:
|
||||
print(f'[cyan]🔄 Created {self.crawl.snapshot_set.count()} snapshot(s), first: {first_snapshot.url}[/cyan]', file=sys.stderr)
|
||||
# Update status to STARTED
|
||||
# Set retry_at to None so workers don't claim us (we wait for snapshots to finish)
|
||||
# Last snapshot will manually call self.seal() when done
|
||||
# Set retry_at to near future so tick() can poll and check is_finished()
|
||||
self.crawl.update_and_requeue(
|
||||
retry_at=None,
|
||||
retry_at=timezone.now() + timedelta(seconds=2),
|
||||
status=Crawl.StatusChoices.STARTED,
|
||||
)
|
||||
else:
|
||||
# No snapshots (system crawl like archivebox://install)
|
||||
print(f'[cyan]🔄 No snapshots created, allowing immediate seal[/cyan]', file=sys.stderr)
|
||||
# Set retry_at=now so next tick() will transition to sealed
|
||||
self.crawl.update_and_requeue(
|
||||
retry_at=timezone.now(),
|
||||
status=Crawl.StatusChoices.STARTED,
|
||||
)
|
||||
print(f'[cyan]🔄 No snapshots created, sealing crawl immediately[/cyan]', file=sys.stderr)
|
||||
# Seal immediately since there's no work to do
|
||||
self.seal()
|
||||
|
||||
except Exception as e:
|
||||
print(f'[red]⚠️ Crawl {self.crawl.id} failed to start: {e}[/red]')
|
||||
|
||||
@@ -240,13 +240,14 @@ def run_hook(
|
||||
output_dir: Path,
|
||||
config: Dict[str, Any],
|
||||
timeout: Optional[int] = None,
|
||||
parent: Optional['Process'] = None,
|
||||
**kwargs: Any
|
||||
) -> HookResult:
|
||||
) -> 'Process':
|
||||
"""
|
||||
Execute a hook script with the given arguments.
|
||||
Execute a hook script with the given arguments using Process model.
|
||||
|
||||
This is the low-level hook executor. For running extractors with proper
|
||||
metadata handling, use call_extractor() instead.
|
||||
This is the low-level hook executor that creates a Process record and
|
||||
uses Process.launch() for subprocess management.
|
||||
|
||||
Config is passed to hooks via environment variables. Caller MUST use
|
||||
get_config() to merge all sources (file, env, machine, crawl, snapshot).
|
||||
@@ -257,16 +258,20 @@ def run_hook(
|
||||
config: Merged config dict from get_config(crawl=..., snapshot=...) - REQUIRED
|
||||
timeout: Maximum execution time in seconds
|
||||
If None, auto-detects from PLUGINNAME_TIMEOUT config (fallback to TIMEOUT, default 300)
|
||||
parent: Optional parent Process (for tracking worker->hook hierarchy)
|
||||
**kwargs: Arguments passed to the script as --key=value
|
||||
|
||||
Returns:
|
||||
HookResult with 'returncode', 'stdout', 'stderr', 'output_json', 'output_files', 'duration_ms'
|
||||
Process model instance (use process.exit_code, process.stdout, process.get_records())
|
||||
|
||||
Example:
|
||||
from archivebox.config.configset import get_config
|
||||
config = get_config(crawl=my_crawl, snapshot=my_snapshot)
|
||||
result = run_hook(hook_path, output_dir, config=config, url=url, snapshot_id=id)
|
||||
process = run_hook(hook_path, output_dir, config=config, url=url, snapshot_id=id)
|
||||
if process.status == 'exited':
|
||||
records = process.get_records() # Get parsed JSONL output
|
||||
"""
|
||||
from archivebox.machine.models import Process, Machine
|
||||
import time
|
||||
start_time = time.time()
|
||||
|
||||
@@ -276,18 +281,32 @@ def run_hook(
|
||||
plugin_config = get_plugin_special_config(plugin_name, config)
|
||||
timeout = plugin_config['timeout']
|
||||
|
||||
# Get current machine
|
||||
machine = Machine.current()
|
||||
|
||||
# Auto-detect parent process if not explicitly provided
|
||||
# This enables automatic hierarchy tracking: Worker -> Hook
|
||||
if parent is None:
|
||||
try:
|
||||
parent = Process.current()
|
||||
except Exception:
|
||||
# If Process.current() fails (e.g., not in a worker context), leave parent as None
|
||||
pass
|
||||
|
||||
if not script.exists():
|
||||
return HookResult(
|
||||
returncode=1,
|
||||
stdout='',
|
||||
# Create a failed Process record for hooks that don't exist
|
||||
process = Process.objects.create(
|
||||
machine=machine,
|
||||
parent=parent,
|
||||
process_type=Process.TypeChoices.HOOK,
|
||||
pwd=str(output_dir),
|
||||
cmd=['echo', f'Hook script not found: {script}'],
|
||||
timeout=timeout,
|
||||
status=Process.StatusChoices.EXITED,
|
||||
exit_code=1,
|
||||
stderr=f'Hook script not found: {script}',
|
||||
output_json=None,
|
||||
output_files=[],
|
||||
duration_ms=0,
|
||||
hook=str(script),
|
||||
plugin=script.parent.name,
|
||||
hook_name=script.name,
|
||||
)
|
||||
return process
|
||||
|
||||
# Determine the interpreter based on file extension
|
||||
ext = script.suffix.lower()
|
||||
@@ -379,130 +398,138 @@ def run_hook(
|
||||
# Create output directory if needed
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Capture files before execution to detect new output
|
||||
files_before = set(output_dir.rglob('*')) if output_dir.exists() else set()
|
||||
|
||||
# Detect if this is a background hook (long-running daemon)
|
||||
# New convention: .bg. suffix (e.g., on_Snapshot__21_consolelog.bg.js)
|
||||
# Old convention: __background in stem (for backwards compatibility)
|
||||
is_background = '.bg.' in script.name or '__background' in script.stem
|
||||
|
||||
# Set up output files for ALL hooks (useful for debugging)
|
||||
stdout_file = output_dir / 'stdout.log'
|
||||
stderr_file = output_dir / 'stderr.log'
|
||||
pid_file = output_dir / 'hook.pid'
|
||||
cmd_file = output_dir / 'cmd.sh'
|
||||
|
||||
try:
|
||||
# Write command script for validation
|
||||
from archivebox.misc.process_utils import write_cmd_file
|
||||
write_cmd_file(cmd_file, cmd)
|
||||
|
||||
# Open log files for writing
|
||||
with open(stdout_file, 'w') as out, open(stderr_file, 'w') as err:
|
||||
process = subprocess.Popen(
|
||||
cmd,
|
||||
cwd=str(output_dir),
|
||||
stdout=out,
|
||||
stderr=err,
|
||||
env=env,
|
||||
)
|
||||
|
||||
# Write PID with mtime set to process start time for validation
|
||||
from archivebox.misc.process_utils import write_pid_file_with_mtime
|
||||
process_start_time = time.time()
|
||||
write_pid_file_with_mtime(pid_file, process.pid, process_start_time)
|
||||
|
||||
if is_background:
|
||||
# Background hook - return None immediately, don't wait
|
||||
# Process continues running, writing to stdout.log
|
||||
# ArchiveResult will poll for completion later
|
||||
return None
|
||||
|
||||
# Normal hook - wait for completion with timeout
|
||||
try:
|
||||
returncode = process.wait(timeout=timeout)
|
||||
except subprocess.TimeoutExpired:
|
||||
process.kill()
|
||||
process.wait() # Clean up zombie
|
||||
duration_ms = int((time.time() - start_time) * 1000)
|
||||
return HookResult(
|
||||
returncode=-1,
|
||||
stdout='',
|
||||
stderr=f'Hook timed out after {timeout} seconds',
|
||||
output_json=None,
|
||||
output_files=[],
|
||||
duration_ms=duration_ms,
|
||||
hook=str(script),
|
||||
)
|
||||
|
||||
# Read output from files
|
||||
stdout = stdout_file.read_text() if stdout_file.exists() else ''
|
||||
stderr = stderr_file.read_text() if stderr_file.exists() else ''
|
||||
|
||||
# Detect new files created by the hook
|
||||
files_after = set(output_dir.rglob('*')) if output_dir.exists() else set()
|
||||
new_files = [str(f.relative_to(output_dir)) for f in (files_after - files_before) if f.is_file()]
|
||||
# Exclude the log files themselves from new_files
|
||||
new_files = [f for f in new_files if f not in ('stdout.log', 'stderr.log', 'hook.pid')]
|
||||
|
||||
# Parse JSONL output from stdout
|
||||
# Each line starting with { that has 'type' field is a record
|
||||
records = []
|
||||
plugin_name = script.parent.name # Plugin directory name (e.g., 'wget')
|
||||
hook_name = script.name # Full hook filename (e.g., 'on_Snapshot__50_wget.py')
|
||||
|
||||
for line in stdout.splitlines():
|
||||
line = line.strip()
|
||||
if not line or not line.startswith('{'):
|
||||
continue
|
||||
|
||||
try:
|
||||
data = json.loads(line)
|
||||
if 'type' in data:
|
||||
# Add plugin metadata to every record
|
||||
data['plugin'] = plugin_name
|
||||
data['hook_name'] = hook_name
|
||||
data['plugin_hook'] = str(script)
|
||||
records.append(data)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
duration_ms = int((time.time() - start_time) * 1000)
|
||||
|
||||
# Clean up log files on success (keep on failure for debugging)
|
||||
if returncode == 0:
|
||||
stdout_file.unlink(missing_ok=True)
|
||||
stderr_file.unlink(missing_ok=True)
|
||||
pid_file.unlink(missing_ok=True)
|
||||
|
||||
return HookResult(
|
||||
returncode=returncode,
|
||||
stdout=stdout,
|
||||
stderr=stderr,
|
||||
output_json=None, # Legacy field, we now use records for JSONL
|
||||
output_files=new_files,
|
||||
duration_ms=duration_ms,
|
||||
hook=str(script),
|
||||
plugin=plugin_name,
|
||||
hook_name=hook_name,
|
||||
records=records,
|
||||
# Create Process record
|
||||
process = Process.objects.create(
|
||||
machine=machine,
|
||||
parent=parent,
|
||||
process_type=Process.TypeChoices.HOOK,
|
||||
pwd=str(output_dir),
|
||||
cmd=cmd,
|
||||
timeout=timeout,
|
||||
)
|
||||
|
||||
# Build environment from config (Process._build_env() expects self.env dict)
|
||||
# We need to set env on the process before launching
|
||||
process.env = {}
|
||||
for key, value in config.items():
|
||||
if value is None:
|
||||
continue
|
||||
elif isinstance(value, bool):
|
||||
process.env[key] = 'true' if value else 'false'
|
||||
elif isinstance(value, (list, dict)):
|
||||
process.env[key] = json.dumps(value)
|
||||
else:
|
||||
process.env[key] = str(value)
|
||||
|
||||
# Add base paths to env
|
||||
process.env['DATA_DIR'] = str(getattr(settings, 'DATA_DIR', Path.cwd()))
|
||||
process.env['ARCHIVE_DIR'] = str(getattr(settings, 'ARCHIVE_DIR', Path.cwd() / 'archive'))
|
||||
process.env.setdefault('MACHINE_ID', getattr(settings, 'MACHINE_ID', '') or os.environ.get('MACHINE_ID', ''))
|
||||
|
||||
# Add LIB_DIR and LIB_BIN_DIR
|
||||
lib_dir = config.get('LIB_DIR', getattr(settings, 'LIB_DIR', None))
|
||||
lib_bin_dir = config.get('LIB_BIN_DIR', getattr(settings, 'LIB_BIN_DIR', None))
|
||||
if lib_dir:
|
||||
process.env['LIB_DIR'] = str(lib_dir)
|
||||
if not lib_bin_dir and lib_dir:
|
||||
lib_bin_dir = Path(lib_dir) / 'bin'
|
||||
if lib_bin_dir:
|
||||
process.env['LIB_BIN_DIR'] = str(lib_bin_dir)
|
||||
|
||||
# Set PATH from Machine.config if available
|
||||
try:
|
||||
if machine and machine.config:
|
||||
machine_path = machine.config.get('config/PATH')
|
||||
if machine_path:
|
||||
# Prepend LIB_BIN_DIR to machine PATH as well
|
||||
if lib_bin_dir and not machine_path.startswith(f'{lib_bin_dir}:'):
|
||||
process.env['PATH'] = f'{lib_bin_dir}:{machine_path}'
|
||||
else:
|
||||
process.env['PATH'] = machine_path
|
||||
elif lib_bin_dir:
|
||||
# Just prepend to current PATH
|
||||
current_path = os.environ.get('PATH', '')
|
||||
if not current_path.startswith(f'{lib_bin_dir}:'):
|
||||
process.env['PATH'] = f'{lib_bin_dir}:{current_path}' if current_path else str(lib_bin_dir)
|
||||
|
||||
# Also set NODE_MODULES_DIR if configured
|
||||
node_modules_dir = machine.config.get('config/NODE_MODULES_DIR')
|
||||
if node_modules_dir:
|
||||
process.env['NODE_MODULES_DIR'] = node_modules_dir
|
||||
except Exception:
|
||||
pass # Fall back to system PATH if Machine not available
|
||||
|
||||
# Save env before launching
|
||||
process.save()
|
||||
|
||||
# Launch subprocess using Process.launch()
|
||||
process.launch(background=is_background)
|
||||
|
||||
# Return Process object (caller can use process.exit_code, process.stdout, process.get_records())
|
||||
return process
|
||||
|
||||
except Exception as e:
|
||||
duration_ms = int((time.time() - start_time) * 1000)
|
||||
return HookResult(
|
||||
returncode=-1,
|
||||
stdout='',
|
||||
# Create a failed Process record for exceptions
|
||||
process = Process.objects.create(
|
||||
machine=machine,
|
||||
process_type=Process.TypeChoices.HOOK,
|
||||
pwd=str(output_dir),
|
||||
cmd=cmd,
|
||||
timeout=timeout,
|
||||
status=Process.StatusChoices.EXITED,
|
||||
exit_code=-1,
|
||||
stderr=f'Failed to run hook: {type(e).__name__}: {e}',
|
||||
output_json=None,
|
||||
output_files=[],
|
||||
duration_ms=duration_ms,
|
||||
hook=str(script),
|
||||
plugin=script.parent.name,
|
||||
hook_name=script.name,
|
||||
records=[],
|
||||
)
|
||||
return process
|
||||
|
||||
|
||||
def extract_records_from_process(process: 'Process') -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Extract JSONL records from a Process's stdout.
|
||||
|
||||
Uses the same parse_line() logic from misc/jsonl.py.
|
||||
Adds plugin metadata to each record.
|
||||
|
||||
Args:
|
||||
process: Process model instance with stdout captured
|
||||
|
||||
Returns:
|
||||
List of parsed JSONL records with plugin metadata
|
||||
"""
|
||||
from archivebox.misc.jsonl import parse_line
|
||||
|
||||
records = []
|
||||
|
||||
# Read stdout from process
|
||||
stdout = process.stdout
|
||||
if not stdout and process.stdout_file and process.stdout_file.exists():
|
||||
stdout = process.stdout_file.read_text()
|
||||
|
||||
if not stdout:
|
||||
return records
|
||||
|
||||
# Extract plugin metadata from process.pwd and process.cmd
|
||||
plugin_name = Path(process.pwd).name if process.pwd else 'unknown'
|
||||
hook_name = Path(process.cmd[1]).name if len(process.cmd) > 1 else 'unknown'
|
||||
plugin_hook = process.cmd[1] if len(process.cmd) > 1 else ''
|
||||
|
||||
# Parse each line as JSONL
|
||||
for line in stdout.splitlines():
|
||||
record = parse_line(line)
|
||||
if record and 'type' in record:
|
||||
# Add plugin metadata to record
|
||||
record.setdefault('plugin', plugin_name)
|
||||
record.setdefault('hook_name', hook_name)
|
||||
record.setdefault('plugin_hook', plugin_hook)
|
||||
records.append(record)
|
||||
|
||||
return records
|
||||
|
||||
|
||||
def collect_urls_from_plugins(snapshot_dir: Path) -> List[Dict[str, Any]]:
|
||||
@@ -940,7 +967,7 @@ def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[
|
||||
else:
|
||||
# No PLUGINS whitelist - use PLUGINNAME_ENABLED (default True)
|
||||
import sys
|
||||
print(f"DEBUG: NO PLUGINS whitelist in config, checking {plugin_name}_ENABLED", file=sys.stderr)
|
||||
print(f"DEBUG: NO PLUGINS whitelist in config, checking {plugin_upper}_ENABLED", file=sys.stderr)
|
||||
enabled_key = f'{plugin_upper}_ENABLED'
|
||||
enabled = config.get(enabled_key)
|
||||
if enabled is None:
|
||||
|
||||
18
archivebox/machine/migrations/0009_alter_binary_status.py
Normal file
18
archivebox/machine/migrations/0009_alter_binary_status.py
Normal file
@@ -0,0 +1,18 @@
|
||||
# Generated by Django 6.0 on 2026-01-02 08:43
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('machine', '0008_add_worker_type_field'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='binary',
|
||||
name='status',
|
||||
field=models.CharField(choices=[('queued', 'Queued'), ('installed', 'Installed')], db_index=True, default='queued', max_length=16),
|
||||
),
|
||||
]
|
||||
@@ -683,6 +683,7 @@ class Process(models.Model):
|
||||
ORCHESTRATOR = 'orchestrator', 'Orchestrator'
|
||||
WORKER = 'worker', 'Worker'
|
||||
CLI = 'cli', 'CLI'
|
||||
HOOK = 'hook', 'Hook'
|
||||
BINARY = 'binary', 'Binary'
|
||||
|
||||
# Primary fields
|
||||
@@ -1415,6 +1416,10 @@ class Process(models.Model):
|
||||
"""
|
||||
Check if process has exited and update status if so.
|
||||
|
||||
Cleanup when process exits:
|
||||
- Copy stdout/stderr to DB (keep files for debugging)
|
||||
- Delete PID file
|
||||
|
||||
Returns:
|
||||
exit_code if exited, None if still running
|
||||
"""
|
||||
@@ -1422,11 +1427,25 @@ class Process(models.Model):
|
||||
return self.exit_code
|
||||
|
||||
if not self.is_running:
|
||||
# Process exited - read output and update status
|
||||
# Process exited - read output and copy to DB
|
||||
if self.stdout_file and self.stdout_file.exists():
|
||||
self.stdout = self.stdout_file.read_text()
|
||||
# TODO: Uncomment to cleanup (keeping for debugging for now)
|
||||
# self.stdout_file.unlink(missing_ok=True)
|
||||
if self.stderr_file and self.stderr_file.exists():
|
||||
self.stderr = self.stderr_file.read_text()
|
||||
# TODO: Uncomment to cleanup (keeping for debugging for now)
|
||||
# self.stderr_file.unlink(missing_ok=True)
|
||||
|
||||
# Clean up PID file (not needed for debugging)
|
||||
if self.pid_file and self.pid_file.exists():
|
||||
self.pid_file.unlink(missing_ok=True)
|
||||
|
||||
# TODO: Uncomment to cleanup cmd.sh (keeping for debugging for now)
|
||||
# if self.pwd:
|
||||
# cmd_file = Path(self.pwd) / 'cmd.sh'
|
||||
# if cmd_file.exists():
|
||||
# cmd_file.unlink(missing_ok=True)
|
||||
|
||||
# Try to get exit code from proc or default to unknown
|
||||
self.exit_code = self.exit_code if self.exit_code is not None else -1
|
||||
@@ -1686,6 +1705,46 @@ class Process(models.Model):
|
||||
"""
|
||||
return cls.get_running_count(process_type=process_type, machine=machine)
|
||||
|
||||
@classmethod
|
||||
def cleanup_orphaned_chrome(cls) -> int:
|
||||
"""
|
||||
Kill orphaned Chrome processes using chrome_utils.js killZombieChrome.
|
||||
|
||||
Scans DATA_DIR for chrome/*.pid files from stale crawls (>5 min old)
|
||||
and kills any orphaned Chrome processes.
|
||||
|
||||
Called by:
|
||||
- Orchestrator on startup (cleanup from previous crashes)
|
||||
- Orchestrator periodically (every N minutes)
|
||||
|
||||
Returns:
|
||||
Number of zombie Chrome processes killed
|
||||
"""
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from django.conf import settings
|
||||
|
||||
chrome_utils = Path(__file__).parent.parent / 'plugins' / 'chrome' / 'chrome_utils.js'
|
||||
if not chrome_utils.exists():
|
||||
return 0
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['node', str(chrome_utils), 'killZombieChrome', str(settings.DATA_DIR)],
|
||||
capture_output=True,
|
||||
timeout=30,
|
||||
text=True,
|
||||
)
|
||||
if result.returncode == 0:
|
||||
killed = int(result.stdout.strip())
|
||||
if killed > 0:
|
||||
print(f'[yellow]🧹 Cleaned up {killed} orphaned Chrome processes[/yellow]')
|
||||
return killed
|
||||
except (subprocess.TimeoutExpired, ValueError, FileNotFoundError) as e:
|
||||
print(f'[red]Failed to cleanup orphaned Chrome: {e}[/red]')
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Binary State Machine
|
||||
|
||||
@@ -530,13 +530,13 @@ def log_worker_event(
|
||||
Log a worker event with structured metadata and indentation.
|
||||
|
||||
Args:
|
||||
worker_type: Type of worker (Orchestrator, CrawlWorker, SnapshotWorker, etc.)
|
||||
worker_type: Type of worker (Orchestrator, CrawlWorker, SnapshotWorker)
|
||||
event: Event name (Starting, Completed, Failed, etc.)
|
||||
indent_level: Indentation level (0=Orchestrator, 1=CrawlWorker, 2=SnapshotWorker, 3=ArchiveResultWorker)
|
||||
indent_level: Indentation level (0=Orchestrator, 1=CrawlWorker, 2=SnapshotWorker)
|
||||
pid: Process ID
|
||||
worker_id: Worker ID (UUID for CrawlWorker, url for SnapshotWorker, plugin for ArchiveResultWorker)
|
||||
url: URL being processed (for SnapshotWorker/ArchiveResultWorker)
|
||||
plugin: Plugin name (for ArchiveResultWorker)
|
||||
worker_id: Worker ID (UUID for workers)
|
||||
url: URL being processed (for SnapshotWorker)
|
||||
plugin: Plugin name (for hook processes)
|
||||
metadata: Dict of metadata to show in curly braces
|
||||
error: Exception if event is an error
|
||||
"""
|
||||
|
||||
345
archivebox/misc/progress_layout.py
Normal file
345
archivebox/misc/progress_layout.py
Normal file
@@ -0,0 +1,345 @@
|
||||
"""
|
||||
Rich Layout-based live progress display for ArchiveBox orchestrator.
|
||||
|
||||
Shows a comprehensive dashboard with:
|
||||
- Top: Crawl queue status (full width)
|
||||
- Middle: 4-column grid of SnapshotWorker progress panels
|
||||
- Bottom: Orchestrator/Daphne logs
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.misc'
|
||||
|
||||
from datetime import datetime, timezone
|
||||
from typing import Dict, List, Optional, Any
|
||||
from collections import deque
|
||||
|
||||
from rich import box
|
||||
from rich.align import Align
|
||||
from rich.console import Console, Group, RenderableType
|
||||
from rich.layout import Layout
|
||||
from rich.panel import Panel
|
||||
from rich.progress import Progress, BarColumn, TextColumn, TaskProgressColumn, SpinnerColumn
|
||||
from rich.table import Table
|
||||
from rich.text import Text
|
||||
|
||||
from archivebox.config import VERSION
|
||||
|
||||
# Maximum number of SnapshotWorker columns to display
|
||||
MAX_WORKER_COLUMNS = 4
|
||||
|
||||
|
||||
class CrawlQueuePanel:
|
||||
"""Display crawl queue status across full width."""
|
||||
|
||||
def __init__(self):
|
||||
self.orchestrator_status = "Idle"
|
||||
self.crawl_queue_count = 0
|
||||
self.crawl_workers_count = 0
|
||||
self.max_crawl_workers = 8
|
||||
self.crawl_id: Optional[str] = None
|
||||
|
||||
def __rich__(self) -> Panel:
|
||||
grid = Table.grid(expand=True)
|
||||
grid.add_column(justify="left", ratio=1)
|
||||
grid.add_column(justify="center", ratio=1)
|
||||
grid.add_column(justify="center", ratio=1)
|
||||
grid.add_column(justify="right", ratio=1)
|
||||
|
||||
# Left: ArchiveBox version + timestamp
|
||||
left_text = Text()
|
||||
left_text.append("ArchiveBox ", style="bold cyan")
|
||||
left_text.append(f"v{VERSION}", style="bold yellow")
|
||||
left_text.append(f" • {datetime.now(timezone.utc).strftime('%H:%M:%S')}", style="grey53")
|
||||
|
||||
# Center-left: Crawl queue status
|
||||
queue_style = "yellow" if self.crawl_queue_count > 0 else "grey53"
|
||||
center_left_text = Text()
|
||||
center_left_text.append("Crawls: ", style="white")
|
||||
center_left_text.append(str(self.crawl_queue_count), style=f"bold {queue_style}")
|
||||
center_left_text.append(" queued", style="grey53")
|
||||
|
||||
# Center-right: CrawlWorker status
|
||||
worker_style = "green" if self.crawl_workers_count > 0 else "grey53"
|
||||
center_right_text = Text()
|
||||
center_right_text.append("Workers: ", style="white")
|
||||
center_right_text.append(f"{self.crawl_workers_count}/{self.max_crawl_workers}", style=f"bold {worker_style}")
|
||||
center_right_text.append(" active", style="grey53")
|
||||
|
||||
# Right: Orchestrator status
|
||||
status_color = "green" if self.crawl_workers_count > 0 else "grey53"
|
||||
right_text = Text()
|
||||
right_text.append("Status: ", style="white")
|
||||
right_text.append(self.orchestrator_status, style=f"bold {status_color}")
|
||||
if self.crawl_id:
|
||||
right_text.append(f" [{self.crawl_id[:8]}]", style="grey53")
|
||||
|
||||
grid.add_row(left_text, center_left_text, center_right_text, right_text)
|
||||
return Panel(grid, style="white on blue", box=box.ROUNDED)
|
||||
|
||||
|
||||
class SnapshotWorkerPanel:
|
||||
"""Display progress for a single SnapshotWorker."""
|
||||
|
||||
def __init__(self, worker_num: int):
|
||||
self.worker_num = worker_num
|
||||
self.snapshot_id: Optional[str] = None
|
||||
self.snapshot_url: Optional[str] = None
|
||||
self.total_hooks: int = 0
|
||||
self.completed_hooks: int = 0
|
||||
self.current_plugin: Optional[str] = None
|
||||
self.status: str = "idle" # idle, working, completed
|
||||
self.recent_logs: deque = deque(maxlen=5)
|
||||
|
||||
def __rich__(self) -> Panel:
|
||||
if self.status == "idle":
|
||||
content = Align.center(
|
||||
Text("Idle", style="grey53"),
|
||||
vertical="middle",
|
||||
)
|
||||
border_style = "grey53"
|
||||
title_style = "grey53"
|
||||
else:
|
||||
# Build progress display
|
||||
lines = []
|
||||
|
||||
# URL (truncated)
|
||||
if self.snapshot_url:
|
||||
url_display = self.snapshot_url[:35] + "..." if len(self.snapshot_url) > 35 else self.snapshot_url
|
||||
lines.append(Text(url_display, style="cyan"))
|
||||
lines.append(Text()) # Spacing
|
||||
|
||||
# Progress bar
|
||||
if self.total_hooks > 0:
|
||||
pct = (self.completed_hooks / self.total_hooks) * 100
|
||||
bar_width = 30
|
||||
filled = int((pct / 100) * bar_width)
|
||||
bar = "█" * filled + "░" * (bar_width - filled)
|
||||
|
||||
# Color based on progress
|
||||
if pct < 30:
|
||||
bar_style = "yellow"
|
||||
elif pct < 100:
|
||||
bar_style = "green"
|
||||
else:
|
||||
bar_style = "blue"
|
||||
|
||||
progress_text = Text()
|
||||
progress_text.append(bar, style=bar_style)
|
||||
progress_text.append(f" {pct:.0f}%", style="white")
|
||||
lines.append(progress_text)
|
||||
lines.append(Text()) # Spacing
|
||||
|
||||
# Stats
|
||||
stats = Table.grid(padding=(0, 1))
|
||||
stats.add_column(style="grey53", no_wrap=True)
|
||||
stats.add_column(style="white")
|
||||
stats.add_row("Hooks:", f"{self.completed_hooks}/{self.total_hooks}")
|
||||
if self.current_plugin:
|
||||
stats.add_row("Current:", Text(self.current_plugin, style="yellow"))
|
||||
lines.append(stats)
|
||||
lines.append(Text()) # Spacing
|
||||
|
||||
# Recent logs
|
||||
if self.recent_logs:
|
||||
lines.append(Text("Recent:", style="grey53"))
|
||||
for log_msg, log_style in self.recent_logs:
|
||||
log_text = Text(f"• {log_msg[:30]}", style=log_style)
|
||||
lines.append(log_text)
|
||||
|
||||
content = Group(*lines)
|
||||
border_style = "green" if self.status == "working" else "blue"
|
||||
title_style = "green" if self.status == "working" else "blue"
|
||||
|
||||
return Panel(
|
||||
content,
|
||||
title=f"[{title_style}]Worker {self.worker_num}",
|
||||
border_style=border_style,
|
||||
box=box.ROUNDED,
|
||||
height=20,
|
||||
)
|
||||
|
||||
def add_log(self, message: str, style: str = "white"):
|
||||
"""Add a log message to this worker's recent logs."""
|
||||
self.recent_logs.append((message, style))
|
||||
|
||||
|
||||
class OrchestratorLogPanel:
|
||||
"""Display orchestrator and system logs."""
|
||||
|
||||
def __init__(self, max_events: int = 15):
|
||||
self.events: deque = deque(maxlen=max_events)
|
||||
self.max_events = max_events
|
||||
|
||||
def add_event(self, message: str, style: str = "white"):
|
||||
"""Add an event to the log."""
|
||||
timestamp = datetime.now(timezone.utc).strftime("%H:%M:%S")
|
||||
self.events.append((timestamp, message, style))
|
||||
|
||||
def __rich__(self) -> Panel:
|
||||
if not self.events:
|
||||
content = Text("No recent events", style="grey53", justify="center")
|
||||
else:
|
||||
lines = []
|
||||
for timestamp, message, style in self.events:
|
||||
line = Text()
|
||||
line.append(f"[{timestamp}] ", style="grey53")
|
||||
line.append(message, style=style)
|
||||
lines.append(line)
|
||||
content = Group(*lines)
|
||||
|
||||
return Panel(
|
||||
content,
|
||||
title="[bold white]Orchestrator / Daphne Logs",
|
||||
border_style="white",
|
||||
box=box.ROUNDED,
|
||||
height=12,
|
||||
)
|
||||
|
||||
|
||||
class ArchiveBoxProgressLayout:
|
||||
"""
|
||||
Main layout manager for ArchiveBox orchestrator progress display.
|
||||
|
||||
Layout structure:
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ Crawl Queue (full width) │
|
||||
├───────────────┬───────────────┬───────────────┬─────────────┤
|
||||
│ Snapshot │ Snapshot │ Snapshot │ Snapshot │
|
||||
│ Worker 1 │ Worker 2 │ Worker 3 │ Worker 4 │
|
||||
│ │ │ │ │
|
||||
│ Progress + │ Progress + │ Progress + │ Progress + │
|
||||
│ Stats + │ Stats + │ Stats + │ Stats + │
|
||||
│ Logs │ Logs │ Logs │ Logs │
|
||||
├───────────────┴───────────────┴───────────────┴─────────────┤
|
||||
│ Orchestrator / Daphne Logs │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
"""
|
||||
|
||||
def __init__(self, crawl_id: Optional[str] = None):
|
||||
self.crawl_id = crawl_id
|
||||
self.start_time = datetime.now(timezone.utc)
|
||||
|
||||
# Create components
|
||||
self.crawl_queue = CrawlQueuePanel()
|
||||
self.crawl_queue.crawl_id = crawl_id
|
||||
|
||||
# Create 4 worker panels
|
||||
self.worker_panels = [SnapshotWorkerPanel(i + 1) for i in range(MAX_WORKER_COLUMNS)]
|
||||
|
||||
self.orchestrator_log = OrchestratorLogPanel(max_events=12)
|
||||
|
||||
# Create layout
|
||||
self.layout = self._make_layout()
|
||||
|
||||
# Track snapshot ID to worker panel mapping
|
||||
self.snapshot_to_worker: Dict[str, int] = {} # snapshot_id -> worker_panel_index
|
||||
|
||||
def _make_layout(self) -> Layout:
|
||||
"""Define the layout structure."""
|
||||
layout = Layout(name="root")
|
||||
|
||||
# Top-level split: crawl_queue, workers, logs
|
||||
layout.split(
|
||||
Layout(name="crawl_queue", size=3),
|
||||
Layout(name="workers", ratio=1),
|
||||
Layout(name="logs", size=13),
|
||||
)
|
||||
|
||||
# Split workers into 4 columns
|
||||
layout["workers"].split_row(
|
||||
Layout(name="worker1"),
|
||||
Layout(name="worker2"),
|
||||
Layout(name="worker3"),
|
||||
Layout(name="worker4"),
|
||||
)
|
||||
|
||||
# Assign components to layout sections
|
||||
layout["crawl_queue"].update(self.crawl_queue)
|
||||
layout["worker1"].update(self.worker_panels[0])
|
||||
layout["worker2"].update(self.worker_panels[1])
|
||||
layout["worker3"].update(self.worker_panels[2])
|
||||
layout["worker4"].update(self.worker_panels[3])
|
||||
layout["logs"].update(self.orchestrator_log)
|
||||
|
||||
return layout
|
||||
|
||||
def update_orchestrator_status(
|
||||
self,
|
||||
status: str,
|
||||
crawl_queue_count: int = 0,
|
||||
crawl_workers_count: int = 0,
|
||||
max_crawl_workers: int = 8,
|
||||
):
|
||||
"""Update orchestrator status in the crawl queue panel."""
|
||||
self.crawl_queue.orchestrator_status = status
|
||||
self.crawl_queue.crawl_queue_count = crawl_queue_count
|
||||
self.crawl_queue.crawl_workers_count = crawl_workers_count
|
||||
self.crawl_queue.max_crawl_workers = max_crawl_workers
|
||||
|
||||
def update_snapshot_worker(
|
||||
self,
|
||||
snapshot_id: str,
|
||||
url: str,
|
||||
total: int,
|
||||
completed: int,
|
||||
current_plugin: str = "",
|
||||
):
|
||||
"""Update or assign a snapshot to a worker panel."""
|
||||
# Find or assign worker panel for this snapshot
|
||||
if snapshot_id not in self.snapshot_to_worker:
|
||||
# Find first idle worker panel
|
||||
worker_idx = None
|
||||
for idx, panel in enumerate(self.worker_panels):
|
||||
if panel.status == "idle":
|
||||
worker_idx = idx
|
||||
break
|
||||
|
||||
# If no idle worker, use round-robin (shouldn't happen often)
|
||||
if worker_idx is None:
|
||||
worker_idx = len(self.snapshot_to_worker) % MAX_WORKER_COLUMNS
|
||||
|
||||
self.snapshot_to_worker[snapshot_id] = worker_idx
|
||||
|
||||
# Get assigned worker panel
|
||||
worker_idx = self.snapshot_to_worker[snapshot_id]
|
||||
panel = self.worker_panels[worker_idx]
|
||||
|
||||
# Update panel
|
||||
panel.snapshot_id = snapshot_id
|
||||
panel.snapshot_url = url
|
||||
panel.total_hooks = total
|
||||
panel.completed_hooks = completed
|
||||
panel.current_plugin = current_plugin
|
||||
panel.status = "working" if completed < total else "completed"
|
||||
|
||||
def remove_snapshot_worker(self, snapshot_id: str):
|
||||
"""Mark a snapshot worker as idle after completion."""
|
||||
if snapshot_id in self.snapshot_to_worker:
|
||||
worker_idx = self.snapshot_to_worker[snapshot_id]
|
||||
panel = self.worker_panels[worker_idx]
|
||||
|
||||
# Mark as idle
|
||||
panel.status = "idle"
|
||||
panel.snapshot_id = None
|
||||
panel.snapshot_url = None
|
||||
panel.total_hooks = 0
|
||||
panel.completed_hooks = 0
|
||||
panel.current_plugin = None
|
||||
panel.recent_logs.clear()
|
||||
|
||||
# Remove mapping
|
||||
del self.snapshot_to_worker[snapshot_id]
|
||||
|
||||
def log_to_worker(self, snapshot_id: str, message: str, style: str = "white"):
|
||||
"""Add a log message to a specific worker's panel."""
|
||||
if snapshot_id in self.snapshot_to_worker:
|
||||
worker_idx = self.snapshot_to_worker[snapshot_id]
|
||||
self.worker_panels[worker_idx].add_log(message, style)
|
||||
|
||||
def log_event(self, message: str, style: str = "white"):
|
||||
"""Add an event to the orchestrator log."""
|
||||
self.orchestrator_log.add_event(message, style)
|
||||
|
||||
def get_layout(self) -> Layout:
|
||||
"""Get the Rich Layout object for rendering."""
|
||||
return self.layout
|
||||
@@ -72,10 +72,8 @@ class TestAccessibilityWithChrome(TestCase):
|
||||
test_url=test_url,
|
||||
navigate=True,
|
||||
timeout=30,
|
||||
) as (chrome_process, chrome_pid, snapshot_chrome_dir):
|
||||
# Get environment and run the accessibility hook
|
||||
env = get_test_env()
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
) as (chrome_process, chrome_pid, snapshot_chrome_dir, env):
|
||||
# Use the environment from chrome_session (already has CHROME_HEADLESS=true)
|
||||
|
||||
# Run accessibility hook with the active Chrome session
|
||||
result = subprocess.run(
|
||||
@@ -116,6 +114,85 @@ class TestAccessibilityWithChrome(TestCase):
|
||||
self.skipTest(f"Chrome session setup failed: {e}")
|
||||
raise
|
||||
|
||||
def test_accessibility_disabled_skips(self):
|
||||
"""Test that ACCESSIBILITY_ENABLED=False skips without error."""
|
||||
test_url = 'https://example.com'
|
||||
snapshot_id = 'test-disabled'
|
||||
|
||||
env = get_test_env()
|
||||
env['ACCESSIBILITY_ENABLED'] = 'False'
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(ACCESSIBILITY_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
|
||||
cwd=str(self.temp_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
env=env
|
||||
)
|
||||
|
||||
# Should exit 0 even when disabled
|
||||
self.assertEqual(result.returncode, 0, f"Should succeed when disabled: {result.stderr}")
|
||||
|
||||
# Should NOT create output file when disabled
|
||||
accessibility_output = self.temp_dir / 'accessibility.json'
|
||||
self.assertFalse(accessibility_output.exists(), "Should not create file when disabled")
|
||||
|
||||
def test_accessibility_missing_url_argument(self):
|
||||
"""Test that missing --url argument causes error."""
|
||||
snapshot_id = 'test-missing-url'
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(ACCESSIBILITY_HOOK), f'--snapshot-id={snapshot_id}'],
|
||||
cwd=str(self.temp_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
env=get_test_env()
|
||||
)
|
||||
|
||||
# Should fail with non-zero exit code
|
||||
self.assertNotEqual(result.returncode, 0, "Should fail when URL missing")
|
||||
|
||||
def test_accessibility_missing_snapshot_id_argument(self):
|
||||
"""Test that missing --snapshot-id argument causes error."""
|
||||
test_url = 'https://example.com'
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(ACCESSIBILITY_HOOK), f'--url={test_url}'],
|
||||
cwd=str(self.temp_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
env=get_test_env()
|
||||
)
|
||||
|
||||
# Should fail with non-zero exit code
|
||||
self.assertNotEqual(result.returncode, 0, "Should fail when snapshot-id missing")
|
||||
|
||||
def test_accessibility_with_no_chrome_session(self):
|
||||
"""Test that hook fails gracefully when no Chrome session exists."""
|
||||
test_url = 'https://example.com'
|
||||
snapshot_id = 'test-no-chrome'
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(ACCESSIBILITY_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
|
||||
cwd=str(self.temp_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
env=get_test_env()
|
||||
)
|
||||
|
||||
# Should fail when no Chrome session
|
||||
self.assertNotEqual(result.returncode, 0, "Should fail when no Chrome session exists")
|
||||
# Error should mention CDP or Chrome
|
||||
err_lower = result.stderr.lower()
|
||||
self.assertTrue(
|
||||
any(x in err_lower for x in ['chrome', 'cdp', 'cannot find', 'puppeteer']),
|
||||
f"Should mention Chrome/CDP in error: {result.stderr}"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
|
||||
@@ -1397,11 +1397,11 @@ function getMachineType() {
|
||||
*/
|
||||
function getLibDir() {
|
||||
if (process.env.LIB_DIR) {
|
||||
return process.env.LIB_DIR;
|
||||
return path.resolve(process.env.LIB_DIR);
|
||||
}
|
||||
const dataDir = getEnv('DATA_DIR', './data');
|
||||
const machineType = getMachineType();
|
||||
return path.join(dataDir, 'lib', machineType);
|
||||
return path.resolve(path.join(dataDir, 'lib', machineType));
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -1412,9 +1412,9 @@ function getLibDir() {
|
||||
*/
|
||||
function getNodeModulesDir() {
|
||||
if (process.env.NODE_MODULES_DIR) {
|
||||
return process.env.NODE_MODULES_DIR;
|
||||
return path.resolve(process.env.NODE_MODULES_DIR);
|
||||
}
|
||||
return path.join(getLibDir(), 'npm', 'node_modules');
|
||||
return path.resolve(path.join(getLibDir(), 'npm', 'node_modules'));
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -37,9 +37,8 @@ Usage:
|
||||
|
||||
# For Chrome session tests:
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
setup_chrome_session, # Full Chrome + tab setup
|
||||
cleanup_chrome, # Cleanup by PID
|
||||
chrome_session, # Context manager
|
||||
chrome_session, # Context manager (Full Chrome + tab setup with automatic cleanup)
|
||||
cleanup_chrome, # Manual cleanup by PID (rarely needed)
|
||||
)
|
||||
|
||||
# For extension tests:
|
||||
@@ -184,8 +183,7 @@ def get_lib_dir() -> Path:
|
||||
# Fallback to Python
|
||||
if os.environ.get('LIB_DIR'):
|
||||
return Path(os.environ['LIB_DIR'])
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
return Path(str(STORAGE_CONFIG.LIB_DIR))
|
||||
raise Exception('LIB_DIR env var must be set!')
|
||||
|
||||
|
||||
def get_node_modules_dir() -> Path:
|
||||
@@ -695,111 +693,6 @@ def chromium_session(env: dict, chrome_dir: Path, crawl_id: str):
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def setup_chrome_session(
|
||||
tmpdir: Path,
|
||||
crawl_id: str = 'test-crawl',
|
||||
snapshot_id: str = 'test-snapshot',
|
||||
test_url: str = 'about:blank',
|
||||
navigate: bool = True,
|
||||
timeout: int = 15,
|
||||
) -> Tuple[subprocess.Popen, int, Path]:
|
||||
"""Set up a Chrome session with tab and optional navigation.
|
||||
|
||||
Creates the directory structure, launches Chrome, creates a tab,
|
||||
and optionally navigates to the test URL.
|
||||
|
||||
Args:
|
||||
tmpdir: Temporary directory for test files
|
||||
crawl_id: ID to use for the crawl
|
||||
snapshot_id: ID to use for the snapshot
|
||||
test_url: URL to navigate to (if navigate=True)
|
||||
navigate: Whether to navigate to the URL after creating tab
|
||||
timeout: Seconds to wait for Chrome to start
|
||||
|
||||
Returns:
|
||||
Tuple of (chrome_launch_process, chrome_pid, snapshot_chrome_dir)
|
||||
|
||||
Raises:
|
||||
RuntimeError: If Chrome fails to start or tab creation fails
|
||||
"""
|
||||
crawl_dir = Path(tmpdir) / 'crawl'
|
||||
crawl_dir.mkdir(exist_ok=True)
|
||||
chrome_dir = crawl_dir / 'chrome'
|
||||
chrome_dir.mkdir(exist_ok=True)
|
||||
|
||||
env = get_test_env()
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
|
||||
# Launch Chrome at crawl level
|
||||
chrome_launch_process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
|
||||
cwd=str(chrome_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
|
||||
# Wait for Chrome to launch
|
||||
for i in range(timeout):
|
||||
if chrome_launch_process.poll() is not None:
|
||||
stdout, stderr = chrome_launch_process.communicate()
|
||||
raise RuntimeError(f"Chrome launch failed:\nStdout: {stdout}\nStderr: {stderr}")
|
||||
if (chrome_dir / 'cdp_url.txt').exists():
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
if not (chrome_dir / 'cdp_url.txt').exists():
|
||||
raise RuntimeError(f"Chrome CDP URL not found after {timeout}s")
|
||||
|
||||
chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
|
||||
|
||||
# Create snapshot directory structure
|
||||
snapshot_dir = Path(tmpdir) / 'snapshot'
|
||||
snapshot_dir.mkdir(exist_ok=True)
|
||||
snapshot_chrome_dir = snapshot_dir / 'chrome'
|
||||
snapshot_chrome_dir.mkdir(exist_ok=True)
|
||||
|
||||
# Create tab
|
||||
tab_env = env.copy()
|
||||
tab_env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['node', str(CHROME_TAB_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}', f'--crawl-id={crawl_id}'],
|
||||
cwd=str(snapshot_chrome_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
env=tab_env
|
||||
)
|
||||
if result.returncode != 0:
|
||||
cleanup_chrome(chrome_launch_process, chrome_pid)
|
||||
raise RuntimeError(f"Tab creation failed: {result.stderr}")
|
||||
except subprocess.TimeoutExpired:
|
||||
cleanup_chrome(chrome_launch_process, chrome_pid)
|
||||
raise RuntimeError("Tab creation timed out after 60s")
|
||||
|
||||
# Navigate to URL if requested
|
||||
if navigate and CHROME_NAVIGATE_HOOK and test_url != 'about:blank':
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['node', str(CHROME_NAVIGATE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
|
||||
cwd=str(snapshot_chrome_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120,
|
||||
env=env
|
||||
)
|
||||
if result.returncode != 0:
|
||||
cleanup_chrome(chrome_launch_process, chrome_pid)
|
||||
raise RuntimeError(f"Navigation failed: {result.stderr}")
|
||||
except subprocess.TimeoutExpired:
|
||||
cleanup_chrome(chrome_launch_process, chrome_pid)
|
||||
raise RuntimeError("Navigation timed out after 120s")
|
||||
|
||||
return chrome_launch_process, chrome_pid, snapshot_chrome_dir
|
||||
|
||||
|
||||
def cleanup_chrome(chrome_launch_process: subprocess.Popen, chrome_pid: int, chrome_dir: Optional[Path] = None) -> None:
|
||||
"""Clean up Chrome processes using chrome_utils.js killChrome.
|
||||
|
||||
@@ -835,8 +728,12 @@ def chrome_session(
|
||||
):
|
||||
"""Context manager for Chrome sessions with automatic cleanup.
|
||||
|
||||
Creates the directory structure, launches Chrome, creates a tab,
|
||||
and optionally navigates to the test URL. Automatically cleans up
|
||||
Chrome on exit.
|
||||
|
||||
Usage:
|
||||
with chrome_session(tmpdir, test_url='https://example.com') as (process, pid, chrome_dir):
|
||||
with chrome_session(tmpdir, test_url='https://example.com') as (process, pid, chrome_dir, env):
|
||||
# Run tests with chrome session
|
||||
pass
|
||||
# Chrome automatically cleaned up
|
||||
@@ -850,20 +747,129 @@ def chrome_session(
|
||||
timeout: Seconds to wait for Chrome to start
|
||||
|
||||
Yields:
|
||||
Tuple of (chrome_launch_process, chrome_pid, snapshot_chrome_dir)
|
||||
Tuple of (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env)
|
||||
|
||||
Raises:
|
||||
RuntimeError: If Chrome fails to start or tab creation fails
|
||||
"""
|
||||
chrome_launch_process = None
|
||||
chrome_pid = None
|
||||
try:
|
||||
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(
|
||||
tmpdir=tmpdir,
|
||||
crawl_id=crawl_id,
|
||||
snapshot_id=snapshot_id,
|
||||
test_url=test_url,
|
||||
navigate=navigate,
|
||||
timeout=timeout,
|
||||
# Create proper directory structure in tmpdir
|
||||
machine = platform.machine().lower()
|
||||
system = platform.system().lower()
|
||||
if machine in ('arm64', 'aarch64'):
|
||||
machine = 'arm64'
|
||||
elif machine in ('x86_64', 'amd64'):
|
||||
machine = 'x86_64'
|
||||
machine_type = f"{machine}-{system}"
|
||||
|
||||
data_dir = Path(tmpdir) / 'data'
|
||||
lib_dir = data_dir / 'lib' / machine_type
|
||||
npm_dir = lib_dir / 'npm'
|
||||
node_modules_dir = npm_dir / 'node_modules'
|
||||
|
||||
# Create lib structure for puppeteer installation
|
||||
node_modules_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Create crawl and snapshot directories
|
||||
crawl_dir = Path(tmpdir) / 'crawl'
|
||||
crawl_dir.mkdir(exist_ok=True)
|
||||
chrome_dir = crawl_dir / 'chrome'
|
||||
chrome_dir.mkdir(exist_ok=True)
|
||||
|
||||
# Build env with tmpdir-specific paths
|
||||
env = os.environ.copy()
|
||||
env.update({
|
||||
'DATA_DIR': str(data_dir),
|
||||
'LIB_DIR': str(lib_dir),
|
||||
'MACHINE_TYPE': machine_type,
|
||||
'NODE_MODULES_DIR': str(node_modules_dir),
|
||||
'NODE_PATH': str(node_modules_dir),
|
||||
'NPM_BIN_DIR': str(npm_dir / '.bin'),
|
||||
'CHROME_HEADLESS': 'true',
|
||||
})
|
||||
|
||||
# CRITICAL: Run chrome install hook first (installs puppeteer-core and chromium)
|
||||
# chrome_launch assumes chrome_install has already run
|
||||
install_result = subprocess.run(
|
||||
['python', str(CHROME_INSTALL_HOOK)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120,
|
||||
env=env
|
||||
)
|
||||
yield chrome_launch_process, chrome_pid, snapshot_chrome_dir
|
||||
if install_result.returncode != 0:
|
||||
raise RuntimeError(f"Chrome install failed: {install_result.stderr}")
|
||||
|
||||
# Launch Chrome at crawl level
|
||||
chrome_launch_process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
|
||||
cwd=str(chrome_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
|
||||
# Wait for Chrome to launch
|
||||
for i in range(timeout):
|
||||
if chrome_launch_process.poll() is not None:
|
||||
stdout, stderr = chrome_launch_process.communicate()
|
||||
raise RuntimeError(f"Chrome launch failed:\nStdout: {stdout}\nStderr: {stderr}")
|
||||
if (chrome_dir / 'cdp_url.txt').exists():
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
if not (chrome_dir / 'cdp_url.txt').exists():
|
||||
raise RuntimeError(f"Chrome CDP URL not found after {timeout}s")
|
||||
|
||||
chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
|
||||
|
||||
# Create snapshot directory structure
|
||||
snapshot_dir = Path(tmpdir) / 'snapshot'
|
||||
snapshot_dir.mkdir(exist_ok=True)
|
||||
snapshot_chrome_dir = snapshot_dir / 'chrome'
|
||||
snapshot_chrome_dir.mkdir(exist_ok=True)
|
||||
|
||||
# Create tab
|
||||
tab_env = env.copy()
|
||||
tab_env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['node', str(CHROME_TAB_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}', f'--crawl-id={crawl_id}'],
|
||||
cwd=str(snapshot_chrome_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
env=tab_env
|
||||
)
|
||||
if result.returncode != 0:
|
||||
cleanup_chrome(chrome_launch_process, chrome_pid)
|
||||
raise RuntimeError(f"Tab creation failed: {result.stderr}")
|
||||
except subprocess.TimeoutExpired:
|
||||
cleanup_chrome(chrome_launch_process, chrome_pid)
|
||||
raise RuntimeError("Tab creation timed out after 60s")
|
||||
|
||||
# Navigate to URL if requested
|
||||
if navigate and CHROME_NAVIGATE_HOOK and test_url != 'about:blank':
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['node', str(CHROME_NAVIGATE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
|
||||
cwd=str(snapshot_chrome_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120,
|
||||
env=env
|
||||
)
|
||||
if result.returncode != 0:
|
||||
cleanup_chrome(chrome_launch_process, chrome_pid)
|
||||
raise RuntimeError(f"Navigation failed: {result.stderr}")
|
||||
except subprocess.TimeoutExpired:
|
||||
cleanup_chrome(chrome_launch_process, chrome_pid)
|
||||
raise RuntimeError("Navigation timed out after 120s")
|
||||
|
||||
yield chrome_launch_process, chrome_pid, snapshot_chrome_dir, env
|
||||
finally:
|
||||
if chrome_launch_process and chrome_pid:
|
||||
cleanup_chrome(chrome_launch_process, chrome_pid)
|
||||
|
||||
@@ -525,10 +525,9 @@ def test_zombie_prevention_hook_killed():
|
||||
time.sleep(1)
|
||||
|
||||
assert (chrome_dir / 'chrome.pid').exists(), "Chrome PID file should exist"
|
||||
assert (chrome_dir / 'hook.pid').exists(), "Hook PID file should exist"
|
||||
|
||||
chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
|
||||
hook_pid = int((chrome_dir / 'hook.pid').read_text().strip())
|
||||
hook_pid = chrome_launch_process.pid # Use the Popen process PID instead of hook.pid file
|
||||
|
||||
# Verify both Chrome and hook are running
|
||||
try:
|
||||
|
||||
260
archivebox/plugins/chrome/tests/test_chrome_test_helpers.py
Normal file
260
archivebox/plugins/chrome/tests/test_chrome_test_helpers.py
Normal file
@@ -0,0 +1,260 @@
|
||||
"""
|
||||
Tests for chrome_test_helpers.py functions.
|
||||
|
||||
These tests verify the Python helper functions used across Chrome plugin tests.
|
||||
"""
|
||||
|
||||
import os
|
||||
import pytest
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
get_test_env,
|
||||
get_machine_type,
|
||||
get_lib_dir,
|
||||
get_node_modules_dir,
|
||||
get_extensions_dir,
|
||||
find_chromium_binary,
|
||||
get_plugin_dir,
|
||||
get_hook_script,
|
||||
parse_jsonl_output,
|
||||
)
|
||||
|
||||
|
||||
def test_get_machine_type():
|
||||
"""Test get_machine_type() returns valid format."""
|
||||
machine_type = get_machine_type()
|
||||
assert isinstance(machine_type, str)
|
||||
assert '-' in machine_type, "Machine type should be in format: arch-os"
|
||||
# Should be one of the expected formats
|
||||
assert any(x in machine_type for x in ['arm64', 'x86_64']), "Should contain valid architecture"
|
||||
assert any(x in machine_type for x in ['darwin', 'linux', 'win32']), "Should contain valid OS"
|
||||
|
||||
|
||||
def test_get_lib_dir_with_env_var():
|
||||
"""Test get_lib_dir() respects LIB_DIR env var."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
custom_lib = Path(tmpdir) / 'custom_lib'
|
||||
custom_lib.mkdir()
|
||||
|
||||
old_lib_dir = os.environ.get('LIB_DIR')
|
||||
try:
|
||||
os.environ['LIB_DIR'] = str(custom_lib)
|
||||
lib_dir = get_lib_dir()
|
||||
assert lib_dir == custom_lib
|
||||
finally:
|
||||
if old_lib_dir:
|
||||
os.environ['LIB_DIR'] = old_lib_dir
|
||||
else:
|
||||
os.environ.pop('LIB_DIR', None)
|
||||
|
||||
|
||||
def test_get_node_modules_dir_with_env_var():
|
||||
"""Test get_node_modules_dir() respects NODE_MODULES_DIR env var."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
custom_nm = Path(tmpdir) / 'node_modules'
|
||||
custom_nm.mkdir()
|
||||
|
||||
old_nm_dir = os.environ.get('NODE_MODULES_DIR')
|
||||
try:
|
||||
os.environ['NODE_MODULES_DIR'] = str(custom_nm)
|
||||
nm_dir = get_node_modules_dir()
|
||||
assert nm_dir == custom_nm
|
||||
finally:
|
||||
if old_nm_dir:
|
||||
os.environ['NODE_MODULES_DIR'] = old_nm_dir
|
||||
else:
|
||||
os.environ.pop('NODE_MODULES_DIR', None)
|
||||
|
||||
|
||||
def test_get_extensions_dir_default():
|
||||
"""Test get_extensions_dir() returns expected path format."""
|
||||
ext_dir = get_extensions_dir()
|
||||
assert isinstance(ext_dir, str)
|
||||
assert 'personas' in ext_dir
|
||||
assert 'chrome_extensions' in ext_dir
|
||||
|
||||
|
||||
def test_get_extensions_dir_with_custom_persona():
|
||||
"""Test get_extensions_dir() respects ACTIVE_PERSONA env var."""
|
||||
old_persona = os.environ.get('ACTIVE_PERSONA')
|
||||
old_data_dir = os.environ.get('DATA_DIR')
|
||||
try:
|
||||
os.environ['ACTIVE_PERSONA'] = 'TestPersona'
|
||||
os.environ['DATA_DIR'] = '/tmp/test'
|
||||
ext_dir = get_extensions_dir()
|
||||
assert 'TestPersona' in ext_dir
|
||||
assert '/tmp/test' in ext_dir
|
||||
finally:
|
||||
if old_persona:
|
||||
os.environ['ACTIVE_PERSONA'] = old_persona
|
||||
else:
|
||||
os.environ.pop('ACTIVE_PERSONA', None)
|
||||
if old_data_dir:
|
||||
os.environ['DATA_DIR'] = old_data_dir
|
||||
else:
|
||||
os.environ.pop('DATA_DIR', None)
|
||||
|
||||
|
||||
def test_get_test_env_returns_dict():
|
||||
"""Test get_test_env() returns properly formatted environment dict."""
|
||||
env = get_test_env()
|
||||
assert isinstance(env, dict)
|
||||
|
||||
# Should include key paths
|
||||
assert 'MACHINE_TYPE' in env
|
||||
assert 'LIB_DIR' in env
|
||||
assert 'NODE_MODULES_DIR' in env
|
||||
assert 'NODE_PATH' in env # Critical for module resolution
|
||||
assert 'NPM_BIN_DIR' in env
|
||||
assert 'CHROME_EXTENSIONS_DIR' in env
|
||||
|
||||
# Verify NODE_PATH equals NODE_MODULES_DIR (for Node.js module resolution)
|
||||
assert env['NODE_PATH'] == env['NODE_MODULES_DIR']
|
||||
|
||||
|
||||
def test_get_test_env_paths_are_absolute():
|
||||
"""Test that get_test_env() returns absolute paths."""
|
||||
env = get_test_env()
|
||||
|
||||
# All path-like values should be absolute
|
||||
assert Path(env['LIB_DIR']).is_absolute()
|
||||
assert Path(env['NODE_MODULES_DIR']).is_absolute()
|
||||
assert Path(env['NODE_PATH']).is_absolute()
|
||||
|
||||
|
||||
def test_find_chromium_binary():
|
||||
"""Test find_chromium_binary() returns a path or None."""
|
||||
binary = find_chromium_binary()
|
||||
if binary:
|
||||
assert isinstance(binary, str)
|
||||
# Should be an absolute path if found
|
||||
assert os.path.isabs(binary)
|
||||
|
||||
|
||||
def test_get_plugin_dir():
|
||||
"""Test get_plugin_dir() finds correct plugin directory."""
|
||||
# Use this test file's path
|
||||
test_file = __file__
|
||||
plugin_dir = get_plugin_dir(test_file)
|
||||
|
||||
assert plugin_dir.exists()
|
||||
assert plugin_dir.is_dir()
|
||||
# Should be the chrome plugin directory
|
||||
assert plugin_dir.name == 'chrome'
|
||||
assert (plugin_dir.parent.name == 'plugins')
|
||||
|
||||
|
||||
def test_get_hook_script_finds_existing_hook():
|
||||
"""Test get_hook_script() can find an existing hook."""
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import CHROME_PLUGIN_DIR
|
||||
|
||||
# Try to find the chrome launch hook
|
||||
hook = get_hook_script(CHROME_PLUGIN_DIR, 'on_Crawl__*_chrome_launch.*')
|
||||
|
||||
if hook: # May not exist in all test environments
|
||||
assert hook.exists()
|
||||
assert hook.is_file()
|
||||
assert 'chrome_launch' in hook.name
|
||||
|
||||
|
||||
def test_get_hook_script_returns_none_for_missing():
|
||||
"""Test get_hook_script() returns None for non-existent hooks."""
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import CHROME_PLUGIN_DIR
|
||||
|
||||
hook = get_hook_script(CHROME_PLUGIN_DIR, 'nonexistent_hook_*_pattern.*')
|
||||
assert hook is None
|
||||
|
||||
|
||||
def test_parse_jsonl_output_valid():
|
||||
"""Test parse_jsonl_output() parses valid JSONL."""
|
||||
jsonl_output = '''{"type": "ArchiveResult", "status": "succeeded", "output": "test1"}
|
||||
{"type": "ArchiveResult", "status": "failed", "error": "test2"}
|
||||
'''
|
||||
|
||||
# Returns first match only
|
||||
result = parse_jsonl_output(jsonl_output)
|
||||
assert result is not None
|
||||
assert result['type'] == 'ArchiveResult'
|
||||
assert result['status'] == 'succeeded'
|
||||
assert result['output'] == 'test1'
|
||||
|
||||
|
||||
def test_parse_jsonl_output_with_non_json_lines():
|
||||
"""Test parse_jsonl_output() skips non-JSON lines."""
|
||||
mixed_output = '''Some non-JSON output
|
||||
{"type": "ArchiveResult", "status": "succeeded"}
|
||||
More non-JSON
|
||||
{"type": "ArchiveResult", "status": "failed"}
|
||||
'''
|
||||
|
||||
result = parse_jsonl_output(mixed_output)
|
||||
assert result is not None
|
||||
assert result['type'] == 'ArchiveResult'
|
||||
assert result['status'] == 'succeeded'
|
||||
|
||||
|
||||
def test_parse_jsonl_output_empty():
|
||||
"""Test parse_jsonl_output() handles empty input."""
|
||||
result = parse_jsonl_output('')
|
||||
assert result is None
|
||||
|
||||
|
||||
def test_parse_jsonl_output_filters_by_type():
|
||||
"""Test parse_jsonl_output() can filter by record type."""
|
||||
jsonl_output = '''{"type": "LogEntry", "data": "log1"}
|
||||
{"type": "ArchiveResult", "data": "result1"}
|
||||
{"type": "ArchiveResult", "data": "result2"}
|
||||
'''
|
||||
|
||||
# Should return first ArchiveResult, not LogEntry
|
||||
result = parse_jsonl_output(jsonl_output, record_type='ArchiveResult')
|
||||
assert result is not None
|
||||
assert result['type'] == 'ArchiveResult'
|
||||
assert result['data'] == 'result1' # First ArchiveResult
|
||||
|
||||
|
||||
def test_parse_jsonl_output_filters_custom_type():
|
||||
"""Test parse_jsonl_output() can filter by custom record type."""
|
||||
jsonl_output = '''{"type": "ArchiveResult", "data": "result1"}
|
||||
{"type": "LogEntry", "data": "log1"}
|
||||
{"type": "ArchiveResult", "data": "result2"}
|
||||
'''
|
||||
|
||||
result = parse_jsonl_output(jsonl_output, record_type='LogEntry')
|
||||
assert result is not None
|
||||
assert result['type'] == 'LogEntry'
|
||||
assert result['data'] == 'log1'
|
||||
|
||||
|
||||
def test_machine_type_consistency():
|
||||
"""Test that machine type is consistent across calls."""
|
||||
mt1 = get_machine_type()
|
||||
mt2 = get_machine_type()
|
||||
assert mt1 == mt2, "Machine type should be stable across calls"
|
||||
|
||||
|
||||
def test_lib_dir_is_directory():
|
||||
"""Test that lib_dir points to an actual directory when DATA_DIR is set."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
old_data_dir = os.environ.get('DATA_DIR')
|
||||
try:
|
||||
os.environ['DATA_DIR'] = tmpdir
|
||||
# Create the expected directory structure
|
||||
machine_type = get_machine_type()
|
||||
lib_dir = Path(tmpdir) / 'lib' / machine_type
|
||||
lib_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
result = get_lib_dir()
|
||||
# Should return a Path object
|
||||
assert isinstance(result, Path)
|
||||
finally:
|
||||
if old_data_dir:
|
||||
os.environ['DATA_DIR'] = old_data_dir
|
||||
else:
|
||||
os.environ.pop('DATA_DIR', None)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
@@ -72,10 +72,9 @@ class TestConsolelogWithChrome(TestCase):
|
||||
test_url=test_url,
|
||||
navigate=True,
|
||||
timeout=30,
|
||||
) as (chrome_process, chrome_pid, snapshot_chrome_dir):
|
||||
# Get environment and run the consolelog hook
|
||||
env = get_test_env()
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
) as (chrome_process, chrome_pid, snapshot_chrome_dir, env):
|
||||
# Use the environment from chrome_session (already has CHROME_HEADLESS=true)
|
||||
|
||||
|
||||
# Run consolelog hook with the active Chrome session
|
||||
result = subprocess.run(
|
||||
|
||||
80
archivebox/plugins/forumdl/on_Crawl__13_forumdl_install.py
Executable file
80
archivebox/plugins/forumdl/on_Crawl__13_forumdl_install.py
Executable file
@@ -0,0 +1,80 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Detect forum-dl binary and emit Binary JSONL record.
|
||||
|
||||
Output: Binary JSONL record to stdout if forum-dl is found
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
from abx_pkg import Binary, EnvProvider
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
return os.environ.get(name, default).strip()
|
||||
|
||||
def get_env_bool(name: str, default: bool = False) -> bool:
|
||||
val = get_env(name, '').lower()
|
||||
if val in ('true', '1', 'yes', 'on'):
|
||||
return True
|
||||
if val in ('false', '0', 'no', 'off'):
|
||||
return False
|
||||
return default
|
||||
|
||||
|
||||
def output_binary_found(binary: Binary, name: str):
|
||||
"""Output Binary JSONL record for an installed binary."""
|
||||
machine_id = os.environ.get('MACHINE_ID', '')
|
||||
|
||||
record = {
|
||||
'type': 'Binary',
|
||||
'name': name,
|
||||
'abspath': str(binary.abspath),
|
||||
'version': str(binary.version) if binary.version else '',
|
||||
'sha256': binary.sha256 or '',
|
||||
'binprovider': 'env', # Already installed
|
||||
'machine_id': machine_id,
|
||||
}
|
||||
print(json.dumps(record))
|
||||
|
||||
|
||||
def output_binary_missing(name: str, binproviders: str):
|
||||
"""Output Binary JSONL record for a missing binary that needs installation."""
|
||||
machine_id = os.environ.get('MACHINE_ID', '')
|
||||
|
||||
record = {
|
||||
'type': 'Binary',
|
||||
'name': name,
|
||||
'binproviders': binproviders, # Providers that can install it
|
||||
'machine_id': machine_id,
|
||||
}
|
||||
print(json.dumps(record))
|
||||
|
||||
|
||||
def main():
|
||||
forumdl_enabled = get_env_bool('FORUMDL_ENABLED', True)
|
||||
forumdl_binary = get_env('FORUMDL_BINARY', 'forum-dl')
|
||||
|
||||
if not forumdl_enabled:
|
||||
sys.exit(0)
|
||||
|
||||
provider = EnvProvider()
|
||||
try:
|
||||
binary = Binary(name=forumdl_binary, binproviders=[provider]).load()
|
||||
if binary.abspath:
|
||||
# Binary found
|
||||
output_binary_found(binary, name='forum-dl')
|
||||
else:
|
||||
# Binary not found
|
||||
output_binary_missing(name='forum-dl', binproviders='pip')
|
||||
except Exception:
|
||||
# Binary not found
|
||||
output_binary_missing(name='forum-dl', binproviders='pip')
|
||||
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -24,8 +24,7 @@ import pytest
|
||||
# Import shared Chrome test helpers
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
get_test_env,
|
||||
setup_chrome_session,
|
||||
cleanup_chrome,
|
||||
chrome_session,
|
||||
)
|
||||
|
||||
|
||||
@@ -101,22 +100,17 @@ def test_fails_gracefully_without_chrome_session():
|
||||
def test_scrolls_page_and_outputs_stats():
|
||||
"""Integration test: scroll page and verify JSONL output format."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
chrome_launch_process = None
|
||||
chrome_pid = None
|
||||
try:
|
||||
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(
|
||||
Path(tmpdir),
|
||||
crawl_id='test-infiniscroll',
|
||||
snapshot_id='snap-infiniscroll',
|
||||
test_url=TEST_URL,
|
||||
)
|
||||
|
||||
with chrome_session(
|
||||
Path(tmpdir),
|
||||
crawl_id='test-infiniscroll',
|
||||
snapshot_id='snap-infiniscroll',
|
||||
test_url=TEST_URL,
|
||||
) as (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env):
|
||||
# Create infiniscroll output directory (sibling to chrome)
|
||||
infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll'
|
||||
infiniscroll_dir.mkdir()
|
||||
|
||||
# Run infiniscroll hook
|
||||
env = get_test_env()
|
||||
env['INFINISCROLL_SCROLL_LIMIT'] = '3' # Limit scrolls for faster test
|
||||
env['INFINISCROLL_SCROLL_DELAY'] = '500' # Faster scrolling
|
||||
env['INFINISCROLL_MIN_HEIGHT'] = '1000' # Lower threshold for test
|
||||
@@ -158,29 +152,21 @@ def test_scrolls_page_and_outputs_stats():
|
||||
output_files = list(infiniscroll_dir.iterdir())
|
||||
assert len(output_files) == 0, f"Should not create any files, but found: {output_files}"
|
||||
|
||||
finally:
|
||||
if chrome_launch_process and chrome_pid:
|
||||
cleanup_chrome(chrome_launch_process, chrome_pid)
|
||||
|
||||
|
||||
def test_config_scroll_limit_honored():
|
||||
"""Test that INFINISCROLL_SCROLL_LIMIT config is respected."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
chrome_launch_process = None
|
||||
chrome_pid = None
|
||||
try:
|
||||
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(
|
||||
Path(tmpdir),
|
||||
crawl_id='test-scroll-limit',
|
||||
snapshot_id='snap-limit',
|
||||
test_url=TEST_URL,
|
||||
)
|
||||
with chrome_session(
|
||||
Path(tmpdir),
|
||||
crawl_id='test-scroll-limit',
|
||||
snapshot_id='snap-limit',
|
||||
test_url=TEST_URL,
|
||||
) as (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env):
|
||||
|
||||
infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll'
|
||||
infiniscroll_dir.mkdir()
|
||||
|
||||
# Set scroll limit to 2
|
||||
env = get_test_env()
|
||||
# Set scroll limit to 2 (use env from setup_chrome_session)
|
||||
env['INFINISCROLL_SCROLL_LIMIT'] = '2'
|
||||
env['INFINISCROLL_SCROLL_DELAY'] = '500'
|
||||
env['INFINISCROLL_MIN_HEIGHT'] = '100000' # High threshold so limit kicks in
|
||||
@@ -215,29 +201,22 @@ def test_config_scroll_limit_honored():
|
||||
assert output_str.startswith('scrolled to'), f"Should have valid output_str: {output_str}"
|
||||
assert result_json['status'] == 'succeeded', f"Should succeed with scroll limit: {result_json}"
|
||||
|
||||
finally:
|
||||
if chrome_launch_process and chrome_pid:
|
||||
cleanup_chrome(chrome_launch_process, chrome_pid)
|
||||
|
||||
|
||||
def test_config_timeout_honored():
|
||||
"""Test that INFINISCROLL_TIMEOUT config is respected."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
chrome_launch_process = None
|
||||
chrome_pid = None
|
||||
try:
|
||||
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(
|
||||
Path(tmpdir),
|
||||
crawl_id='test-timeout',
|
||||
snapshot_id='snap-timeout',
|
||||
test_url=TEST_URL,
|
||||
)
|
||||
with chrome_session(
|
||||
Path(tmpdir),
|
||||
crawl_id='test-timeout',
|
||||
snapshot_id='snap-timeout',
|
||||
test_url=TEST_URL,
|
||||
) as (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env):
|
||||
|
||||
infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll'
|
||||
infiniscroll_dir.mkdir()
|
||||
|
||||
# Set very short timeout
|
||||
env = get_test_env()
|
||||
# Set very short timeout (use env from setup_chrome_session)
|
||||
env['INFINISCROLL_TIMEOUT'] = '3' # 3 seconds
|
||||
env['INFINISCROLL_SCROLL_DELAY'] = '2000' # 2s delay - timeout should trigger
|
||||
env['INFINISCROLL_SCROLL_LIMIT'] = '100' # High limit
|
||||
@@ -258,9 +237,6 @@ def test_config_timeout_honored():
|
||||
assert elapsed < 15, f"Should respect timeout, took {elapsed:.1f}s"
|
||||
assert result.returncode == 0, f"Should complete even with timeout: {result.stderr}"
|
||||
|
||||
finally:
|
||||
if chrome_launch_process and chrome_pid:
|
||||
cleanup_chrome(chrome_launch_process, chrome_pid)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@@ -154,8 +154,7 @@ def test_extension_loads_in_chromium():
|
||||
# Step 1: Install the extension
|
||||
result = subprocess.run(
|
||||
['node', str(INSTALL_SCRIPT)],
|
||||
cwd=str(tmpdir,
|
||||
env=get_test_env()),
|
||||
cwd=str(tmpdir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Detect mercury-parser binary and emit Binary JSONL record.
|
||||
Detect postlight-parser binary and emit Binary JSONL record.
|
||||
|
||||
Output: Binary JSONL record to stdout if mercury-parser is found
|
||||
Output: Binary JSONL record to stdout if postlight-parser is found
|
||||
"""
|
||||
|
||||
import json
|
||||
@@ -48,6 +48,11 @@ def output_binary_missing(name: str, binproviders: str):
|
||||
'type': 'Binary',
|
||||
'name': name,
|
||||
'binproviders': binproviders, # Providers that can install it
|
||||
'overrides': {
|
||||
'npm': {
|
||||
'packages': ['@postlight/parser'],
|
||||
}
|
||||
},
|
||||
'machine_id': machine_id,
|
||||
}
|
||||
print(json.dumps(record))
|
||||
@@ -55,7 +60,7 @@ def output_binary_missing(name: str, binproviders: str):
|
||||
|
||||
def main():
|
||||
mercury_enabled = get_env_bool('MERCURY_ENABLED', True)
|
||||
mercury_binary = get_env('MERCURY_BINARY', 'mercury-parser')
|
||||
mercury_binary = get_env('MERCURY_BINARY', 'postlight-parser')
|
||||
|
||||
if not mercury_enabled:
|
||||
sys.exit(0)
|
||||
@@ -65,13 +70,13 @@ def main():
|
||||
binary = Binary(name=mercury_binary, binproviders=[provider]).load()
|
||||
if binary.abspath:
|
||||
# Binary found
|
||||
output_binary_found(binary, name='mercury-parser')
|
||||
output_binary_found(binary, name='postlight-parser')
|
||||
else:
|
||||
# Binary not found
|
||||
output_binary_missing(name='mercury-parser', binproviders='npm')
|
||||
output_binary_missing(name='postlight-parser', binproviders='npm')
|
||||
except Exception:
|
||||
# Binary not found
|
||||
output_binary_missing(name='mercury-parser', binproviders='npm')
|
||||
output_binary_missing(name='postlight-parser', binproviders='npm')
|
||||
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
@@ -25,8 +25,7 @@ import pytest
|
||||
# Import shared Chrome test helpers
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
get_test_env,
|
||||
setup_chrome_session,
|
||||
cleanup_chrome,
|
||||
chrome_session,
|
||||
)
|
||||
|
||||
|
||||
@@ -103,129 +102,119 @@ def test_fails_gracefully_without_chrome_session():
|
||||
def test_background_script_handles_sigterm():
|
||||
"""Test that background script runs and handles SIGTERM correctly."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
chrome_launch_process = None
|
||||
chrome_pid = None
|
||||
modalcloser_process = None
|
||||
try:
|
||||
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(
|
||||
with chrome_session(
|
||||
Path(tmpdir),
|
||||
crawl_id='test-modalcloser',
|
||||
snapshot_id='snap-modalcloser',
|
||||
test_url=TEST_URL,
|
||||
)
|
||||
) as (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env):
|
||||
# Create modalcloser output directory (sibling to chrome)
|
||||
modalcloser_dir = snapshot_chrome_dir.parent / 'modalcloser'
|
||||
modalcloser_dir.mkdir()
|
||||
|
||||
# Create modalcloser output directory (sibling to chrome)
|
||||
modalcloser_dir = snapshot_chrome_dir.parent / 'modalcloser'
|
||||
modalcloser_dir.mkdir()
|
||||
# Run modalcloser as background process (use env from setup_chrome_session)
|
||||
env['MODALCLOSER_POLL_INTERVAL'] = '200' # Faster polling for test
|
||||
|
||||
# Run modalcloser as background process
|
||||
env = get_test_env()
|
||||
env['MODALCLOSER_POLL_INTERVAL'] = '200' # Faster polling for test
|
||||
modalcloser_process = subprocess.Popen(
|
||||
['node', str(MODALCLOSER_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-modalcloser'],
|
||||
cwd=str(modalcloser_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
|
||||
modalcloser_process = subprocess.Popen(
|
||||
['node', str(MODALCLOSER_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-modalcloser'],
|
||||
cwd=str(modalcloser_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
# Let it run for a bit
|
||||
time.sleep(2)
|
||||
|
||||
# Let it run for a bit
|
||||
time.sleep(2)
|
||||
# Verify it's still running (background script)
|
||||
assert modalcloser_process.poll() is None, "Modalcloser should still be running as background process"
|
||||
|
||||
# Verify it's still running (background script)
|
||||
assert modalcloser_process.poll() is None, "Modalcloser should still be running as background process"
|
||||
# Send SIGTERM
|
||||
modalcloser_process.send_signal(signal.SIGTERM)
|
||||
stdout, stderr = modalcloser_process.communicate(timeout=5)
|
||||
|
||||
# Send SIGTERM
|
||||
modalcloser_process.send_signal(signal.SIGTERM)
|
||||
stdout, stderr = modalcloser_process.communicate(timeout=5)
|
||||
assert modalcloser_process.returncode == 0, f"Should exit 0 on SIGTERM: {stderr}"
|
||||
|
||||
assert modalcloser_process.returncode == 0, f"Should exit 0 on SIGTERM: {stderr}"
|
||||
# Parse JSONL output
|
||||
result_json = None
|
||||
for line in stdout.strip().split('\n'):
|
||||
line = line.strip()
|
||||
if line.startswith('{'):
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'ArchiveResult':
|
||||
result_json = record
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Parse JSONL output
|
||||
result_json = None
|
||||
for line in stdout.strip().split('\n'):
|
||||
line = line.strip()
|
||||
if line.startswith('{'):
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'ArchiveResult':
|
||||
result_json = record
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
assert result_json is not None, f"Should have ArchiveResult JSONL output. Stdout: {stdout}"
|
||||
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
|
||||
|
||||
assert result_json is not None, f"Should have ArchiveResult JSONL output. Stdout: {stdout}"
|
||||
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
|
||||
# Verify output_str format
|
||||
output_str = result_json.get('output_str', '')
|
||||
assert 'modal' in output_str.lower() or 'dialog' in output_str.lower(), \
|
||||
f"output_str should mention modals/dialogs: {output_str}"
|
||||
|
||||
# Verify output_str format
|
||||
output_str = result_json.get('output_str', '')
|
||||
assert 'modal' in output_str.lower() or 'dialog' in output_str.lower(), \
|
||||
f"output_str should mention modals/dialogs: {output_str}"
|
||||
|
||||
# Verify no files created in output directory
|
||||
output_files = list(modalcloser_dir.iterdir())
|
||||
assert len(output_files) == 0, f"Should not create any files, but found: {output_files}"
|
||||
# Verify no files created in output directory
|
||||
output_files = list(modalcloser_dir.iterdir())
|
||||
assert len(output_files) == 0, f"Should not create any files, but found: {output_files}"
|
||||
|
||||
finally:
|
||||
if modalcloser_process and modalcloser_process.poll() is None:
|
||||
modalcloser_process.kill()
|
||||
if chrome_launch_process and chrome_pid:
|
||||
cleanup_chrome(chrome_launch_process, chrome_pid)
|
||||
|
||||
|
||||
def test_dialog_handler_logs_dialogs():
|
||||
"""Test that dialog handler is set up correctly."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
chrome_launch_process = None
|
||||
chrome_pid = None
|
||||
modalcloser_process = None
|
||||
try:
|
||||
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(
|
||||
Path(tmpdir),
|
||||
crawl_id='test-dialog',
|
||||
snapshot_id='snap-dialog',
|
||||
test_url=TEST_URL,
|
||||
)
|
||||
with chrome_session(
|
||||
Path(tmpdir),
|
||||
crawl_id='test-dialog',
|
||||
snapshot_id='snap-dialog',
|
||||
test_url=TEST_URL,
|
||||
) as (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env):
|
||||
|
||||
modalcloser_dir = snapshot_chrome_dir.parent / 'modalcloser'
|
||||
modalcloser_dir.mkdir()
|
||||
modalcloser_dir = snapshot_chrome_dir.parent / 'modalcloser'
|
||||
modalcloser_dir.mkdir()
|
||||
|
||||
env = get_test_env()
|
||||
env['MODALCLOSER_TIMEOUT'] = '100' # Fast timeout for test
|
||||
env['MODALCLOSER_POLL_INTERVAL'] = '200'
|
||||
# Use env from setup_chrome_session
|
||||
env['MODALCLOSER_TIMEOUT'] = '100' # Fast timeout for test
|
||||
env['MODALCLOSER_POLL_INTERVAL'] = '200'
|
||||
|
||||
modalcloser_process = subprocess.Popen(
|
||||
['node', str(MODALCLOSER_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-dialog'],
|
||||
cwd=str(modalcloser_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
modalcloser_process = subprocess.Popen(
|
||||
['node', str(MODALCLOSER_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-dialog'],
|
||||
cwd=str(modalcloser_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
|
||||
# Let it run briefly
|
||||
time.sleep(1.5)
|
||||
# Let it run briefly
|
||||
time.sleep(1.5)
|
||||
|
||||
# Verify it's running
|
||||
assert modalcloser_process.poll() is None, "Should be running"
|
||||
# Verify it's running
|
||||
assert modalcloser_process.poll() is None, "Should be running"
|
||||
|
||||
# Check stderr for "listening" message
|
||||
# Note: Can't read stderr while process is running without blocking,
|
||||
# so we just verify it exits cleanly
|
||||
modalcloser_process.send_signal(signal.SIGTERM)
|
||||
stdout, stderr = modalcloser_process.communicate(timeout=5)
|
||||
# Check stderr for "listening" message
|
||||
# Note: Can't read stderr while process is running without blocking,
|
||||
# so we just verify it exits cleanly
|
||||
modalcloser_process.send_signal(signal.SIGTERM)
|
||||
stdout, stderr = modalcloser_process.communicate(timeout=5)
|
||||
|
||||
assert 'listening' in stderr.lower() or 'modalcloser' in stderr.lower(), \
|
||||
f"Should log startup message: {stderr}"
|
||||
assert modalcloser_process.returncode == 0, f"Should exit cleanly: {stderr}"
|
||||
assert 'listening' in stderr.lower() or 'modalcloser' in stderr.lower(), \
|
||||
f"Should log startup message: {stderr}"
|
||||
assert modalcloser_process.returncode == 0, f"Should exit cleanly: {stderr}"
|
||||
|
||||
finally:
|
||||
if modalcloser_process and modalcloser_process.poll() is None:
|
||||
modalcloser_process.kill()
|
||||
if chrome_launch_process and chrome_pid:
|
||||
cleanup_chrome(chrome_launch_process, chrome_pid)
|
||||
|
||||
|
||||
def test_config_poll_interval():
|
||||
@@ -235,61 +224,58 @@ def test_config_poll_interval():
|
||||
chrome_pid = None
|
||||
modalcloser_process = None
|
||||
try:
|
||||
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(
|
||||
Path(tmpdir),
|
||||
crawl_id='test-poll',
|
||||
snapshot_id='snap-poll',
|
||||
test_url=TEST_URL,
|
||||
)
|
||||
with chrome_session(
|
||||
Path(tmpdir),
|
||||
crawl_id='test-poll',
|
||||
snapshot_id='snap-poll',
|
||||
test_url=TEST_URL,
|
||||
) as (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env):
|
||||
|
||||
modalcloser_dir = snapshot_chrome_dir.parent / 'modalcloser'
|
||||
modalcloser_dir.mkdir()
|
||||
modalcloser_dir = snapshot_chrome_dir.parent / 'modalcloser'
|
||||
modalcloser_dir.mkdir()
|
||||
|
||||
# Set very short poll interval
|
||||
env = get_test_env()
|
||||
env['MODALCLOSER_POLL_INTERVAL'] = '100' # 100ms
|
||||
# Set very short poll interval (use env from setup_chrome_session)
|
||||
env['MODALCLOSER_POLL_INTERVAL'] = '100' # 100ms
|
||||
|
||||
modalcloser_process = subprocess.Popen(
|
||||
['node', str(MODALCLOSER_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-poll'],
|
||||
cwd=str(modalcloser_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
modalcloser_process = subprocess.Popen(
|
||||
['node', str(MODALCLOSER_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-poll'],
|
||||
cwd=str(modalcloser_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
|
||||
# Run for short time
|
||||
time.sleep(1)
|
||||
# Run for short time
|
||||
time.sleep(1)
|
||||
|
||||
# Should still be running
|
||||
assert modalcloser_process.poll() is None, "Should still be running"
|
||||
# Should still be running
|
||||
assert modalcloser_process.poll() is None, "Should still be running"
|
||||
|
||||
# Clean exit
|
||||
modalcloser_process.send_signal(signal.SIGTERM)
|
||||
stdout, stderr = modalcloser_process.communicate(timeout=5)
|
||||
# Clean exit
|
||||
modalcloser_process.send_signal(signal.SIGTERM)
|
||||
stdout, stderr = modalcloser_process.communicate(timeout=5)
|
||||
|
||||
assert modalcloser_process.returncode == 0, f"Should exit 0: {stderr}"
|
||||
assert modalcloser_process.returncode == 0, f"Should exit 0: {stderr}"
|
||||
|
||||
# Verify JSONL output exists
|
||||
result_json = None
|
||||
for line in stdout.strip().split('\n'):
|
||||
if line.strip().startswith('{'):
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'ArchiveResult':
|
||||
result_json = record
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
# Verify JSONL output exists
|
||||
result_json = None
|
||||
for line in stdout.strip().split('\n'):
|
||||
if line.strip().startswith('{'):
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'ArchiveResult':
|
||||
result_json = record
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
assert result_json is not None, "Should have JSONL output"
|
||||
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
|
||||
assert result_json is not None, "Should have JSONL output"
|
||||
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
|
||||
|
||||
finally:
|
||||
if modalcloser_process and modalcloser_process.poll() is None:
|
||||
modalcloser_process.kill()
|
||||
if chrome_launch_process and chrome_pid:
|
||||
cleanup_chrome(chrome_launch_process, chrome_pid)
|
||||
|
||||
|
||||
def test_hides_cookie_consent_on_filmin():
|
||||
|
||||
80
archivebox/plugins/papersdl/on_Crawl__14_papersdl_install.py
Executable file
80
archivebox/plugins/papersdl/on_Crawl__14_papersdl_install.py
Executable file
@@ -0,0 +1,80 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Detect papers-dl binary and emit Binary JSONL record.
|
||||
|
||||
Output: Binary JSONL record to stdout if papers-dl is found
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
from abx_pkg import Binary, EnvProvider
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
return os.environ.get(name, default).strip()
|
||||
|
||||
def get_env_bool(name: str, default: bool = False) -> bool:
|
||||
val = get_env(name, '').lower()
|
||||
if val in ('true', '1', 'yes', 'on'):
|
||||
return True
|
||||
if val in ('false', '0', 'no', 'off'):
|
||||
return False
|
||||
return default
|
||||
|
||||
|
||||
def output_binary_found(binary: Binary, name: str):
|
||||
"""Output Binary JSONL record for an installed binary."""
|
||||
machine_id = os.environ.get('MACHINE_ID', '')
|
||||
|
||||
record = {
|
||||
'type': 'Binary',
|
||||
'name': name,
|
||||
'abspath': str(binary.abspath),
|
||||
'version': str(binary.version) if binary.version else '',
|
||||
'sha256': binary.sha256 or '',
|
||||
'binprovider': 'env', # Already installed
|
||||
'machine_id': machine_id,
|
||||
}
|
||||
print(json.dumps(record))
|
||||
|
||||
|
||||
def output_binary_missing(name: str, binproviders: str):
|
||||
"""Output Binary JSONL record for a missing binary that needs installation."""
|
||||
machine_id = os.environ.get('MACHINE_ID', '')
|
||||
|
||||
record = {
|
||||
'type': 'Binary',
|
||||
'name': name,
|
||||
'binproviders': binproviders, # Providers that can install it
|
||||
'machine_id': machine_id,
|
||||
}
|
||||
print(json.dumps(record))
|
||||
|
||||
|
||||
def main():
|
||||
papersdl_enabled = get_env_bool('PAPERSDL_ENABLED', True)
|
||||
papersdl_binary = get_env('PAPERSDL_BINARY', 'papers-dl')
|
||||
|
||||
if not papersdl_enabled:
|
||||
sys.exit(0)
|
||||
|
||||
provider = EnvProvider()
|
||||
try:
|
||||
binary = Binary(name=papersdl_binary, binproviders=[provider]).load()
|
||||
if binary.abspath:
|
||||
# Binary found
|
||||
output_binary_found(binary, name='papers-dl')
|
||||
else:
|
||||
# Binary not found
|
||||
output_binary_missing(name='papers-dl', binproviders='pip')
|
||||
except Exception:
|
||||
# Binary not found
|
||||
output_binary_missing(name='papers-dl', binproviders='pip')
|
||||
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -72,10 +72,9 @@ class TestParseDomOutlinksWithChrome(TestCase):
|
||||
test_url=test_url,
|
||||
navigate=True,
|
||||
timeout=30,
|
||||
) as (chrome_process, chrome_pid, snapshot_chrome_dir):
|
||||
# Get environment and run the outlinks hook
|
||||
env = get_test_env()
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
) as (chrome_process, chrome_pid, snapshot_chrome_dir, env):
|
||||
# Use the environment from chrome_session (already has CHROME_HEADLESS=true)
|
||||
|
||||
|
||||
# Run outlinks hook with the active Chrome session
|
||||
result = subprocess.run(
|
||||
|
||||
@@ -73,10 +73,9 @@ class TestRedirectsWithChrome(TestCase):
|
||||
test_url=test_url,
|
||||
navigate=True,
|
||||
timeout=30,
|
||||
) as (chrome_process, chrome_pid, snapshot_chrome_dir):
|
||||
# Get environment and run the redirects hook
|
||||
env = get_test_env()
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
) as (chrome_process, chrome_pid, snapshot_chrome_dir, env):
|
||||
# Use the environment from chrome_session (already has CHROME_HEADLESS=true)
|
||||
|
||||
|
||||
# Run redirects hook with the active Chrome session
|
||||
result = subprocess.run(
|
||||
|
||||
@@ -72,10 +72,9 @@ class TestResponsesWithChrome(TestCase):
|
||||
test_url=test_url,
|
||||
navigate=True,
|
||||
timeout=30,
|
||||
) as (chrome_process, chrome_pid, snapshot_chrome_dir):
|
||||
# Get environment and run the responses hook
|
||||
env = get_test_env()
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
) as (chrome_process, chrome_pid, snapshot_chrome_dir, env):
|
||||
# Use the environment from chrome_session (already has CHROME_HEADLESS=true)
|
||||
|
||||
|
||||
# Run responses hook with the active Chrome session
|
||||
result = subprocess.run(
|
||||
|
||||
@@ -1,20 +1,15 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* Take a screenshot of a URL using Chrome/Puppeteer.
|
||||
* Take a screenshot of a URL using an existing Chrome session.
|
||||
*
|
||||
* If a Chrome session exists (from chrome plugin), connects to it via CDP.
|
||||
* Otherwise launches a new Chrome instance.
|
||||
* Requires chrome plugin to have already created a Chrome session.
|
||||
* Connects to the existing session via CDP and takes a screenshot.
|
||||
*
|
||||
* Usage: on_Snapshot__51_screenshot.js --url=<url> --snapshot-id=<uuid>
|
||||
* Output: Writes screenshot/screenshot.png
|
||||
*
|
||||
* Environment variables:
|
||||
* CHROME_BINARY: Path to Chrome/Chromium binary
|
||||
* CHROME_TIMEOUT: Timeout in seconds (default: 60)
|
||||
* CHROME_RESOLUTION: Screenshot resolution (default: 1440,2000)
|
||||
* CHROME_USER_AGENT: User agent string (optional)
|
||||
* CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true)
|
||||
* CHROME_HEADLESS: Run in headless mode (default: true)
|
||||
* SCREENSHOT_ENABLED: Enable screenshot capture (default: true)
|
||||
*/
|
||||
|
||||
@@ -24,10 +19,8 @@ const path = require('path');
|
||||
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
|
||||
const {
|
||||
findChromium,
|
||||
getEnv,
|
||||
getEnvBool,
|
||||
getEnvInt,
|
||||
parseResolution,
|
||||
parseArgs,
|
||||
readCdpUrl,
|
||||
@@ -56,7 +49,7 @@ function hasStaticFileOutput() {
|
||||
}
|
||||
|
||||
// Wait for chrome tab to be fully loaded
|
||||
async function waitForChromeTabLoaded(timeoutMs = 60000) {
|
||||
async function waitForChromeTabLoaded(timeoutMs = 10000) {
|
||||
const navigationFile = path.join(CHROME_SESSION_DIR, 'navigation.json');
|
||||
const startTime = Date.now();
|
||||
|
||||
@@ -72,102 +65,66 @@ async function waitForChromeTabLoaded(timeoutMs = 60000) {
|
||||
}
|
||||
|
||||
async function takeScreenshot(url) {
|
||||
const timeout = (getEnvInt('CHROME_TIMEOUT') || getEnvInt('TIMEOUT', 60)) * 1000;
|
||||
const resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000');
|
||||
const userAgent = getEnv('CHROME_USER_AGENT') || getEnv('USER_AGENT', '');
|
||||
const checkSsl = getEnvBool('CHROME_CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true));
|
||||
const headless = getEnvBool('CHROME_HEADLESS', true);
|
||||
|
||||
const resolution = getEnv('CHROME_RESOLUTION', '1440,2000');
|
||||
const { width, height } = parseResolution(resolution);
|
||||
|
||||
// Output directory is current directory (hook already runs in output dir)
|
||||
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
let browser = null;
|
||||
let page = null;
|
||||
let connectedToSession = false;
|
||||
// Wait for chrome_navigate to complete (writes navigation.json)
|
||||
const timeoutSeconds = parseInt(getEnv('SCREENSHOT_TIMEOUT', '10'), 10);
|
||||
const timeoutMs = timeoutSeconds * 1000;
|
||||
const pageLoaded = await waitForChromeTabLoaded(timeoutMs);
|
||||
if (!pageLoaded) {
|
||||
throw new Error(`Page not loaded after ${timeoutSeconds}s (chrome_navigate must complete first)`);
|
||||
}
|
||||
|
||||
// Connect to existing Chrome session (required - no fallback)
|
||||
const cdpUrl = readCdpUrl(CHROME_SESSION_DIR);
|
||||
if (!cdpUrl) {
|
||||
throw new Error('No Chrome session found (chrome plugin must run first)');
|
||||
}
|
||||
|
||||
// Read target_id.txt to get the specific tab for this snapshot
|
||||
const targetIdFile = path.join(CHROME_SESSION_DIR, 'target_id.txt');
|
||||
if (!fs.existsSync(targetIdFile)) {
|
||||
throw new Error('No target_id.txt found (chrome_tab must run first)');
|
||||
}
|
||||
const targetId = fs.readFileSync(targetIdFile, 'utf8').trim();
|
||||
|
||||
const browser = await puppeteer.connect({
|
||||
browserWSEndpoint: cdpUrl,
|
||||
defaultViewport: { width, height },
|
||||
});
|
||||
|
||||
try {
|
||||
// Try to connect to existing Chrome session
|
||||
const cdpUrl = readCdpUrl(CHROME_SESSION_DIR);
|
||||
if (cdpUrl) {
|
||||
try {
|
||||
browser = await puppeteer.connect({
|
||||
browserWSEndpoint: cdpUrl,
|
||||
defaultViewport: { width, height },
|
||||
});
|
||||
connectedToSession = true;
|
||||
|
||||
// Get existing pages or create new one
|
||||
const pages = await browser.pages();
|
||||
page = pages.find(p => p.url().startsWith('http')) || pages[0];
|
||||
|
||||
if (!page) {
|
||||
page = await browser.newPage();
|
||||
}
|
||||
|
||||
// Set viewport on the page
|
||||
await page.setViewport({ width, height });
|
||||
|
||||
} catch (e) {
|
||||
console.error(`Failed to connect to CDP session: ${e.message}`);
|
||||
browser = null;
|
||||
}
|
||||
// Get the specific page for this snapshot by target ID
|
||||
const targets = await browser.targets();
|
||||
const target = targets.find(t => t._targetId === targetId);
|
||||
if (!target) {
|
||||
throw new Error(`Target ${targetId} not found in Chrome session`);
|
||||
}
|
||||
|
||||
// Fall back to launching new browser
|
||||
if (!browser) {
|
||||
const executablePath = findChromium();
|
||||
if (!executablePath) {
|
||||
return { success: false, error: 'Chrome binary not found' };
|
||||
}
|
||||
|
||||
browser = await puppeteer.launch({
|
||||
executablePath,
|
||||
headless: headless ? 'new' : false,
|
||||
args: [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-gpu',
|
||||
`--window-size=${width},${height}`,
|
||||
...(checkSsl ? [] : ['--ignore-certificate-errors']),
|
||||
],
|
||||
defaultViewport: { width, height },
|
||||
});
|
||||
|
||||
page = await browser.newPage();
|
||||
|
||||
// Navigate to URL (only if we launched fresh browser)
|
||||
if (userAgent) {
|
||||
await page.setUserAgent(userAgent);
|
||||
}
|
||||
|
||||
await page.goto(url, {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout,
|
||||
});
|
||||
const page = await target.page();
|
||||
if (!page) {
|
||||
throw new Error(`Could not get page for target ${targetId}`);
|
||||
}
|
||||
|
||||
// Take screenshot
|
||||
// Set viewport on the page
|
||||
await page.setViewport({ width, height });
|
||||
|
||||
// Take screenshot (Puppeteer throws on failure)
|
||||
await page.screenshot({
|
||||
path: outputPath,
|
||||
fullPage: true,
|
||||
});
|
||||
|
||||
if (fs.existsSync(outputPath) && fs.statSync(outputPath).size > 0) {
|
||||
return { success: true, output: outputPath };
|
||||
} else {
|
||||
return { success: false, error: 'Screenshot file not created' };
|
||||
}
|
||||
return outputPath;
|
||||
|
||||
} catch (e) {
|
||||
return { success: false, error: `${e.name}: ${e.message}` };
|
||||
} finally {
|
||||
// Only close browser if we launched it (not if we connected to session)
|
||||
if (browser && !connectedToSession) {
|
||||
await browser.close();
|
||||
}
|
||||
// Disconnect from browser (don't close it - we're connected to a shared session)
|
||||
// The chrome_launch hook manages the browser lifecycle
|
||||
await browser.disconnect();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -181,54 +138,33 @@ async function main() {
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
try {
|
||||
// Check if staticfile extractor already handled this (permanent skip)
|
||||
if (hasStaticFileOutput()) {
|
||||
console.error(`Skipping screenshot - staticfile extractor already downloaded this`);
|
||||
// Permanent skip - emit ArchiveResult
|
||||
console.log(JSON.stringify({
|
||||
type: 'ArchiveResult',
|
||||
status: 'skipped',
|
||||
output_str: 'staticfile already handled',
|
||||
}));
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
// Only wait for page load if using shared Chrome session
|
||||
const cdpUrl = readCdpUrl(CHROME_SESSION_DIR);
|
||||
if (cdpUrl) {
|
||||
// Wait for page to be fully loaded
|
||||
const pageLoaded = await waitForChromeTabLoaded(60000);
|
||||
if (!pageLoaded) {
|
||||
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
|
||||
}
|
||||
}
|
||||
|
||||
const result = await takeScreenshot(url);
|
||||
|
||||
if (result.success) {
|
||||
// Success - emit ArchiveResult
|
||||
const size = fs.statSync(result.output).size;
|
||||
console.error(`Screenshot saved (${size} bytes)`);
|
||||
console.log(JSON.stringify({
|
||||
type: 'ArchiveResult',
|
||||
status: 'succeeded',
|
||||
output_str: result.output,
|
||||
}));
|
||||
process.exit(0);
|
||||
} else {
|
||||
// Transient error - emit NO JSONL
|
||||
console.error(`ERROR: ${result.error}`);
|
||||
process.exit(1);
|
||||
}
|
||||
} catch (e) {
|
||||
// Transient error - emit NO JSONL
|
||||
console.error(`ERROR: ${e.name}: ${e.message}`);
|
||||
process.exit(1);
|
||||
// Check if staticfile extractor already handled this (permanent skip)
|
||||
if (hasStaticFileOutput()) {
|
||||
console.error(`Skipping screenshot - staticfile extractor already downloaded this`);
|
||||
// Permanent skip - emit ArchiveResult
|
||||
console.log(JSON.stringify({
|
||||
type: 'ArchiveResult',
|
||||
status: 'skipped',
|
||||
output_str: 'staticfile already handled',
|
||||
}));
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
// Take screenshot (throws on error)
|
||||
const outputPath = await takeScreenshot(url);
|
||||
|
||||
// Success - emit ArchiveResult
|
||||
const size = fs.statSync(outputPath).size;
|
||||
console.error(`Screenshot saved (${size} bytes)`);
|
||||
console.log(JSON.stringify({
|
||||
type: 'ArchiveResult',
|
||||
status: 'succeeded',
|
||||
output_str: outputPath,
|
||||
}));
|
||||
}
|
||||
|
||||
main().catch(e => {
|
||||
console.error(`Fatal error: ${e.message}`);
|
||||
// Transient error - emit NO JSONL
|
||||
console.error(`ERROR: ${e.message}`);
|
||||
process.exit(1);
|
||||
});
|
||||
|
||||
@@ -25,6 +25,7 @@ from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
get_plugin_dir,
|
||||
get_hook_script,
|
||||
run_hook_and_parse,
|
||||
chrome_session,
|
||||
LIB_DIR,
|
||||
NODE_MODULES_DIR,
|
||||
CHROME_PLUGIN_DIR,
|
||||
@@ -62,192 +63,96 @@ def test_verify_deps_with_abx_pkg():
|
||||
assert node_loaded and node_loaded.abspath, "Node.js required for screenshot plugin"
|
||||
|
||||
|
||||
def test_extracts_screenshot_from_example_com():
|
||||
"""Test full workflow: extract screenshot from real example.com via hook.
|
||||
|
||||
Replicates production directory structure:
|
||||
DATA_DIR/users/testuser/crawls/{crawl-id}/chrome/
|
||||
DATA_DIR/users/testuser/crawls/{crawl-id}/snapshots/{snap-id}/chrome/
|
||||
DATA_DIR/users/testuser/crawls/{crawl-id}/snapshots/{snap-id}/screenshot/
|
||||
|
||||
This exercises the "connect to existing session" code path which is the primary
|
||||
path in production and accounts for ~50% of the code.
|
||||
"""
|
||||
import signal
|
||||
import time
|
||||
import os
|
||||
|
||||
def test_screenshot_with_chrome_session():
|
||||
"""Test multiple screenshot scenarios with one Chrome session to save time."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
# Replicate exact production directory structure
|
||||
data_dir = Path(tmpdir)
|
||||
crawl_id = 'test-screenshot-crawl'
|
||||
test_url = 'https://example.com'
|
||||
snapshot_id = 'test-screenshot-snap'
|
||||
|
||||
# Crawl: DATA_DIR/users/{username}/crawls/YYYYMMDD/example.com/{crawl-id}/{plugin}/
|
||||
crawl_dir = data_dir / 'users' / 'testuser' / 'crawls' / '20240101' / 'example.com' / crawl_id
|
||||
chrome_dir = crawl_dir / 'chrome'
|
||||
chrome_dir.mkdir(parents=True)
|
||||
|
||||
# Snapshot: DATA_DIR/users/{username}/snapshots/YYYYMMDD/example.com/{snapshot-uuid}/{plugin}/
|
||||
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / snapshot_id
|
||||
snapshot_chrome_dir = snapshot_dir / 'chrome'
|
||||
snapshot_chrome_dir.mkdir(parents=True)
|
||||
|
||||
screenshot_dir = snapshot_dir / 'screenshot'
|
||||
screenshot_dir.mkdir()
|
||||
|
||||
env = get_test_env()
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
|
||||
# Step 1: Launch Chrome session at crawl level (background process)
|
||||
chrome_launch_process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
|
||||
cwd=str(chrome_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
|
||||
# Wait for Chrome to launch
|
||||
for i in range(15):
|
||||
if chrome_launch_process.poll() is not None:
|
||||
stdout, stderr = chrome_launch_process.communicate()
|
||||
pytest.fail(f"Chrome launch failed:\nStdout: {stdout}\nStderr: {stderr}")
|
||||
if (chrome_dir / 'cdp_url.txt').exists():
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
assert (chrome_dir / 'cdp_url.txt').exists(), "Chrome CDP URL file should exist"
|
||||
assert (chrome_dir / 'chrome.pid').exists(), "Chrome PID file should exist"
|
||||
|
||||
chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
|
||||
|
||||
try:
|
||||
# Step 2: Create tab at snapshot level
|
||||
env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
|
||||
result = subprocess.run(
|
||||
['node', str(CHROME_TAB_HOOK), f'--url={TEST_URL}', f'--snapshot-id={snapshot_id}', f'--crawl-id={crawl_id}'],
|
||||
cwd=str(snapshot_chrome_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
env=env
|
||||
)
|
||||
assert result.returncode == 0, f"Tab creation failed: {result.stderr}"
|
||||
assert (snapshot_chrome_dir / 'cdp_url.txt').exists(), "Snapshot CDP URL should exist"
|
||||
with chrome_session(
|
||||
Path(tmpdir),
|
||||
crawl_id='test-screenshot-crawl',
|
||||
snapshot_id=snapshot_id,
|
||||
test_url=test_url,
|
||||
navigate=True,
|
||||
timeout=30,
|
||||
) as (chrome_process, chrome_pid, snapshot_chrome_dir, env):
|
||||
|
||||
# Step 3: Navigate to URL
|
||||
result = subprocess.run(
|
||||
['node', str(CHROME_NAVIGATE_HOOK), f'--url={TEST_URL}', f'--snapshot-id={snapshot_id}'],
|
||||
cwd=str(snapshot_chrome_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120,
|
||||
env=env
|
||||
)
|
||||
assert result.returncode == 0, f"Navigation failed: {result.stderr}"
|
||||
assert (snapshot_chrome_dir / 'navigation.json').exists(), "Navigation JSON should exist"
|
||||
# Scenario 1: Basic screenshot extraction
|
||||
screenshot_dir = snapshot_chrome_dir.parent / 'screenshot'
|
||||
screenshot_dir.mkdir()
|
||||
|
||||
# Step 4: Take screenshot (should connect to existing session)
|
||||
# Screenshot hook runs in screenshot/ dir and looks for ../chrome/cdp_url.txt
|
||||
result = subprocess.run(
|
||||
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', f'--snapshot-id={snapshot_id}'],
|
||||
cwd=str(screenshot_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120,
|
||||
env=env
|
||||
)
|
||||
result = subprocess.run(
|
||||
['node', str(SCREENSHOT_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
|
||||
cwd=str(screenshot_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
env=env
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Screenshot extraction failed:\nStderr: {result.stderr}\nStdout: {result.stdout}"
|
||||
assert result.returncode == 0, f"Screenshot extraction failed:\nStderr: {result.stderr}"
|
||||
|
||||
# Parse JSONL output
|
||||
result_json = None
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
line = line.strip()
|
||||
if line.startswith('{'):
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'ArchiveResult':
|
||||
result_json = record
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
# Parse JSONL output
|
||||
result_json = None
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
line = line.strip()
|
||||
if line.startswith('{'):
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'ArchiveResult':
|
||||
result_json = record
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
assert result_json, "Should have ArchiveResult JSONL output"
|
||||
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
|
||||
assert 'screenshot.png' in result_json['output_str'], f"Output should be screenshot.png: {result_json}"
|
||||
assert result_json and result_json['status'] == 'succeeded'
|
||||
screenshot_file = screenshot_dir / 'screenshot.png'
|
||||
assert screenshot_file.exists() and screenshot_file.stat().st_size > 1000
|
||||
assert screenshot_file.read_bytes()[:8] == b'\x89PNG\r\n\x1a\n'
|
||||
|
||||
# Verify filesystem output
|
||||
screenshot_file = screenshot_dir / 'screenshot.png'
|
||||
assert screenshot_file.exists(), f"screenshot.png not created at {screenshot_file}"
|
||||
# Scenario 2: Custom resolution
|
||||
screenshot_dir2 = snapshot_chrome_dir.parent / 'screenshot2'
|
||||
screenshot_dir2.mkdir()
|
||||
env['CHROME_RESOLUTION'] = '800,600'
|
||||
|
||||
# Verify file is valid PNG
|
||||
file_size = screenshot_file.stat().st_size
|
||||
assert file_size > 1000, f"Screenshot too small: {file_size} bytes"
|
||||
assert file_size < 10 * 1024 * 1024, f"Screenshot suspiciously large: {file_size} bytes"
|
||||
result = subprocess.run(
|
||||
['node', str(SCREENSHOT_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
|
||||
cwd=str(screenshot_dir2),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
env=env
|
||||
)
|
||||
|
||||
# Check PNG magic bytes
|
||||
screenshot_data = screenshot_file.read_bytes()
|
||||
assert screenshot_data[:8] == b'\x89PNG\r\n\x1a\n', "Should be valid PNG file"
|
||||
assert result.returncode == 0
|
||||
screenshot_file2 = screenshot_dir2 / 'screenshot.png'
|
||||
assert screenshot_file2.exists()
|
||||
file_size = screenshot_file2.stat().st_size
|
||||
assert 500 < file_size < 100000, f"800x600 screenshot size unexpected: {file_size}"
|
||||
|
||||
finally:
|
||||
# Cleanup: Kill Chrome
|
||||
try:
|
||||
chrome_launch_process.send_signal(signal.SIGTERM)
|
||||
chrome_launch_process.wait(timeout=5)
|
||||
except:
|
||||
pass
|
||||
try:
|
||||
os.kill(chrome_pid, signal.SIGKILL)
|
||||
except OSError:
|
||||
pass
|
||||
# Scenario 3: Wrong target ID (error case)
|
||||
screenshot_dir3 = snapshot_chrome_dir.parent / 'screenshot3'
|
||||
screenshot_dir3.mkdir()
|
||||
(snapshot_chrome_dir / 'target_id.txt').write_text('nonexistent-target-id')
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(SCREENSHOT_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
|
||||
cwd=str(screenshot_dir3),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
env=env
|
||||
)
|
||||
|
||||
def test_extracts_screenshot_without_session():
|
||||
"""Test screenshot extraction without existing Chrome session (fallback to own browser)."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
# Create proper snapshot directory structure
|
||||
data_dir = Path(tmpdir)
|
||||
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-fallback'
|
||||
screenshot_dir = snapshot_dir / 'screenshot'
|
||||
screenshot_dir.mkdir(parents=True)
|
||||
assert result.returncode != 0
|
||||
assert 'target' in result.stderr.lower() and 'not found' in result.stderr.lower()
|
||||
|
||||
# Don't set up Chrome session or staticfile - screenshot should launch its own browser
|
||||
env = get_test_env()
|
||||
result = subprocess.run(
|
||||
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-fallback'],
|
||||
cwd=str(screenshot_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120,
|
||||
env=env
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
|
||||
|
||||
# Parse JSONL output
|
||||
result_json = None
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
line = line.strip()
|
||||
if line.startswith('{'):
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'ArchiveResult':
|
||||
result_json = record
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
assert result_json, "Should have ArchiveResult JSONL output"
|
||||
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
|
||||
assert 'screenshot.png' in result_json['output_str']
|
||||
|
||||
# Verify file created
|
||||
screenshot_file = screenshot_dir / 'screenshot.png'
|
||||
assert screenshot_file.exists(), "screenshot.png not created"
|
||||
assert screenshot_file.stat().st_size > 1000, "Screenshot too small"
|
||||
except RuntimeError as e:
|
||||
if 'Chrome' in str(e) or 'CDP' in str(e):
|
||||
pytest.skip(f"Chrome session setup failed: {e}")
|
||||
raise
|
||||
|
||||
|
||||
def test_skips_when_staticfile_exists():
|
||||
@@ -344,57 +249,42 @@ def test_reports_missing_chrome():
|
||||
assert 'chrome' in combined.lower() or 'browser' in combined.lower() or 'ERROR=' in combined
|
||||
|
||||
|
||||
def test_custom_resolution_and_user_agent():
|
||||
"""Test that CHROME_RESOLUTION and CHROME_USER_AGENT configs are respected."""
|
||||
def test_waits_for_navigation_timeout():
|
||||
"""Test that screenshot waits for navigation.json and times out quickly if missing."""
|
||||
import time
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
data_dir = Path(tmpdir)
|
||||
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-config'
|
||||
screenshot_dir = snapshot_dir / 'screenshot'
|
||||
screenshot_dir.mkdir(parents=True)
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Create chrome directory without navigation.json to trigger timeout
|
||||
chrome_dir = tmpdir.parent / 'chrome'
|
||||
chrome_dir.mkdir(parents=True, exist_ok=True)
|
||||
(chrome_dir / 'cdp_url.txt').write_text('ws://localhost:9222/devtools/browser/test')
|
||||
(chrome_dir / 'target_id.txt').write_text('test-target-id')
|
||||
# Intentionally NOT creating navigation.json to test timeout
|
||||
|
||||
screenshot_dir = tmpdir / 'screenshot'
|
||||
screenshot_dir.mkdir()
|
||||
|
||||
env = get_test_env()
|
||||
env['CHROME_RESOLUTION'] = '800,600'
|
||||
env['CHROME_USER_AGENT'] = 'Test/1.0'
|
||||
env['SCREENSHOT_TIMEOUT'] = '2' # Set 2 second timeout
|
||||
|
||||
start_time = time.time()
|
||||
result = subprocess.run(
|
||||
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-config'],
|
||||
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=test-timeout'],
|
||||
cwd=str(screenshot_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120,
|
||||
timeout=5, # Test timeout slightly higher than SCREENSHOT_TIMEOUT
|
||||
env=env
|
||||
)
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
|
||||
|
||||
screenshot_file = screenshot_dir / 'screenshot.png'
|
||||
assert screenshot_file.exists(), "screenshot.png not created"
|
||||
# Resolution affects file size
|
||||
assert screenshot_file.stat().st_size > 500, "Screenshot too small"
|
||||
|
||||
|
||||
def test_ssl_check_disabled():
|
||||
"""Test that CHROME_CHECK_SSL_VALIDITY=False allows invalid certificates."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
data_dir = Path(tmpdir)
|
||||
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-ssl'
|
||||
screenshot_dir = snapshot_dir / 'screenshot'
|
||||
screenshot_dir.mkdir(parents=True)
|
||||
|
||||
env = get_test_env()
|
||||
env['CHROME_CHECK_SSL_VALIDITY'] = 'False'
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-ssl'],
|
||||
cwd=str(screenshot_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120,
|
||||
env=env
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Should succeed: {result.stderr}"
|
||||
assert (screenshot_dir / 'screenshot.png').exists()
|
||||
# Should fail when navigation.json doesn't appear
|
||||
assert result.returncode != 0, "Should fail when navigation.json missing"
|
||||
assert 'not loaded' in result.stderr.lower() or 'navigate' in result.stderr.lower(), f"Should mention navigation timeout: {result.stderr}"
|
||||
# Should complete within 3s (2s wait + 1s overhead)
|
||||
assert elapsed < 3, f"Should timeout within 3s, took {elapsed:.1f}s"
|
||||
|
||||
|
||||
def test_config_timeout_honored():
|
||||
@@ -485,345 +375,114 @@ def test_invalid_resolution_format():
|
||||
# (depending on implementation - script should not crash with uncaught error)
|
||||
assert result.returncode in (0, 1), f"Script should handle bad resolution: {bad_resolution}"
|
||||
|
||||
|
||||
def test_boolean_env_var_parsing():
|
||||
"""Test that boolean environment variables are parsed correctly."""
|
||||
import time
|
||||
def test_no_cdp_url_fails():
|
||||
"""Test error when chrome dir exists but no cdp_url.txt."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
data_dir = Path(tmpdir)
|
||||
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-bool'
|
||||
screenshot_dir = snapshot_dir / 'screenshot'
|
||||
screenshot_dir.mkdir(parents=True)
|
||||
|
||||
env = get_test_env()
|
||||
|
||||
# Test various boolean formats for CHROME_HEADLESS
|
||||
for bool_val in ['true', '1', 'yes', 'on', 'True', 'TRUE']:
|
||||
env['CHROME_HEADLESS'] = bool_val
|
||||
result = subprocess.run(
|
||||
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-bool'],
|
||||
cwd=str(screenshot_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120,
|
||||
env=env
|
||||
)
|
||||
# Should either succeed or fail, but shouldn't crash on boolean parsing
|
||||
assert result.returncode in (0, 1), f"Should handle boolean value: {bool_val}"
|
||||
|
||||
# Clean up screenshot file if created
|
||||
screenshot_file = screenshot_dir / 'screenshot.png'
|
||||
if screenshot_file.exists():
|
||||
screenshot_file.unlink()
|
||||
|
||||
time.sleep(0.5) # Brief pause between attempts
|
||||
|
||||
|
||||
def test_integer_env_var_parsing():
|
||||
"""Test that integer environment variables are parsed correctly."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
data_dir = Path(tmpdir)
|
||||
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-int'
|
||||
screenshot_dir = snapshot_dir / 'screenshot'
|
||||
screenshot_dir.mkdir(parents=True)
|
||||
|
||||
env = get_test_env()
|
||||
|
||||
# Test valid and invalid integer formats for CHROME_TIMEOUT
|
||||
test_cases = [
|
||||
('60', True), # Valid integer
|
||||
('invalid', True), # Invalid - should use default
|
||||
('', True), # Empty - should use default
|
||||
]
|
||||
|
||||
for timeout_val, should_work in test_cases:
|
||||
env['CHROME_TIMEOUT'] = timeout_val
|
||||
result = subprocess.run(
|
||||
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-int'],
|
||||
cwd=str(screenshot_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120,
|
||||
env=env
|
||||
)
|
||||
# Should either succeed or fail gracefully, but shouldn't crash on int parsing
|
||||
assert result.returncode in (0, 1), f"Should handle timeout value: {timeout_val}"
|
||||
|
||||
# Clean up screenshot file if created
|
||||
screenshot_file = screenshot_dir / 'screenshot.png'
|
||||
if screenshot_file.exists():
|
||||
screenshot_file.unlink()
|
||||
|
||||
|
||||
def test_extracts_screenshot_with_all_config_options():
|
||||
"""Test screenshot with comprehensive config to exercise all code paths."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
data_dir = Path(tmpdir)
|
||||
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-full'
|
||||
screenshot_dir = snapshot_dir / 'screenshot'
|
||||
screenshot_dir.mkdir(parents=True)
|
||||
|
||||
# Set ALL config options to exercise all code paths
|
||||
env = get_test_env()
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
env['CHROME_RESOLUTION'] = '800,600'
|
||||
env['CHROME_USER_AGENT'] = 'TestBot/1.0'
|
||||
env['CHROME_CHECK_SSL_VALIDITY'] = 'false' # Exercises checkSsl branch
|
||||
env['CHROME_TIMEOUT'] = '60'
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-full'],
|
||||
cwd=str(screenshot_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120,
|
||||
env=env
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Screenshot should succeed: {result.stderr}"
|
||||
|
||||
# Verify JSONL output with success
|
||||
result_json = None
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip().startswith('{'):
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'ArchiveResult':
|
||||
result_json = record
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
assert result_json, "Should have ArchiveResult JSONL output"
|
||||
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
|
||||
assert 'screenshot.png' in result_json['output_str']
|
||||
|
||||
# Verify file created
|
||||
screenshot_file = screenshot_dir / 'screenshot.png'
|
||||
assert screenshot_file.exists(), "screenshot.png should be created"
|
||||
assert screenshot_file.stat().st_size > 1000, "Screenshot should have content"
|
||||
|
||||
|
||||
def test_headless_mode_false():
|
||||
"""Test headless=false code path specifically."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
data_dir = Path(tmpdir)
|
||||
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-headless'
|
||||
screenshot_dir = snapshot_dir / 'screenshot'
|
||||
screenshot_dir.mkdir(parents=True)
|
||||
|
||||
env = get_test_env()
|
||||
# Explicitly test headless=false (exercises the ternary false branch)
|
||||
env['CHROME_HEADLESS'] = 'false'
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-headless-false'],
|
||||
cwd=str(screenshot_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120,
|
||||
env=env
|
||||
)
|
||||
# Should work or fail gracefully
|
||||
assert result.returncode in (0, 1), f"Headless=false should handle: {result.stderr}"
|
||||
|
||||
|
||||
def test_invalid_url_causes_error():
|
||||
"""Test error path with invalid URL that causes navigation failure."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
data_dir = Path(tmpdir)
|
||||
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-invalid'
|
||||
screenshot_dir = snapshot_dir / 'screenshot'
|
||||
screenshot_dir.mkdir(parents=True)
|
||||
|
||||
env = get_test_env()
|
||||
env['CHROME_TIMEOUT'] = '5' # Short timeout
|
||||
|
||||
# Use invalid URL to trigger error path
|
||||
result = subprocess.run(
|
||||
['node', str(SCREENSHOT_HOOK), '--url=http://this-domain-does-not-exist-12345.invalid', '--snapshot-id=snap-invalid'],
|
||||
cwd=str(screenshot_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
env=env
|
||||
)
|
||||
|
||||
# Should fail due to navigation error
|
||||
assert result.returncode != 0, "Should fail on invalid URL"
|
||||
# Should NOT emit JSONL (transient error)
|
||||
jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')]
|
||||
assert len(jsonl_lines) == 0, f"Should not emit JSONL on error: {jsonl_lines}"
|
||||
|
||||
|
||||
def test_with_corrupted_cdp_url_falls_back():
|
||||
"""Test that corrupted CDP URL file causes fallback to launching browser."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
data_dir = Path(tmpdir)
|
||||
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-corrupt-cdp'
|
||||
screenshot_dir = snapshot_dir / 'screenshot'
|
||||
screenshot_dir.mkdir(parents=True)
|
||||
|
||||
# Create chrome directory with corrupted CDP URL
|
||||
chrome_dir = snapshot_dir / 'chrome'
|
||||
tmpdir = Path(tmpdir)
|
||||
chrome_dir = tmpdir / 'chrome'
|
||||
chrome_dir.mkdir()
|
||||
(chrome_dir / 'cdp_url.txt').write_text('ws://127.0.0.1:99999/invalid')
|
||||
# Create target_id.txt and navigation.json but NOT cdp_url.txt
|
||||
(chrome_dir / 'target_id.txt').write_text('test-target')
|
||||
(chrome_dir / 'navigation.json').write_text('{}')
|
||||
|
||||
env = get_test_env()
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
env['CHROME_TIMEOUT'] = '5' # Short timeout for fast test
|
||||
screenshot_dir = tmpdir / 'screenshot'
|
||||
screenshot_dir.mkdir()
|
||||
|
||||
# Screenshot should try CDP, fail quickly, then fall back to launching own browser
|
||||
result = subprocess.run(
|
||||
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-corrupt-cdp'],
|
||||
['node', str(SCREENSHOT_HOOK), '--url=https://example.com', '--snapshot-id=test'],
|
||||
cwd=str(screenshot_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
env=env
|
||||
timeout=7,
|
||||
env=get_test_env()
|
||||
)
|
||||
|
||||
# Should succeed by falling back to launching browser
|
||||
assert result.returncode == 0, f"Should fallback and succeed: {result.stderr}"
|
||||
assert 'Failed to connect to CDP' in result.stderr, "Should log CDP connection failure"
|
||||
|
||||
# Verify screenshot was created via fallback path
|
||||
screenshot_file = screenshot_dir / 'screenshot.png'
|
||||
assert screenshot_file.exists(), "Screenshot should be created via fallback"
|
||||
assert result.returncode != 0
|
||||
assert 'no chrome session' in result.stderr.lower()
|
||||
|
||||
|
||||
def test_user_agent_is_applied():
|
||||
"""Test that CHROME_USER_AGENT is actually applied when launching browser."""
|
||||
def test_no_target_id_fails():
|
||||
"""Test error when cdp_url exists but no target_id.txt."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
data_dir = Path(tmpdir)
|
||||
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-ua'
|
||||
screenshot_dir = snapshot_dir / 'screenshot'
|
||||
screenshot_dir.mkdir(parents=True)
|
||||
tmpdir = Path(tmpdir)
|
||||
chrome_dir = tmpdir / 'chrome'
|
||||
chrome_dir.mkdir()
|
||||
# Create cdp_url.txt and navigation.json but NOT target_id.txt
|
||||
(chrome_dir / 'cdp_url.txt').write_text('ws://localhost:9222/devtools/browser/test')
|
||||
(chrome_dir / 'navigation.json').write_text('{}')
|
||||
|
||||
env = get_test_env()
|
||||
env['CHROME_USER_AGENT'] = 'CustomBot/9.9.9 (Testing)'
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
screenshot_dir = tmpdir / 'screenshot'
|
||||
screenshot_dir.mkdir()
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-ua'],
|
||||
['node', str(SCREENSHOT_HOOK), '--url=https://example.com', '--snapshot-id=test'],
|
||||
cwd=str(screenshot_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120,
|
||||
env=env
|
||||
timeout=7,
|
||||
env=get_test_env()
|
||||
)
|
||||
|
||||
# Should succeed with custom user agent
|
||||
assert result.returncode == 0, f"Should succeed with custom UA: {result.stderr}"
|
||||
screenshot_file = screenshot_dir / 'screenshot.png'
|
||||
assert screenshot_file.exists(), "Screenshot should be created"
|
||||
assert result.returncode != 0
|
||||
assert 'target_id.txt' in result.stderr.lower()
|
||||
|
||||
|
||||
def test_check_ssl_false_branch():
|
||||
"""Test CHROME_CHECK_SSL_VALIDITY=false adds ignore-certificate-errors arg."""
|
||||
def test_invalid_cdp_url_fails():
|
||||
"""Test error with malformed CDP URL."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
data_dir = Path(tmpdir)
|
||||
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-nossl'
|
||||
screenshot_dir = snapshot_dir / 'screenshot'
|
||||
screenshot_dir.mkdir(parents=True)
|
||||
tmpdir = Path(tmpdir)
|
||||
chrome_dir = tmpdir / 'chrome'
|
||||
chrome_dir.mkdir()
|
||||
(chrome_dir / 'cdp_url.txt').write_text('invalid-url')
|
||||
(chrome_dir / 'target_id.txt').write_text('test-target')
|
||||
(chrome_dir / 'navigation.json').write_text('{}')
|
||||
|
||||
env = get_test_env()
|
||||
env['CHROME_CHECK_SSL_VALIDITY'] = 'false'
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
screenshot_dir = tmpdir / 'screenshot'
|
||||
screenshot_dir.mkdir()
|
||||
|
||||
# Test with both boolean false and string 'false'
|
||||
result = subprocess.run(
|
||||
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-nossl'],
|
||||
['node', str(SCREENSHOT_HOOK), '--url=https://example.com', '--snapshot-id=test'],
|
||||
cwd=str(screenshot_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120,
|
||||
env=env
|
||||
timeout=7,
|
||||
env=get_test_env()
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Should work with SSL check disabled: {result.stderr}"
|
||||
assert (screenshot_dir / 'screenshot.png').exists()
|
||||
assert result.returncode != 0
|
||||
|
||||
|
||||
def test_alternative_env_var_names():
|
||||
"""Test fallback environment variable names (TIMEOUT vs CHROME_TIMEOUT, etc)."""
|
||||
def test_invalid_timeout_uses_default():
|
||||
"""Test that invalid SCREENSHOT_TIMEOUT falls back to default."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
data_dir = Path(tmpdir)
|
||||
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-altenv'
|
||||
screenshot_dir = snapshot_dir / 'screenshot'
|
||||
screenshot_dir.mkdir(parents=True)
|
||||
tmpdir = Path(tmpdir)
|
||||
chrome_dir = tmpdir / 'chrome'
|
||||
chrome_dir.mkdir()
|
||||
# No navigation.json to trigger timeout
|
||||
(chrome_dir / 'cdp_url.txt').write_text('ws://localhost:9222/test')
|
||||
(chrome_dir / 'target_id.txt').write_text('test')
|
||||
|
||||
screenshot_dir = tmpdir / 'screenshot'
|
||||
screenshot_dir.mkdir()
|
||||
|
||||
env = get_test_env()
|
||||
# Use alternative env var names (without CHROME_ prefix)
|
||||
env['TIMEOUT'] = '45'
|
||||
env['RESOLUTION'] = '1024,768'
|
||||
env['USER_AGENT'] = 'AltBot/1.0'
|
||||
env['CHECK_SSL_VALIDITY'] = 'false'
|
||||
env['SCREENSHOT_TIMEOUT'] = 'invalid' # Should fallback to default (10s becomes NaN, treated as 0)
|
||||
|
||||
import time
|
||||
start = time.time()
|
||||
result = subprocess.run(
|
||||
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-altenv'],
|
||||
['node', str(SCREENSHOT_HOOK), '--url=https://example.com', '--snapshot-id=test'],
|
||||
cwd=str(screenshot_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120,
|
||||
timeout=5,
|
||||
env=env
|
||||
)
|
||||
elapsed = time.time() - start
|
||||
|
||||
assert result.returncode == 0, f"Should work with alternative env vars: {result.stderr}"
|
||||
assert (screenshot_dir / 'screenshot.png').exists()
|
||||
|
||||
|
||||
def test_very_large_resolution():
|
||||
"""Test screenshot with very large resolution."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
data_dir = Path(tmpdir)
|
||||
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-large'
|
||||
screenshot_dir = snapshot_dir / 'screenshot'
|
||||
screenshot_dir.mkdir(parents=True)
|
||||
|
||||
env = get_test_env()
|
||||
env['CHROME_RESOLUTION'] = '3840,2160' # 4K resolution
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-large'],
|
||||
cwd=str(screenshot_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120,
|
||||
env=env
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Should handle large resolution: {result.stderr}"
|
||||
screenshot_file = screenshot_dir / 'screenshot.png'
|
||||
assert screenshot_file.exists()
|
||||
# 4K screenshot should be larger
|
||||
assert screenshot_file.stat().st_size > 5000, "4K screenshot should be substantial"
|
||||
|
||||
|
||||
def test_very_small_resolution():
|
||||
"""Test screenshot with very small resolution."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
data_dir = Path(tmpdir)
|
||||
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-small'
|
||||
screenshot_dir = snapshot_dir / 'screenshot'
|
||||
screenshot_dir.mkdir(parents=True)
|
||||
|
||||
env = get_test_env()
|
||||
env['CHROME_RESOLUTION'] = '320,240' # Very small
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-small'],
|
||||
cwd=str(screenshot_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120,
|
||||
env=env
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Should handle small resolution: {result.stderr}"
|
||||
assert (screenshot_dir / 'screenshot.png').exists()
|
||||
# With invalid timeout, parseInt returns NaN, which should be handled
|
||||
assert result.returncode != 0
|
||||
assert elapsed < 2 # Should fail quickly, not wait 10s
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@@ -72,10 +72,9 @@ class TestSEOWithChrome(TestCase):
|
||||
test_url=test_url,
|
||||
navigate=True,
|
||||
timeout=30,
|
||||
) as (chrome_process, chrome_pid, snapshot_chrome_dir):
|
||||
# Get environment and run the SEO hook
|
||||
env = get_test_env()
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
) as (chrome_process, chrome_pid, snapshot_chrome_dir, env):
|
||||
# Use the environment from chrome_session (already has CHROME_HEADLESS=true)
|
||||
|
||||
|
||||
# Run SEO hook with the active Chrome session
|
||||
result = subprocess.run(
|
||||
|
||||
@@ -22,7 +22,7 @@ from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
get_test_env,
|
||||
get_plugin_dir,
|
||||
get_hook_script,
|
||||
setup_chrome_session,
|
||||
chrome_session,
|
||||
cleanup_chrome,
|
||||
)
|
||||
|
||||
@@ -96,17 +96,15 @@ def test_singlefile_with_chrome_session():
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
try:
|
||||
# Set up Chrome session using shared helper
|
||||
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(
|
||||
tmpdir=tmpdir,
|
||||
crawl_id='singlefile-test-crawl',
|
||||
snapshot_id='singlefile-test-snap',
|
||||
test_url=TEST_URL,
|
||||
navigate=False, # Don't navigate, singlefile will do that
|
||||
timeout=20,
|
||||
)
|
||||
|
||||
# Set up Chrome session using shared helper
|
||||
with chrome_session(
|
||||
tmpdir=tmpdir,
|
||||
crawl_id='singlefile-test-crawl',
|
||||
snapshot_id='singlefile-test-snap',
|
||||
test_url=TEST_URL,
|
||||
navigate=False, # Don't navigate, singlefile will do that
|
||||
timeout=20,
|
||||
) as (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env):
|
||||
# singlefile looks for ../chrome/cdp_url.txt relative to cwd
|
||||
# So we need to run from a directory that has ../chrome pointing to our chrome dir
|
||||
singlefile_output_dir = tmpdir / 'snapshot' / 'singlefile'
|
||||
@@ -117,9 +115,8 @@ def test_singlefile_with_chrome_session():
|
||||
if not chrome_link.exists():
|
||||
chrome_link.symlink_to(tmpdir / 'crawl' / 'chrome')
|
||||
|
||||
env = get_test_env()
|
||||
# Use env from chrome_session
|
||||
env['SINGLEFILE_ENABLED'] = 'true'
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
|
||||
# Run singlefile - it should find and use the existing Chrome session
|
||||
result = subprocess.run(
|
||||
@@ -143,9 +140,6 @@ def test_singlefile_with_chrome_session():
|
||||
assert result.returncode == 0 or 'browser-server' in result.stderr or 'cdp' in result.stderr.lower(), \
|
||||
f"Singlefile should attempt CDP connection. stderr: {result.stderr}"
|
||||
|
||||
finally:
|
||||
cleanup_chrome(chrome_launch_process, chrome_pid)
|
||||
|
||||
|
||||
def test_singlefile_disabled_skips():
|
||||
"""Test that SINGLEFILE_ENABLED=False exits without JSONL."""
|
||||
|
||||
@@ -72,10 +72,9 @@ class TestSSLWithChrome(TestCase):
|
||||
test_url=test_url,
|
||||
navigate=True,
|
||||
timeout=30,
|
||||
) as (chrome_process, chrome_pid, snapshot_chrome_dir):
|
||||
# Get environment and run the SSL hook
|
||||
env = get_test_env()
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
) as (chrome_process, chrome_pid, snapshot_chrome_dir, env):
|
||||
# Use the environment from chrome_session (already has CHROME_HEADLESS=true)
|
||||
|
||||
|
||||
# Run SSL hook with the active Chrome session
|
||||
result = subprocess.run(
|
||||
|
||||
@@ -72,16 +72,14 @@ class TestStaticfileWithChrome(TestCase):
|
||||
test_url=test_url,
|
||||
navigate=True,
|
||||
timeout=30,
|
||||
) as (chrome_process, chrome_pid, snapshot_chrome_dir):
|
||||
# Get environment and run the staticfile hook
|
||||
env = get_test_env()
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
) as (chrome_process, chrome_pid, snapshot_chrome_dir, env):
|
||||
# Use the environment from chrome_session (already has CHROME_HEADLESS=true)
|
||||
|
||||
|
||||
# Run staticfile hook with the active Chrome session
|
||||
result = subprocess.run(
|
||||
['node', str(STATICFILE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
|
||||
cwd=str(snapshot_chrome_dir,
|
||||
env=get_test_env()),
|
||||
cwd=str(snapshot_chrome_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120, # Longer timeout as it waits for navigation
|
||||
|
||||
@@ -384,11 +384,11 @@ def test_root_snapshot_has_depth_zero(tmp_path, process, disable_extractors_dict
|
||||
|
||||
|
||||
def test_archiveresult_worker_queue_filters_by_foreground_extractors(tmp_path, process):
|
||||
"""Test that ArchiveResultWorker.get_queue() only blocks on foreground extractors."""
|
||||
"""Test that background hooks don't block foreground extractors from running."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# This test verifies the fix for the orchestrator bug where background hooks
|
||||
# were blocking parser extractors from running
|
||||
# This test verifies that background hooks run concurrently with foreground hooks
|
||||
# and don't block parser extractors
|
||||
|
||||
# Start a crawl
|
||||
env = os.environ.copy()
|
||||
|
||||
@@ -1,15 +1,13 @@
|
||||
"""
|
||||
Orchestrator for managing worker processes.
|
||||
|
||||
The Orchestrator polls queues for each model type (Crawl, Snapshot, ArchiveResult)
|
||||
and lazily spawns worker processes when there is work to be done.
|
||||
The Orchestrator polls the Crawl queue and spawns CrawlWorkers as needed.
|
||||
|
||||
Architecture:
|
||||
Orchestrator (main loop, polls queues)
|
||||
├── CrawlWorker subprocess(es)
|
||||
├── SnapshotWorker subprocess(es)
|
||||
└── ArchiveResultWorker subprocess(es)
|
||||
└── Each worker spawns task subprocesses via CLI
|
||||
Orchestrator (polls Crawl queue)
|
||||
└── CrawlWorker(s) (one per active Crawl)
|
||||
└── SnapshotWorker(s) (one per Snapshot, up to limit)
|
||||
└── Hook Processes (sequential, forked by SnapshotWorker)
|
||||
|
||||
Usage:
|
||||
# Default: runs forever (for use as subprocess of server)
|
||||
@@ -38,7 +36,7 @@ from django.utils import timezone
|
||||
from rich import print
|
||||
|
||||
from archivebox.misc.logging_util import log_worker_event
|
||||
from .worker import Worker, CrawlWorker, SnapshotWorker, ArchiveResultWorker
|
||||
from .worker import Worker, CrawlWorker
|
||||
|
||||
|
||||
def _run_orchestrator_process(exit_on_idle: bool) -> None:
|
||||
@@ -52,22 +50,27 @@ def _run_orchestrator_process(exit_on_idle: bool) -> None:
|
||||
class Orchestrator:
|
||||
"""
|
||||
Manages worker processes by polling queues and spawning workers as needed.
|
||||
|
||||
|
||||
The orchestrator:
|
||||
1. Polls each model queue (Crawl, Snapshot, ArchiveResult)
|
||||
2. If items exist and fewer than MAX_CONCURRENT workers are running, spawns workers
|
||||
1. Polls Crawl queue
|
||||
2. If crawls exist and fewer than MAX_CRAWL_WORKERS are running, spawns CrawlWorkers
|
||||
3. Monitors worker health and cleans up stale PIDs
|
||||
4. Exits when all queues are empty (unless daemon mode)
|
||||
4. Exits when queue is empty (unless daemon mode)
|
||||
|
||||
Architecture:
|
||||
- Orchestrator spawns CrawlWorkers (one per active Crawl)
|
||||
- Each CrawlWorker spawns SnapshotWorkers (one per Snapshot, up to limit)
|
||||
- Each SnapshotWorker runs hooks sequentially for its snapshot
|
||||
"""
|
||||
|
||||
WORKER_TYPES: list[Type[Worker]] = [CrawlWorker, SnapshotWorker, ArchiveResultWorker]
|
||||
|
||||
# Only CrawlWorker - SnapshotWorkers are spawned by CrawlWorker subprocess, not by Orchestrator
|
||||
WORKER_TYPES: list[Type[Worker]] = [CrawlWorker]
|
||||
|
||||
# Configuration
|
||||
POLL_INTERVAL: float = 2.0 # How often to check for new work (seconds)
|
||||
IDLE_TIMEOUT: int = 3 # Exit after N idle ticks (0 = never exit)
|
||||
MAX_WORKERS_PER_TYPE: int = 8 # Max workers per model type
|
||||
MAX_TOTAL_WORKERS: int = 24 # Max workers across all types
|
||||
|
||||
MAX_CRAWL_WORKERS: int = 8 # Max crawls processing simultaneously
|
||||
|
||||
def __init__(self, exit_on_idle: bool = True, crawl_id: str | None = None):
|
||||
self.exit_on_idle = exit_on_idle
|
||||
self.crawl_id = crawl_id # If set, only process work for this crawl
|
||||
@@ -76,11 +79,9 @@ class Orchestrator:
|
||||
self.idle_count: int = 0
|
||||
self._last_cleanup_time: float = 0.0 # For throttling cleanup_stale_running()
|
||||
|
||||
# In foreground mode (exit_on_idle=True), limit workers but allow enough
|
||||
# for crawl progression: 1 CrawlWorker + 1 SnapshotWorker + 1 ArchiveResultWorker
|
||||
# In foreground mode (exit_on_idle=True), limit to 1 CrawlWorker
|
||||
if self.exit_on_idle:
|
||||
self.MAX_WORKERS_PER_TYPE = 1
|
||||
self.MAX_TOTAL_WORKERS = 3 # Allow one worker of each type to run concurrently
|
||||
self.MAX_CRAWL_WORKERS = 1
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f'[underline]Orchestrator[/underline]\\[pid={self.pid}]'
|
||||
@@ -109,14 +110,18 @@ class Orchestrator:
|
||||
# Clean up any stale Process records from previous runs
|
||||
stale_count = Process.cleanup_stale_running()
|
||||
|
||||
# Clean up orphaned Chrome processes from previous crashes
|
||||
chrome_count = Process.cleanup_orphaned_chrome()
|
||||
|
||||
# Collect startup metadata
|
||||
metadata = {
|
||||
'max_workers_per_type': self.MAX_WORKERS_PER_TYPE,
|
||||
'max_total_workers': self.MAX_TOTAL_WORKERS,
|
||||
'max_crawl_workers': self.MAX_CRAWL_WORKERS,
|
||||
'poll_interval': self.POLL_INTERVAL,
|
||||
}
|
||||
if stale_count:
|
||||
metadata['cleaned_stale_pids'] = stale_count
|
||||
if chrome_count:
|
||||
metadata['cleaned_orphaned_chrome'] = chrome_count
|
||||
|
||||
log_worker_event(
|
||||
worker_type='Orchestrator',
|
||||
@@ -126,8 +131,34 @@ class Orchestrator:
|
||||
metadata=metadata,
|
||||
)
|
||||
|
||||
def terminate_all_workers(self) -> None:
|
||||
"""Terminate all running worker processes."""
|
||||
from archivebox.machine.models import Process
|
||||
import signal
|
||||
|
||||
# Get all running worker processes
|
||||
running_workers = Process.objects.filter(
|
||||
process_type=Process.TypeChoices.WORKER,
|
||||
status__in=['running', 'started']
|
||||
)
|
||||
|
||||
for worker_process in running_workers:
|
||||
try:
|
||||
# Send SIGTERM to gracefully terminate the worker
|
||||
os.kill(worker_process.pid, signal.SIGTERM)
|
||||
except ProcessLookupError:
|
||||
# Process already dead
|
||||
pass
|
||||
except Exception:
|
||||
# Ignore other errors during shutdown
|
||||
pass
|
||||
|
||||
def on_shutdown(self, error: BaseException | None = None) -> None:
|
||||
"""Called when orchestrator shuts down."""
|
||||
# Terminate all worker processes in exit_on_idle mode
|
||||
if self.exit_on_idle:
|
||||
self.terminate_all_workers()
|
||||
|
||||
# Update Process record status
|
||||
if hasattr(self, 'db_process') and self.db_process:
|
||||
# KeyboardInterrupt is a graceful shutdown, not an error
|
||||
@@ -163,20 +194,15 @@ class Orchestrator:
|
||||
return len(WorkerClass.get_running_workers())
|
||||
|
||||
def should_spawn_worker(self, WorkerClass: Type[Worker], queue_count: int) -> bool:
|
||||
"""Determine if we should spawn a new worker of the given type."""
|
||||
"""Determine if we should spawn a new CrawlWorker."""
|
||||
if queue_count == 0:
|
||||
return False
|
||||
|
||||
# Check per-type limit
|
||||
# Check CrawlWorker limit
|
||||
running_workers = WorkerClass.get_running_workers()
|
||||
running_count = len(running_workers)
|
||||
|
||||
if running_count >= self.MAX_WORKERS_PER_TYPE:
|
||||
return False
|
||||
|
||||
# Check total limit
|
||||
total_workers = self.get_total_worker_count()
|
||||
if total_workers >= self.MAX_TOTAL_WORKERS:
|
||||
if running_count >= self.MAX_CRAWL_WORKERS:
|
||||
return False
|
||||
|
||||
# Check if we already have enough workers for the queue size
|
||||
@@ -190,7 +216,7 @@ class Orchestrator:
|
||||
"""Spawn a new worker process. Returns PID or None if spawn failed."""
|
||||
try:
|
||||
print(f'[yellow]DEBUG: Spawning {WorkerClass.name} worker with crawl_id={self.crawl_id}...[/yellow]')
|
||||
pid = WorkerClass.start(daemon=False, crawl_id=self.crawl_id)
|
||||
pid = WorkerClass.start(crawl_id=self.crawl_id)
|
||||
print(f'[yellow]DEBUG: Spawned {WorkerClass.name} worker with PID={pid}[/yellow]')
|
||||
|
||||
# CRITICAL: Block until worker registers itself in Process table
|
||||
@@ -259,24 +285,49 @@ class Orchestrator:
|
||||
|
||||
def check_queues_and_spawn_workers(self) -> dict[str, int]:
|
||||
"""
|
||||
Check all queues and spawn workers as needed.
|
||||
Returns dict of queue sizes by worker type.
|
||||
Check Crawl queue and spawn CrawlWorkers as needed.
|
||||
Returns dict of queue sizes.
|
||||
"""
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
queue_sizes = {}
|
||||
|
||||
for WorkerClass in self.WORKER_TYPES:
|
||||
# Get queue for this worker type
|
||||
# Need to instantiate worker to get queue (for model access)
|
||||
worker = WorkerClass(worker_id=-1, crawl_id=self.crawl_id) # temp instance just for queue access
|
||||
queue = worker.get_queue()
|
||||
queue_count = queue.count()
|
||||
queue_sizes[WorkerClass.name] = queue_count
|
||||
# Only check Crawl queue
|
||||
crawl_queue = Crawl.objects.filter(
|
||||
retry_at__lte=timezone.now()
|
||||
).exclude(
|
||||
status__in=Crawl.FINAL_STATES
|
||||
)
|
||||
|
||||
# Apply crawl_id filter if set
|
||||
if self.crawl_id:
|
||||
crawl_queue = crawl_queue.filter(id=self.crawl_id)
|
||||
|
||||
crawl_queue = crawl_queue.order_by('retry_at')
|
||||
crawl_count = crawl_queue.count()
|
||||
queue_sizes['crawl'] = crawl_count
|
||||
|
||||
# Spawn CrawlWorker if needed
|
||||
if self.should_spawn_worker(CrawlWorker, crawl_count):
|
||||
# Claim next crawl
|
||||
crawl = crawl_queue.first()
|
||||
if crawl and self._claim_crawl(crawl):
|
||||
CrawlWorker.start(crawl_id=str(crawl.id))
|
||||
|
||||
# Spawn worker if needed
|
||||
if self.should_spawn_worker(WorkerClass, queue_count):
|
||||
self.spawn_worker(WorkerClass)
|
||||
|
||||
return queue_sizes
|
||||
|
||||
def _claim_crawl(self, crawl) -> bool:
|
||||
"""Atomically claim a crawl using optimistic locking."""
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
updated = Crawl.objects.filter(
|
||||
pk=crawl.pk,
|
||||
retry_at=crawl.retry_at,
|
||||
).update(
|
||||
retry_at=timezone.now() + timedelta(hours=24), # Long lock (crawls take time)
|
||||
)
|
||||
|
||||
return updated == 1
|
||||
|
||||
def has_pending_work(self, queue_sizes: dict[str, int]) -> bool:
|
||||
"""Check if any queue has pending work."""
|
||||
@@ -287,30 +338,21 @@ class Orchestrator:
|
||||
return self.get_total_worker_count() > 0
|
||||
|
||||
def has_future_work(self) -> bool:
|
||||
"""Check if there's work scheduled for the future (retry_at > now)."""
|
||||
for WorkerClass in self.WORKER_TYPES:
|
||||
worker = WorkerClass(worker_id=-1, crawl_id=self.crawl_id)
|
||||
Model = worker.get_model()
|
||||
"""Check if there's work scheduled for the future (retry_at > now) in Crawl queue."""
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
# Build filter for future work, respecting crawl_id if set
|
||||
qs = Model.objects.filter(
|
||||
retry_at__gt=timezone.now()
|
||||
).exclude(
|
||||
status__in=Model.FINAL_STATES
|
||||
)
|
||||
# Build filter for future work, respecting crawl_id if set
|
||||
qs = Crawl.objects.filter(
|
||||
retry_at__gt=timezone.now()
|
||||
).exclude(
|
||||
status__in=Crawl.FINAL_STATES
|
||||
)
|
||||
|
||||
# Apply crawl_id filter if set
|
||||
if self.crawl_id:
|
||||
if WorkerClass.name == 'crawl':
|
||||
qs = qs.filter(id=self.crawl_id)
|
||||
elif WorkerClass.name == 'snapshot':
|
||||
qs = qs.filter(crawl_id=self.crawl_id)
|
||||
elif WorkerClass.name == 'archiveresult':
|
||||
qs = qs.filter(snapshot__crawl_id=self.crawl_id)
|
||||
# Apply crawl_id filter if set
|
||||
if self.crawl_id:
|
||||
qs = qs.filter(id=self.crawl_id)
|
||||
|
||||
if qs.count() > 0:
|
||||
return True
|
||||
return False
|
||||
return qs.count() > 0
|
||||
|
||||
def on_tick(self, queue_sizes: dict[str, int]) -> None:
|
||||
"""Called each orchestrator tick. Override for custom behavior."""
|
||||
@@ -345,20 +387,20 @@ class Orchestrator:
|
||||
|
||||
def runloop(self) -> None:
|
||||
"""Main orchestrator loop."""
|
||||
from rich.progress import Progress, BarColumn, TextColumn, TaskProgressColumn
|
||||
from archivebox.misc.logging import IS_TTY, CONSOLE
|
||||
from rich.live import Live
|
||||
from archivebox.misc.logging import IS_TTY
|
||||
from archivebox.misc.progress_layout import ArchiveBoxProgressLayout
|
||||
import sys
|
||||
import os
|
||||
|
||||
# Enable progress bars only in TTY + foreground mode
|
||||
# Enable progress layout only in TTY + foreground mode
|
||||
show_progress = IS_TTY and self.exit_on_idle
|
||||
|
||||
self.on_startup()
|
||||
task_ids = {}
|
||||
|
||||
if not show_progress:
|
||||
# No progress bars - just run normally
|
||||
self._run_orchestrator_loop(None, task_ids)
|
||||
# No progress layout - just run normally
|
||||
self._run_orchestrator_loop(None)
|
||||
else:
|
||||
# Redirect worker subprocess output to /dev/null
|
||||
devnull_fd = os.open(os.devnull, os.O_WRONLY)
|
||||
@@ -384,14 +426,16 @@ class Orchestrator:
|
||||
original_console = logging_module.CONSOLE
|
||||
logging_module.CONSOLE = orchestrator_console
|
||||
|
||||
# Now create Progress and run loop (DON'T restore stdout/stderr - workers need /dev/null)
|
||||
with Progress(
|
||||
TextColumn("[cyan]{task.description}"),
|
||||
BarColumn(bar_width=40),
|
||||
TaskProgressColumn(),
|
||||
# Create layout and run with Live display
|
||||
progress_layout = ArchiveBoxProgressLayout(crawl_id=self.crawl_id)
|
||||
|
||||
with Live(
|
||||
progress_layout.get_layout(),
|
||||
refresh_per_second=4,
|
||||
screen=True,
|
||||
console=orchestrator_console,
|
||||
) as progress:
|
||||
self._run_orchestrator_loop(progress, task_ids)
|
||||
):
|
||||
self._run_orchestrator_loop(progress_layout)
|
||||
|
||||
# Restore original console
|
||||
logging_module.CONSOLE = original_console
|
||||
@@ -409,22 +453,68 @@ class Orchestrator:
|
||||
pass
|
||||
# stdout_for_console is closed by orchestrator_console
|
||||
|
||||
def _run_orchestrator_loop(self, progress, task_ids):
|
||||
def _run_orchestrator_loop(self, progress_layout):
|
||||
"""Run the main orchestrator loop with optional progress display."""
|
||||
last_queue_sizes = {}
|
||||
last_snapshot_count = None
|
||||
tick_count = 0
|
||||
|
||||
# Track snapshot progress to detect changes
|
||||
snapshot_progress = {} # snapshot_id -> (total, completed, current_plugin)
|
||||
|
||||
try:
|
||||
while True:
|
||||
tick_count += 1
|
||||
|
||||
# Check queues and spawn workers
|
||||
queue_sizes = self.check_queues_and_spawn_workers()
|
||||
|
||||
# Debug queue sizes (only when changed)
|
||||
if progress and queue_sizes != last_queue_sizes:
|
||||
progress.console.print(f'[yellow]DEBUG: Queue sizes: {queue_sizes}[/yellow]')
|
||||
last_queue_sizes = queue_sizes.copy()
|
||||
# Get worker counts for each type
|
||||
worker_counts = {
|
||||
WorkerClass.name: len(WorkerClass.get_running_workers())
|
||||
for WorkerClass in self.WORKER_TYPES
|
||||
}
|
||||
|
||||
# Update progress bars
|
||||
if progress:
|
||||
# Update layout if enabled
|
||||
if progress_layout:
|
||||
# Get crawl queue and worker counts
|
||||
crawl_queue_count = queue_sizes.get('crawl', 0)
|
||||
crawl_workers_count = worker_counts.get('crawl', 0)
|
||||
|
||||
# Determine orchestrator status
|
||||
if crawl_workers_count > 0:
|
||||
status = "Working"
|
||||
elif crawl_queue_count > 0:
|
||||
status = "Spawning"
|
||||
else:
|
||||
status = "Idle"
|
||||
|
||||
# Update orchestrator status
|
||||
progress_layout.update_orchestrator_status(
|
||||
status=status,
|
||||
crawl_queue_count=crawl_queue_count,
|
||||
crawl_workers_count=crawl_workers_count,
|
||||
max_crawl_workers=self.MAX_CRAWL_WORKERS,
|
||||
)
|
||||
|
||||
# Log queue size changes
|
||||
if queue_sizes != last_queue_sizes:
|
||||
for worker_type, count in queue_sizes.items():
|
||||
old_count = last_queue_sizes.get(worker_type, 0)
|
||||
if count != old_count:
|
||||
if count > old_count:
|
||||
progress_layout.log_event(
|
||||
f"{worker_type.capitalize()} queue: {old_count} → {count}",
|
||||
style="yellow"
|
||||
)
|
||||
else:
|
||||
progress_layout.log_event(
|
||||
f"{worker_type.capitalize()} queue: {old_count} → {count}",
|
||||
style="green"
|
||||
)
|
||||
last_queue_sizes = queue_sizes.copy()
|
||||
|
||||
# Update snapshot progress
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
# Get all started snapshots (optionally filtered by crawl_id)
|
||||
@@ -438,9 +528,36 @@ class Orchestrator:
|
||||
|
||||
active_snapshots = list(Snapshot.objects.filter(**snapshot_filter))
|
||||
|
||||
# Debug snapshot count (only when changed)
|
||||
# Log snapshot count changes and details
|
||||
if len(active_snapshots) != last_snapshot_count:
|
||||
progress.console.print(f'[yellow]DEBUG: Found {len(active_snapshots)} active snapshots (crawl_id={self.crawl_id})[/yellow]')
|
||||
if last_snapshot_count is not None:
|
||||
if len(active_snapshots) > last_snapshot_count:
|
||||
progress_layout.log_event(
|
||||
f"Active snapshots: {last_snapshot_count} → {len(active_snapshots)}",
|
||||
style="cyan"
|
||||
)
|
||||
# Log which snapshots started
|
||||
for snapshot in active_snapshots[-1:]: # Just show the newest one
|
||||
progress_layout.log_event(
|
||||
f"Started: {snapshot.url[:60]}",
|
||||
style="green"
|
||||
)
|
||||
|
||||
# Log SnapshotWorker count
|
||||
from archivebox.machine.models import Process
|
||||
all_workers = Process.objects.filter(
|
||||
process_type=Process.TypeChoices.WORKER,
|
||||
status__in=['running', 'started']
|
||||
).count()
|
||||
progress_layout.log_event(
|
||||
f"Workers running: {all_workers} ({crawl_workers_count} CrawlWorkers)",
|
||||
style="grey53"
|
||||
)
|
||||
else:
|
||||
progress_layout.log_event(
|
||||
f"Active snapshots: {last_snapshot_count} → {len(active_snapshots)}",
|
||||
style="blue"
|
||||
)
|
||||
last_snapshot_count = len(active_snapshots)
|
||||
|
||||
# Track which snapshots are still active
|
||||
@@ -450,13 +567,14 @@ class Orchestrator:
|
||||
active_ids.add(snapshot.id)
|
||||
|
||||
total = snapshot.archiveresult_set.count()
|
||||
if total == 0:
|
||||
continue
|
||||
|
||||
completed = snapshot.archiveresult_set.filter(
|
||||
status__in=['succeeded', 'skipped', 'failed']
|
||||
).count()
|
||||
|
||||
# Count hooks by status for debugging
|
||||
queued = snapshot.archiveresult_set.filter(status='queued').count()
|
||||
started = snapshot.archiveresult_set.filter(status='started').count()
|
||||
|
||||
# Find currently running hook (ordered by hook_name to get lowest step number)
|
||||
current_ar = snapshot.archiveresult_set.filter(status='started').order_by('hook_name').first()
|
||||
if not current_ar:
|
||||
@@ -472,24 +590,78 @@ class Orchestrator:
|
||||
# Clean up the name: remove prefix and extension
|
||||
clean_name = hook_name.split('__')[-1] if '__' in hook_name else hook_name
|
||||
clean_name = clean_name.replace('.py', '').replace('.sh', '').replace('.bg', '')
|
||||
current_plugin = f" • {clean_name}"
|
||||
current_plugin = clean_name
|
||||
elif total == 0:
|
||||
# Snapshot just started, hooks not created yet
|
||||
current_plugin = "initializing"
|
||||
elif queued > 0:
|
||||
# Hooks created but none started yet
|
||||
current_plugin = "waiting"
|
||||
|
||||
# Build description with URL + current plugin
|
||||
url = snapshot.url[:50] + '...' if len(snapshot.url) > 50 else snapshot.url
|
||||
description = f"{url}{current_plugin}"
|
||||
# Update snapshot worker (show even if no hooks yet)
|
||||
# Debug: Log first time we see this snapshot
|
||||
if snapshot.id not in progress_layout.snapshot_to_worker:
|
||||
progress_layout.log_event(
|
||||
f"Assigning to worker: {snapshot.url[:50]}",
|
||||
style="grey53"
|
||||
)
|
||||
|
||||
# Create or update task
|
||||
if snapshot.id not in task_ids:
|
||||
task_ids[snapshot.id] = progress.add_task(description, total=total, completed=completed)
|
||||
else:
|
||||
# Update both progress and description
|
||||
progress.update(task_ids[snapshot.id], description=description, completed=completed)
|
||||
# Track progress changes
|
||||
prev_progress = snapshot_progress.get(snapshot.id, (0, 0, ''))
|
||||
curr_progress = (total, completed, current_plugin)
|
||||
|
||||
# Remove tasks for snapshots that are no longer active
|
||||
for snapshot_id in list(task_ids.keys()):
|
||||
if prev_progress != curr_progress:
|
||||
prev_total, prev_completed, prev_plugin = prev_progress
|
||||
|
||||
# Log hooks created
|
||||
if total > prev_total:
|
||||
progress_layout.log_event(
|
||||
f"Hooks created: {total} for {snapshot.url[:40]}",
|
||||
style="cyan"
|
||||
)
|
||||
|
||||
# Log hook completion
|
||||
if completed > prev_completed:
|
||||
progress_layout.log_event(
|
||||
f"Hook completed: {completed}/{total} for {snapshot.url[:40]}",
|
||||
style="green"
|
||||
)
|
||||
|
||||
# Log plugin change
|
||||
if current_plugin and current_plugin != prev_plugin:
|
||||
progress_layout.log_event(
|
||||
f"Running: {current_plugin} ({snapshot.url[:40]})",
|
||||
style="yellow"
|
||||
)
|
||||
|
||||
snapshot_progress[snapshot.id] = curr_progress
|
||||
|
||||
# Debug: Every 10 ticks, log detailed status if stuck at initializing
|
||||
if tick_count % 10 == 0 and total == 0 and current_plugin == "initializing":
|
||||
progress_layout.log_event(
|
||||
f"DEBUG: Snapshot stuck at initializing (status={snapshot.status})",
|
||||
style="red"
|
||||
)
|
||||
|
||||
progress_layout.update_snapshot_worker(
|
||||
snapshot_id=snapshot.id,
|
||||
url=snapshot.url,
|
||||
total=max(total, 1), # Show at least 1 to avoid division by zero
|
||||
completed=completed,
|
||||
current_plugin=current_plugin,
|
||||
)
|
||||
|
||||
# Remove snapshots that are no longer active
|
||||
for snapshot_id in list(progress_layout.snapshot_to_worker.keys()):
|
||||
if snapshot_id not in active_ids:
|
||||
progress.remove_task(task_ids[snapshot_id])
|
||||
del task_ids[snapshot_id]
|
||||
progress_layout.log_event(
|
||||
f"Snapshot completed/removed",
|
||||
style="blue"
|
||||
)
|
||||
progress_layout.remove_snapshot_worker(snapshot_id)
|
||||
# Also clean up progress tracking
|
||||
if snapshot_id in snapshot_progress:
|
||||
del snapshot_progress[snapshot_id]
|
||||
|
||||
# Track idle state
|
||||
has_pending = self.has_pending_work(queue_sizes)
|
||||
@@ -503,6 +675,8 @@ class Orchestrator:
|
||||
|
||||
# Check if we should exit
|
||||
if self.should_exit(queue_sizes):
|
||||
if progress_layout:
|
||||
progress_layout.log_event("All work complete", style="green")
|
||||
log_worker_event(
|
||||
worker_type='Orchestrator',
|
||||
event='All work complete',
|
||||
@@ -514,8 +688,12 @@ class Orchestrator:
|
||||
time.sleep(self.POLL_INTERVAL)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
if progress_layout:
|
||||
progress_layout.log_event("Interrupted by user", style="red")
|
||||
print() # Newline after ^C
|
||||
except BaseException as e:
|
||||
if progress_layout:
|
||||
progress_layout.log_event(f"Error: {e}", style="red")
|
||||
self.on_shutdown(error=e)
|
||||
raise
|
||||
else:
|
||||
|
||||
@@ -34,7 +34,7 @@ CPU_COUNT = cpu_count()
|
||||
WORKER_TYPES: dict[str, type['Worker']] = {}
|
||||
|
||||
|
||||
def _run_worker(worker_class_name: str, worker_id: int, daemon: bool, **kwargs):
|
||||
def _run_worker(worker_class_name: str, worker_id: int, **kwargs):
|
||||
"""
|
||||
Module-level function to run a worker. Must be at module level for pickling.
|
||||
"""
|
||||
@@ -43,16 +43,28 @@ def _run_worker(worker_class_name: str, worker_id: int, daemon: bool, **kwargs):
|
||||
|
||||
# Get worker class by name to avoid pickling class objects
|
||||
worker_cls = WORKER_TYPES[worker_class_name]
|
||||
worker = worker_cls(worker_id=worker_id, daemon=daemon, **kwargs)
|
||||
worker = worker_cls(worker_id=worker_id, **kwargs)
|
||||
worker.runloop()
|
||||
|
||||
|
||||
def _run_snapshot_worker(snapshot_id: str, worker_id: int, **kwargs):
|
||||
"""
|
||||
Module-level function to run a SnapshotWorker for a specific snapshot.
|
||||
Must be at module level for pickling compatibility.
|
||||
"""
|
||||
from archivebox.config.django import setup_django
|
||||
setup_django()
|
||||
|
||||
worker = SnapshotWorker(snapshot_id=snapshot_id, worker_id=worker_id, **kwargs)
|
||||
worker.runloop()
|
||||
|
||||
|
||||
class Worker:
|
||||
"""
|
||||
Base worker class that polls a queue and processes items directly.
|
||||
Base worker class for CrawlWorker and SnapshotWorker.
|
||||
|
||||
Each item is processed by calling its state machine tick() method.
|
||||
Workers exit when idle for too long (unless daemon mode).
|
||||
Workers are spawned as subprocesses to process crawls and snapshots.
|
||||
Each worker type has its own custom runloop implementation.
|
||||
"""
|
||||
|
||||
name: ClassVar[str] = 'worker'
|
||||
@@ -60,16 +72,10 @@ class Worker:
|
||||
# Configuration (can be overridden by subclasses)
|
||||
MAX_TICK_TIME: ClassVar[int] = 60
|
||||
MAX_CONCURRENT_TASKS: ClassVar[int] = 1
|
||||
POLL_INTERVAL: ClassVar[float] = 0.1 # How often to check for new work (seconds)
|
||||
IDLE_TIMEOUT: ClassVar[int] = 100 # Exit after N idle iterations (10 sec at 0.1 poll interval)
|
||||
|
||||
def __init__(self, worker_id: int = 0, daemon: bool = False, crawl_id: str | None = None, **kwargs: Any):
|
||||
def __init__(self, worker_id: int = 0, **kwargs: Any):
|
||||
self.worker_id = worker_id
|
||||
self.daemon = daemon
|
||||
self.crawl_id = crawl_id # If set, only process work for this crawl
|
||||
self.pid: int = os.getpid()
|
||||
self.pid_file: Path | None = None
|
||||
self.idle_count: int = 0
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f'[underline]{self.__class__.__name__}[/underline]\\[id={self.worker_id}, pid={self.pid}]'
|
||||
@@ -78,55 +84,6 @@ class Worker:
|
||||
"""Get the Django model class. Subclasses must override this."""
|
||||
raise NotImplementedError("Subclasses must implement get_model()")
|
||||
|
||||
def get_queue(self) -> QuerySet:
|
||||
"""Get the queue of objects ready for processing."""
|
||||
Model = self.get_model()
|
||||
return Model.objects.filter(
|
||||
retry_at__lte=timezone.now()
|
||||
).exclude(
|
||||
status__in=Model.FINAL_STATES
|
||||
).order_by('retry_at')
|
||||
|
||||
def claim_next(self):
|
||||
"""
|
||||
Atomically claim the next object from the queue.
|
||||
Returns the claimed object or None if queue is empty or claim failed.
|
||||
"""
|
||||
Model = self.get_model()
|
||||
|
||||
queue = self.get_queue()
|
||||
obj = queue.first()
|
||||
if obj is None:
|
||||
return None
|
||||
|
||||
# Atomic claim using optimistic locking on retry_at
|
||||
claimed = Model.objects.filter(
|
||||
pk=obj.pk,
|
||||
retry_at=obj.retry_at,
|
||||
).update(
|
||||
retry_at=timezone.now() + timedelta(seconds=self.MAX_TICK_TIME)
|
||||
)
|
||||
|
||||
if claimed == 1:
|
||||
obj.refresh_from_db()
|
||||
return obj
|
||||
|
||||
return None # Someone else claimed it
|
||||
|
||||
def process_item(self, obj) -> bool:
|
||||
"""
|
||||
Process a single item by calling its state machine tick().
|
||||
Returns True on success, False on failure.
|
||||
Subclasses can override for custom processing.
|
||||
"""
|
||||
try:
|
||||
obj.sm.tick()
|
||||
return True
|
||||
except Exception as e:
|
||||
# Error will be logged in runloop's completion event
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
def on_startup(self) -> None:
|
||||
"""Called when worker starts."""
|
||||
from archivebox.machine.models import Process
|
||||
@@ -139,7 +96,7 @@ class Worker:
|
||||
if self.db_process.process_type != Process.TypeChoices.WORKER:
|
||||
self.db_process.process_type = Process.TypeChoices.WORKER
|
||||
update_fields.append('process_type')
|
||||
# Store worker type name (crawl/snapshot/archiveresult) in worker_type field
|
||||
# Store worker type name (crawl/snapshot) in worker_type field
|
||||
if not self.db_process.worker_type:
|
||||
self.db_process.worker_type = self.name
|
||||
update_fields.append('worker_type')
|
||||
@@ -148,13 +105,11 @@ class Worker:
|
||||
|
||||
# Determine worker type for logging
|
||||
worker_type_name = self.__class__.__name__
|
||||
indent_level = 1 # Default for most workers
|
||||
indent_level = 1 # Default for CrawlWorker
|
||||
|
||||
# Adjust indent level based on worker type
|
||||
# SnapshotWorker gets indent level 2
|
||||
if 'Snapshot' in worker_type_name:
|
||||
indent_level = 2
|
||||
elif 'ArchiveResult' in worker_type_name:
|
||||
indent_level = 3
|
||||
|
||||
log_worker_event(
|
||||
worker_type=worker_type_name,
|
||||
@@ -162,10 +117,6 @@ class Worker:
|
||||
indent_level=indent_level,
|
||||
pid=self.pid,
|
||||
worker_id=str(self.worker_id),
|
||||
metadata={
|
||||
'max_concurrent': self.MAX_CONCURRENT_TASKS,
|
||||
'poll_interval': self.POLL_INTERVAL,
|
||||
},
|
||||
)
|
||||
|
||||
def on_shutdown(self, error: BaseException | None = None) -> None:
|
||||
@@ -179,12 +130,10 @@ class Worker:
|
||||
|
||||
# Determine worker type for logging
|
||||
worker_type_name = self.__class__.__name__
|
||||
indent_level = 1
|
||||
indent_level = 1 # CrawlWorker
|
||||
|
||||
if 'Snapshot' in worker_type_name:
|
||||
indent_level = 2
|
||||
elif 'ArchiveResult' in worker_type_name:
|
||||
indent_level = 3
|
||||
|
||||
log_worker_event(
|
||||
worker_type=worker_type_name,
|
||||
@@ -195,121 +144,157 @@ class Worker:
|
||||
error=error if error and not isinstance(error, KeyboardInterrupt) else None,
|
||||
)
|
||||
|
||||
def should_exit(self) -> bool:
|
||||
"""Check if worker should exit due to idle timeout."""
|
||||
if self.daemon:
|
||||
return False
|
||||
def _terminate_background_hooks(
|
||||
self,
|
||||
background_processes: dict[str, 'Process'],
|
||||
worker_type: str,
|
||||
indent_level: int,
|
||||
) -> None:
|
||||
"""
|
||||
Terminate background hooks in 3 phases (shared logic for Crawl/Snapshot workers).
|
||||
|
||||
if self.IDLE_TIMEOUT == 0:
|
||||
return False
|
||||
Phase 1: Send SIGTERM to all bg hooks + children in parallel (polite request to wrap up)
|
||||
Phase 2: Wait for each hook's remaining timeout before SIGKILL
|
||||
Phase 3: SIGKILL any stragglers that exceeded their timeout
|
||||
|
||||
return self.idle_count >= self.IDLE_TIMEOUT
|
||||
Args:
|
||||
background_processes: Dict mapping hook name -> Process instance
|
||||
worker_type: Worker type name for logging (e.g., 'CrawlWorker', 'SnapshotWorker')
|
||||
indent_level: Logging indent level (1 for Crawl, 2 for Snapshot)
|
||||
"""
|
||||
import signal
|
||||
import time
|
||||
|
||||
def runloop(self) -> None:
|
||||
"""Main worker loop - polls queue, processes items."""
|
||||
self.on_startup()
|
||||
if not background_processes:
|
||||
return
|
||||
|
||||
# Determine worker type for logging
|
||||
worker_type_name = self.__class__.__name__
|
||||
indent_level = 1
|
||||
now = time.time()
|
||||
|
||||
if 'Snapshot' in worker_type_name:
|
||||
indent_level = 2
|
||||
elif 'ArchiveResult' in worker_type_name:
|
||||
indent_level = 3
|
||||
# Phase 1: Send SIGTERM to ALL background processes + children in parallel
|
||||
log_worker_event(
|
||||
worker_type=worker_type,
|
||||
event=f'Sending SIGTERM to {len(background_processes)} background hooks (+ children)',
|
||||
indent_level=indent_level,
|
||||
pid=self.pid,
|
||||
)
|
||||
|
||||
try:
|
||||
while True:
|
||||
# Try to claim and process an item
|
||||
obj = self.claim_next()
|
||||
|
||||
if obj is not None:
|
||||
self.idle_count = 0
|
||||
|
||||
# Build metadata for task start
|
||||
start_metadata = {}
|
||||
url = None
|
||||
if hasattr(obj, 'url'):
|
||||
# SnapshotWorker
|
||||
url = str(obj.url) if obj.url else None
|
||||
elif hasattr(obj, 'snapshot') and hasattr(obj.snapshot, 'url'):
|
||||
# ArchiveResultWorker
|
||||
url = str(obj.snapshot.url) if obj.snapshot.url else None
|
||||
elif hasattr(obj, 'get_urls_list'):
|
||||
# CrawlWorker
|
||||
urls = obj.get_urls_list()
|
||||
url = urls[0] if urls else None
|
||||
|
||||
plugin = None
|
||||
if hasattr(obj, 'plugin'):
|
||||
# ArchiveResultWorker, Crawl
|
||||
plugin = obj.plugin
|
||||
# Build deadline map first (before killing, to get accurate remaining time)
|
||||
deadlines = {}
|
||||
for hook_name, process in background_processes.items():
|
||||
elapsed = now - process.started_at.timestamp()
|
||||
remaining = max(0, process.timeout - elapsed)
|
||||
deadline = now + remaining
|
||||
deadlines[hook_name] = (process, deadline)
|
||||
|
||||
# Send SIGTERM to all process trees in parallel (non-blocking)
|
||||
for hook_name, process in background_processes.items():
|
||||
try:
|
||||
# Get chrome children (renderer processes etc) before sending signal
|
||||
children_pids = process.get_children_pids()
|
||||
if children_pids:
|
||||
# Chrome hook with children - kill tree
|
||||
os.kill(process.pid, signal.SIGTERM)
|
||||
for child_pid in children_pids:
|
||||
try:
|
||||
os.kill(child_pid, signal.SIGTERM)
|
||||
except ProcessLookupError:
|
||||
pass
|
||||
log_worker_event(
|
||||
worker_type=worker_type_name,
|
||||
event='Processing',
|
||||
worker_type=worker_type,
|
||||
event=f'Sent SIGTERM to {hook_name} + {len(children_pids)} children',
|
||||
indent_level=indent_level,
|
||||
pid=self.pid,
|
||||
worker_id=str(self.worker_id),
|
||||
url=url,
|
||||
plugin=plugin,
|
||||
metadata=start_metadata if start_metadata else None,
|
||||
)
|
||||
|
||||
start_time = time.time()
|
||||
success = self.process_item(obj)
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
# Build metadata for task completion
|
||||
complete_metadata = {
|
||||
'duration': elapsed,
|
||||
'status': 'success' if success else 'failed',
|
||||
}
|
||||
|
||||
log_worker_event(
|
||||
worker_type=worker_type_name,
|
||||
event='Completed' if success else 'Failed',
|
||||
indent_level=indent_level,
|
||||
pid=self.pid,
|
||||
worker_id=str(self.worker_id),
|
||||
url=url,
|
||||
plugin=plugin,
|
||||
metadata=complete_metadata,
|
||||
)
|
||||
else:
|
||||
# No work available - idle logging suppressed
|
||||
self.idle_count += 1
|
||||
# No children - normal kill
|
||||
os.kill(process.pid, signal.SIGTERM)
|
||||
except ProcessLookupError:
|
||||
pass # Already dead
|
||||
except Exception as e:
|
||||
log_worker_event(
|
||||
worker_type=worker_type,
|
||||
event=f'Failed to SIGTERM {hook_name}: {e}',
|
||||
indent_level=indent_level,
|
||||
pid=self.pid,
|
||||
)
|
||||
|
||||
# Check if we should exit
|
||||
if self.should_exit():
|
||||
# Exit logging suppressed - shutdown will be logged by on_shutdown()
|
||||
break
|
||||
# Phase 2: Wait for all processes in parallel, respecting individual timeouts
|
||||
for hook_name, (process, deadline) in deadlines.items():
|
||||
remaining = deadline - now
|
||||
log_worker_event(
|
||||
worker_type=worker_type,
|
||||
event=f'Waiting up to {remaining:.1f}s for {hook_name}',
|
||||
indent_level=indent_level,
|
||||
pid=self.pid,
|
||||
)
|
||||
|
||||
time.sleep(self.POLL_INTERVAL)
|
||||
# Poll all processes in parallel using Process.poll()
|
||||
still_running = set(deadlines.keys())
|
||||
|
||||
except KeyboardInterrupt:
|
||||
pass
|
||||
except BaseException as e:
|
||||
self.on_shutdown(error=e)
|
||||
raise
|
||||
else:
|
||||
self.on_shutdown()
|
||||
while still_running:
|
||||
time.sleep(0.1)
|
||||
now = time.time()
|
||||
|
||||
for hook_name in list(still_running):
|
||||
process, deadline = deadlines[hook_name]
|
||||
|
||||
# Check if process exited using Process.poll()
|
||||
exit_code = process.poll()
|
||||
if exit_code is not None:
|
||||
# Process exited
|
||||
still_running.remove(hook_name)
|
||||
log_worker_event(
|
||||
worker_type=worker_type,
|
||||
event=f'✓ {hook_name} exited with code {exit_code}',
|
||||
indent_level=indent_level,
|
||||
pid=self.pid,
|
||||
)
|
||||
continue
|
||||
|
||||
# Check if deadline exceeded
|
||||
if now >= deadline:
|
||||
# Timeout exceeded - SIGKILL process tree
|
||||
try:
|
||||
# Get children before killing (chrome may have spawned more)
|
||||
children_pids = process.get_children_pids()
|
||||
if children_pids:
|
||||
# Kill children first
|
||||
for child_pid in children_pids:
|
||||
try:
|
||||
os.kill(child_pid, signal.SIGKILL)
|
||||
except ProcessLookupError:
|
||||
pass
|
||||
# Then kill parent
|
||||
process.kill(signal_num=signal.SIGKILL)
|
||||
log_worker_event(
|
||||
worker_type=worker_type,
|
||||
event=f'⚠ Sent SIGKILL to {hook_name} + {len(children_pids) if children_pids else 0} children (exceeded timeout)',
|
||||
indent_level=indent_level,
|
||||
pid=self.pid,
|
||||
)
|
||||
except Exception as e:
|
||||
log_worker_event(
|
||||
worker_type=worker_type,
|
||||
event=f'Failed to SIGKILL {hook_name}: {e}',
|
||||
indent_level=indent_level,
|
||||
pid=self.pid,
|
||||
)
|
||||
still_running.remove(hook_name)
|
||||
|
||||
@classmethod
|
||||
def start(cls, worker_id: int | None = None, daemon: bool = False, **kwargs: Any) -> int:
|
||||
def start(cls, **kwargs: Any) -> int:
|
||||
"""
|
||||
Fork a new worker as a subprocess.
|
||||
Returns the PID of the new process.
|
||||
"""
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
if worker_id is None:
|
||||
worker_id = Process.get_next_worker_id(process_type=Process.TypeChoices.WORKER)
|
||||
worker_id = Process.get_next_worker_id(process_type=Process.TypeChoices.WORKER)
|
||||
|
||||
# Use module-level function for pickling compatibility
|
||||
proc = MPProcess(
|
||||
target=_run_worker,
|
||||
args=(cls.name, worker_id, daemon),
|
||||
args=(cls.name, worker_id),
|
||||
kwargs=kwargs,
|
||||
name=f'{cls.name}_worker_{worker_id}',
|
||||
)
|
||||
@@ -356,120 +341,397 @@ class Worker:
|
||||
|
||||
|
||||
class CrawlWorker(Worker):
|
||||
"""Worker for processing Crawl objects."""
|
||||
"""
|
||||
Worker for processing Crawl objects.
|
||||
|
||||
Responsibilities:
|
||||
1. Run on_Crawl__* hooks (e.g., chrome launcher)
|
||||
2. Create Snapshots from URLs
|
||||
3. Spawn SnapshotWorkers (up to MAX_SNAPSHOT_WORKERS)
|
||||
4. Monitor snapshots and seal crawl when all done
|
||||
"""
|
||||
|
||||
name: ClassVar[str] = 'crawl'
|
||||
MAX_TICK_TIME: ClassVar[int] = 60
|
||||
MAX_SNAPSHOT_WORKERS: ClassVar[int] = 8 # Per crawl limit
|
||||
|
||||
def __init__(self, crawl_id: str, **kwargs: Any):
|
||||
super().__init__(**kwargs)
|
||||
self.crawl_id = crawl_id
|
||||
self.crawl = None
|
||||
|
||||
def get_model(self):
|
||||
from archivebox.crawls.models import Crawl
|
||||
return Crawl
|
||||
|
||||
def get_queue(self) -> QuerySet:
|
||||
"""Get queue of Crawls ready for processing, optionally filtered by crawl_id."""
|
||||
qs = super().get_queue()
|
||||
if self.crawl_id:
|
||||
qs = qs.filter(id=self.crawl_id)
|
||||
return qs
|
||||
def on_startup(self) -> None:
|
||||
"""Load crawl."""
|
||||
super().on_startup()
|
||||
|
||||
from archivebox.crawls.models import Crawl
|
||||
self.crawl = Crawl.objects.get(id=self.crawl_id)
|
||||
|
||||
def runloop(self) -> None:
|
||||
"""Run crawl state machine, spawn SnapshotWorkers."""
|
||||
import sys
|
||||
self.on_startup()
|
||||
|
||||
try:
|
||||
print(f'[cyan]🔄 CrawlWorker.runloop: Starting tick() for crawl {self.crawl_id}[/cyan]', file=sys.stderr)
|
||||
# Advance state machine: QUEUED → STARTED (triggers run() via @started.enter)
|
||||
self.crawl.sm.tick()
|
||||
self.crawl.refresh_from_db()
|
||||
print(f'[cyan]🔄 tick() complete, crawl status={self.crawl.status}[/cyan]', file=sys.stderr)
|
||||
|
||||
# Now spawn SnapshotWorkers and monitor progress
|
||||
while True:
|
||||
# Check if crawl is done
|
||||
if self._is_crawl_finished():
|
||||
print(f'[cyan]🔄 Crawl finished, sealing...[/cyan]', file=sys.stderr)
|
||||
self.crawl.sm.seal()
|
||||
break
|
||||
|
||||
# Spawn workers for queued snapshots
|
||||
self._spawn_snapshot_workers()
|
||||
|
||||
time.sleep(2) # Check every 2s
|
||||
|
||||
finally:
|
||||
self.on_shutdown()
|
||||
|
||||
def _spawn_snapshot_workers(self) -> None:
|
||||
"""Spawn SnapshotWorkers for queued snapshots (up to limit)."""
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
# Count running SnapshotWorkers for this crawl
|
||||
running_count = Process.objects.filter(
|
||||
process_type=Process.TypeChoices.WORKER,
|
||||
worker_type='snapshot',
|
||||
parent_id=self.db_process.id, # Children of this CrawlWorker
|
||||
status__in=['running', 'started'],
|
||||
).count()
|
||||
|
||||
if running_count >= self.MAX_SNAPSHOT_WORKERS:
|
||||
return # At limit
|
||||
|
||||
# Get queued snapshots for this crawl (SnapshotWorker will mark as STARTED in on_startup)
|
||||
queued_snapshots = Snapshot.objects.filter(
|
||||
crawl_id=self.crawl_id,
|
||||
status=Snapshot.StatusChoices.QUEUED,
|
||||
).order_by('created_at')[:self.MAX_SNAPSHOT_WORKERS - running_count]
|
||||
|
||||
import sys
|
||||
print(f'[yellow]🔧 _spawn_snapshot_workers: running={running_count}/{self.MAX_SNAPSHOT_WORKERS}, queued={queued_snapshots.count()}[/yellow]', file=sys.stderr)
|
||||
|
||||
# Spawn workers
|
||||
for snapshot in queued_snapshots:
|
||||
print(f'[yellow]🔧 Spawning worker for {snapshot.url} (status={snapshot.status})[/yellow]', file=sys.stderr)
|
||||
SnapshotWorker.start(snapshot_id=str(snapshot.id))
|
||||
log_worker_event(
|
||||
worker_type='CrawlWorker',
|
||||
event=f'Spawned SnapshotWorker for {snapshot.url}',
|
||||
indent_level=1,
|
||||
pid=self.pid,
|
||||
)
|
||||
|
||||
def _is_crawl_finished(self) -> bool:
|
||||
"""Check if all snapshots are sealed."""
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
pending = Snapshot.objects.filter(
|
||||
crawl_id=self.crawl_id,
|
||||
status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED],
|
||||
).count()
|
||||
|
||||
return pending == 0
|
||||
|
||||
def on_shutdown(self, error: BaseException | None = None) -> None:
|
||||
"""
|
||||
Terminate all background Crawl hooks when crawl finishes.
|
||||
|
||||
Background hooks (e.g., chrome launcher) should only be killed when:
|
||||
- All snapshots are done (crawl is sealed)
|
||||
- Worker is shutting down
|
||||
"""
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
# Query for all running hook processes that are children of this CrawlWorker
|
||||
background_hooks = Process.objects.filter(
|
||||
parent_id=self.db_process.id,
|
||||
process_type=Process.TypeChoices.HOOK,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
).select_related('machine')
|
||||
|
||||
# Build dict for shared termination logic
|
||||
background_processes = {
|
||||
hook.cmd[0] if hook.cmd else f'hook-{hook.pid}': hook
|
||||
for hook in background_hooks
|
||||
}
|
||||
|
||||
# Use shared termination logic from Worker base class
|
||||
self._terminate_background_hooks(
|
||||
background_processes=background_processes,
|
||||
worker_type='CrawlWorker',
|
||||
indent_level=1,
|
||||
)
|
||||
|
||||
super().on_shutdown(error)
|
||||
|
||||
|
||||
class SnapshotWorker(Worker):
|
||||
"""Worker for processing Snapshot objects."""
|
||||
"""
|
||||
Worker that owns sequential hook execution for ONE snapshot.
|
||||
|
||||
Unlike other workers, SnapshotWorker doesn't poll a queue - it's given
|
||||
a specific snapshot_id and runs all hooks for that snapshot sequentially.
|
||||
|
||||
Execution flow:
|
||||
1. Mark snapshot as STARTED
|
||||
2. Discover hooks for snapshot
|
||||
3. For each hook (sorted by name):
|
||||
a. Fork hook Process
|
||||
b. If foreground: wait for completion
|
||||
c. If background: track but continue to next hook
|
||||
d. Update ArchiveResult status
|
||||
e. Advance current_step when all step's hooks complete
|
||||
4. When all hooks done: seal snapshot
|
||||
5. On shutdown: SIGTERM all background hooks
|
||||
"""
|
||||
|
||||
name: ClassVar[str] = 'snapshot'
|
||||
MAX_TICK_TIME: ClassVar[int] = 60
|
||||
|
||||
def __init__(self, snapshot_id: str, **kwargs: Any):
|
||||
super().__init__(**kwargs)
|
||||
self.snapshot_id = snapshot_id
|
||||
self.snapshot = None
|
||||
self.background_processes: dict[str, Any] = {} # hook_name -> Process
|
||||
|
||||
def get_model(self):
|
||||
"""Not used - SnapshotWorker doesn't poll queues."""
|
||||
from archivebox.core.models import Snapshot
|
||||
return Snapshot
|
||||
|
||||
def get_queue(self) -> QuerySet:
|
||||
"""Get queue of Snapshots ready for processing, optionally filtered by crawl_id."""
|
||||
qs = super().get_queue()
|
||||
if self.crawl_id:
|
||||
qs = qs.filter(crawl_id=self.crawl_id)
|
||||
return qs
|
||||
def on_startup(self) -> None:
|
||||
"""Load snapshot and mark as STARTED."""
|
||||
super().on_startup()
|
||||
|
||||
from archivebox.core.models import Snapshot
|
||||
self.snapshot = Snapshot.objects.get(id=self.snapshot_id)
|
||||
|
||||
class ArchiveResultWorker(Worker):
|
||||
"""Worker for processing ArchiveResult objects."""
|
||||
# Mark snapshot as STARTED
|
||||
self.snapshot.status = Snapshot.StatusChoices.STARTED
|
||||
self.snapshot.retry_at = None # No more polling needed
|
||||
self.snapshot.save(update_fields=['status', 'retry_at', 'modified_at'])
|
||||
|
||||
name: ClassVar[str] = 'archiveresult'
|
||||
MAX_TICK_TIME: ClassVar[int] = 120
|
||||
|
||||
def __init__(self, plugin: str | None = None, **kwargs: Any):
|
||||
super().__init__(**kwargs)
|
||||
self.plugin = plugin
|
||||
|
||||
def get_model(self):
|
||||
def runloop(self) -> None:
|
||||
"""Execute all hooks sequentially."""
|
||||
from archivebox.hooks import discover_hooks, is_background_hook, extract_step
|
||||
from archivebox.core.models import ArchiveResult
|
||||
return ArchiveResult
|
||||
|
||||
def get_queue(self) -> QuerySet:
|
||||
"""
|
||||
Get queue of ArchiveResults ready for processing.
|
||||
self.on_startup()
|
||||
|
||||
Uses step-based filtering: only claims ARs where hook step <= snapshot.current_step.
|
||||
This ensures hooks execute in order (step 0 → 1 → 2 ... → 9).
|
||||
"""
|
||||
from archivebox.core.models import ArchiveResult
|
||||
from archivebox.hooks import extract_step
|
||||
|
||||
qs = super().get_queue()
|
||||
|
||||
if self.crawl_id:
|
||||
qs = qs.filter(snapshot__crawl_id=self.crawl_id)
|
||||
|
||||
if self.plugin:
|
||||
qs = qs.filter(plugin=self.plugin)
|
||||
|
||||
# Step-based filtering: only process ARs whose step <= snapshot.current_step
|
||||
# Since step is derived from hook_name, we filter in Python after initial query
|
||||
# This is efficient because the base query already filters by retry_at and status
|
||||
|
||||
# Get candidate ARs
|
||||
candidates = list(qs[:50]) # Limit to avoid loading too many
|
||||
ready_pks = []
|
||||
|
||||
for ar in candidates:
|
||||
if not ar.hook_name:
|
||||
# Legacy ARs without hook_name - process them
|
||||
ready_pks.append(ar.pk)
|
||||
continue
|
||||
|
||||
ar_step = extract_step(ar.hook_name)
|
||||
snapshot_step = ar.snapshot.current_step
|
||||
|
||||
if ar_step <= snapshot_step:
|
||||
ready_pks.append(ar.pk)
|
||||
|
||||
# Return filtered queryset ordered by hook_name (so earlier hooks run first within a step)
|
||||
return ArchiveResult.objects.filter(pk__in=ready_pks).order_by('hook_name', 'retry_at')
|
||||
|
||||
def process_item(self, obj) -> bool:
|
||||
"""Process an ArchiveResult by running its plugin."""
|
||||
try:
|
||||
obj.sm.tick()
|
||||
return True
|
||||
# Discover all hooks for this snapshot
|
||||
hooks = discover_hooks('Snapshot', config=self.snapshot.config)
|
||||
hooks = sorted(hooks, key=lambda h: h.name) # Sort by name (includes step prefix)
|
||||
|
||||
# Execute each hook sequentially
|
||||
for hook_path in hooks:
|
||||
hook_name = hook_path.name
|
||||
plugin = self._extract_plugin_name(hook_name)
|
||||
hook_step = extract_step(hook_name)
|
||||
is_background = is_background_hook(hook_name)
|
||||
|
||||
# Create ArchiveResult for THIS HOOK (not per plugin)
|
||||
# One plugin can have multiple hooks (e.g., chrome/on_Snapshot__20_launch_chrome.js, chrome/on_Snapshot__21_navigate_chrome.js)
|
||||
# Unique key = (snapshot, plugin, hook_name) for idempotency
|
||||
ar, created = ArchiveResult.objects.get_or_create(
|
||||
snapshot=self.snapshot,
|
||||
plugin=plugin,
|
||||
hook_name=hook_name,
|
||||
defaults={
|
||||
'status': ArchiveResult.StatusChoices.STARTED,
|
||||
'start_ts': timezone.now(),
|
||||
}
|
||||
)
|
||||
|
||||
if not created:
|
||||
# Update existing AR to STARTED
|
||||
ar.status = ArchiveResult.StatusChoices.STARTED
|
||||
ar.start_ts = timezone.now()
|
||||
ar.save(update_fields=['status', 'start_ts', 'modified_at'])
|
||||
|
||||
# Fork and run the hook
|
||||
process = self._run_hook(hook_path, ar)
|
||||
|
||||
if is_background:
|
||||
# Track but don't wait
|
||||
self.background_processes[hook_name] = process
|
||||
log_worker_event(
|
||||
worker_type='SnapshotWorker',
|
||||
event=f'Started background hook: {hook_name} (timeout={process.timeout}s)',
|
||||
indent_level=2,
|
||||
pid=self.pid,
|
||||
)
|
||||
else:
|
||||
# Wait for foreground hook to complete
|
||||
self._wait_for_hook(process, ar)
|
||||
log_worker_event(
|
||||
worker_type='SnapshotWorker',
|
||||
event=f'Completed hook: {hook_name}',
|
||||
indent_level=2,
|
||||
pid=self.pid,
|
||||
)
|
||||
|
||||
# Check if we can advance to next step
|
||||
self._try_advance_step()
|
||||
|
||||
# All hooks launched (or completed) - cleanup and seal
|
||||
self._cleanup_empty_archiveresults()
|
||||
self.snapshot.status = Snapshot.StatusChoices.SEALED
|
||||
self.snapshot.save(update_fields=['status', 'modified_at'])
|
||||
|
||||
except Exception as e:
|
||||
# Error will be logged in runloop's completion event
|
||||
traceback.print_exc()
|
||||
return False
|
||||
# Mark snapshot as failed
|
||||
self.snapshot.status = Snapshot.StatusChoices.SEALED # Still seal on error
|
||||
self.snapshot.save(update_fields=['status', 'modified_at'])
|
||||
raise
|
||||
finally:
|
||||
self.on_shutdown()
|
||||
|
||||
def _run_hook(self, hook_path: Path, ar: Any) -> Any:
|
||||
"""Fork and run a hook using Process model, return Process."""
|
||||
from archivebox.hooks import run_hook
|
||||
|
||||
# Create output directory
|
||||
output_dir = ar.create_output_dir()
|
||||
|
||||
# Run hook using Process.launch() - returns Process model directly
|
||||
# Pass self.db_process as parent to track SnapshotWorker -> Hook hierarchy
|
||||
process = run_hook(
|
||||
script=hook_path,
|
||||
output_dir=output_dir,
|
||||
config=self.snapshot.config,
|
||||
timeout=120,
|
||||
parent=self.db_process,
|
||||
url=str(self.snapshot.url),
|
||||
snapshot_id=str(self.snapshot.id),
|
||||
)
|
||||
|
||||
# Link ArchiveResult to Process for tracking
|
||||
ar.process = process
|
||||
ar.save(update_fields=['process_id', 'modified_at'])
|
||||
|
||||
return process
|
||||
|
||||
def _wait_for_hook(self, process: Any, ar: Any) -> None:
|
||||
"""Wait for hook using Process.wait(), update AR status."""
|
||||
# Use Process.wait() helper instead of manual polling
|
||||
try:
|
||||
exit_code = process.wait(timeout=process.timeout)
|
||||
except TimeoutError:
|
||||
# Hook exceeded timeout - kill it
|
||||
process.kill(signal_num=9)
|
||||
exit_code = -1
|
||||
|
||||
# Update ArchiveResult from hook output
|
||||
ar.update_from_output()
|
||||
ar.end_ts = timezone.now()
|
||||
|
||||
# Determine final status from hook exit code
|
||||
if exit_code == 0:
|
||||
ar.status = ar.StatusChoices.SUCCEEDED
|
||||
else:
|
||||
ar.status = ar.StatusChoices.FAILED
|
||||
|
||||
ar.save(update_fields=['status', 'end_ts', 'modified_at'])
|
||||
|
||||
def _try_advance_step(self) -> None:
|
||||
"""Advance current_step if all foreground hooks in current step are done."""
|
||||
from django.db.models import Q
|
||||
from archivebox.core.models import ArchiveResult
|
||||
|
||||
current_step = self.snapshot.current_step
|
||||
|
||||
# Single query: foreground hooks in current step that aren't finished
|
||||
# Foreground hooks: hook_name doesn't contain '.bg.'
|
||||
pending_foreground = self.snapshot.archiveresult_set.filter(
|
||||
Q(hook_name__contains=f'__{current_step}_') & # Current step
|
||||
~Q(hook_name__contains='.bg.') & # Not background
|
||||
~Q(status__in=ArchiveResult.FINAL_STATES) # Not finished
|
||||
).exists()
|
||||
|
||||
if pending_foreground:
|
||||
return # Still waiting for hooks
|
||||
|
||||
# All foreground hooks done - advance!
|
||||
self.snapshot.current_step += 1
|
||||
self.snapshot.save(update_fields=['current_step', 'modified_at'])
|
||||
|
||||
log_worker_event(
|
||||
worker_type='SnapshotWorker',
|
||||
event=f'Advanced to step {self.snapshot.current_step}',
|
||||
indent_level=2,
|
||||
pid=self.pid,
|
||||
)
|
||||
|
||||
def _cleanup_empty_archiveresults(self) -> None:
|
||||
"""Delete ArchiveResults that produced no output files."""
|
||||
empty_ars = self.snapshot.archiveresult_set.filter(
|
||||
output_files={} # No output files
|
||||
).filter(
|
||||
status__in=self.snapshot.archiveresult_set.model.FINAL_STATES # Only delete finished ones
|
||||
)
|
||||
|
||||
deleted_count = empty_ars.count()
|
||||
if deleted_count > 0:
|
||||
empty_ars.delete()
|
||||
log_worker_event(
|
||||
worker_type='SnapshotWorker',
|
||||
event=f'Deleted {deleted_count} empty ArchiveResults',
|
||||
indent_level=2,
|
||||
pid=self.pid,
|
||||
)
|
||||
|
||||
def on_shutdown(self, error: BaseException | None = None) -> None:
|
||||
"""
|
||||
Terminate all background Snapshot hooks when snapshot finishes.
|
||||
|
||||
Background hooks should only be killed when:
|
||||
- All foreground hooks are done (snapshot is sealed)
|
||||
- Worker is shutting down
|
||||
"""
|
||||
# Use shared termination logic from Worker base class
|
||||
self._terminate_background_hooks(
|
||||
background_processes=self.background_processes,
|
||||
worker_type='SnapshotWorker',
|
||||
indent_level=2,
|
||||
)
|
||||
|
||||
super().on_shutdown(error)
|
||||
|
||||
@staticmethod
|
||||
def _extract_plugin_name(hook_name: str) -> str:
|
||||
"""Extract plugin name from hook filename."""
|
||||
# on_Snapshot__50_wget.py -> wget
|
||||
name = hook_name.split('__')[-1] # Get part after last __
|
||||
name = name.replace('.py', '').replace('.js', '').replace('.sh', '')
|
||||
name = name.replace('.bg', '') # Remove .bg suffix
|
||||
return name
|
||||
|
||||
@classmethod
|
||||
def start(cls, worker_id: int | None = None, daemon: bool = False, plugin: str | None = None, **kwargs: Any) -> int:
|
||||
"""Fork a new worker as subprocess with optional plugin filter."""
|
||||
def start(cls, snapshot_id: str, **kwargs: Any) -> int:
|
||||
"""Fork a SnapshotWorker for a specific snapshot."""
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
if worker_id is None:
|
||||
worker_id = Process.get_next_worker_id(process_type=Process.TypeChoices.WORKER)
|
||||
worker_id = Process.get_next_worker_id(process_type=Process.TypeChoices.WORKER)
|
||||
|
||||
# Use module-level function for pickling compatibility
|
||||
proc = MPProcess(
|
||||
target=_run_worker,
|
||||
args=(cls.name, worker_id, daemon),
|
||||
kwargs={'plugin': plugin, **kwargs},
|
||||
name=f'{cls.name}_worker_{worker_id}',
|
||||
target=_run_snapshot_worker, # New module-level function
|
||||
args=(snapshot_id, worker_id),
|
||||
kwargs=kwargs,
|
||||
name=f'snapshot_worker_{snapshot_id[:8]}',
|
||||
)
|
||||
proc.start()
|
||||
|
||||
@@ -481,7 +743,6 @@ class ArchiveResultWorker(Worker):
|
||||
WORKER_TYPES.update({
|
||||
'crawl': CrawlWorker,
|
||||
'snapshot': SnapshotWorker,
|
||||
'archiveresult': ArchiveResultWorker,
|
||||
})
|
||||
|
||||
|
||||
|
||||
333
old/TODO_Process_cleanup_unification.md
Normal file
333
old/TODO_Process_cleanup_unification.md
Normal file
@@ -0,0 +1,333 @@
|
||||
# Process Model Integration Plan
|
||||
|
||||
## Current Architecture
|
||||
|
||||
### Hook Execution Flow
|
||||
```
|
||||
Orchestrator
|
||||
├─> CrawlWorker
|
||||
│ └─> Crawl.run() [state machine @started.enter]
|
||||
│ └─> run_hook() for on_Crawl__* hooks
|
||||
│ └─> subprocess.Popen (NOT using Process model)
|
||||
│
|
||||
└─> SnapshotWorker
|
||||
└─> Snapshot.run() [planned - doesn't exist yet]
|
||||
└─> ArchiveResult.run() [state machine @started.enter]
|
||||
└─> run_hook() for on_Snapshot__* hooks
|
||||
└─> subprocess.Popen (NOT using Process model)
|
||||
```
|
||||
|
||||
### Problem
|
||||
1. **No Process tracking**: `run_hook()` uses `subprocess.Popen` directly, never creates Process records
|
||||
2. **Orphaned Process model**: Process model has `.launch()`, `.wait()`, `.terminate()` methods that are NEVER used
|
||||
3. **Manual process management**: SnapshotWorker manually uses psutil for waiting/killing
|
||||
4. **Duplicate logic**: Process model and run_hook() both do subprocess management independently
|
||||
|
||||
## Unified Architecture
|
||||
|
||||
### Goal
|
||||
Make Process model the **single source of truth** for all subprocess operations:
|
||||
- Hook execution
|
||||
- PID tracking
|
||||
- stdout/stderr capture
|
||||
- Process lifecycle (launch, wait, terminate)
|
||||
|
||||
### Design
|
||||
|
||||
```python
|
||||
# hooks.py - Thin wrapper
|
||||
def run_hook(...) -> Process:
|
||||
"""
|
||||
Run a hook using Process model (THIN WRAPPER).
|
||||
|
||||
Returns Process model instance for tracking and control.
|
||||
"""
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
# Build command
|
||||
cmd = build_hook_cmd(script, kwargs)
|
||||
|
||||
# Use Process.launch() - handles everything
|
||||
process = Process.objects.create(
|
||||
machine=Machine.current(),
|
||||
process_type=Process.TypeChoices.HOOK,
|
||||
pwd=str(output_dir),
|
||||
cmd=cmd,
|
||||
env=build_hook_env(config),
|
||||
timeout=timeout,
|
||||
)
|
||||
|
||||
# Launch subprocess
|
||||
process.launch(background=is_background_hook(script.name))
|
||||
|
||||
return process # Return Process, not dict
|
||||
|
||||
|
||||
# worker.py - Use Process methods
|
||||
class SnapshotWorker:
|
||||
def _run_hook(self, hook_path, ar) -> Process:
|
||||
"""Fork hook using Process model."""
|
||||
process = run_hook(
|
||||
hook_path,
|
||||
ar.create_output_dir(),
|
||||
self.snapshot.config,
|
||||
url=self.snapshot.url,
|
||||
snapshot_id=str(self.snapshot.id),
|
||||
)
|
||||
|
||||
# Link ArchiveResult to Process
|
||||
ar.process = process
|
||||
ar.save()
|
||||
|
||||
return process
|
||||
|
||||
def _wait_for_hook(self, process, ar):
|
||||
"""Wait using Process.wait() method."""
|
||||
exit_code = process.wait(timeout=None)
|
||||
|
||||
# Update AR from hook output
|
||||
ar.update_from_output()
|
||||
ar.status = ar.StatusChoices.SUCCEEDED if exit_code == 0 else ar.StatusChoices.FAILED
|
||||
ar.save()
|
||||
|
||||
def on_shutdown(self):
|
||||
"""
|
||||
Terminate all background hooks in parallel with per-plugin timeouts.
|
||||
|
||||
Phase 1: Send SIGTERM to all in parallel (polite request to wrap up)
|
||||
Phase 2: Wait for all in parallel, respecting individual plugin timeouts
|
||||
Phase 3: SIGKILL any that exceed their timeout
|
||||
|
||||
Each plugin has its own timeout (SCREENSHOT_TIMEOUT=60, YTDLP_TIMEOUT=300, etc.)
|
||||
Some hooks (consolelog, responses) exit immediately on SIGTERM.
|
||||
Others (ytdlp, wget) need their full timeout to finish actual work.
|
||||
"""
|
||||
# Send SIGTERM to all processes in parallel
|
||||
for hook_name, process in self.background_processes.items():
|
||||
os.kill(process.pid, signal.SIGTERM)
|
||||
|
||||
# Build per-process deadlines based on plugin-specific timeouts
|
||||
deadlines = {
|
||||
name: (proc, time.time() + max(0, proc.timeout - (time.time() - proc.started_at.timestamp())))
|
||||
for name, proc in self.background_processes.items()
|
||||
}
|
||||
|
||||
# Poll all processes in parallel - no head-of-line blocking
|
||||
still_running = set(deadlines.keys())
|
||||
while still_running:
|
||||
time.sleep(0.1)
|
||||
for name in list(still_running):
|
||||
proc, deadline = deadlines[name]
|
||||
if not proc.is_running():
|
||||
still_running.remove(name)
|
||||
elif time.time() >= deadline:
|
||||
os.kill(proc.pid, signal.SIGKILL) # Timeout exceeded
|
||||
still_running.remove(name)
|
||||
|
||||
|
||||
# models.py - Process becomes active
|
||||
class Process:
|
||||
def launch(self, background=False):
|
||||
"""Spawn subprocess and track it."""
|
||||
with open(self.stdout_file, 'w') as out, open(self.stderr_file, 'w') as err:
|
||||
proc = subprocess.Popen(
|
||||
self.cmd,
|
||||
cwd=self.pwd,
|
||||
stdout=out,
|
||||
stderr=err,
|
||||
env=self._build_env(),
|
||||
)
|
||||
|
||||
self.pid = proc.pid
|
||||
self.started_at = timezone.now()
|
||||
self.status = self.StatusChoices.RUNNING
|
||||
self.save()
|
||||
|
||||
if not background:
|
||||
# Foreground - wait inline
|
||||
proc.wait()
|
||||
self.exit_code = proc.returncode
|
||||
self.ended_at = timezone.now()
|
||||
self.status = self.StatusChoices.EXITED
|
||||
self.save()
|
||||
|
||||
return self
|
||||
|
||||
def wait(self, timeout=None):
|
||||
"""Wait for process to exit, polling DB."""
|
||||
while True:
|
||||
self.refresh_from_db()
|
||||
if self.status == self.StatusChoices.EXITED:
|
||||
return self.exit_code
|
||||
|
||||
# Check via psutil if Process died without updating DB
|
||||
if not self.is_running():
|
||||
self._reap() # Update status from OS
|
||||
return self.exit_code
|
||||
|
||||
time.sleep(0.1)
|
||||
|
||||
def terminate(self, sig=signal.SIGTERM):
|
||||
"""Gracefully terminate: SIGTERM → wait → SIGKILL."""
|
||||
if not self.is_running():
|
||||
return True
|
||||
|
||||
os.kill(self.pid, sig)
|
||||
|
||||
# Wait for graceful shutdown
|
||||
for _ in range(50): # 5 seconds
|
||||
if not self.is_running():
|
||||
self._reap()
|
||||
return True
|
||||
time.sleep(0.1)
|
||||
|
||||
# Escalate to SIGKILL
|
||||
os.kill(self.pid, signal.SIGKILL)
|
||||
self._reap()
|
||||
return True
|
||||
```
|
||||
|
||||
## Migration Steps
|
||||
|
||||
### Step 1: Update Process.launch() (DONE - already exists)
|
||||
Process model already has `.launch()`, `.wait()`, `.terminate()` methods implemented in machine/models.py:1295-1593
|
||||
|
||||
### Step 2: Refactor run_hook() to use Process.launch()
|
||||
**File**: `archivebox/hooks.py`
|
||||
|
||||
Change signature from:
|
||||
```python
|
||||
def run_hook(...) -> HookResult: # Returns dict
|
||||
```
|
||||
|
||||
To:
|
||||
```python
|
||||
def run_hook(...) -> Process: # Returns Process model
|
||||
```
|
||||
|
||||
**Implementation**:
|
||||
```python
|
||||
def run_hook(script, output_dir, config, timeout=None, **kwargs) -> Process:
|
||||
from archivebox.machine.models import Process, Machine
|
||||
|
||||
# Build command
|
||||
cmd = build_hook_cmd(script, kwargs)
|
||||
env = build_hook_env(config)
|
||||
is_bg = is_background_hook(script.name)
|
||||
|
||||
# Create Process record
|
||||
process = Process.objects.create(
|
||||
machine=Machine.current(),
|
||||
process_type=Process.TypeChoices.HOOK,
|
||||
pwd=str(output_dir),
|
||||
cmd=cmd,
|
||||
env=env,
|
||||
timeout=timeout or 120,
|
||||
)
|
||||
|
||||
# Launch subprocess
|
||||
process.launch(background=is_bg)
|
||||
|
||||
return process
|
||||
```
|
||||
|
||||
### Step 3: Update SnapshotWorker to use Process methods
|
||||
**File**: `archivebox/workers/worker.py`
|
||||
|
||||
Replace manual psutil code with Process model methods (shown above in Design section).
|
||||
|
||||
### Step 4: Update ArchiveResult.run() to use new run_hook()
|
||||
**File**: `archivebox/core/models.py:2559`
|
||||
|
||||
Change from:
|
||||
```python
|
||||
result = run_hook(...) # Returns HookResult dict
|
||||
if result is None:
|
||||
is_bg_hook = True
|
||||
```
|
||||
|
||||
To:
|
||||
```python
|
||||
process = run_hook(...) # Returns Process
|
||||
self.process = process
|
||||
self.save()
|
||||
|
||||
if process.status == Process.StatusChoices.RUNNING:
|
||||
# Background hook - still running
|
||||
return
|
||||
else:
|
||||
# Foreground hook - completed
|
||||
self.update_from_output()
|
||||
```
|
||||
|
||||
### Step 5: Update Crawl.run() similarly
|
||||
**File**: `archivebox/crawls/models.py:374`
|
||||
|
||||
Same pattern as ArchiveResult.run()
|
||||
|
||||
## Benefits
|
||||
|
||||
### 1. Single Source of Truth
|
||||
- Process model owns ALL subprocess operations
|
||||
- No duplicate logic between run_hook(), Process, and workers
|
||||
- Consistent PID tracking, stdout/stderr handling
|
||||
|
||||
### 2. Proper Hierarchy
|
||||
```
|
||||
Process.parent_id creates tree:
|
||||
Orchestrator (PID 1000)
|
||||
└─> CrawlWorker (PID 1001, parent=1000)
|
||||
└─> on_Crawl__01_chrome.js (PID 1010, parent=1001)
|
||||
└─> SnapshotWorker (PID 1020, parent=1000)
|
||||
└─> on_Snapshot__50_wget.py (PID 1021, parent=1020)
|
||||
└─> on_Snapshot__63_ytdlp.bg.py (PID 1022, parent=1020)
|
||||
```
|
||||
|
||||
### 3. Better Observability
|
||||
- Query all hook processes: `snapshot.process_set.all()`
|
||||
- Count running: `Process.objects.filter(status='running').count()`
|
||||
- Track resource usage via Process.get_memory_info()
|
||||
|
||||
### 4. Cleaner Code
|
||||
- SnapshotWorker._wait_for_hook: 25 lines → 8 lines
|
||||
- SnapshotWorker.on_shutdown: 12 lines → 7 lines
|
||||
- run_hook(): ~200 lines → ~50 lines
|
||||
- Total: ~100 LoC saved
|
||||
|
||||
## Risks & Mitigation
|
||||
|
||||
### Risk 1: Breaking existing run_hook() callers
|
||||
**Mitigation**: Two-phase rollout
|
||||
1. Phase 1: Add run_hook_v2() that returns Process
|
||||
2. Phase 2: Migrate callers to run_hook_v2()
|
||||
3. Phase 3: Rename run_hook → run_hook_legacy, run_hook_v2 → run_hook
|
||||
|
||||
### Risk 2: Background hook tracking changes
|
||||
**Mitigation**:
|
||||
- Process.launch(background=True) handles async launches
|
||||
- Process.wait() already polls for completion
|
||||
- Behavior identical to current subprocess.Popen
|
||||
|
||||
### Risk 3: Performance overhead (extra DB writes)
|
||||
**Mitigation**:
|
||||
- Process records already being created (just not used)
|
||||
- Batch updates where possible
|
||||
- Monitor via metrics
|
||||
|
||||
## Timeline
|
||||
|
||||
### Immediate (This PR)
|
||||
- [x] State machine fixes (completed)
|
||||
- [x] Step advancement optimization (completed)
|
||||
- [x] Document unified architecture (this file)
|
||||
|
||||
### Next PR (Process Integration)
|
||||
1. Add run_hook_v2() returning Process
|
||||
2. Update SnapshotWorker to use Process methods
|
||||
3. Migrate ArchiveResult.run() and Crawl.run()
|
||||
4. Deprecate old run_hook()
|
||||
|
||||
### Future
|
||||
- Remove run_hook_legacy after migration complete
|
||||
- Add Process.get_tree() for hierarchy visualization
|
||||
- Add ProcessMachine state machine for lifecycle management
|
||||
Reference in New Issue
Block a user