unified Process source of truth and better screenshot tests

This commit is contained in:
Nick Sweeting
2026-01-02 04:20:34 -08:00
parent 3672174dad
commit dd77511026
44 changed files with 3369 additions and 1919 deletions

View File

@@ -1,3 +0,0 @@
[SERVER_CONFIG]
SECRET_KEY = y6fw9wcaqls9sx_dze6ahky9ggpkpzoaw5g5v98_u3ro5j0_4f

View File

@@ -35,12 +35,8 @@ class WorkerSchema(Schema):
model: str
max_tick_time: int
max_concurrent_tasks: int
poll_interval: float
idle_timeout: int
running_count: int
running_workers: List[dict[str, Any]]
queue_count: int
queue: List[QueueItemSchema]
@staticmethod
def resolve_model(obj) -> str:
@@ -55,38 +51,21 @@ class WorkerSchema(Schema):
def resolve_max_concurrent_tasks(obj) -> int:
return obj.MAX_CONCURRENT_TASKS
@staticmethod
def resolve_poll_interval(obj) -> float:
return obj.POLL_INTERVAL
@staticmethod
def resolve_idle_timeout(obj) -> int:
return obj.IDLE_TIMEOUT
@staticmethod
def resolve_running_count(obj) -> int:
return len(obj.get_running_workers())
return obj.get_worker_count()
@staticmethod
def resolve_running_workers(obj) -> List[dict[str, Any]]:
return obj.get_running_workers()
@staticmethod
def resolve_queue_count(obj) -> int:
return obj.get_queue().count()
@staticmethod
def resolve_queue(obj) -> List[QueueItemSchema]:
return list(obj.get_queue()[:50]) # Limit to 50 items
class OrchestratorSchema(Schema):
"""Schema for the Orchestrator."""
is_running: bool
poll_interval: float
idle_timeout: int
max_workers_per_type: int
max_total_workers: int
max_crawl_workers: int
total_worker_count: int
workers: List[WorkerSchema]
@@ -95,23 +74,20 @@ class OrchestratorSchema(Schema):
def get_orchestrator(request):
"""Get the orchestrator status and all worker queues."""
from archivebox.workers.orchestrator import Orchestrator
from archivebox.workers.worker import CrawlWorker, SnapshotWorker, ArchiveResultWorker
from archivebox.workers.worker import CrawlWorker
orchestrator = Orchestrator()
# Create temporary worker instances to query their queues
workers = [
CrawlWorker(worker_id=-1),
SnapshotWorker(worker_id=-1),
ArchiveResultWorker(worker_id=-1),
]
return {
'is_running': orchestrator.is_running(),
'poll_interval': orchestrator.POLL_INTERVAL,
'idle_timeout': orchestrator.IDLE_TIMEOUT,
'max_workers_per_type': orchestrator.MAX_WORKERS_PER_TYPE,
'max_total_workers': orchestrator.MAX_TOTAL_WORKERS,
'max_crawl_workers': orchestrator.MAX_CRAWL_WORKERS,
'total_worker_count': orchestrator.get_total_worker_count(),
'workers': workers,
}
@@ -120,41 +96,12 @@ def get_orchestrator(request):
@router.get("/workers", response=List[WorkerSchema], url_name="get_workers")
def get_workers(request):
"""List all worker types and their current status."""
from archivebox.workers.worker import CrawlWorker, SnapshotWorker, ArchiveResultWorker
from archivebox.workers.worker import CrawlWorker
# Create temporary instances to query their queues
return [
CrawlWorker(worker_id=-1),
SnapshotWorker(worker_id=-1),
ArchiveResultWorker(worker_id=-1),
]
@router.get("/worker/{worker_name}", response=WorkerSchema, url_name="get_worker")
def get_worker(request, worker_name: str):
"""Get status and queue for a specific worker type."""
from archivebox.workers.worker import WORKER_TYPES
if worker_name not in WORKER_TYPES:
from ninja.errors import HttpError
raise HttpError(404, f"Unknown worker type: {worker_name}. Valid types: {list(WORKER_TYPES.keys())}")
WorkerClass = WORKER_TYPES[worker_name]
return WorkerClass(worker_id=-1)
@router.get("/worker/{worker_name}/queue", response=List[QueueItemSchema], url_name="get_worker_queue")
def get_worker_queue(request, worker_name: str, limit: int = 100):
"""Get the current queue for a specific worker type."""
from archivebox.workers.worker import WORKER_TYPES
if worker_name not in WORKER_TYPES:
from ninja.errors import HttpError
raise HttpError(404, f"Unknown worker type: {worker_name}. Valid types: {list(WORKER_TYPES.keys())}")
WorkerClass = WORKER_TYPES[worker_name]
worker = WorkerClass(worker_id=-1)
return list(worker.get_queue()[:limit])
# Progress endpoint moved to core.views.live_progress_view for simplicity

View File

@@ -96,10 +96,9 @@ def add(urls: str | list[str],
first_url = crawl.get_urls_list()[0] if crawl.get_urls_list() else ''
print(f' [dim]First URL: {first_url}[/dim]')
# 3. The CrawlMachine will create the root Snapshot when started
# If URLs are from a file: first URL = file:///path/to/sources/...txt
# Parser extractors will run on it and discover more URLs
# Those URLs become child Snapshots (depth=1)
# 3. The CrawlMachine will create Snapshots from all URLs when started
# Parser extractors run on snapshots and discover more URLs
# Discovered URLs become child Snapshots (depth+1)
if index_only:
# Just create the crawl but don't start processing
@@ -119,10 +118,9 @@ def add(urls: str | list[str],
# 5. Start the orchestrator to process the queue
# The orchestrator will:
# - Process Crawl -> create root Snapshot
# - Process root Snapshot -> run parser extractors -> discover URLs
# - Create child Snapshots from discovered URLs
# - Process child Snapshots -> run extractors
# - Process Crawl -> create Snapshots from all URLs
# - Process Snapshots -> run extractors
# - Parser extractors discover new URLs -> create child Snapshots
# - Repeat until max_depth reached
if bg:

View File

@@ -160,10 +160,12 @@ def init(force: bool=False, quick: bool=False, install: bool=False) -> None:
CONSTANTS.PERSONAS_DIR.mkdir(parents=True, exist_ok=True)
CONSTANTS.DEFAULT_TMP_DIR.mkdir(parents=True, exist_ok=True)
CONSTANTS.DEFAULT_LIB_DIR.mkdir(parents=True, exist_ok=True)
(CONSTANTS.DEFAULT_LIB_DIR / 'bin').mkdir(parents=True, exist_ok=True)
from archivebox.config.common import STORAGE_CONFIG
STORAGE_CONFIG.TMP_DIR.mkdir(parents=True, exist_ok=True)
STORAGE_CONFIG.LIB_DIR.mkdir(parents=True, exist_ok=True)
(STORAGE_CONFIG.LIB_DIR / 'bin').mkdir(parents=True, exist_ok=True)
if install:
from archivebox.cli.archivebox_install import install as install_method

View File

@@ -96,33 +96,45 @@ ARCHIVERESULT_MACHINE_DIAGRAM = """
├─────────────────────────────────────────────────────────────────────────────┤
│ │
│ ┌─────────────┐ │
│ │ QUEUED │◄────────────────┐
│ │ (initial) │ │
│ └──────┬──────┘ │
│ │ │ tick() unless can_start()
│ tick() when
│ │ can_start()
│ ▼
│ ┌─────────────┐
│ │ STARTED │─────────────────┘
│◄────────────────┐
│ enter: │ │ tick() unless is_finished()
│ result.run()│─────────────────┘
│ (execute │
│ hook via
│ run_hook())│
────────────┘
│ tick() checks status set by hook output
├────────────────┬────────────────┬────────────────┐
▼ ▼
┌───────────┐ ┌───────────┐ ┌─────────── ┌───────────┐
│ │ SUCCEEDED FAILED SKIPPED BACKOFF
│ (final) (final) (final)
└───────────┘ └───────────┘ └───────────┘ └─────┬─────┘
can_start()───┘
loops back to STARTED
│ │ QUEUED │◄────────────────┐ │
│ │ (initial) │ │ │
│ └──┬───────┬──┘ │ │
│ │ tick() unless can_start() │
│ │ exceeded_max_ │ │
attempts │ │
┌──────────┐
SKIPPED │ │
│ │ (final)
│ └──────────┘ │
│ tick() when │
│ can_start()
┌─────────────┐
│ STARTED │──────────────────┘ │
│ │ │◄─────────────────────────────────────────────────┐
│ enter: │ │
│ result.run()│ tick() unless
│ (execute │ is_finished()
│ hook via │──────────────────────
│ │ run_hook())
└──────┬──────┘ │ │
│ │
│ tick() checks status set by hook output │ │
├─────────────┬─────────────┬─────────────┐
▼ │
│ ┌───────────┐ ┌───────────┐ ┌───────────┐ ┌───────────┐ │ │
│ │ SUCCEEDED │ │ FAILED │ │ SKIPPED │ │ BACKOFF │ │ │
│ │ (final) │ │ (final) │ │ (final) │ │ │ │ │
│ └───────────┘ └───────────┘ └───────────┘ └──┬──────┬─┘ │ │
│ │ │ │ │
│ exceeded_max_ │ │ can_start()│ │
│ attempts │ │ loops back │ │
│ ▼ │ └────────────┘ │
│ ┌──────────┐ │ │
│ │ SKIPPED │◄─┘ │
│ │ (final) │ │
│ └──────────┘ │
│ │
│ Each ArchiveResult runs ONE specific hook (stored in .hook_name field) │
└─────────────────────────────────────────────────────────────────────────────┘
@@ -137,35 +149,38 @@ BINARY_MACHINE_DIAGRAM = """
│ │ QUEUED │◄────────────────┐ │
│ │ (initial) │ │ │
│ └──────┬──────┘ │ │
│ │ │ tick() unless can_start()
│ │ │ tick() unless can_install()
│ │ │ (stays queued if failed) │
│ │ tick() when │ │
│ │ can_start() │ │
│ │
┌─────────────┐ │ │
│ STARTED │─────────────────┘
│◄────────────────┐
│ enter: │ │
│ binary.run()│ │ tick() unless is_finished()
│ (discover │─────────────────┘
Binary
hooks,
│ try each │
│ provider) │ │
└──────┬──────┘
│ │ can_install() │ │
│ │
│ on_install() runs │ │
│ during transition: │
• binary.run() │
(discover Binary │ │
│ hooks, try each │
│ provider until │
│ │ one succeeds)
│ │ • Sets abspath,
version, sha256
│ If install fails:
│ │ raises exception──────┘ │
│ │ (retry_at bumped) │
│ │ │
│ tick() checks status set by hook output
├────────────────────────────────┐
┌─────────────┐ ┌─────────────┐
│ │ SUCCEEDED FAILED │
│ │ (final) (final)
│ │
│ │ abspath,│ no provider │
│ version set │ │ succeeded
│ └─────────────┘ └─────────────┘ │
┌─────────────┐
INSTALLED │
│ (final)
│ │
│ │ Binary is
│ │ ready to
│ │ use
└─────────────┘
│ │
│ Hooks triggered: on_Binary__* (provider hooks during STARTED.enter)
│ Hooks triggered: on_Binary__* (provider hooks during transition)
│ Providers tried in sequence until one succeeds: apt, brew, pip, npm, etc. │
│ Installation is synchronous - no intermediate STARTED state │
└─────────────────────────────────────────────────────────────────────────────┘
"""

View File

@@ -109,15 +109,18 @@ def drain_old_archive_dirs(resume_from: str = None, batch_size: int = 100) -> di
if not archive_dir.exists():
return stats
print('[*] Scanning for old directories in archive/...')
print('[DEBUG Phase1] Scanning for old directories in archive/...')
# Scan for real directories only (skip symlinks - they're already migrated)
all_entries = list(os.scandir(archive_dir))
print(f'[DEBUG Phase1] Total entries in archive/: {len(all_entries)}')
entries = [
(e.stat().st_mtime, e.path)
for e in os.scandir(archive_dir)
for e in all_entries
if e.is_dir(follow_symlinks=False) # Skip symlinks
]
entries.sort(reverse=True) # Newest first
print(f'[DEBUG Phase1] Real directories (not symlinks): {len(entries)}')
print(f'[*] Found {len(entries)} old directories to drain')
for mtime, entry_path in entries:
@@ -142,14 +145,48 @@ def drain_old_archive_dirs(resume_from: str = None, batch_size: int = 100) -> di
print(f" [{stats['processed']}] Invalid: {entry_path.name}")
continue
# Ensure snapshot has a valid crawl (migration 0024 may have failed)
from archivebox.crawls.models import Crawl
has_valid_crawl = False
if snapshot.crawl_id:
# Check if the crawl actually exists
has_valid_crawl = Crawl.objects.filter(id=snapshot.crawl_id).exists()
if not has_valid_crawl:
# Create a new crawl (created_by will default to system user)
crawl = Crawl.objects.create(urls=snapshot.url)
# Use queryset update to avoid triggering save() hooks
from archivebox.core.models import Snapshot as SnapshotModel
SnapshotModel.objects.filter(pk=snapshot.pk).update(crawl=crawl)
# Refresh the instance
snapshot.crawl = crawl
snapshot.crawl_id = crawl.id
print(f"[DEBUG Phase1] Created missing crawl for snapshot {str(snapshot.id)[:8]}")
# Check if needs migration (0.8.x → 0.9.x)
print(f"[DEBUG Phase1] Snapshot {str(snapshot.id)[:8]}: fs_version={snapshot.fs_version}, needs_migration={snapshot.fs_migration_needed}")
if snapshot.fs_migration_needed:
try:
# Manually trigger filesystem migration without full save()
# This avoids UNIQUE constraint issues while still migrating files
cleanup_info = None
if hasattr(snapshot, '_fs_migrate_from_0_8_0_to_0_9_0'):
cleanup_info = snapshot._fs_migrate_from_0_8_0_to_0_9_0()
# Calculate paths using actual directory (entry_path), not snapshot.timestamp
# because snapshot.timestamp might be truncated
old_dir = entry_path
new_dir = snapshot.get_storage_path_for_version('0.9.0')
print(f"[DEBUG Phase1] Migrating {old_dir.name}{new_dir}")
# Manually migrate files
if not new_dir.exists() and old_dir.exists():
new_dir.mkdir(parents=True, exist_ok=True)
import shutil
file_count = 0
for old_file in old_dir.rglob('*'):
if old_file.is_file():
rel_path = old_file.relative_to(old_dir)
new_file = new_dir / rel_path
if not new_file.exists():
new_file.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(old_file, new_file)
file_count += 1
print(f"[DEBUG Phase1] Copied {file_count} files")
# Update only fs_version field using queryset update (bypasses validation)
from archivebox.core.models import Snapshot as SnapshotModel
@@ -158,9 +195,8 @@ def drain_old_archive_dirs(resume_from: str = None, batch_size: int = 100) -> di
# Commit the transaction
transaction.commit()
# Manually call cleanup since we bypassed normal save() flow
if cleanup_info:
old_dir, new_dir = cleanup_info
# Cleanup: delete old dir and create symlink
if old_dir.exists() and old_dir != new_dir:
snapshot._cleanup_old_migration_dir(old_dir, new_dir)
stats['migrated'] += 1
@@ -207,19 +243,39 @@ def process_all_db_snapshots(batch_size: int = 100) -> dict:
continue
try:
# Reconcile index.json with DB
snapshot.reconcile_with_index_json()
print(f"[DEBUG Phase2] Snapshot {str(snapshot.id)[:8]}: fs_version={snapshot.fs_version}, needs_migration={snapshot.fs_migration_needed}")
# Check if snapshot has a directory on disk
from pathlib import Path
output_dir = Path(snapshot.output_dir)
has_directory = output_dir.exists() and output_dir.is_dir()
# Only reconcile if directory exists (don't create empty directories for orphans)
if has_directory:
snapshot.reconcile_with_index_json()
# Clean up invalid field values from old migrations
if not isinstance(snapshot.current_step, int):
snapshot.current_step = 0
# If still needs migration, it's an orphan (no directory on disk)
# Mark it as migrated to prevent save() from triggering filesystem migration
if snapshot.fs_migration_needed:
if has_directory:
print(f"[DEBUG Phase2] WARNING: Snapshot {str(snapshot.id)[:8]} has directory but still needs migration")
else:
print(f"[DEBUG Phase2] Orphan snapshot {str(snapshot.id)[:8]} - marking as migrated without filesystem operation")
# Use queryset update to set fs_version without triggering save() hooks
from archivebox.core.models import Snapshot as SnapshotModel
SnapshotModel.objects.filter(pk=snapshot.pk).update(fs_version='0.9.0')
snapshot.fs_version = '0.9.0'
# Queue for archiving (state machine will handle it)
snapshot.status = Snapshot.StatusChoices.QUEUED
snapshot.retry_at = timezone.now()
snapshot.save()
stats['reconciled'] += 1
stats['reconciled'] += 1 if has_directory else 0
stats['queued'] += 1
except Exception as e:
# Skip snapshots that can't be processed (e.g., missing crawl)

View File

@@ -4,6 +4,7 @@
from django.db import migrations, connection
import json
from pathlib import Path
from archivebox.uuid_compat import uuid7
def parse_cmd_field(cmd_raw):
@@ -39,7 +40,6 @@ def parse_cmd_field(cmd_raw):
def get_or_create_current_machine(cursor):
"""Get or create Machine.current() using raw SQL."""
import uuid
import socket
from datetime import datetime
@@ -55,7 +55,8 @@ def get_or_create_current_machine(cursor):
return row[0]
# Create new machine
machine_id = str(uuid.uuid4())
# Django UUIDField stores UUIDs as 32-char hex (no dashes) in SQLite
machine_id = uuid7().hex
now = datetime.now().isoformat()
# Check which columns exist (schema differs between 0.8.x and 0.9.x)
@@ -103,7 +104,6 @@ def get_or_create_binary(cursor, machine_id, name, abspath, version):
Returns:
binary_id (str)
"""
import uuid
from datetime import datetime
# If abspath is just a name without slashes, it's not a full path
@@ -123,7 +123,8 @@ def get_or_create_binary(cursor, machine_id, name, abspath, version):
return row[0]
# Create new binary
binary_id = str(uuid.uuid4())
# Django UUIDField stores UUIDs as 32-char hex (no dashes) in SQLite
binary_id = uuid7().hex
now = datetime.now().isoformat()
# Check which columns exist (schema differs between 0.8.x and 0.9.x)
@@ -186,10 +187,10 @@ def create_process(cursor, machine_id, pwd, cmd, status, exit_code, started_at,
Returns:
process_id (str)
"""
import uuid
from datetime import datetime
process_id = str(uuid.uuid4())
# Django UUIDField stores UUIDs as 32-char hex (no dashes) in SQLite
process_id = uuid7().hex
now = datetime.now().isoformat()
# Convert cmd array to JSON

View File

@@ -0,0 +1,18 @@
# Generated by Django 6.0 on 2026-01-02 08:43
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0027_copy_archiveresult_to_process'),
]
operations = [
migrations.AlterField(
model_name='snapshot',
name='fs_version',
field=models.CharField(default='0.9.0', help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().', max_length=10),
),
]

View File

@@ -0,0 +1,181 @@
# Generated by hand on 2026-01-02
# Migrate ArchiveResult from integer PK to UUID PK (matching Snapshot)
from django.db import migrations, models, connection
from uuid import UUID
from archivebox.uuid_compat import uuid7
def migrate_archiveresult_id_to_uuid(apps, schema_editor):
"""
Migrate ArchiveResult from integer PK to UUID PK.
Strategy:
1. Add old_id field to store current integer IDs
2. Generate UUIDs for any records missing them
3. Swap id and uuid fields (uuid becomes PK, old integer id becomes old_id)
"""
cursor = connection.cursor()
# Check if table exists and has data
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='core_archiveresult'")
if not cursor.fetchone():
print('ArchiveResult table does not exist, skipping migration')
return
cursor.execute("SELECT COUNT(*) FROM core_archiveresult")
row_count = cursor.fetchone()[0]
if row_count == 0:
print('No ArchiveResult records to migrate')
return
print(f'Migrating {row_count} ArchiveResult records from integer PK to UUID PK...')
# Step 0: Check if machine_process table exists, if not NULL out process_id values
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='machine_process'")
machine_process_exists = cursor.fetchone() is not None
if not machine_process_exists:
print('machine_process table does not exist yet, setting process_id to NULL')
cursor.execute("UPDATE core_archiveresult SET process_id = NULL WHERE process_id IS NOT NULL")
# Step 1: Create new table with UUID as primary key
cursor.execute("""
CREATE TABLE core_archiveresult_new (
id TEXT PRIMARY KEY NOT NULL,
old_id INTEGER,
uuid TEXT UNIQUE,
created_at DATETIME NOT NULL,
modified_at DATETIME NOT NULL,
snapshot_id TEXT NOT NULL,
plugin VARCHAR(32) NOT NULL,
hook_name VARCHAR(255) NOT NULL DEFAULT '',
status VARCHAR(15) NOT NULL DEFAULT 'queued',
retry_at DATETIME,
start_ts DATETIME,
end_ts DATETIME,
output_str TEXT NOT NULL DEFAULT '',
output_json TEXT,
output_files TEXT NOT NULL DEFAULT '{}',
output_size BIGINT NOT NULL DEFAULT 0,
output_mimetypes VARCHAR(512) NOT NULL DEFAULT '',
config TEXT NOT NULL DEFAULT '{}',
notes TEXT NOT NULL DEFAULT '',
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
num_uses_failed INTEGER NOT NULL DEFAULT 0,
process_id TEXT,
FOREIGN KEY (snapshot_id) REFERENCES core_snapshot(id) ON DELETE CASCADE,
FOREIGN KEY (process_id) REFERENCES machine_process(id) ON DELETE SET NULL
);
""")
# Step 2: Generate UUIDs for records that don't have them
cursor.execute("SELECT id, uuid FROM core_archiveresult")
records = cursor.fetchall()
id_to_uuid = {}
for old_id, existing_uuid in records:
if existing_uuid:
# Normalize existing UUID to 32-char hex format (Django SQLite UUIDField format)
# (existing UUIDs might be stored with or without dashes in old schema)
id_to_uuid[old_id] = UUID(existing_uuid).hex
else:
# Generate new UUIDv7 (time-ordered) as 32-char hex
id_to_uuid[old_id] = uuid7().hex
# Step 3: Copy data with UUIDs as new primary key
cursor.execute("SELECT * FROM core_archiveresult")
old_records = cursor.fetchall()
# Get column names
cursor.execute("PRAGMA table_info(core_archiveresult)")
columns = cursor.fetchall()
col_names = [col[1] for col in columns]
for i, record in enumerate(old_records):
old_id = record[col_names.index('id')]
new_uuid = id_to_uuid[old_id]
# Build insert with new structure
values = {col_names[i]: record[i] for i in range(len(col_names))}
# Check which fields exist in new table
fields_to_copy = [
'created_at', 'modified_at', 'snapshot_id', 'plugin', 'hook_name',
'status', 'retry_at', 'start_ts', 'end_ts',
'output_str', 'output_json', 'output_files', 'output_size', 'output_mimetypes',
'config', 'notes', 'num_uses_succeeded', 'num_uses_failed', 'process_id'
]
# Build INSERT statement
existing_fields = [f for f in fields_to_copy if f in values]
placeholders = ', '.join(['?'] * (len(existing_fields) + 3)) # +3 for id, old_id, uuid
field_list = 'id, old_id, uuid, ' + ', '.join(existing_fields)
insert_values = [new_uuid, old_id, new_uuid] + [values.get(f) for f in existing_fields]
cursor.execute(
f"INSERT INTO core_archiveresult_new ({field_list}) VALUES ({placeholders})",
insert_values
)
# Step 4: Replace old table with new table
cursor.execute("DROP TABLE core_archiveresult")
cursor.execute("ALTER TABLE core_archiveresult_new RENAME TO core_archiveresult")
# Step 5: Create indexes
cursor.execute("CREATE INDEX core_archiveresult_snapshot_id_idx ON core_archiveresult(snapshot_id)")
cursor.execute("CREATE INDEX core_archiveresult_plugin_idx ON core_archiveresult(plugin)")
cursor.execute("CREATE INDEX core_archiveresult_status_idx ON core_archiveresult(status)")
cursor.execute("CREATE INDEX core_archiveresult_retry_at_idx ON core_archiveresult(retry_at)")
cursor.execute("CREATE INDEX core_archiveresult_created_at_idx ON core_archiveresult(created_at)")
cursor.execute("CREATE INDEX core_archiveresult_hook_name_idx ON core_archiveresult(hook_name)")
cursor.execute("CREATE INDEX core_archiveresult_process_id_idx ON core_archiveresult(process_id)")
cursor.execute("CREATE INDEX core_archiveresult_old_id_idx ON core_archiveresult(old_id)")
print(f'✓ Migrated {row_count} ArchiveResult records to UUID primary key')
class Migration(migrations.Migration):
dependencies = [
('core', '0028_alter_snapshot_fs_version'),
]
operations = [
migrations.SeparateDatabaseAndState(
database_operations=[
migrations.RunPython(
migrate_archiveresult_id_to_uuid,
reverse_code=migrations.RunPython.noop,
),
],
state_operations=[
# Remove old uuid field
migrations.RemoveField(
model_name='archiveresult',
name='uuid',
),
# Change id from AutoField to UUIDField
migrations.AlterField(
model_name='archiveresult',
name='id',
field=models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True),
),
# Add old_id field to preserve legacy integer IDs
migrations.AddField(
model_name='archiveresult',
name='old_id',
field=models.IntegerField(null=True, blank=True, db_index=True, help_text='Legacy integer ID from pre-0.9.0 versions'),
),
],
),
]

View File

@@ -0,0 +1,19 @@
# Generated by Django 6.0 on 2026-01-02 10:02
import uuid
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0029_migrate_archiveresult_to_uuid_pk'),
]
operations = [
migrations.AlterField(
model_name='archiveresult',
name='id',
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
]

View File

@@ -362,6 +362,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
# Migrate filesystem if needed (happens automatically on save)
if self.pk and self.fs_migration_needed:
print(f"[DEBUG save()] Triggering filesystem migration for {str(self.id)[:8]}: {self.fs_version}{self._fs_current_version()}")
# Walk through migration chain automatically
current = self.fs_version
target = self._fs_current_version()
@@ -372,6 +373,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
# Only run if method exists (most are no-ops)
if hasattr(self, method):
print(f"[DEBUG save()] Running {method}()")
getattr(self, method)()
current = next_ver
@@ -449,10 +451,18 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
old_dir = self.get_storage_path_for_version('0.8.0')
new_dir = self.get_storage_path_for_version('0.9.0')
if not old_dir.exists() or old_dir == new_dir or new_dir.exists():
# Even if no directory migration needed, still convert index format
self.convert_index_json_to_jsonl()
return
print(f"[DEBUG _fs_migrate] {self.timestamp}: old_exists={old_dir.exists()}, same={old_dir == new_dir}, new_exists={new_dir.exists()}")
if not old_dir.exists() or old_dir == new_dir:
# No migration needed
print(f"[DEBUG _fs_migrate] Returning None (early return)")
return None
if new_dir.exists():
# New directory already exists (files already copied), but we still need cleanup
# Return cleanup info so old directory can be cleaned up
print(f"[DEBUG _fs_migrate] Returning cleanup info (new_dir exists)")
return (old_dir, new_dir)
new_dir.mkdir(parents=True, exist_ok=True)
@@ -495,47 +505,32 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
def _cleanup_old_migration_dir(self, old_dir: Path, new_dir: Path):
"""
Delete old directory and create symlink after successful migration.
Called via transaction.on_commit() after DB commit succeeds.
"""
import shutil
import logging
print(f"[DEBUG] _cleanup_old_migration_dir called: old_dir={old_dir}, new_dir={new_dir}")
# Delete old directory
if old_dir.exists() and not old_dir.is_symlink():
print(f"[DEBUG] Attempting to delete old directory: {old_dir}")
try:
shutil.rmtree(old_dir)
print(f"[DEBUG] Successfully deleted old directory: {old_dir}")
except Exception as e:
# Log but don't raise - migration succeeded, this is just cleanup
print(f"[DEBUG] Failed to delete old directory {old_dir}: {e}")
logging.getLogger('archivebox.migration').warning(
f"Could not remove old migration directory {old_dir}: {e}"
)
return # Don't create symlink if cleanup failed
else:
print(f"[DEBUG] Old directory doesn't exist or is already a symlink: {old_dir}")
# Create backwards-compat symlink (after old dir is deleted)
symlink_path = old_dir # Same path as old_dir
if symlink_path.is_symlink():
print(f"[DEBUG] Unlinking existing symlink: {symlink_path}")
symlink_path.unlink()
if not symlink_path.exists():
print(f"[DEBUG] Creating symlink: {symlink_path} -> {new_dir}")
try:
symlink_path.symlink_to(new_dir, target_is_directory=True)
print(f"[DEBUG] Successfully created symlink")
except Exception as e:
print(f"[DEBUG] Failed to create symlink: {e}")
logging.getLogger('archivebox.migration').warning(
f"Could not create symlink from {symlink_path} to {new_dir}: {e}"
)
else:
print(f"[DEBUG] Symlink path already exists: {symlink_path}")
# =========================================================================
# Path Calculation and Migration Helpers
@@ -660,13 +655,28 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
if not timestamp:
return None
# Look up existing
# Look up existing (try exact match first, then fuzzy match for truncated timestamps)
try:
return cls.objects.get(url=url, timestamp=timestamp)
snapshot = cls.objects.get(url=url, timestamp=timestamp)
print(f"[DEBUG load_from_directory] Found existing snapshot for {url} @ {timestamp}: {str(snapshot.id)[:8]}")
return snapshot
except cls.DoesNotExist:
print(f"[DEBUG load_from_directory] NOT FOUND (exact): {url} @ {timestamp}")
# Try fuzzy match - index.json may have truncated timestamp
# e.g., index has "1767000340" but DB has "1767000340.624737"
candidates = cls.objects.filter(url=url, timestamp__startswith=timestamp)
if candidates.count() == 1:
snapshot = candidates.first()
print(f"[DEBUG load_from_directory] Found via fuzzy match: {snapshot.timestamp}")
return snapshot
elif candidates.count() > 1:
print(f"[DEBUG load_from_directory] Multiple fuzzy matches, using first")
return candidates.first()
print(f"[DEBUG load_from_directory] NOT FOUND (fuzzy): {url} @ {timestamp}")
return None
except cls.MultipleObjectsReturned:
# Should not happen with unique constraint
print(f"[DEBUG load_from_directory] Multiple snapshots found for {url} @ {timestamp}")
return cls.objects.filter(url=url, timestamp=timestamp).first()
@classmethod
@@ -1668,83 +1678,20 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
return archiveresults
def advance_step_if_ready(self) -> bool:
"""
Advance current_step if all foreground hooks in current step are finished.
Called by the state machine to check if step can advance.
Background hooks (.bg) don't block step advancement.
Step advancement rules:
- All foreground ARs in current step must be finished (SUCCEEDED/FAILED/SKIPPED)
- Background ARs (hook_name contains '.bg.') are ignored for advancement
- When ready, increments current_step by 1 (up to 9)
Returns:
True if step was advanced, False if not ready or already at step 9.
"""
from archivebox.hooks import extract_step, is_background_hook
if self.current_step >= 9:
return False # Already at final step
# Get all ARs for current step that are foreground
current_step_ars = self.archiveresult_set.filter(
hook_name__isnull=False
).exclude(hook_name='')
# Check each AR in current step
for ar in current_step_ars:
ar_step = extract_step(ar.hook_name)
if ar_step != self.current_step:
continue # Not in current step
if is_background_hook(ar.hook_name):
continue # Background hooks don't block
# Foreground hook in current step - check if finished
if ar.status not in ArchiveResult.FINAL_OR_ACTIVE_STATES:
# Still pending/queued - can't advance
return False
if ar.status == ArchiveResult.StatusChoices.STARTED:
# Still running - can't advance
return False
# All foreground hooks in current step are finished - advance!
self.current_step += 1
self.save(update_fields=['current_step', 'modified_at'])
return True
def is_finished_processing(self) -> bool:
"""
Check if this snapshot has finished processing.
Check if all ArchiveResults are finished.
Used by SnapshotMachine.is_finished() to determine if snapshot is complete.
Returns:
True if all archiveresults are finished (or no work to do), False otherwise.
Note: This is only called for observability/progress tracking.
SnapshotWorker owns the execution and doesn't poll this.
"""
# if no archiveresults exist yet, it's not finished
if not self.archiveresult_set.exists():
return False
# Check if any ARs are still pending/started
pending = self.archiveresult_set.exclude(
status__in=ArchiveResult.FINAL_OR_ACTIVE_STATES
).exists()
# Try to advance step if ready (handles step-based hook execution)
# This will increment current_step when all foreground hooks in current step are done
while self.advance_step_if_ready():
pass # Keep advancing until we can't anymore
# if archiveresults exist but are still pending, it's not finished
if self.pending_archiveresults().exists():
return False
# Don't wait for background hooks - they'll be cleaned up on entering sealed state
# Background hooks in STARTED state are excluded by pending_archiveresults()
# (STARTED is in FINAL_OR_ACTIVE_STATES) so once all results are FINAL or ACTIVE,
# we can transition to sealed and cleanup() will kill the background hooks
# otherwise archiveresults exist and are all finished, so it's finished
return True
return not pending
def get_progress_stats(self) -> dict:
"""
@@ -2242,7 +2189,6 @@ class SnapshotMachine(BaseStateMachine, strict_states=True):
tick = (
queued.to.itself(unless='can_start') |
queued.to(started, cond='can_start') |
started.to.itself(unless='is_finished') |
started.to(sealed, cond='is_finished')
)
@@ -2253,6 +2199,10 @@ class SnapshotMachine(BaseStateMachine, strict_states=True):
can_start = bool(self.snapshot.url)
return can_start
def is_finished(self) -> bool:
"""Check if all ArchiveResults for this snapshot are finished."""
return self.snapshot.is_finished_processing()
@queued.enter
def enter_queued(self):
self.snapshot.update_and_requeue(
@@ -2262,29 +2212,10 @@ class SnapshotMachine(BaseStateMachine, strict_states=True):
@started.enter
def enter_started(self):
import sys
print(f'[cyan] 🔄 SnapshotMachine.enter_started() - creating archiveresults for {self.snapshot.url}[/cyan]', file=sys.stderr)
# Run the snapshot - creates pending archiveresults for all enabled plugins
self.snapshot.run()
# Check if any archiveresults were created
ar_count = self.snapshot.archiveresult_set.count()
print(f'[cyan] 🔄 ArchiveResult count: {ar_count}[/cyan]', file=sys.stderr)
if ar_count == 0:
# No archiveresults created, seal immediately
print(f'[cyan] 🔄 No archiveresults created, sealing snapshot immediately[/cyan]', file=sys.stderr)
self.seal()
else:
# Set status = started with retry_at far future (so workers don't claim us - we're waiting for ARs)
# Last AR will manually call self.seal() when done
self.snapshot.update_and_requeue(
retry_at=timezone.now() + timedelta(days=365),
status=Snapshot.StatusChoices.STARTED,
)
print(f'[cyan] 🔄 {ar_count} archiveresults created, waiting for them to finish[/cyan]', file=sys.stderr)
"""Just mark as started - SnapshotWorker will create ARs and run hooks."""
self.snapshot.status = Snapshot.StatusChoices.STARTED
self.snapshot.retry_at = None # No more polling
self.snapshot.save(update_fields=['status', 'retry_at', 'modified_at'])
@sealed.enter
def enter_sealed(self):
@@ -2329,12 +2260,11 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
plugins = [get_plugin_name(e) for e in get_plugins()]
return tuple((e, e) for e in plugins)
# Keep AutoField for backward compatibility with 0.7.x databases
# UUID field is added separately by migration for new records
id = models.AutoField(primary_key=True, editable=False)
# Note: unique constraint is added by migration 0027 - don't set unique=True here
# or SQLite table recreation in earlier migrations will fail
uuid = models.UUIDField(default=uuid7, null=True, blank=True, db_index=True)
# UUID primary key (migrated from integer in 0029)
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
# old_id preserves the legacy integer ID for backward compatibility
old_id = models.IntegerField(null=True, blank=True, db_index=True, help_text='Legacy integer ID from pre-0.9.0 versions')
# Note: uuid field was removed in migration 0029 when id became UUID
created_at = models.DateTimeField(default=timezone.now, db_index=True)
modified_at = models.DateTimeField(auto_now=True)
@@ -2684,13 +2614,11 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
plugin_dir = Path(self.snapshot.output_dir) / self.plugin
start_ts = timezone.now()
is_bg_hook = False
process = None
for hook in hooks:
# Check if this is a background hook
is_bg_hook = is_background_hook(hook.name)
result = run_hook(
# Run hook using Process.launch() - returns Process model
process = run_hook(
hook,
output_dir=plugin_dir,
config=config,
@@ -2700,27 +2628,25 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
depth=self.snapshot.depth,
)
# Background hooks return None
if result is None:
is_bg_hook = True
# Update status based on hook execution
if is_bg_hook:
# BACKGROUND HOOK - still running, return immediately
# Status stays STARTED, will be finalized by Snapshot.cleanup()
self.status = self.StatusChoices.STARTED
# Link ArchiveResult to Process
self.process = process
self.start_ts = start_ts
if self.process_id:
self.process.pwd = str(plugin_dir)
self.process.save()
self.save(update_fields=['process_id', 'start_ts', 'modified_at'])
if not process:
# No hooks ran
self.status = self.StatusChoices.FAILED
self.output_str = 'No hooks executed'
self.save()
return
# Update status based on hook execution
if process.status == process.StatusChoices.RUNNING:
# BACKGROUND HOOK - still running, return immediately
# Status is already STARTED from enter_started(), will be finalized by Snapshot.cleanup()
return
# FOREGROUND HOOK - completed, update from filesystem
self.start_ts = start_ts
if self.process_id:
self.process.pwd = str(plugin_dir)
self.process.save()
self.update_from_output()
# Clean up empty output directory if no files were created
@@ -3037,26 +2963,30 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):
skipped = State(value=ArchiveResult.StatusChoices.SKIPPED, final=True)
# Tick Event - transitions based on conditions
# Flow: queued → started → (succeeded|failed|skipped)
# queued → skipped (if exceeded max attempts)
# started → backoff → started (retry)
tick = (
queued.to(skipped, cond='is_exceeded_max_attempts') | # Check skip first
queued.to.itself(unless='can_start') |
queued.to(started, cond='can_start') |
started.to.itself(unless='is_finished') |
started.to(succeeded, cond='is_succeeded') |
started.to(failed, cond='is_failed') |
started.to(skipped, cond='is_skipped') |
started.to(backoff, cond='is_backoff') |
backoff.to(skipped, cond='is_exceeded_max_attempts') | # Check skip from backoff too
backoff.to.itself(unless='can_start') |
backoff.to(started, cond='can_start') |
backoff.to(succeeded, cond='is_succeeded') |
backoff.to(failed, cond='is_failed') |
backoff.to(skipped, cond='is_skipped')
backoff.to(started, cond='can_start')
# Removed redundant transitions: backoff.to(succeeded/failed/skipped)
# Reason: backoff should always retry→started, then started→final states
)
def can_start(self) -> bool:
if not self.archiveresult.snapshot.url:
return False
"""Pure function - check if AR can start (has valid URL)."""
return bool(self.archiveresult.snapshot.url)
# Check if snapshot has exceeded MAX_URL_ATTEMPTS failed results
def is_exceeded_max_attempts(self) -> bool:
"""Check if snapshot has exceeded MAX_URL_ATTEMPTS failed results."""
from archivebox.config.configset import get_config
config = get_config(
@@ -3070,15 +3000,7 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):
status=ArchiveResult.StatusChoices.FAILED
).count()
if failed_count >= max_attempts:
# Mark this result as skipped since we've hit the limit
self.archiveresult.status = ArchiveResult.StatusChoices.SKIPPED
self.archiveresult.output_str = f'Skipped: snapshot exceeded MAX_URL_ATTEMPTS ({max_attempts} failures)'
self.archiveresult.retry_at = None
self.archiveresult.save()
return False
return True
return failed_count >= max_attempts
def is_succeeded(self) -> bool:
"""Check if extractor plugin succeeded (status was set by run())."""
@@ -3101,12 +3023,35 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):
)
def is_finished(self) -> bool:
"""Check if extraction has completed (success, failure, or skipped)."""
return self.archiveresult.status in (
"""
Check if extraction has completed (success, failure, or skipped).
For background hooks in STARTED state, checks if their Process has finished and reaps them.
"""
# If already in final state, return True
if self.archiveresult.status in (
ArchiveResult.StatusChoices.SUCCEEDED,
ArchiveResult.StatusChoices.FAILED,
ArchiveResult.StatusChoices.SKIPPED,
)
):
return True
# If in STARTED state with a Process, check if Process has finished running
if self.archiveresult.status == ArchiveResult.StatusChoices.STARTED:
if self.archiveresult.process_id:
process = self.archiveresult.process
# If process is NOT running anymore, reap the background hook
if not process.is_running():
self.archiveresult.update_from_output()
# Check if now in final state after reaping
return self.archiveresult.status in (
ArchiveResult.StatusChoices.SUCCEEDED,
ArchiveResult.StatusChoices.FAILED,
ArchiveResult.StatusChoices.SKIPPED,
)
return False
@queued.enter
def enter_queued(self):
@@ -3148,7 +3093,12 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):
)
def _check_and_seal_parent_snapshot(self):
"""Check if this is the last ArchiveResult to finish - if so, seal the parent Snapshot."""
"""
Check if this is the last ArchiveResult to finish - if so, seal the parent Snapshot.
Note: In the new architecture, SnapshotWorker handles step advancement and sealing.
This method is kept for backwards compatibility with manual CLI commands.
"""
import sys
snapshot = self.archiveresult.snapshot
@@ -3189,6 +3139,8 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):
def enter_failed(self):
import sys
print(f'[red] ❌ ArchiveResult.enter_failed() called for {self.archiveresult.plugin}[/red]', file=sys.stderr)
self.archiveresult.update_and_requeue(
retry_at=None,
status=ArchiveResult.StatusChoices.FAILED,
@@ -3207,6 +3159,16 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):
def enter_skipped(self):
import sys
# Set output_str if not already set (e.g., when skipped due to max attempts)
if not self.archiveresult.output_str and self.is_exceeded_max_attempts():
from archivebox.config.configset import get_config
config = get_config(
crawl=self.archiveresult.snapshot.crawl,
snapshot=self.archiveresult.snapshot,
)
max_attempts = config.get('MAX_URL_ATTEMPTS', 50)
self.archiveresult.output_str = f'Skipped: snapshot exceeded MAX_URL_ATTEMPTS ({max_attempts} failures)'
self.archiveresult.update_and_requeue(
retry_at=None,
status=ArchiveResult.StatusChoices.SKIPPED,

View File

@@ -281,25 +281,11 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
"""Editor for crawl URLs."""
widget_id = f'crawl_urls_{obj.pk}'
# Check if it's a local file we can edit
source_file = obj.get_file_path()
is_file = source_file is not None
file_contents = ""
error = None
if is_file and source_file:
try:
file_contents = source_file.read_text().strip()
except Exception as e:
error = f'Error reading {source_file}: {e}'
# Escape for safe HTML embedding
escaped_urls = (obj.urls or '').replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;')
escaped_file_contents = file_contents.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;')
# Count lines for auto-expand logic
line_count = len((obj.urls or '').split('\n'))
file_line_count = len(file_contents.split('\n')) if file_contents else 0
uri_rows = min(max(3, line_count), 10)
html = f'''
@@ -318,21 +304,6 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
{line_count} URL{'s' if line_count != 1 else ''} · Note: URLs displayed here for reference only
</p>
</div>
{"" if not is_file else f'''
<!-- File contents preview (if first URL is a file://) -->
<div style="margin-bottom: 8px;">
<label style="font-weight: bold; display: block; margin-bottom: 4px;">
File Preview: <code style="font-weight: normal; color: #666;">{source_file}</code>
</label>
{"<div style='color: #dc3545; margin-bottom: 8px;'>" + error + "</div>" if error else ""}
<textarea id="{widget_id}_file_preview"
style="width: 100%; height: {min(400, max(150, file_line_count * 18))}px; font-family: monospace; font-size: 12px;
padding: 8px; border: 1px solid #ccc; border-radius: 4px; resize: vertical; background: #f9f9f9;"
readonly>{escaped_file_contents}</textarea>
</div>
'''}
</div>
'''
return mark_safe(html)

View File

@@ -114,22 +114,6 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
},
)
@classmethod
def from_file(cls, source_file: Path, max_depth: int = 0, label: str = '', extractor: str = 'auto',
tags_str: str = '', config=None, created_by=None):
"""Create a crawl from a file containing URLs."""
urls_content = source_file.read_text()
crawl = cls.objects.create(
urls=urls_content,
extractor=extractor,
max_depth=max_depth,
tags_str=tags_str,
label=label or source_file.name,
config=config or {},
created_by_id=getattr(created_by, 'pk', created_by) or get_or_create_system_user_pk(),
)
return crawl
@property
def api_url(self) -> str:
return reverse_lazy('api-1:get_crawl', args=[self.id])
@@ -196,15 +180,19 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
return crawl
@property
def output_dir_parent(self) -> str:
"""Construct parent directory: users/{username}/crawls/{YYYYMMDD}"""
date_str = self.created_at.strftime('%Y%m%d')
return f'users/{self.created_by.username}/crawls/{date_str}'
def OUTPUT_DIR(self) -> Path:
"""
Construct output directory: users/{username}/crawls/{YYYYMMDD}/{domain}/{crawl-id}
Domain is extracted from the first URL in the crawl.
"""
from archivebox import DATA_DIR
from archivebox.core.models import Snapshot
@property
def output_dir_name(self) -> str:
"""Use crawl ID as directory name"""
return str(self.id)
date_str = self.created_at.strftime('%Y%m%d')
urls = self.get_urls_list()
domain = Snapshot.extract_domain_from_url(urls[0]) if urls else 'unknown'
return DATA_DIR / 'users' / self.created_by.username / 'crawls' / date_str / domain / str(self.id)
def get_urls_list(self) -> list[str]:
"""Get list of URLs from urls field, filtering out comments and empty lines."""
@@ -216,52 +204,6 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
if url.strip() and not url.strip().startswith('#')
]
def get_file_path(self) -> Path | None:
"""
Get filesystem path if this crawl references a local file.
Checks if the first URL is a file:// URI.
"""
urls = self.get_urls_list()
if not urls:
return None
first_url = urls[0]
if not first_url.startswith('file://'):
return None
# Remove file:// prefix
path_str = first_url.replace('file://', '', 1)
return Path(path_str)
def create_root_snapshot(self) -> 'Snapshot':
from archivebox.core.models import Snapshot
first_url = self.get_urls_list()[0] if self.get_urls_list() else None
if not first_url:
raise ValueError(f'Crawl {self.id} has no URLs to create root snapshot from')
# Try to get existing snapshot
try:
snapshot = Snapshot.objects.get(crawl=self, url=first_url)
# If exists and already queued/started, return it as-is
if snapshot.status in [Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]:
# Update retry_at to now so it can be picked up immediately
snapshot.retry_at = timezone.now()
snapshot.save(update_fields=['retry_at'])
return snapshot
except Snapshot.DoesNotExist:
pass
# Create new snapshot
root_snapshot = Snapshot.objects.create(
crawl=self,
url=first_url,
status=Snapshot.INITIAL_STATE,
retry_at=timezone.now(),
timestamp=str(timezone.now().timestamp()),
depth=0,
)
return root_snapshot
def add_url(self, entry: dict) -> bool:
"""
@@ -316,11 +258,15 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
Returns:
List of newly created Snapshot objects
"""
import sys
import json
from archivebox.core.models import Snapshot
created_snapshots = []
print(f'[cyan]DEBUG create_snapshots_from_urls: self.urls={repr(self.urls)}[/cyan]', file=sys.stderr)
print(f'[cyan]DEBUG create_snapshots_from_urls: lines={self.urls.splitlines()}[/cyan]', file=sys.stderr)
for line in self.urls.splitlines():
if not line.strip():
continue
@@ -329,13 +275,13 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
try:
entry = json.loads(line)
url = entry.get('url', '')
depth = entry.get('depth', 1)
depth = entry.get('depth', 0)
title = entry.get('title')
timestamp = entry.get('timestamp')
tags = entry.get('tags', '')
except json.JSONDecodeError:
url = line.strip()
depth = 1
depth = 0
title = None
timestamp = None
tags = ''
@@ -379,41 +325,90 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
The root Snapshot for this crawl, or None for system crawls that don't create snapshots
"""
import time
import json
from pathlib import Path
from archivebox.hooks import run_hook, discover_hooks, process_hook_records
from archivebox.config.configset import get_config
# Debug logging to file (since stdout/stderr redirected to /dev/null in progress mode)
debug_log = Path('/tmp/archivebox_crawl_debug.log')
with open(debug_log, 'a') as f:
f.write(f'\n=== Crawl.run() starting for {self.id} at {time.time()} ===\n')
f.flush()
# Get merged config with crawl context
config = get_config(crawl=self)
# Load all binaries.jsonl files from plugins
# This replaces individual on_Crawl install hooks with declarative configuration
from archivebox.hooks import BUILTIN_PLUGINS_DIR
from archivebox.machine.models import Machine
machine_id = str(Machine.current().id)
binaries_records = []
for binaries_file in BUILTIN_PLUGINS_DIR.glob('*/binaries.jsonl'):
try:
with open(binaries_file, 'r') as f:
for line in f:
line = line.strip()
if line and not line.startswith('#'):
try:
record = json.loads(line)
if record.get('type') == 'Binary':
record['machine_id'] = machine_id
binaries_records.append(record)
except json.JSONDecodeError:
pass
except Exception:
pass
# Process binary declarations before running hooks
if binaries_records:
overrides = {'crawl': self}
process_hook_records(binaries_records, overrides=overrides)
# Discover and run on_Crawl hooks
with open(debug_log, 'a') as f:
f.write(f'Discovering Crawl hooks...\n')
f.flush()
hooks = discover_hooks('Crawl', config=config)
first_url = self.get_urls_list()[0] if self.get_urls_list() else ''
with open(debug_log, 'a') as f:
f.write(f'Found {len(hooks)} hooks\n')
f.flush()
for hook in hooks:
with open(debug_log, 'a') as f:
f.write(f'Running hook: {hook.name}\n')
f.flush()
hook_start = time.time()
plugin_name = hook.parent.name
output_dir = self.OUTPUT_DIR / plugin_name
output_dir.mkdir(parents=True, exist_ok=True)
result = run_hook(
# Run hook using Process.launch() - returns Process model
process = run_hook(
hook,
output_dir=output_dir,
config=config,
crawl_id=str(self.id),
source_url=first_url,
source_url=self.urls, # Pass full newline-separated URLs
)
with open(debug_log, 'a') as f:
f.write(f'Hook {hook.name} completed with status={process.status}\n')
f.flush()
hook_elapsed = time.time() - hook_start
if hook_elapsed > 0.5: # Log slow hooks
print(f'[yellow]⏱️ Hook {hook.name} took {hook_elapsed:.2f}s[/yellow]')
# Background hook - returns None, continues running
if result is None:
# Background hook - still running
if process.status == process.StatusChoices.RUNNING:
continue
# Foreground hook - process JSONL records
records = result.get('records', [])
from archivebox.hooks import extract_records_from_process
records = extract_records_from_process(process)
if records:
print(f'[cyan]📝 Processing {len(records)} records from {hook.name}[/cyan]')
for record in records[:3]: # Show first 3
@@ -423,14 +418,33 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
if stats:
print(f'[green]✓ Created: {stats}[/green]')
# System crawls (archivebox://*) don't create snapshots - they just run hooks
if first_url.startswith('archivebox://'):
return None
# Create snapshots from all URLs in self.urls
with open(debug_log, 'a') as f:
f.write(f'Creating snapshots from URLs...\n')
f.flush()
created_snapshots = self.create_snapshots_from_urls()
with open(debug_log, 'a') as f:
f.write(f'Created {len(created_snapshots)} snapshots\n')
f.write(f'=== Crawl.run() complete ===\n\n')
f.flush()
return created_snapshots[0] if created_snapshots else None
# Create snapshots from URLs
root_snapshot = self.create_root_snapshot()
self.create_snapshots_from_urls()
return root_snapshot
def is_finished(self) -> bool:
"""Check if crawl is finished (all snapshots sealed or no snapshots exist)."""
from archivebox.core.models import Snapshot
# Check if any snapshots exist for this crawl
snapshots = Snapshot.objects.filter(crawl=self)
# If no snapshots exist, allow finishing (e.g., archivebox://install crawls that only run hooks)
if not snapshots.exists():
return True
# If snapshots exist, check if all are sealed
if snapshots.filter(status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]).exists():
return False
return True
def cleanup(self):
"""Clean up background hooks and run on_CrawlEnd hooks."""
@@ -452,7 +466,6 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
config = get_config(crawl=self)
hooks = discover_hooks('CrawlEnd', config=config)
first_url = self.get_urls_list()[0] if self.get_urls_list() else ''
for hook in hooks:
plugin_name = hook.parent.name
@@ -464,7 +477,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
output_dir=output_dir,
config=config,
crawl_id=str(self.id),
source_url=first_url,
source_url=self.urls, # Pass full newline-separated URLs
)
# Log failures but don't block
@@ -494,7 +507,6 @@ class CrawlMachine(BaseStateMachine, strict_states=True):
│ - run_hook(script, output_dir, ...) │
│ - Parse JSONL from hook output │
│ - process_hook_records() → creates Snapshots │
│ • create_root_snapshot() → root snapshot for crawl │
│ • create_snapshots_from_urls() → from self.urls field │
│ │
│ 2. Snapshots process independently with their own │
@@ -518,7 +530,8 @@ class CrawlMachine(BaseStateMachine, strict_states=True):
# Tick Event (polled by workers)
tick = (
queued.to.itself(unless='can_start') |
queued.to(started, cond='can_start')
queued.to(started, cond='can_start') |
started.to(sealed, cond='is_finished')
)
# Manual event (triggered by last Snapshot sealing)
@@ -534,6 +547,10 @@ class CrawlMachine(BaseStateMachine, strict_states=True):
return False
return True
def is_finished(self) -> bool:
"""Check if all Snapshots for this crawl are finished."""
return self.crawl.is_finished()
@started.enter
def enter_started(self):
import sys
@@ -543,25 +560,21 @@ class CrawlMachine(BaseStateMachine, strict_states=True):
try:
# Run the crawl - runs hooks, processes JSONL, creates snapshots
root_snapshot = self.crawl.run()
first_snapshot = self.crawl.run()
if root_snapshot:
print(f'[cyan]🔄 Created root snapshot: {root_snapshot.url}[/cyan]', file=sys.stderr)
if first_snapshot:
print(f'[cyan]🔄 Created {self.crawl.snapshot_set.count()} snapshot(s), first: {first_snapshot.url}[/cyan]', file=sys.stderr)
# Update status to STARTED
# Set retry_at to None so workers don't claim us (we wait for snapshots to finish)
# Last snapshot will manually call self.seal() when done
# Set retry_at to near future so tick() can poll and check is_finished()
self.crawl.update_and_requeue(
retry_at=None,
retry_at=timezone.now() + timedelta(seconds=2),
status=Crawl.StatusChoices.STARTED,
)
else:
# No snapshots (system crawl like archivebox://install)
print(f'[cyan]🔄 No snapshots created, allowing immediate seal[/cyan]', file=sys.stderr)
# Set retry_at=now so next tick() will transition to sealed
self.crawl.update_and_requeue(
retry_at=timezone.now(),
status=Crawl.StatusChoices.STARTED,
)
print(f'[cyan]🔄 No snapshots created, sealing crawl immediately[/cyan]', file=sys.stderr)
# Seal immediately since there's no work to do
self.seal()
except Exception as e:
print(f'[red]⚠️ Crawl {self.crawl.id} failed to start: {e}[/red]')

View File

@@ -240,13 +240,14 @@ def run_hook(
output_dir: Path,
config: Dict[str, Any],
timeout: Optional[int] = None,
parent: Optional['Process'] = None,
**kwargs: Any
) -> HookResult:
) -> 'Process':
"""
Execute a hook script with the given arguments.
Execute a hook script with the given arguments using Process model.
This is the low-level hook executor. For running extractors with proper
metadata handling, use call_extractor() instead.
This is the low-level hook executor that creates a Process record and
uses Process.launch() for subprocess management.
Config is passed to hooks via environment variables. Caller MUST use
get_config() to merge all sources (file, env, machine, crawl, snapshot).
@@ -257,16 +258,20 @@ def run_hook(
config: Merged config dict from get_config(crawl=..., snapshot=...) - REQUIRED
timeout: Maximum execution time in seconds
If None, auto-detects from PLUGINNAME_TIMEOUT config (fallback to TIMEOUT, default 300)
parent: Optional parent Process (for tracking worker->hook hierarchy)
**kwargs: Arguments passed to the script as --key=value
Returns:
HookResult with 'returncode', 'stdout', 'stderr', 'output_json', 'output_files', 'duration_ms'
Process model instance (use process.exit_code, process.stdout, process.get_records())
Example:
from archivebox.config.configset import get_config
config = get_config(crawl=my_crawl, snapshot=my_snapshot)
result = run_hook(hook_path, output_dir, config=config, url=url, snapshot_id=id)
process = run_hook(hook_path, output_dir, config=config, url=url, snapshot_id=id)
if process.status == 'exited':
records = process.get_records() # Get parsed JSONL output
"""
from archivebox.machine.models import Process, Machine
import time
start_time = time.time()
@@ -276,18 +281,32 @@ def run_hook(
plugin_config = get_plugin_special_config(plugin_name, config)
timeout = plugin_config['timeout']
# Get current machine
machine = Machine.current()
# Auto-detect parent process if not explicitly provided
# This enables automatic hierarchy tracking: Worker -> Hook
if parent is None:
try:
parent = Process.current()
except Exception:
# If Process.current() fails (e.g., not in a worker context), leave parent as None
pass
if not script.exists():
return HookResult(
returncode=1,
stdout='',
# Create a failed Process record for hooks that don't exist
process = Process.objects.create(
machine=machine,
parent=parent,
process_type=Process.TypeChoices.HOOK,
pwd=str(output_dir),
cmd=['echo', f'Hook script not found: {script}'],
timeout=timeout,
status=Process.StatusChoices.EXITED,
exit_code=1,
stderr=f'Hook script not found: {script}',
output_json=None,
output_files=[],
duration_ms=0,
hook=str(script),
plugin=script.parent.name,
hook_name=script.name,
)
return process
# Determine the interpreter based on file extension
ext = script.suffix.lower()
@@ -379,130 +398,138 @@ def run_hook(
# Create output directory if needed
output_dir.mkdir(parents=True, exist_ok=True)
# Capture files before execution to detect new output
files_before = set(output_dir.rglob('*')) if output_dir.exists() else set()
# Detect if this is a background hook (long-running daemon)
# New convention: .bg. suffix (e.g., on_Snapshot__21_consolelog.bg.js)
# Old convention: __background in stem (for backwards compatibility)
is_background = '.bg.' in script.name or '__background' in script.stem
# Set up output files for ALL hooks (useful for debugging)
stdout_file = output_dir / 'stdout.log'
stderr_file = output_dir / 'stderr.log'
pid_file = output_dir / 'hook.pid'
cmd_file = output_dir / 'cmd.sh'
try:
# Write command script for validation
from archivebox.misc.process_utils import write_cmd_file
write_cmd_file(cmd_file, cmd)
# Open log files for writing
with open(stdout_file, 'w') as out, open(stderr_file, 'w') as err:
process = subprocess.Popen(
cmd,
cwd=str(output_dir),
stdout=out,
stderr=err,
env=env,
)
# Write PID with mtime set to process start time for validation
from archivebox.misc.process_utils import write_pid_file_with_mtime
process_start_time = time.time()
write_pid_file_with_mtime(pid_file, process.pid, process_start_time)
if is_background:
# Background hook - return None immediately, don't wait
# Process continues running, writing to stdout.log
# ArchiveResult will poll for completion later
return None
# Normal hook - wait for completion with timeout
try:
returncode = process.wait(timeout=timeout)
except subprocess.TimeoutExpired:
process.kill()
process.wait() # Clean up zombie
duration_ms = int((time.time() - start_time) * 1000)
return HookResult(
returncode=-1,
stdout='',
stderr=f'Hook timed out after {timeout} seconds',
output_json=None,
output_files=[],
duration_ms=duration_ms,
hook=str(script),
)
# Read output from files
stdout = stdout_file.read_text() if stdout_file.exists() else ''
stderr = stderr_file.read_text() if stderr_file.exists() else ''
# Detect new files created by the hook
files_after = set(output_dir.rglob('*')) if output_dir.exists() else set()
new_files = [str(f.relative_to(output_dir)) for f in (files_after - files_before) if f.is_file()]
# Exclude the log files themselves from new_files
new_files = [f for f in new_files if f not in ('stdout.log', 'stderr.log', 'hook.pid')]
# Parse JSONL output from stdout
# Each line starting with { that has 'type' field is a record
records = []
plugin_name = script.parent.name # Plugin directory name (e.g., 'wget')
hook_name = script.name # Full hook filename (e.g., 'on_Snapshot__50_wget.py')
for line in stdout.splitlines():
line = line.strip()
if not line or not line.startswith('{'):
continue
try:
data = json.loads(line)
if 'type' in data:
# Add plugin metadata to every record
data['plugin'] = plugin_name
data['hook_name'] = hook_name
data['plugin_hook'] = str(script)
records.append(data)
except json.JSONDecodeError:
pass
duration_ms = int((time.time() - start_time) * 1000)
# Clean up log files on success (keep on failure for debugging)
if returncode == 0:
stdout_file.unlink(missing_ok=True)
stderr_file.unlink(missing_ok=True)
pid_file.unlink(missing_ok=True)
return HookResult(
returncode=returncode,
stdout=stdout,
stderr=stderr,
output_json=None, # Legacy field, we now use records for JSONL
output_files=new_files,
duration_ms=duration_ms,
hook=str(script),
plugin=plugin_name,
hook_name=hook_name,
records=records,
# Create Process record
process = Process.objects.create(
machine=machine,
parent=parent,
process_type=Process.TypeChoices.HOOK,
pwd=str(output_dir),
cmd=cmd,
timeout=timeout,
)
# Build environment from config (Process._build_env() expects self.env dict)
# We need to set env on the process before launching
process.env = {}
for key, value in config.items():
if value is None:
continue
elif isinstance(value, bool):
process.env[key] = 'true' if value else 'false'
elif isinstance(value, (list, dict)):
process.env[key] = json.dumps(value)
else:
process.env[key] = str(value)
# Add base paths to env
process.env['DATA_DIR'] = str(getattr(settings, 'DATA_DIR', Path.cwd()))
process.env['ARCHIVE_DIR'] = str(getattr(settings, 'ARCHIVE_DIR', Path.cwd() / 'archive'))
process.env.setdefault('MACHINE_ID', getattr(settings, 'MACHINE_ID', '') or os.environ.get('MACHINE_ID', ''))
# Add LIB_DIR and LIB_BIN_DIR
lib_dir = config.get('LIB_DIR', getattr(settings, 'LIB_DIR', None))
lib_bin_dir = config.get('LIB_BIN_DIR', getattr(settings, 'LIB_BIN_DIR', None))
if lib_dir:
process.env['LIB_DIR'] = str(lib_dir)
if not lib_bin_dir and lib_dir:
lib_bin_dir = Path(lib_dir) / 'bin'
if lib_bin_dir:
process.env['LIB_BIN_DIR'] = str(lib_bin_dir)
# Set PATH from Machine.config if available
try:
if machine and machine.config:
machine_path = machine.config.get('config/PATH')
if machine_path:
# Prepend LIB_BIN_DIR to machine PATH as well
if lib_bin_dir and not machine_path.startswith(f'{lib_bin_dir}:'):
process.env['PATH'] = f'{lib_bin_dir}:{machine_path}'
else:
process.env['PATH'] = machine_path
elif lib_bin_dir:
# Just prepend to current PATH
current_path = os.environ.get('PATH', '')
if not current_path.startswith(f'{lib_bin_dir}:'):
process.env['PATH'] = f'{lib_bin_dir}:{current_path}' if current_path else str(lib_bin_dir)
# Also set NODE_MODULES_DIR if configured
node_modules_dir = machine.config.get('config/NODE_MODULES_DIR')
if node_modules_dir:
process.env['NODE_MODULES_DIR'] = node_modules_dir
except Exception:
pass # Fall back to system PATH if Machine not available
# Save env before launching
process.save()
# Launch subprocess using Process.launch()
process.launch(background=is_background)
# Return Process object (caller can use process.exit_code, process.stdout, process.get_records())
return process
except Exception as e:
duration_ms = int((time.time() - start_time) * 1000)
return HookResult(
returncode=-1,
stdout='',
# Create a failed Process record for exceptions
process = Process.objects.create(
machine=machine,
process_type=Process.TypeChoices.HOOK,
pwd=str(output_dir),
cmd=cmd,
timeout=timeout,
status=Process.StatusChoices.EXITED,
exit_code=-1,
stderr=f'Failed to run hook: {type(e).__name__}: {e}',
output_json=None,
output_files=[],
duration_ms=duration_ms,
hook=str(script),
plugin=script.parent.name,
hook_name=script.name,
records=[],
)
return process
def extract_records_from_process(process: 'Process') -> List[Dict[str, Any]]:
"""
Extract JSONL records from a Process's stdout.
Uses the same parse_line() logic from misc/jsonl.py.
Adds plugin metadata to each record.
Args:
process: Process model instance with stdout captured
Returns:
List of parsed JSONL records with plugin metadata
"""
from archivebox.misc.jsonl import parse_line
records = []
# Read stdout from process
stdout = process.stdout
if not stdout and process.stdout_file and process.stdout_file.exists():
stdout = process.stdout_file.read_text()
if not stdout:
return records
# Extract plugin metadata from process.pwd and process.cmd
plugin_name = Path(process.pwd).name if process.pwd else 'unknown'
hook_name = Path(process.cmd[1]).name if len(process.cmd) > 1 else 'unknown'
plugin_hook = process.cmd[1] if len(process.cmd) > 1 else ''
# Parse each line as JSONL
for line in stdout.splitlines():
record = parse_line(line)
if record and 'type' in record:
# Add plugin metadata to record
record.setdefault('plugin', plugin_name)
record.setdefault('hook_name', hook_name)
record.setdefault('plugin_hook', plugin_hook)
records.append(record)
return records
def collect_urls_from_plugins(snapshot_dir: Path) -> List[Dict[str, Any]]:
@@ -940,7 +967,7 @@ def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[
else:
# No PLUGINS whitelist - use PLUGINNAME_ENABLED (default True)
import sys
print(f"DEBUG: NO PLUGINS whitelist in config, checking {plugin_name}_ENABLED", file=sys.stderr)
print(f"DEBUG: NO PLUGINS whitelist in config, checking {plugin_upper}_ENABLED", file=sys.stderr)
enabled_key = f'{plugin_upper}_ENABLED'
enabled = config.get(enabled_key)
if enabled is None:

View File

@@ -0,0 +1,18 @@
# Generated by Django 6.0 on 2026-01-02 08:43
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('machine', '0008_add_worker_type_field'),
]
operations = [
migrations.AlterField(
model_name='binary',
name='status',
field=models.CharField(choices=[('queued', 'Queued'), ('installed', 'Installed')], db_index=True, default='queued', max_length=16),
),
]

View File

@@ -683,6 +683,7 @@ class Process(models.Model):
ORCHESTRATOR = 'orchestrator', 'Orchestrator'
WORKER = 'worker', 'Worker'
CLI = 'cli', 'CLI'
HOOK = 'hook', 'Hook'
BINARY = 'binary', 'Binary'
# Primary fields
@@ -1415,6 +1416,10 @@ class Process(models.Model):
"""
Check if process has exited and update status if so.
Cleanup when process exits:
- Copy stdout/stderr to DB (keep files for debugging)
- Delete PID file
Returns:
exit_code if exited, None if still running
"""
@@ -1422,11 +1427,25 @@ class Process(models.Model):
return self.exit_code
if not self.is_running:
# Process exited - read output and update status
# Process exited - read output and copy to DB
if self.stdout_file and self.stdout_file.exists():
self.stdout = self.stdout_file.read_text()
# TODO: Uncomment to cleanup (keeping for debugging for now)
# self.stdout_file.unlink(missing_ok=True)
if self.stderr_file and self.stderr_file.exists():
self.stderr = self.stderr_file.read_text()
# TODO: Uncomment to cleanup (keeping for debugging for now)
# self.stderr_file.unlink(missing_ok=True)
# Clean up PID file (not needed for debugging)
if self.pid_file and self.pid_file.exists():
self.pid_file.unlink(missing_ok=True)
# TODO: Uncomment to cleanup cmd.sh (keeping for debugging for now)
# if self.pwd:
# cmd_file = Path(self.pwd) / 'cmd.sh'
# if cmd_file.exists():
# cmd_file.unlink(missing_ok=True)
# Try to get exit code from proc or default to unknown
self.exit_code = self.exit_code if self.exit_code is not None else -1
@@ -1686,6 +1705,46 @@ class Process(models.Model):
"""
return cls.get_running_count(process_type=process_type, machine=machine)
@classmethod
def cleanup_orphaned_chrome(cls) -> int:
"""
Kill orphaned Chrome processes using chrome_utils.js killZombieChrome.
Scans DATA_DIR for chrome/*.pid files from stale crawls (>5 min old)
and kills any orphaned Chrome processes.
Called by:
- Orchestrator on startup (cleanup from previous crashes)
- Orchestrator periodically (every N minutes)
Returns:
Number of zombie Chrome processes killed
"""
import subprocess
from pathlib import Path
from django.conf import settings
chrome_utils = Path(__file__).parent.parent / 'plugins' / 'chrome' / 'chrome_utils.js'
if not chrome_utils.exists():
return 0
try:
result = subprocess.run(
['node', str(chrome_utils), 'killZombieChrome', str(settings.DATA_DIR)],
capture_output=True,
timeout=30,
text=True,
)
if result.returncode == 0:
killed = int(result.stdout.strip())
if killed > 0:
print(f'[yellow]🧹 Cleaned up {killed} orphaned Chrome processes[/yellow]')
return killed
except (subprocess.TimeoutExpired, ValueError, FileNotFoundError) as e:
print(f'[red]Failed to cleanup orphaned Chrome: {e}[/red]')
return 0
# =============================================================================
# Binary State Machine

View File

@@ -530,13 +530,13 @@ def log_worker_event(
Log a worker event with structured metadata and indentation.
Args:
worker_type: Type of worker (Orchestrator, CrawlWorker, SnapshotWorker, etc.)
worker_type: Type of worker (Orchestrator, CrawlWorker, SnapshotWorker)
event: Event name (Starting, Completed, Failed, etc.)
indent_level: Indentation level (0=Orchestrator, 1=CrawlWorker, 2=SnapshotWorker, 3=ArchiveResultWorker)
indent_level: Indentation level (0=Orchestrator, 1=CrawlWorker, 2=SnapshotWorker)
pid: Process ID
worker_id: Worker ID (UUID for CrawlWorker, url for SnapshotWorker, plugin for ArchiveResultWorker)
url: URL being processed (for SnapshotWorker/ArchiveResultWorker)
plugin: Plugin name (for ArchiveResultWorker)
worker_id: Worker ID (UUID for workers)
url: URL being processed (for SnapshotWorker)
plugin: Plugin name (for hook processes)
metadata: Dict of metadata to show in curly braces
error: Exception if event is an error
"""

View File

@@ -0,0 +1,345 @@
"""
Rich Layout-based live progress display for ArchiveBox orchestrator.
Shows a comprehensive dashboard with:
- Top: Crawl queue status (full width)
- Middle: 4-column grid of SnapshotWorker progress panels
- Bottom: Orchestrator/Daphne logs
"""
__package__ = 'archivebox.misc'
from datetime import datetime, timezone
from typing import Dict, List, Optional, Any
from collections import deque
from rich import box
from rich.align import Align
from rich.console import Console, Group, RenderableType
from rich.layout import Layout
from rich.panel import Panel
from rich.progress import Progress, BarColumn, TextColumn, TaskProgressColumn, SpinnerColumn
from rich.table import Table
from rich.text import Text
from archivebox.config import VERSION
# Maximum number of SnapshotWorker columns to display
MAX_WORKER_COLUMNS = 4
class CrawlQueuePanel:
"""Display crawl queue status across full width."""
def __init__(self):
self.orchestrator_status = "Idle"
self.crawl_queue_count = 0
self.crawl_workers_count = 0
self.max_crawl_workers = 8
self.crawl_id: Optional[str] = None
def __rich__(self) -> Panel:
grid = Table.grid(expand=True)
grid.add_column(justify="left", ratio=1)
grid.add_column(justify="center", ratio=1)
grid.add_column(justify="center", ratio=1)
grid.add_column(justify="right", ratio=1)
# Left: ArchiveBox version + timestamp
left_text = Text()
left_text.append("ArchiveBox ", style="bold cyan")
left_text.append(f"v{VERSION}", style="bold yellow")
left_text.append(f"{datetime.now(timezone.utc).strftime('%H:%M:%S')}", style="grey53")
# Center-left: Crawl queue status
queue_style = "yellow" if self.crawl_queue_count > 0 else "grey53"
center_left_text = Text()
center_left_text.append("Crawls: ", style="white")
center_left_text.append(str(self.crawl_queue_count), style=f"bold {queue_style}")
center_left_text.append(" queued", style="grey53")
# Center-right: CrawlWorker status
worker_style = "green" if self.crawl_workers_count > 0 else "grey53"
center_right_text = Text()
center_right_text.append("Workers: ", style="white")
center_right_text.append(f"{self.crawl_workers_count}/{self.max_crawl_workers}", style=f"bold {worker_style}")
center_right_text.append(" active", style="grey53")
# Right: Orchestrator status
status_color = "green" if self.crawl_workers_count > 0 else "grey53"
right_text = Text()
right_text.append("Status: ", style="white")
right_text.append(self.orchestrator_status, style=f"bold {status_color}")
if self.crawl_id:
right_text.append(f" [{self.crawl_id[:8]}]", style="grey53")
grid.add_row(left_text, center_left_text, center_right_text, right_text)
return Panel(grid, style="white on blue", box=box.ROUNDED)
class SnapshotWorkerPanel:
"""Display progress for a single SnapshotWorker."""
def __init__(self, worker_num: int):
self.worker_num = worker_num
self.snapshot_id: Optional[str] = None
self.snapshot_url: Optional[str] = None
self.total_hooks: int = 0
self.completed_hooks: int = 0
self.current_plugin: Optional[str] = None
self.status: str = "idle" # idle, working, completed
self.recent_logs: deque = deque(maxlen=5)
def __rich__(self) -> Panel:
if self.status == "idle":
content = Align.center(
Text("Idle", style="grey53"),
vertical="middle",
)
border_style = "grey53"
title_style = "grey53"
else:
# Build progress display
lines = []
# URL (truncated)
if self.snapshot_url:
url_display = self.snapshot_url[:35] + "..." if len(self.snapshot_url) > 35 else self.snapshot_url
lines.append(Text(url_display, style="cyan"))
lines.append(Text()) # Spacing
# Progress bar
if self.total_hooks > 0:
pct = (self.completed_hooks / self.total_hooks) * 100
bar_width = 30
filled = int((pct / 100) * bar_width)
bar = "" * filled + "" * (bar_width - filled)
# Color based on progress
if pct < 30:
bar_style = "yellow"
elif pct < 100:
bar_style = "green"
else:
bar_style = "blue"
progress_text = Text()
progress_text.append(bar, style=bar_style)
progress_text.append(f" {pct:.0f}%", style="white")
lines.append(progress_text)
lines.append(Text()) # Spacing
# Stats
stats = Table.grid(padding=(0, 1))
stats.add_column(style="grey53", no_wrap=True)
stats.add_column(style="white")
stats.add_row("Hooks:", f"{self.completed_hooks}/{self.total_hooks}")
if self.current_plugin:
stats.add_row("Current:", Text(self.current_plugin, style="yellow"))
lines.append(stats)
lines.append(Text()) # Spacing
# Recent logs
if self.recent_logs:
lines.append(Text("Recent:", style="grey53"))
for log_msg, log_style in self.recent_logs:
log_text = Text(f"{log_msg[:30]}", style=log_style)
lines.append(log_text)
content = Group(*lines)
border_style = "green" if self.status == "working" else "blue"
title_style = "green" if self.status == "working" else "blue"
return Panel(
content,
title=f"[{title_style}]Worker {self.worker_num}",
border_style=border_style,
box=box.ROUNDED,
height=20,
)
def add_log(self, message: str, style: str = "white"):
"""Add a log message to this worker's recent logs."""
self.recent_logs.append((message, style))
class OrchestratorLogPanel:
"""Display orchestrator and system logs."""
def __init__(self, max_events: int = 15):
self.events: deque = deque(maxlen=max_events)
self.max_events = max_events
def add_event(self, message: str, style: str = "white"):
"""Add an event to the log."""
timestamp = datetime.now(timezone.utc).strftime("%H:%M:%S")
self.events.append((timestamp, message, style))
def __rich__(self) -> Panel:
if not self.events:
content = Text("No recent events", style="grey53", justify="center")
else:
lines = []
for timestamp, message, style in self.events:
line = Text()
line.append(f"[{timestamp}] ", style="grey53")
line.append(message, style=style)
lines.append(line)
content = Group(*lines)
return Panel(
content,
title="[bold white]Orchestrator / Daphne Logs",
border_style="white",
box=box.ROUNDED,
height=12,
)
class ArchiveBoxProgressLayout:
"""
Main layout manager for ArchiveBox orchestrator progress display.
Layout structure:
┌─────────────────────────────────────────────────────────────┐
│ Crawl Queue (full width) │
├───────────────┬───────────────┬───────────────┬─────────────┤
│ Snapshot │ Snapshot │ Snapshot │ Snapshot │
│ Worker 1 │ Worker 2 │ Worker 3 │ Worker 4 │
│ │ │ │ │
│ Progress + │ Progress + │ Progress + │ Progress + │
│ Stats + │ Stats + │ Stats + │ Stats + │
│ Logs │ Logs │ Logs │ Logs │
├───────────────┴───────────────┴───────────────┴─────────────┤
│ Orchestrator / Daphne Logs │
└─────────────────────────────────────────────────────────────┘
"""
def __init__(self, crawl_id: Optional[str] = None):
self.crawl_id = crawl_id
self.start_time = datetime.now(timezone.utc)
# Create components
self.crawl_queue = CrawlQueuePanel()
self.crawl_queue.crawl_id = crawl_id
# Create 4 worker panels
self.worker_panels = [SnapshotWorkerPanel(i + 1) for i in range(MAX_WORKER_COLUMNS)]
self.orchestrator_log = OrchestratorLogPanel(max_events=12)
# Create layout
self.layout = self._make_layout()
# Track snapshot ID to worker panel mapping
self.snapshot_to_worker: Dict[str, int] = {} # snapshot_id -> worker_panel_index
def _make_layout(self) -> Layout:
"""Define the layout structure."""
layout = Layout(name="root")
# Top-level split: crawl_queue, workers, logs
layout.split(
Layout(name="crawl_queue", size=3),
Layout(name="workers", ratio=1),
Layout(name="logs", size=13),
)
# Split workers into 4 columns
layout["workers"].split_row(
Layout(name="worker1"),
Layout(name="worker2"),
Layout(name="worker3"),
Layout(name="worker4"),
)
# Assign components to layout sections
layout["crawl_queue"].update(self.crawl_queue)
layout["worker1"].update(self.worker_panels[0])
layout["worker2"].update(self.worker_panels[1])
layout["worker3"].update(self.worker_panels[2])
layout["worker4"].update(self.worker_panels[3])
layout["logs"].update(self.orchestrator_log)
return layout
def update_orchestrator_status(
self,
status: str,
crawl_queue_count: int = 0,
crawl_workers_count: int = 0,
max_crawl_workers: int = 8,
):
"""Update orchestrator status in the crawl queue panel."""
self.crawl_queue.orchestrator_status = status
self.crawl_queue.crawl_queue_count = crawl_queue_count
self.crawl_queue.crawl_workers_count = crawl_workers_count
self.crawl_queue.max_crawl_workers = max_crawl_workers
def update_snapshot_worker(
self,
snapshot_id: str,
url: str,
total: int,
completed: int,
current_plugin: str = "",
):
"""Update or assign a snapshot to a worker panel."""
# Find or assign worker panel for this snapshot
if snapshot_id not in self.snapshot_to_worker:
# Find first idle worker panel
worker_idx = None
for idx, panel in enumerate(self.worker_panels):
if panel.status == "idle":
worker_idx = idx
break
# If no idle worker, use round-robin (shouldn't happen often)
if worker_idx is None:
worker_idx = len(self.snapshot_to_worker) % MAX_WORKER_COLUMNS
self.snapshot_to_worker[snapshot_id] = worker_idx
# Get assigned worker panel
worker_idx = self.snapshot_to_worker[snapshot_id]
panel = self.worker_panels[worker_idx]
# Update panel
panel.snapshot_id = snapshot_id
panel.snapshot_url = url
panel.total_hooks = total
panel.completed_hooks = completed
panel.current_plugin = current_plugin
panel.status = "working" if completed < total else "completed"
def remove_snapshot_worker(self, snapshot_id: str):
"""Mark a snapshot worker as idle after completion."""
if snapshot_id in self.snapshot_to_worker:
worker_idx = self.snapshot_to_worker[snapshot_id]
panel = self.worker_panels[worker_idx]
# Mark as idle
panel.status = "idle"
panel.snapshot_id = None
panel.snapshot_url = None
panel.total_hooks = 0
panel.completed_hooks = 0
panel.current_plugin = None
panel.recent_logs.clear()
# Remove mapping
del self.snapshot_to_worker[snapshot_id]
def log_to_worker(self, snapshot_id: str, message: str, style: str = "white"):
"""Add a log message to a specific worker's panel."""
if snapshot_id in self.snapshot_to_worker:
worker_idx = self.snapshot_to_worker[snapshot_id]
self.worker_panels[worker_idx].add_log(message, style)
def log_event(self, message: str, style: str = "white"):
"""Add an event to the orchestrator log."""
self.orchestrator_log.add_event(message, style)
def get_layout(self) -> Layout:
"""Get the Rich Layout object for rendering."""
return self.layout

View File

@@ -72,10 +72,8 @@ class TestAccessibilityWithChrome(TestCase):
test_url=test_url,
navigate=True,
timeout=30,
) as (chrome_process, chrome_pid, snapshot_chrome_dir):
# Get environment and run the accessibility hook
env = get_test_env()
env['CHROME_HEADLESS'] = 'true'
) as (chrome_process, chrome_pid, snapshot_chrome_dir, env):
# Use the environment from chrome_session (already has CHROME_HEADLESS=true)
# Run accessibility hook with the active Chrome session
result = subprocess.run(
@@ -116,6 +114,85 @@ class TestAccessibilityWithChrome(TestCase):
self.skipTest(f"Chrome session setup failed: {e}")
raise
def test_accessibility_disabled_skips(self):
"""Test that ACCESSIBILITY_ENABLED=False skips without error."""
test_url = 'https://example.com'
snapshot_id = 'test-disabled'
env = get_test_env()
env['ACCESSIBILITY_ENABLED'] = 'False'
result = subprocess.run(
['node', str(ACCESSIBILITY_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
cwd=str(self.temp_dir),
capture_output=True,
text=True,
timeout=30,
env=env
)
# Should exit 0 even when disabled
self.assertEqual(result.returncode, 0, f"Should succeed when disabled: {result.stderr}")
# Should NOT create output file when disabled
accessibility_output = self.temp_dir / 'accessibility.json'
self.assertFalse(accessibility_output.exists(), "Should not create file when disabled")
def test_accessibility_missing_url_argument(self):
"""Test that missing --url argument causes error."""
snapshot_id = 'test-missing-url'
result = subprocess.run(
['node', str(ACCESSIBILITY_HOOK), f'--snapshot-id={snapshot_id}'],
cwd=str(self.temp_dir),
capture_output=True,
text=True,
timeout=30,
env=get_test_env()
)
# Should fail with non-zero exit code
self.assertNotEqual(result.returncode, 0, "Should fail when URL missing")
def test_accessibility_missing_snapshot_id_argument(self):
"""Test that missing --snapshot-id argument causes error."""
test_url = 'https://example.com'
result = subprocess.run(
['node', str(ACCESSIBILITY_HOOK), f'--url={test_url}'],
cwd=str(self.temp_dir),
capture_output=True,
text=True,
timeout=30,
env=get_test_env()
)
# Should fail with non-zero exit code
self.assertNotEqual(result.returncode, 0, "Should fail when snapshot-id missing")
def test_accessibility_with_no_chrome_session(self):
"""Test that hook fails gracefully when no Chrome session exists."""
test_url = 'https://example.com'
snapshot_id = 'test-no-chrome'
result = subprocess.run(
['node', str(ACCESSIBILITY_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
cwd=str(self.temp_dir),
capture_output=True,
text=True,
timeout=30,
env=get_test_env()
)
# Should fail when no Chrome session
self.assertNotEqual(result.returncode, 0, "Should fail when no Chrome session exists")
# Error should mention CDP or Chrome
err_lower = result.stderr.lower()
self.assertTrue(
any(x in err_lower for x in ['chrome', 'cdp', 'cannot find', 'puppeteer']),
f"Should mention Chrome/CDP in error: {result.stderr}"
)
if __name__ == '__main__':
pytest.main([__file__, '-v'])

View File

@@ -1397,11 +1397,11 @@ function getMachineType() {
*/
function getLibDir() {
if (process.env.LIB_DIR) {
return process.env.LIB_DIR;
return path.resolve(process.env.LIB_DIR);
}
const dataDir = getEnv('DATA_DIR', './data');
const machineType = getMachineType();
return path.join(dataDir, 'lib', machineType);
return path.resolve(path.join(dataDir, 'lib', machineType));
}
/**
@@ -1412,9 +1412,9 @@ function getLibDir() {
*/
function getNodeModulesDir() {
if (process.env.NODE_MODULES_DIR) {
return process.env.NODE_MODULES_DIR;
return path.resolve(process.env.NODE_MODULES_DIR);
}
return path.join(getLibDir(), 'npm', 'node_modules');
return path.resolve(path.join(getLibDir(), 'npm', 'node_modules'));
}
/**

View File

@@ -37,9 +37,8 @@ Usage:
# For Chrome session tests:
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
setup_chrome_session, # Full Chrome + tab setup
cleanup_chrome, # Cleanup by PID
chrome_session, # Context manager
chrome_session, # Context manager (Full Chrome + tab setup with automatic cleanup)
cleanup_chrome, # Manual cleanup by PID (rarely needed)
)
# For extension tests:
@@ -184,8 +183,7 @@ def get_lib_dir() -> Path:
# Fallback to Python
if os.environ.get('LIB_DIR'):
return Path(os.environ['LIB_DIR'])
from archivebox.config.common import STORAGE_CONFIG
return Path(str(STORAGE_CONFIG.LIB_DIR))
raise Exception('LIB_DIR env var must be set!')
def get_node_modules_dir() -> Path:
@@ -695,111 +693,6 @@ def chromium_session(env: dict, chrome_dir: Path, crawl_id: str):
# =============================================================================
def setup_chrome_session(
tmpdir: Path,
crawl_id: str = 'test-crawl',
snapshot_id: str = 'test-snapshot',
test_url: str = 'about:blank',
navigate: bool = True,
timeout: int = 15,
) -> Tuple[subprocess.Popen, int, Path]:
"""Set up a Chrome session with tab and optional navigation.
Creates the directory structure, launches Chrome, creates a tab,
and optionally navigates to the test URL.
Args:
tmpdir: Temporary directory for test files
crawl_id: ID to use for the crawl
snapshot_id: ID to use for the snapshot
test_url: URL to navigate to (if navigate=True)
navigate: Whether to navigate to the URL after creating tab
timeout: Seconds to wait for Chrome to start
Returns:
Tuple of (chrome_launch_process, chrome_pid, snapshot_chrome_dir)
Raises:
RuntimeError: If Chrome fails to start or tab creation fails
"""
crawl_dir = Path(tmpdir) / 'crawl'
crawl_dir.mkdir(exist_ok=True)
chrome_dir = crawl_dir / 'chrome'
chrome_dir.mkdir(exist_ok=True)
env = get_test_env()
env['CHROME_HEADLESS'] = 'true'
# Launch Chrome at crawl level
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=env
)
# Wait for Chrome to launch
for i in range(timeout):
if chrome_launch_process.poll() is not None:
stdout, stderr = chrome_launch_process.communicate()
raise RuntimeError(f"Chrome launch failed:\nStdout: {stdout}\nStderr: {stderr}")
if (chrome_dir / 'cdp_url.txt').exists():
break
time.sleep(1)
if not (chrome_dir / 'cdp_url.txt').exists():
raise RuntimeError(f"Chrome CDP URL not found after {timeout}s")
chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
# Create snapshot directory structure
snapshot_dir = Path(tmpdir) / 'snapshot'
snapshot_dir.mkdir(exist_ok=True)
snapshot_chrome_dir = snapshot_dir / 'chrome'
snapshot_chrome_dir.mkdir(exist_ok=True)
# Create tab
tab_env = env.copy()
tab_env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
try:
result = subprocess.run(
['node', str(CHROME_TAB_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}', f'--crawl-id={crawl_id}'],
cwd=str(snapshot_chrome_dir),
capture_output=True,
text=True,
timeout=60,
env=tab_env
)
if result.returncode != 0:
cleanup_chrome(chrome_launch_process, chrome_pid)
raise RuntimeError(f"Tab creation failed: {result.stderr}")
except subprocess.TimeoutExpired:
cleanup_chrome(chrome_launch_process, chrome_pid)
raise RuntimeError("Tab creation timed out after 60s")
# Navigate to URL if requested
if navigate and CHROME_NAVIGATE_HOOK and test_url != 'about:blank':
try:
result = subprocess.run(
['node', str(CHROME_NAVIGATE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
cwd=str(snapshot_chrome_dir),
capture_output=True,
text=True,
timeout=120,
env=env
)
if result.returncode != 0:
cleanup_chrome(chrome_launch_process, chrome_pid)
raise RuntimeError(f"Navigation failed: {result.stderr}")
except subprocess.TimeoutExpired:
cleanup_chrome(chrome_launch_process, chrome_pid)
raise RuntimeError("Navigation timed out after 120s")
return chrome_launch_process, chrome_pid, snapshot_chrome_dir
def cleanup_chrome(chrome_launch_process: subprocess.Popen, chrome_pid: int, chrome_dir: Optional[Path] = None) -> None:
"""Clean up Chrome processes using chrome_utils.js killChrome.
@@ -835,8 +728,12 @@ def chrome_session(
):
"""Context manager for Chrome sessions with automatic cleanup.
Creates the directory structure, launches Chrome, creates a tab,
and optionally navigates to the test URL. Automatically cleans up
Chrome on exit.
Usage:
with chrome_session(tmpdir, test_url='https://example.com') as (process, pid, chrome_dir):
with chrome_session(tmpdir, test_url='https://example.com') as (process, pid, chrome_dir, env):
# Run tests with chrome session
pass
# Chrome automatically cleaned up
@@ -850,20 +747,129 @@ def chrome_session(
timeout: Seconds to wait for Chrome to start
Yields:
Tuple of (chrome_launch_process, chrome_pid, snapshot_chrome_dir)
Tuple of (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env)
Raises:
RuntimeError: If Chrome fails to start or tab creation fails
"""
chrome_launch_process = None
chrome_pid = None
try:
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(
tmpdir=tmpdir,
crawl_id=crawl_id,
snapshot_id=snapshot_id,
test_url=test_url,
navigate=navigate,
timeout=timeout,
# Create proper directory structure in tmpdir
machine = platform.machine().lower()
system = platform.system().lower()
if machine in ('arm64', 'aarch64'):
machine = 'arm64'
elif machine in ('x86_64', 'amd64'):
machine = 'x86_64'
machine_type = f"{machine}-{system}"
data_dir = Path(tmpdir) / 'data'
lib_dir = data_dir / 'lib' / machine_type
npm_dir = lib_dir / 'npm'
node_modules_dir = npm_dir / 'node_modules'
# Create lib structure for puppeteer installation
node_modules_dir.mkdir(parents=True, exist_ok=True)
# Create crawl and snapshot directories
crawl_dir = Path(tmpdir) / 'crawl'
crawl_dir.mkdir(exist_ok=True)
chrome_dir = crawl_dir / 'chrome'
chrome_dir.mkdir(exist_ok=True)
# Build env with tmpdir-specific paths
env = os.environ.copy()
env.update({
'DATA_DIR': str(data_dir),
'LIB_DIR': str(lib_dir),
'MACHINE_TYPE': machine_type,
'NODE_MODULES_DIR': str(node_modules_dir),
'NODE_PATH': str(node_modules_dir),
'NPM_BIN_DIR': str(npm_dir / '.bin'),
'CHROME_HEADLESS': 'true',
})
# CRITICAL: Run chrome install hook first (installs puppeteer-core and chromium)
# chrome_launch assumes chrome_install has already run
install_result = subprocess.run(
['python', str(CHROME_INSTALL_HOOK)],
capture_output=True,
text=True,
timeout=120,
env=env
)
yield chrome_launch_process, chrome_pid, snapshot_chrome_dir
if install_result.returncode != 0:
raise RuntimeError(f"Chrome install failed: {install_result.stderr}")
# Launch Chrome at crawl level
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=env
)
# Wait for Chrome to launch
for i in range(timeout):
if chrome_launch_process.poll() is not None:
stdout, stderr = chrome_launch_process.communicate()
raise RuntimeError(f"Chrome launch failed:\nStdout: {stdout}\nStderr: {stderr}")
if (chrome_dir / 'cdp_url.txt').exists():
break
time.sleep(1)
if not (chrome_dir / 'cdp_url.txt').exists():
raise RuntimeError(f"Chrome CDP URL not found after {timeout}s")
chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
# Create snapshot directory structure
snapshot_dir = Path(tmpdir) / 'snapshot'
snapshot_dir.mkdir(exist_ok=True)
snapshot_chrome_dir = snapshot_dir / 'chrome'
snapshot_chrome_dir.mkdir(exist_ok=True)
# Create tab
tab_env = env.copy()
tab_env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
try:
result = subprocess.run(
['node', str(CHROME_TAB_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}', f'--crawl-id={crawl_id}'],
cwd=str(snapshot_chrome_dir),
capture_output=True,
text=True,
timeout=60,
env=tab_env
)
if result.returncode != 0:
cleanup_chrome(chrome_launch_process, chrome_pid)
raise RuntimeError(f"Tab creation failed: {result.stderr}")
except subprocess.TimeoutExpired:
cleanup_chrome(chrome_launch_process, chrome_pid)
raise RuntimeError("Tab creation timed out after 60s")
# Navigate to URL if requested
if navigate and CHROME_NAVIGATE_HOOK and test_url != 'about:blank':
try:
result = subprocess.run(
['node', str(CHROME_NAVIGATE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
cwd=str(snapshot_chrome_dir),
capture_output=True,
text=True,
timeout=120,
env=env
)
if result.returncode != 0:
cleanup_chrome(chrome_launch_process, chrome_pid)
raise RuntimeError(f"Navigation failed: {result.stderr}")
except subprocess.TimeoutExpired:
cleanup_chrome(chrome_launch_process, chrome_pid)
raise RuntimeError("Navigation timed out after 120s")
yield chrome_launch_process, chrome_pid, snapshot_chrome_dir, env
finally:
if chrome_launch_process and chrome_pid:
cleanup_chrome(chrome_launch_process, chrome_pid)

View File

@@ -525,10 +525,9 @@ def test_zombie_prevention_hook_killed():
time.sleep(1)
assert (chrome_dir / 'chrome.pid').exists(), "Chrome PID file should exist"
assert (chrome_dir / 'hook.pid').exists(), "Hook PID file should exist"
chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
hook_pid = int((chrome_dir / 'hook.pid').read_text().strip())
hook_pid = chrome_launch_process.pid # Use the Popen process PID instead of hook.pid file
# Verify both Chrome and hook are running
try:

View File

@@ -0,0 +1,260 @@
"""
Tests for chrome_test_helpers.py functions.
These tests verify the Python helper functions used across Chrome plugin tests.
"""
import os
import pytest
import tempfile
from pathlib import Path
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
get_test_env,
get_machine_type,
get_lib_dir,
get_node_modules_dir,
get_extensions_dir,
find_chromium_binary,
get_plugin_dir,
get_hook_script,
parse_jsonl_output,
)
def test_get_machine_type():
"""Test get_machine_type() returns valid format."""
machine_type = get_machine_type()
assert isinstance(machine_type, str)
assert '-' in machine_type, "Machine type should be in format: arch-os"
# Should be one of the expected formats
assert any(x in machine_type for x in ['arm64', 'x86_64']), "Should contain valid architecture"
assert any(x in machine_type for x in ['darwin', 'linux', 'win32']), "Should contain valid OS"
def test_get_lib_dir_with_env_var():
"""Test get_lib_dir() respects LIB_DIR env var."""
with tempfile.TemporaryDirectory() as tmpdir:
custom_lib = Path(tmpdir) / 'custom_lib'
custom_lib.mkdir()
old_lib_dir = os.environ.get('LIB_DIR')
try:
os.environ['LIB_DIR'] = str(custom_lib)
lib_dir = get_lib_dir()
assert lib_dir == custom_lib
finally:
if old_lib_dir:
os.environ['LIB_DIR'] = old_lib_dir
else:
os.environ.pop('LIB_DIR', None)
def test_get_node_modules_dir_with_env_var():
"""Test get_node_modules_dir() respects NODE_MODULES_DIR env var."""
with tempfile.TemporaryDirectory() as tmpdir:
custom_nm = Path(tmpdir) / 'node_modules'
custom_nm.mkdir()
old_nm_dir = os.environ.get('NODE_MODULES_DIR')
try:
os.environ['NODE_MODULES_DIR'] = str(custom_nm)
nm_dir = get_node_modules_dir()
assert nm_dir == custom_nm
finally:
if old_nm_dir:
os.environ['NODE_MODULES_DIR'] = old_nm_dir
else:
os.environ.pop('NODE_MODULES_DIR', None)
def test_get_extensions_dir_default():
"""Test get_extensions_dir() returns expected path format."""
ext_dir = get_extensions_dir()
assert isinstance(ext_dir, str)
assert 'personas' in ext_dir
assert 'chrome_extensions' in ext_dir
def test_get_extensions_dir_with_custom_persona():
"""Test get_extensions_dir() respects ACTIVE_PERSONA env var."""
old_persona = os.environ.get('ACTIVE_PERSONA')
old_data_dir = os.environ.get('DATA_DIR')
try:
os.environ['ACTIVE_PERSONA'] = 'TestPersona'
os.environ['DATA_DIR'] = '/tmp/test'
ext_dir = get_extensions_dir()
assert 'TestPersona' in ext_dir
assert '/tmp/test' in ext_dir
finally:
if old_persona:
os.environ['ACTIVE_PERSONA'] = old_persona
else:
os.environ.pop('ACTIVE_PERSONA', None)
if old_data_dir:
os.environ['DATA_DIR'] = old_data_dir
else:
os.environ.pop('DATA_DIR', None)
def test_get_test_env_returns_dict():
"""Test get_test_env() returns properly formatted environment dict."""
env = get_test_env()
assert isinstance(env, dict)
# Should include key paths
assert 'MACHINE_TYPE' in env
assert 'LIB_DIR' in env
assert 'NODE_MODULES_DIR' in env
assert 'NODE_PATH' in env # Critical for module resolution
assert 'NPM_BIN_DIR' in env
assert 'CHROME_EXTENSIONS_DIR' in env
# Verify NODE_PATH equals NODE_MODULES_DIR (for Node.js module resolution)
assert env['NODE_PATH'] == env['NODE_MODULES_DIR']
def test_get_test_env_paths_are_absolute():
"""Test that get_test_env() returns absolute paths."""
env = get_test_env()
# All path-like values should be absolute
assert Path(env['LIB_DIR']).is_absolute()
assert Path(env['NODE_MODULES_DIR']).is_absolute()
assert Path(env['NODE_PATH']).is_absolute()
def test_find_chromium_binary():
"""Test find_chromium_binary() returns a path or None."""
binary = find_chromium_binary()
if binary:
assert isinstance(binary, str)
# Should be an absolute path if found
assert os.path.isabs(binary)
def test_get_plugin_dir():
"""Test get_plugin_dir() finds correct plugin directory."""
# Use this test file's path
test_file = __file__
plugin_dir = get_plugin_dir(test_file)
assert plugin_dir.exists()
assert plugin_dir.is_dir()
# Should be the chrome plugin directory
assert plugin_dir.name == 'chrome'
assert (plugin_dir.parent.name == 'plugins')
def test_get_hook_script_finds_existing_hook():
"""Test get_hook_script() can find an existing hook."""
from archivebox.plugins.chrome.tests.chrome_test_helpers import CHROME_PLUGIN_DIR
# Try to find the chrome launch hook
hook = get_hook_script(CHROME_PLUGIN_DIR, 'on_Crawl__*_chrome_launch.*')
if hook: # May not exist in all test environments
assert hook.exists()
assert hook.is_file()
assert 'chrome_launch' in hook.name
def test_get_hook_script_returns_none_for_missing():
"""Test get_hook_script() returns None for non-existent hooks."""
from archivebox.plugins.chrome.tests.chrome_test_helpers import CHROME_PLUGIN_DIR
hook = get_hook_script(CHROME_PLUGIN_DIR, 'nonexistent_hook_*_pattern.*')
assert hook is None
def test_parse_jsonl_output_valid():
"""Test parse_jsonl_output() parses valid JSONL."""
jsonl_output = '''{"type": "ArchiveResult", "status": "succeeded", "output": "test1"}
{"type": "ArchiveResult", "status": "failed", "error": "test2"}
'''
# Returns first match only
result = parse_jsonl_output(jsonl_output)
assert result is not None
assert result['type'] == 'ArchiveResult'
assert result['status'] == 'succeeded'
assert result['output'] == 'test1'
def test_parse_jsonl_output_with_non_json_lines():
"""Test parse_jsonl_output() skips non-JSON lines."""
mixed_output = '''Some non-JSON output
{"type": "ArchiveResult", "status": "succeeded"}
More non-JSON
{"type": "ArchiveResult", "status": "failed"}
'''
result = parse_jsonl_output(mixed_output)
assert result is not None
assert result['type'] == 'ArchiveResult'
assert result['status'] == 'succeeded'
def test_parse_jsonl_output_empty():
"""Test parse_jsonl_output() handles empty input."""
result = parse_jsonl_output('')
assert result is None
def test_parse_jsonl_output_filters_by_type():
"""Test parse_jsonl_output() can filter by record type."""
jsonl_output = '''{"type": "LogEntry", "data": "log1"}
{"type": "ArchiveResult", "data": "result1"}
{"type": "ArchiveResult", "data": "result2"}
'''
# Should return first ArchiveResult, not LogEntry
result = parse_jsonl_output(jsonl_output, record_type='ArchiveResult')
assert result is not None
assert result['type'] == 'ArchiveResult'
assert result['data'] == 'result1' # First ArchiveResult
def test_parse_jsonl_output_filters_custom_type():
"""Test parse_jsonl_output() can filter by custom record type."""
jsonl_output = '''{"type": "ArchiveResult", "data": "result1"}
{"type": "LogEntry", "data": "log1"}
{"type": "ArchiveResult", "data": "result2"}
'''
result = parse_jsonl_output(jsonl_output, record_type='LogEntry')
assert result is not None
assert result['type'] == 'LogEntry'
assert result['data'] == 'log1'
def test_machine_type_consistency():
"""Test that machine type is consistent across calls."""
mt1 = get_machine_type()
mt2 = get_machine_type()
assert mt1 == mt2, "Machine type should be stable across calls"
def test_lib_dir_is_directory():
"""Test that lib_dir points to an actual directory when DATA_DIR is set."""
with tempfile.TemporaryDirectory() as tmpdir:
old_data_dir = os.environ.get('DATA_DIR')
try:
os.environ['DATA_DIR'] = tmpdir
# Create the expected directory structure
machine_type = get_machine_type()
lib_dir = Path(tmpdir) / 'lib' / machine_type
lib_dir.mkdir(parents=True, exist_ok=True)
result = get_lib_dir()
# Should return a Path object
assert isinstance(result, Path)
finally:
if old_data_dir:
os.environ['DATA_DIR'] = old_data_dir
else:
os.environ.pop('DATA_DIR', None)
if __name__ == '__main__':
pytest.main([__file__, '-v'])

View File

@@ -72,10 +72,9 @@ class TestConsolelogWithChrome(TestCase):
test_url=test_url,
navigate=True,
timeout=30,
) as (chrome_process, chrome_pid, snapshot_chrome_dir):
# Get environment and run the consolelog hook
env = get_test_env()
env['CHROME_HEADLESS'] = 'true'
) as (chrome_process, chrome_pid, snapshot_chrome_dir, env):
# Use the environment from chrome_session (already has CHROME_HEADLESS=true)
# Run consolelog hook with the active Chrome session
result = subprocess.run(

View File

@@ -0,0 +1,80 @@
#!/usr/bin/env python3
"""
Detect forum-dl binary and emit Binary JSONL record.
Output: Binary JSONL record to stdout if forum-dl is found
"""
import json
import os
import sys
from abx_pkg import Binary, EnvProvider
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_bool(name: str, default: bool = False) -> bool:
val = get_env(name, '').lower()
if val in ('true', '1', 'yes', 'on'):
return True
if val in ('false', '0', 'no', 'off'):
return False
return default
def output_binary_found(binary: Binary, name: str):
"""Output Binary JSONL record for an installed binary."""
machine_id = os.environ.get('MACHINE_ID', '')
record = {
'type': 'Binary',
'name': name,
'abspath': str(binary.abspath),
'version': str(binary.version) if binary.version else '',
'sha256': binary.sha256 or '',
'binprovider': 'env', # Already installed
'machine_id': machine_id,
}
print(json.dumps(record))
def output_binary_missing(name: str, binproviders: str):
"""Output Binary JSONL record for a missing binary that needs installation."""
machine_id = os.environ.get('MACHINE_ID', '')
record = {
'type': 'Binary',
'name': name,
'binproviders': binproviders, # Providers that can install it
'machine_id': machine_id,
}
print(json.dumps(record))
def main():
forumdl_enabled = get_env_bool('FORUMDL_ENABLED', True)
forumdl_binary = get_env('FORUMDL_BINARY', 'forum-dl')
if not forumdl_enabled:
sys.exit(0)
provider = EnvProvider()
try:
binary = Binary(name=forumdl_binary, binproviders=[provider]).load()
if binary.abspath:
# Binary found
output_binary_found(binary, name='forum-dl')
else:
# Binary not found
output_binary_missing(name='forum-dl', binproviders='pip')
except Exception:
# Binary not found
output_binary_missing(name='forum-dl', binproviders='pip')
sys.exit(0)
if __name__ == '__main__':
main()

View File

@@ -24,8 +24,7 @@ import pytest
# Import shared Chrome test helpers
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
get_test_env,
setup_chrome_session,
cleanup_chrome,
chrome_session,
)
@@ -101,22 +100,17 @@ def test_fails_gracefully_without_chrome_session():
def test_scrolls_page_and_outputs_stats():
"""Integration test: scroll page and verify JSONL output format."""
with tempfile.TemporaryDirectory() as tmpdir:
chrome_launch_process = None
chrome_pid = None
try:
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(
Path(tmpdir),
crawl_id='test-infiniscroll',
snapshot_id='snap-infiniscroll',
test_url=TEST_URL,
)
with chrome_session(
Path(tmpdir),
crawl_id='test-infiniscroll',
snapshot_id='snap-infiniscroll',
test_url=TEST_URL,
) as (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env):
# Create infiniscroll output directory (sibling to chrome)
infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll'
infiniscroll_dir.mkdir()
# Run infiniscroll hook
env = get_test_env()
env['INFINISCROLL_SCROLL_LIMIT'] = '3' # Limit scrolls for faster test
env['INFINISCROLL_SCROLL_DELAY'] = '500' # Faster scrolling
env['INFINISCROLL_MIN_HEIGHT'] = '1000' # Lower threshold for test
@@ -158,29 +152,21 @@ def test_scrolls_page_and_outputs_stats():
output_files = list(infiniscroll_dir.iterdir())
assert len(output_files) == 0, f"Should not create any files, but found: {output_files}"
finally:
if chrome_launch_process and chrome_pid:
cleanup_chrome(chrome_launch_process, chrome_pid)
def test_config_scroll_limit_honored():
"""Test that INFINISCROLL_SCROLL_LIMIT config is respected."""
with tempfile.TemporaryDirectory() as tmpdir:
chrome_launch_process = None
chrome_pid = None
try:
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(
Path(tmpdir),
crawl_id='test-scroll-limit',
snapshot_id='snap-limit',
test_url=TEST_URL,
)
with chrome_session(
Path(tmpdir),
crawl_id='test-scroll-limit',
snapshot_id='snap-limit',
test_url=TEST_URL,
) as (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env):
infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll'
infiniscroll_dir.mkdir()
# Set scroll limit to 2
env = get_test_env()
# Set scroll limit to 2 (use env from setup_chrome_session)
env['INFINISCROLL_SCROLL_LIMIT'] = '2'
env['INFINISCROLL_SCROLL_DELAY'] = '500'
env['INFINISCROLL_MIN_HEIGHT'] = '100000' # High threshold so limit kicks in
@@ -215,29 +201,22 @@ def test_config_scroll_limit_honored():
assert output_str.startswith('scrolled to'), f"Should have valid output_str: {output_str}"
assert result_json['status'] == 'succeeded', f"Should succeed with scroll limit: {result_json}"
finally:
if chrome_launch_process and chrome_pid:
cleanup_chrome(chrome_launch_process, chrome_pid)
def test_config_timeout_honored():
"""Test that INFINISCROLL_TIMEOUT config is respected."""
with tempfile.TemporaryDirectory() as tmpdir:
chrome_launch_process = None
chrome_pid = None
try:
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(
Path(tmpdir),
crawl_id='test-timeout',
snapshot_id='snap-timeout',
test_url=TEST_URL,
)
with chrome_session(
Path(tmpdir),
crawl_id='test-timeout',
snapshot_id='snap-timeout',
test_url=TEST_URL,
) as (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env):
infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll'
infiniscroll_dir.mkdir()
# Set very short timeout
env = get_test_env()
# Set very short timeout (use env from setup_chrome_session)
env['INFINISCROLL_TIMEOUT'] = '3' # 3 seconds
env['INFINISCROLL_SCROLL_DELAY'] = '2000' # 2s delay - timeout should trigger
env['INFINISCROLL_SCROLL_LIMIT'] = '100' # High limit
@@ -258,9 +237,6 @@ def test_config_timeout_honored():
assert elapsed < 15, f"Should respect timeout, took {elapsed:.1f}s"
assert result.returncode == 0, f"Should complete even with timeout: {result.stderr}"
finally:
if chrome_launch_process and chrome_pid:
cleanup_chrome(chrome_launch_process, chrome_pid)
if __name__ == '__main__':

View File

@@ -154,8 +154,7 @@ def test_extension_loads_in_chromium():
# Step 1: Install the extension
result = subprocess.run(
['node', str(INSTALL_SCRIPT)],
cwd=str(tmpdir,
env=get_test_env()),
cwd=str(tmpdir),
capture_output=True,
text=True,
env=env,

View File

@@ -1,8 +1,8 @@
#!/usr/bin/env python3
"""
Detect mercury-parser binary and emit Binary JSONL record.
Detect postlight-parser binary and emit Binary JSONL record.
Output: Binary JSONL record to stdout if mercury-parser is found
Output: Binary JSONL record to stdout if postlight-parser is found
"""
import json
@@ -48,6 +48,11 @@ def output_binary_missing(name: str, binproviders: str):
'type': 'Binary',
'name': name,
'binproviders': binproviders, # Providers that can install it
'overrides': {
'npm': {
'packages': ['@postlight/parser'],
}
},
'machine_id': machine_id,
}
print(json.dumps(record))
@@ -55,7 +60,7 @@ def output_binary_missing(name: str, binproviders: str):
def main():
mercury_enabled = get_env_bool('MERCURY_ENABLED', True)
mercury_binary = get_env('MERCURY_BINARY', 'mercury-parser')
mercury_binary = get_env('MERCURY_BINARY', 'postlight-parser')
if not mercury_enabled:
sys.exit(0)
@@ -65,13 +70,13 @@ def main():
binary = Binary(name=mercury_binary, binproviders=[provider]).load()
if binary.abspath:
# Binary found
output_binary_found(binary, name='mercury-parser')
output_binary_found(binary, name='postlight-parser')
else:
# Binary not found
output_binary_missing(name='mercury-parser', binproviders='npm')
output_binary_missing(name='postlight-parser', binproviders='npm')
except Exception:
# Binary not found
output_binary_missing(name='mercury-parser', binproviders='npm')
output_binary_missing(name='postlight-parser', binproviders='npm')
sys.exit(0)

View File

@@ -25,8 +25,7 @@ import pytest
# Import shared Chrome test helpers
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
get_test_env,
setup_chrome_session,
cleanup_chrome,
chrome_session,
)
@@ -103,129 +102,119 @@ def test_fails_gracefully_without_chrome_session():
def test_background_script_handles_sigterm():
"""Test that background script runs and handles SIGTERM correctly."""
with tempfile.TemporaryDirectory() as tmpdir:
chrome_launch_process = None
chrome_pid = None
modalcloser_process = None
try:
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(
with chrome_session(
Path(tmpdir),
crawl_id='test-modalcloser',
snapshot_id='snap-modalcloser',
test_url=TEST_URL,
)
) as (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env):
# Create modalcloser output directory (sibling to chrome)
modalcloser_dir = snapshot_chrome_dir.parent / 'modalcloser'
modalcloser_dir.mkdir()
# Create modalcloser output directory (sibling to chrome)
modalcloser_dir = snapshot_chrome_dir.parent / 'modalcloser'
modalcloser_dir.mkdir()
# Run modalcloser as background process (use env from setup_chrome_session)
env['MODALCLOSER_POLL_INTERVAL'] = '200' # Faster polling for test
# Run modalcloser as background process
env = get_test_env()
env['MODALCLOSER_POLL_INTERVAL'] = '200' # Faster polling for test
modalcloser_process = subprocess.Popen(
['node', str(MODALCLOSER_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-modalcloser'],
cwd=str(modalcloser_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=env
)
modalcloser_process = subprocess.Popen(
['node', str(MODALCLOSER_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-modalcloser'],
cwd=str(modalcloser_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=env
)
# Let it run for a bit
time.sleep(2)
# Let it run for a bit
time.sleep(2)
# Verify it's still running (background script)
assert modalcloser_process.poll() is None, "Modalcloser should still be running as background process"
# Verify it's still running (background script)
assert modalcloser_process.poll() is None, "Modalcloser should still be running as background process"
# Send SIGTERM
modalcloser_process.send_signal(signal.SIGTERM)
stdout, stderr = modalcloser_process.communicate(timeout=5)
# Send SIGTERM
modalcloser_process.send_signal(signal.SIGTERM)
stdout, stderr = modalcloser_process.communicate(timeout=5)
assert modalcloser_process.returncode == 0, f"Should exit 0 on SIGTERM: {stderr}"
assert modalcloser_process.returncode == 0, f"Should exit 0 on SIGTERM: {stderr}"
# Parse JSONL output
result_json = None
for line in stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
result_json = record
break
except json.JSONDecodeError:
pass
# Parse JSONL output
result_json = None
for line in stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
result_json = record
break
except json.JSONDecodeError:
pass
assert result_json is not None, f"Should have ArchiveResult JSONL output. Stdout: {stdout}"
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
assert result_json is not None, f"Should have ArchiveResult JSONL output. Stdout: {stdout}"
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
# Verify output_str format
output_str = result_json.get('output_str', '')
assert 'modal' in output_str.lower() or 'dialog' in output_str.lower(), \
f"output_str should mention modals/dialogs: {output_str}"
# Verify output_str format
output_str = result_json.get('output_str', '')
assert 'modal' in output_str.lower() or 'dialog' in output_str.lower(), \
f"output_str should mention modals/dialogs: {output_str}"
# Verify no files created in output directory
output_files = list(modalcloser_dir.iterdir())
assert len(output_files) == 0, f"Should not create any files, but found: {output_files}"
# Verify no files created in output directory
output_files = list(modalcloser_dir.iterdir())
assert len(output_files) == 0, f"Should not create any files, but found: {output_files}"
finally:
if modalcloser_process and modalcloser_process.poll() is None:
modalcloser_process.kill()
if chrome_launch_process and chrome_pid:
cleanup_chrome(chrome_launch_process, chrome_pid)
def test_dialog_handler_logs_dialogs():
"""Test that dialog handler is set up correctly."""
with tempfile.TemporaryDirectory() as tmpdir:
chrome_launch_process = None
chrome_pid = None
modalcloser_process = None
try:
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(
Path(tmpdir),
crawl_id='test-dialog',
snapshot_id='snap-dialog',
test_url=TEST_URL,
)
with chrome_session(
Path(tmpdir),
crawl_id='test-dialog',
snapshot_id='snap-dialog',
test_url=TEST_URL,
) as (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env):
modalcloser_dir = snapshot_chrome_dir.parent / 'modalcloser'
modalcloser_dir.mkdir()
modalcloser_dir = snapshot_chrome_dir.parent / 'modalcloser'
modalcloser_dir.mkdir()
env = get_test_env()
env['MODALCLOSER_TIMEOUT'] = '100' # Fast timeout for test
env['MODALCLOSER_POLL_INTERVAL'] = '200'
# Use env from setup_chrome_session
env['MODALCLOSER_TIMEOUT'] = '100' # Fast timeout for test
env['MODALCLOSER_POLL_INTERVAL'] = '200'
modalcloser_process = subprocess.Popen(
['node', str(MODALCLOSER_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-dialog'],
cwd=str(modalcloser_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=env
)
modalcloser_process = subprocess.Popen(
['node', str(MODALCLOSER_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-dialog'],
cwd=str(modalcloser_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=env
)
# Let it run briefly
time.sleep(1.5)
# Let it run briefly
time.sleep(1.5)
# Verify it's running
assert modalcloser_process.poll() is None, "Should be running"
# Verify it's running
assert modalcloser_process.poll() is None, "Should be running"
# Check stderr for "listening" message
# Note: Can't read stderr while process is running without blocking,
# so we just verify it exits cleanly
modalcloser_process.send_signal(signal.SIGTERM)
stdout, stderr = modalcloser_process.communicate(timeout=5)
# Check stderr for "listening" message
# Note: Can't read stderr while process is running without blocking,
# so we just verify it exits cleanly
modalcloser_process.send_signal(signal.SIGTERM)
stdout, stderr = modalcloser_process.communicate(timeout=5)
assert 'listening' in stderr.lower() or 'modalcloser' in stderr.lower(), \
f"Should log startup message: {stderr}"
assert modalcloser_process.returncode == 0, f"Should exit cleanly: {stderr}"
assert 'listening' in stderr.lower() or 'modalcloser' in stderr.lower(), \
f"Should log startup message: {stderr}"
assert modalcloser_process.returncode == 0, f"Should exit cleanly: {stderr}"
finally:
if modalcloser_process and modalcloser_process.poll() is None:
modalcloser_process.kill()
if chrome_launch_process and chrome_pid:
cleanup_chrome(chrome_launch_process, chrome_pid)
def test_config_poll_interval():
@@ -235,61 +224,58 @@ def test_config_poll_interval():
chrome_pid = None
modalcloser_process = None
try:
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(
Path(tmpdir),
crawl_id='test-poll',
snapshot_id='snap-poll',
test_url=TEST_URL,
)
with chrome_session(
Path(tmpdir),
crawl_id='test-poll',
snapshot_id='snap-poll',
test_url=TEST_URL,
) as (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env):
modalcloser_dir = snapshot_chrome_dir.parent / 'modalcloser'
modalcloser_dir.mkdir()
modalcloser_dir = snapshot_chrome_dir.parent / 'modalcloser'
modalcloser_dir.mkdir()
# Set very short poll interval
env = get_test_env()
env['MODALCLOSER_POLL_INTERVAL'] = '100' # 100ms
# Set very short poll interval (use env from setup_chrome_session)
env['MODALCLOSER_POLL_INTERVAL'] = '100' # 100ms
modalcloser_process = subprocess.Popen(
['node', str(MODALCLOSER_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-poll'],
cwd=str(modalcloser_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=env
)
modalcloser_process = subprocess.Popen(
['node', str(MODALCLOSER_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-poll'],
cwd=str(modalcloser_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=env
)
# Run for short time
time.sleep(1)
# Run for short time
time.sleep(1)
# Should still be running
assert modalcloser_process.poll() is None, "Should still be running"
# Should still be running
assert modalcloser_process.poll() is None, "Should still be running"
# Clean exit
modalcloser_process.send_signal(signal.SIGTERM)
stdout, stderr = modalcloser_process.communicate(timeout=5)
# Clean exit
modalcloser_process.send_signal(signal.SIGTERM)
stdout, stderr = modalcloser_process.communicate(timeout=5)
assert modalcloser_process.returncode == 0, f"Should exit 0: {stderr}"
assert modalcloser_process.returncode == 0, f"Should exit 0: {stderr}"
# Verify JSONL output exists
result_json = None
for line in stdout.strip().split('\n'):
if line.strip().startswith('{'):
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
result_json = record
break
except json.JSONDecodeError:
pass
# Verify JSONL output exists
result_json = None
for line in stdout.strip().split('\n'):
if line.strip().startswith('{'):
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
result_json = record
break
except json.JSONDecodeError:
pass
assert result_json is not None, "Should have JSONL output"
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
assert result_json is not None, "Should have JSONL output"
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
finally:
if modalcloser_process and modalcloser_process.poll() is None:
modalcloser_process.kill()
if chrome_launch_process and chrome_pid:
cleanup_chrome(chrome_launch_process, chrome_pid)
def test_hides_cookie_consent_on_filmin():

View File

@@ -0,0 +1,80 @@
#!/usr/bin/env python3
"""
Detect papers-dl binary and emit Binary JSONL record.
Output: Binary JSONL record to stdout if papers-dl is found
"""
import json
import os
import sys
from abx_pkg import Binary, EnvProvider
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_bool(name: str, default: bool = False) -> bool:
val = get_env(name, '').lower()
if val in ('true', '1', 'yes', 'on'):
return True
if val in ('false', '0', 'no', 'off'):
return False
return default
def output_binary_found(binary: Binary, name: str):
"""Output Binary JSONL record for an installed binary."""
machine_id = os.environ.get('MACHINE_ID', '')
record = {
'type': 'Binary',
'name': name,
'abspath': str(binary.abspath),
'version': str(binary.version) if binary.version else '',
'sha256': binary.sha256 or '',
'binprovider': 'env', # Already installed
'machine_id': machine_id,
}
print(json.dumps(record))
def output_binary_missing(name: str, binproviders: str):
"""Output Binary JSONL record for a missing binary that needs installation."""
machine_id = os.environ.get('MACHINE_ID', '')
record = {
'type': 'Binary',
'name': name,
'binproviders': binproviders, # Providers that can install it
'machine_id': machine_id,
}
print(json.dumps(record))
def main():
papersdl_enabled = get_env_bool('PAPERSDL_ENABLED', True)
papersdl_binary = get_env('PAPERSDL_BINARY', 'papers-dl')
if not papersdl_enabled:
sys.exit(0)
provider = EnvProvider()
try:
binary = Binary(name=papersdl_binary, binproviders=[provider]).load()
if binary.abspath:
# Binary found
output_binary_found(binary, name='papers-dl')
else:
# Binary not found
output_binary_missing(name='papers-dl', binproviders='pip')
except Exception:
# Binary not found
output_binary_missing(name='papers-dl', binproviders='pip')
sys.exit(0)
if __name__ == '__main__':
main()

View File

@@ -72,10 +72,9 @@ class TestParseDomOutlinksWithChrome(TestCase):
test_url=test_url,
navigate=True,
timeout=30,
) as (chrome_process, chrome_pid, snapshot_chrome_dir):
# Get environment and run the outlinks hook
env = get_test_env()
env['CHROME_HEADLESS'] = 'true'
) as (chrome_process, chrome_pid, snapshot_chrome_dir, env):
# Use the environment from chrome_session (already has CHROME_HEADLESS=true)
# Run outlinks hook with the active Chrome session
result = subprocess.run(

View File

@@ -73,10 +73,9 @@ class TestRedirectsWithChrome(TestCase):
test_url=test_url,
navigate=True,
timeout=30,
) as (chrome_process, chrome_pid, snapshot_chrome_dir):
# Get environment and run the redirects hook
env = get_test_env()
env['CHROME_HEADLESS'] = 'true'
) as (chrome_process, chrome_pid, snapshot_chrome_dir, env):
# Use the environment from chrome_session (already has CHROME_HEADLESS=true)
# Run redirects hook with the active Chrome session
result = subprocess.run(

View File

@@ -72,10 +72,9 @@ class TestResponsesWithChrome(TestCase):
test_url=test_url,
navigate=True,
timeout=30,
) as (chrome_process, chrome_pid, snapshot_chrome_dir):
# Get environment and run the responses hook
env = get_test_env()
env['CHROME_HEADLESS'] = 'true'
) as (chrome_process, chrome_pid, snapshot_chrome_dir, env):
# Use the environment from chrome_session (already has CHROME_HEADLESS=true)
# Run responses hook with the active Chrome session
result = subprocess.run(

View File

@@ -1,20 +1,15 @@
#!/usr/bin/env node
/**
* Take a screenshot of a URL using Chrome/Puppeteer.
* Take a screenshot of a URL using an existing Chrome session.
*
* If a Chrome session exists (from chrome plugin), connects to it via CDP.
* Otherwise launches a new Chrome instance.
* Requires chrome plugin to have already created a Chrome session.
* Connects to the existing session via CDP and takes a screenshot.
*
* Usage: on_Snapshot__51_screenshot.js --url=<url> --snapshot-id=<uuid>
* Output: Writes screenshot/screenshot.png
*
* Environment variables:
* CHROME_BINARY: Path to Chrome/Chromium binary
* CHROME_TIMEOUT: Timeout in seconds (default: 60)
* CHROME_RESOLUTION: Screenshot resolution (default: 1440,2000)
* CHROME_USER_AGENT: User agent string (optional)
* CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true)
* CHROME_HEADLESS: Run in headless mode (default: true)
* SCREENSHOT_ENABLED: Enable screenshot capture (default: true)
*/
@@ -24,10 +19,8 @@ const path = require('path');
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const {
findChromium,
getEnv,
getEnvBool,
getEnvInt,
parseResolution,
parseArgs,
readCdpUrl,
@@ -56,7 +49,7 @@ function hasStaticFileOutput() {
}
// Wait for chrome tab to be fully loaded
async function waitForChromeTabLoaded(timeoutMs = 60000) {
async function waitForChromeTabLoaded(timeoutMs = 10000) {
const navigationFile = path.join(CHROME_SESSION_DIR, 'navigation.json');
const startTime = Date.now();
@@ -72,102 +65,66 @@ async function waitForChromeTabLoaded(timeoutMs = 60000) {
}
async function takeScreenshot(url) {
const timeout = (getEnvInt('CHROME_TIMEOUT') || getEnvInt('TIMEOUT', 60)) * 1000;
const resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000');
const userAgent = getEnv('CHROME_USER_AGENT') || getEnv('USER_AGENT', '');
const checkSsl = getEnvBool('CHROME_CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true));
const headless = getEnvBool('CHROME_HEADLESS', true);
const resolution = getEnv('CHROME_RESOLUTION', '1440,2000');
const { width, height } = parseResolution(resolution);
// Output directory is current directory (hook already runs in output dir)
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
let browser = null;
let page = null;
let connectedToSession = false;
// Wait for chrome_navigate to complete (writes navigation.json)
const timeoutSeconds = parseInt(getEnv('SCREENSHOT_TIMEOUT', '10'), 10);
const timeoutMs = timeoutSeconds * 1000;
const pageLoaded = await waitForChromeTabLoaded(timeoutMs);
if (!pageLoaded) {
throw new Error(`Page not loaded after ${timeoutSeconds}s (chrome_navigate must complete first)`);
}
// Connect to existing Chrome session (required - no fallback)
const cdpUrl = readCdpUrl(CHROME_SESSION_DIR);
if (!cdpUrl) {
throw new Error('No Chrome session found (chrome plugin must run first)');
}
// Read target_id.txt to get the specific tab for this snapshot
const targetIdFile = path.join(CHROME_SESSION_DIR, 'target_id.txt');
if (!fs.existsSync(targetIdFile)) {
throw new Error('No target_id.txt found (chrome_tab must run first)');
}
const targetId = fs.readFileSync(targetIdFile, 'utf8').trim();
const browser = await puppeteer.connect({
browserWSEndpoint: cdpUrl,
defaultViewport: { width, height },
});
try {
// Try to connect to existing Chrome session
const cdpUrl = readCdpUrl(CHROME_SESSION_DIR);
if (cdpUrl) {
try {
browser = await puppeteer.connect({
browserWSEndpoint: cdpUrl,
defaultViewport: { width, height },
});
connectedToSession = true;
// Get existing pages or create new one
const pages = await browser.pages();
page = pages.find(p => p.url().startsWith('http')) || pages[0];
if (!page) {
page = await browser.newPage();
}
// Set viewport on the page
await page.setViewport({ width, height });
} catch (e) {
console.error(`Failed to connect to CDP session: ${e.message}`);
browser = null;
}
// Get the specific page for this snapshot by target ID
const targets = await browser.targets();
const target = targets.find(t => t._targetId === targetId);
if (!target) {
throw new Error(`Target ${targetId} not found in Chrome session`);
}
// Fall back to launching new browser
if (!browser) {
const executablePath = findChromium();
if (!executablePath) {
return { success: false, error: 'Chrome binary not found' };
}
browser = await puppeteer.launch({
executablePath,
headless: headless ? 'new' : false,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
`--window-size=${width},${height}`,
...(checkSsl ? [] : ['--ignore-certificate-errors']),
],
defaultViewport: { width, height },
});
page = await browser.newPage();
// Navigate to URL (only if we launched fresh browser)
if (userAgent) {
await page.setUserAgent(userAgent);
}
await page.goto(url, {
waitUntil: 'networkidle2',
timeout,
});
const page = await target.page();
if (!page) {
throw new Error(`Could not get page for target ${targetId}`);
}
// Take screenshot
// Set viewport on the page
await page.setViewport({ width, height });
// Take screenshot (Puppeteer throws on failure)
await page.screenshot({
path: outputPath,
fullPage: true,
});
if (fs.existsSync(outputPath) && fs.statSync(outputPath).size > 0) {
return { success: true, output: outputPath };
} else {
return { success: false, error: 'Screenshot file not created' };
}
return outputPath;
} catch (e) {
return { success: false, error: `${e.name}: ${e.message}` };
} finally {
// Only close browser if we launched it (not if we connected to session)
if (browser && !connectedToSession) {
await browser.close();
}
// Disconnect from browser (don't close it - we're connected to a shared session)
// The chrome_launch hook manages the browser lifecycle
await browser.disconnect();
}
}
@@ -181,54 +138,33 @@ async function main() {
process.exit(1);
}
try {
// Check if staticfile extractor already handled this (permanent skip)
if (hasStaticFileOutput()) {
console.error(`Skipping screenshot - staticfile extractor already downloaded this`);
// Permanent skip - emit ArchiveResult
console.log(JSON.stringify({
type: 'ArchiveResult',
status: 'skipped',
output_str: 'staticfile already handled',
}));
process.exit(0);
}
// Only wait for page load if using shared Chrome session
const cdpUrl = readCdpUrl(CHROME_SESSION_DIR);
if (cdpUrl) {
// Wait for page to be fully loaded
const pageLoaded = await waitForChromeTabLoaded(60000);
if (!pageLoaded) {
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
}
}
const result = await takeScreenshot(url);
if (result.success) {
// Success - emit ArchiveResult
const size = fs.statSync(result.output).size;
console.error(`Screenshot saved (${size} bytes)`);
console.log(JSON.stringify({
type: 'ArchiveResult',
status: 'succeeded',
output_str: result.output,
}));
process.exit(0);
} else {
// Transient error - emit NO JSONL
console.error(`ERROR: ${result.error}`);
process.exit(1);
}
} catch (e) {
// Transient error - emit NO JSONL
console.error(`ERROR: ${e.name}: ${e.message}`);
process.exit(1);
// Check if staticfile extractor already handled this (permanent skip)
if (hasStaticFileOutput()) {
console.error(`Skipping screenshot - staticfile extractor already downloaded this`);
// Permanent skip - emit ArchiveResult
console.log(JSON.stringify({
type: 'ArchiveResult',
status: 'skipped',
output_str: 'staticfile already handled',
}));
process.exit(0);
}
// Take screenshot (throws on error)
const outputPath = await takeScreenshot(url);
// Success - emit ArchiveResult
const size = fs.statSync(outputPath).size;
console.error(`Screenshot saved (${size} bytes)`);
console.log(JSON.stringify({
type: 'ArchiveResult',
status: 'succeeded',
output_str: outputPath,
}));
}
main().catch(e => {
console.error(`Fatal error: ${e.message}`);
// Transient error - emit NO JSONL
console.error(`ERROR: ${e.message}`);
process.exit(1);
});

View File

@@ -25,6 +25,7 @@ from archivebox.plugins.chrome.tests.chrome_test_helpers import (
get_plugin_dir,
get_hook_script,
run_hook_and_parse,
chrome_session,
LIB_DIR,
NODE_MODULES_DIR,
CHROME_PLUGIN_DIR,
@@ -62,192 +63,96 @@ def test_verify_deps_with_abx_pkg():
assert node_loaded and node_loaded.abspath, "Node.js required for screenshot plugin"
def test_extracts_screenshot_from_example_com():
"""Test full workflow: extract screenshot from real example.com via hook.
Replicates production directory structure:
DATA_DIR/users/testuser/crawls/{crawl-id}/chrome/
DATA_DIR/users/testuser/crawls/{crawl-id}/snapshots/{snap-id}/chrome/
DATA_DIR/users/testuser/crawls/{crawl-id}/snapshots/{snap-id}/screenshot/
This exercises the "connect to existing session" code path which is the primary
path in production and accounts for ~50% of the code.
"""
import signal
import time
import os
def test_screenshot_with_chrome_session():
"""Test multiple screenshot scenarios with one Chrome session to save time."""
with tempfile.TemporaryDirectory() as tmpdir:
# Replicate exact production directory structure
data_dir = Path(tmpdir)
crawl_id = 'test-screenshot-crawl'
test_url = 'https://example.com'
snapshot_id = 'test-screenshot-snap'
# Crawl: DATA_DIR/users/{username}/crawls/YYYYMMDD/example.com/{crawl-id}/{plugin}/
crawl_dir = data_dir / 'users' / 'testuser' / 'crawls' / '20240101' / 'example.com' / crawl_id
chrome_dir = crawl_dir / 'chrome'
chrome_dir.mkdir(parents=True)
# Snapshot: DATA_DIR/users/{username}/snapshots/YYYYMMDD/example.com/{snapshot-uuid}/{plugin}/
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / snapshot_id
snapshot_chrome_dir = snapshot_dir / 'chrome'
snapshot_chrome_dir.mkdir(parents=True)
screenshot_dir = snapshot_dir / 'screenshot'
screenshot_dir.mkdir()
env = get_test_env()
env['CHROME_HEADLESS'] = 'true'
# Step 1: Launch Chrome session at crawl level (background process)
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=env
)
# Wait for Chrome to launch
for i in range(15):
if chrome_launch_process.poll() is not None:
stdout, stderr = chrome_launch_process.communicate()
pytest.fail(f"Chrome launch failed:\nStdout: {stdout}\nStderr: {stderr}")
if (chrome_dir / 'cdp_url.txt').exists():
break
time.sleep(1)
assert (chrome_dir / 'cdp_url.txt').exists(), "Chrome CDP URL file should exist"
assert (chrome_dir / 'chrome.pid').exists(), "Chrome PID file should exist"
chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
try:
# Step 2: Create tab at snapshot level
env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
result = subprocess.run(
['node', str(CHROME_TAB_HOOK), f'--url={TEST_URL}', f'--snapshot-id={snapshot_id}', f'--crawl-id={crawl_id}'],
cwd=str(snapshot_chrome_dir),
capture_output=True,
text=True,
timeout=60,
env=env
)
assert result.returncode == 0, f"Tab creation failed: {result.stderr}"
assert (snapshot_chrome_dir / 'cdp_url.txt').exists(), "Snapshot CDP URL should exist"
with chrome_session(
Path(tmpdir),
crawl_id='test-screenshot-crawl',
snapshot_id=snapshot_id,
test_url=test_url,
navigate=True,
timeout=30,
) as (chrome_process, chrome_pid, snapshot_chrome_dir, env):
# Step 3: Navigate to URL
result = subprocess.run(
['node', str(CHROME_NAVIGATE_HOOK), f'--url={TEST_URL}', f'--snapshot-id={snapshot_id}'],
cwd=str(snapshot_chrome_dir),
capture_output=True,
text=True,
timeout=120,
env=env
)
assert result.returncode == 0, f"Navigation failed: {result.stderr}"
assert (snapshot_chrome_dir / 'navigation.json').exists(), "Navigation JSON should exist"
# Scenario 1: Basic screenshot extraction
screenshot_dir = snapshot_chrome_dir.parent / 'screenshot'
screenshot_dir.mkdir()
# Step 4: Take screenshot (should connect to existing session)
# Screenshot hook runs in screenshot/ dir and looks for ../chrome/cdp_url.txt
result = subprocess.run(
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', f'--snapshot-id={snapshot_id}'],
cwd=str(screenshot_dir),
capture_output=True,
text=True,
timeout=120,
env=env
)
result = subprocess.run(
['node', str(SCREENSHOT_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
cwd=str(screenshot_dir),
capture_output=True,
text=True,
timeout=30,
env=env
)
assert result.returncode == 0, f"Screenshot extraction failed:\nStderr: {result.stderr}\nStdout: {result.stdout}"
assert result.returncode == 0, f"Screenshot extraction failed:\nStderr: {result.stderr}"
# Parse JSONL output
result_json = None
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
result_json = record
break
except json.JSONDecodeError:
pass
# Parse JSONL output
result_json = None
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
result_json = record
break
except json.JSONDecodeError:
pass
assert result_json, "Should have ArchiveResult JSONL output"
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
assert 'screenshot.png' in result_json['output_str'], f"Output should be screenshot.png: {result_json}"
assert result_json and result_json['status'] == 'succeeded'
screenshot_file = screenshot_dir / 'screenshot.png'
assert screenshot_file.exists() and screenshot_file.stat().st_size > 1000
assert screenshot_file.read_bytes()[:8] == b'\x89PNG\r\n\x1a\n'
# Verify filesystem output
screenshot_file = screenshot_dir / 'screenshot.png'
assert screenshot_file.exists(), f"screenshot.png not created at {screenshot_file}"
# Scenario 2: Custom resolution
screenshot_dir2 = snapshot_chrome_dir.parent / 'screenshot2'
screenshot_dir2.mkdir()
env['CHROME_RESOLUTION'] = '800,600'
# Verify file is valid PNG
file_size = screenshot_file.stat().st_size
assert file_size > 1000, f"Screenshot too small: {file_size} bytes"
assert file_size < 10 * 1024 * 1024, f"Screenshot suspiciously large: {file_size} bytes"
result = subprocess.run(
['node', str(SCREENSHOT_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
cwd=str(screenshot_dir2),
capture_output=True,
text=True,
timeout=30,
env=env
)
# Check PNG magic bytes
screenshot_data = screenshot_file.read_bytes()
assert screenshot_data[:8] == b'\x89PNG\r\n\x1a\n', "Should be valid PNG file"
assert result.returncode == 0
screenshot_file2 = screenshot_dir2 / 'screenshot.png'
assert screenshot_file2.exists()
file_size = screenshot_file2.stat().st_size
assert 500 < file_size < 100000, f"800x600 screenshot size unexpected: {file_size}"
finally:
# Cleanup: Kill Chrome
try:
chrome_launch_process.send_signal(signal.SIGTERM)
chrome_launch_process.wait(timeout=5)
except:
pass
try:
os.kill(chrome_pid, signal.SIGKILL)
except OSError:
pass
# Scenario 3: Wrong target ID (error case)
screenshot_dir3 = snapshot_chrome_dir.parent / 'screenshot3'
screenshot_dir3.mkdir()
(snapshot_chrome_dir / 'target_id.txt').write_text('nonexistent-target-id')
result = subprocess.run(
['node', str(SCREENSHOT_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
cwd=str(screenshot_dir3),
capture_output=True,
text=True,
timeout=5,
env=env
)
def test_extracts_screenshot_without_session():
"""Test screenshot extraction without existing Chrome session (fallback to own browser)."""
with tempfile.TemporaryDirectory() as tmpdir:
# Create proper snapshot directory structure
data_dir = Path(tmpdir)
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-fallback'
screenshot_dir = snapshot_dir / 'screenshot'
screenshot_dir.mkdir(parents=True)
assert result.returncode != 0
assert 'target' in result.stderr.lower() and 'not found' in result.stderr.lower()
# Don't set up Chrome session or staticfile - screenshot should launch its own browser
env = get_test_env()
result = subprocess.run(
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-fallback'],
cwd=str(screenshot_dir),
capture_output=True,
text=True,
timeout=120,
env=env
)
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
# Parse JSONL output
result_json = None
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
result_json = record
break
except json.JSONDecodeError:
pass
assert result_json, "Should have ArchiveResult JSONL output"
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
assert 'screenshot.png' in result_json['output_str']
# Verify file created
screenshot_file = screenshot_dir / 'screenshot.png'
assert screenshot_file.exists(), "screenshot.png not created"
assert screenshot_file.stat().st_size > 1000, "Screenshot too small"
except RuntimeError as e:
if 'Chrome' in str(e) or 'CDP' in str(e):
pytest.skip(f"Chrome session setup failed: {e}")
raise
def test_skips_when_staticfile_exists():
@@ -344,57 +249,42 @@ def test_reports_missing_chrome():
assert 'chrome' in combined.lower() or 'browser' in combined.lower() or 'ERROR=' in combined
def test_custom_resolution_and_user_agent():
"""Test that CHROME_RESOLUTION and CHROME_USER_AGENT configs are respected."""
def test_waits_for_navigation_timeout():
"""Test that screenshot waits for navigation.json and times out quickly if missing."""
import time
with tempfile.TemporaryDirectory() as tmpdir:
data_dir = Path(tmpdir)
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-config'
screenshot_dir = snapshot_dir / 'screenshot'
screenshot_dir.mkdir(parents=True)
tmpdir = Path(tmpdir)
# Create chrome directory without navigation.json to trigger timeout
chrome_dir = tmpdir.parent / 'chrome'
chrome_dir.mkdir(parents=True, exist_ok=True)
(chrome_dir / 'cdp_url.txt').write_text('ws://localhost:9222/devtools/browser/test')
(chrome_dir / 'target_id.txt').write_text('test-target-id')
# Intentionally NOT creating navigation.json to test timeout
screenshot_dir = tmpdir / 'screenshot'
screenshot_dir.mkdir()
env = get_test_env()
env['CHROME_RESOLUTION'] = '800,600'
env['CHROME_USER_AGENT'] = 'Test/1.0'
env['SCREENSHOT_TIMEOUT'] = '2' # Set 2 second timeout
start_time = time.time()
result = subprocess.run(
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-config'],
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=test-timeout'],
cwd=str(screenshot_dir),
capture_output=True,
text=True,
timeout=120,
timeout=5, # Test timeout slightly higher than SCREENSHOT_TIMEOUT
env=env
)
elapsed = time.time() - start_time
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
screenshot_file = screenshot_dir / 'screenshot.png'
assert screenshot_file.exists(), "screenshot.png not created"
# Resolution affects file size
assert screenshot_file.stat().st_size > 500, "Screenshot too small"
def test_ssl_check_disabled():
"""Test that CHROME_CHECK_SSL_VALIDITY=False allows invalid certificates."""
with tempfile.TemporaryDirectory() as tmpdir:
data_dir = Path(tmpdir)
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-ssl'
screenshot_dir = snapshot_dir / 'screenshot'
screenshot_dir.mkdir(parents=True)
env = get_test_env()
env['CHROME_CHECK_SSL_VALIDITY'] = 'False'
result = subprocess.run(
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-ssl'],
cwd=str(screenshot_dir),
capture_output=True,
text=True,
timeout=120,
env=env
)
assert result.returncode == 0, f"Should succeed: {result.stderr}"
assert (screenshot_dir / 'screenshot.png').exists()
# Should fail when navigation.json doesn't appear
assert result.returncode != 0, "Should fail when navigation.json missing"
assert 'not loaded' in result.stderr.lower() or 'navigate' in result.stderr.lower(), f"Should mention navigation timeout: {result.stderr}"
# Should complete within 3s (2s wait + 1s overhead)
assert elapsed < 3, f"Should timeout within 3s, took {elapsed:.1f}s"
def test_config_timeout_honored():
@@ -485,345 +375,114 @@ def test_invalid_resolution_format():
# (depending on implementation - script should not crash with uncaught error)
assert result.returncode in (0, 1), f"Script should handle bad resolution: {bad_resolution}"
def test_boolean_env_var_parsing():
"""Test that boolean environment variables are parsed correctly."""
import time
def test_no_cdp_url_fails():
"""Test error when chrome dir exists but no cdp_url.txt."""
with tempfile.TemporaryDirectory() as tmpdir:
data_dir = Path(tmpdir)
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-bool'
screenshot_dir = snapshot_dir / 'screenshot'
screenshot_dir.mkdir(parents=True)
env = get_test_env()
# Test various boolean formats for CHROME_HEADLESS
for bool_val in ['true', '1', 'yes', 'on', 'True', 'TRUE']:
env['CHROME_HEADLESS'] = bool_val
result = subprocess.run(
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-bool'],
cwd=str(screenshot_dir),
capture_output=True,
text=True,
timeout=120,
env=env
)
# Should either succeed or fail, but shouldn't crash on boolean parsing
assert result.returncode in (0, 1), f"Should handle boolean value: {bool_val}"
# Clean up screenshot file if created
screenshot_file = screenshot_dir / 'screenshot.png'
if screenshot_file.exists():
screenshot_file.unlink()
time.sleep(0.5) # Brief pause between attempts
def test_integer_env_var_parsing():
"""Test that integer environment variables are parsed correctly."""
with tempfile.TemporaryDirectory() as tmpdir:
data_dir = Path(tmpdir)
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-int'
screenshot_dir = snapshot_dir / 'screenshot'
screenshot_dir.mkdir(parents=True)
env = get_test_env()
# Test valid and invalid integer formats for CHROME_TIMEOUT
test_cases = [
('60', True), # Valid integer
('invalid', True), # Invalid - should use default
('', True), # Empty - should use default
]
for timeout_val, should_work in test_cases:
env['CHROME_TIMEOUT'] = timeout_val
result = subprocess.run(
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-int'],
cwd=str(screenshot_dir),
capture_output=True,
text=True,
timeout=120,
env=env
)
# Should either succeed or fail gracefully, but shouldn't crash on int parsing
assert result.returncode in (0, 1), f"Should handle timeout value: {timeout_val}"
# Clean up screenshot file if created
screenshot_file = screenshot_dir / 'screenshot.png'
if screenshot_file.exists():
screenshot_file.unlink()
def test_extracts_screenshot_with_all_config_options():
"""Test screenshot with comprehensive config to exercise all code paths."""
with tempfile.TemporaryDirectory() as tmpdir:
data_dir = Path(tmpdir)
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-full'
screenshot_dir = snapshot_dir / 'screenshot'
screenshot_dir.mkdir(parents=True)
# Set ALL config options to exercise all code paths
env = get_test_env()
env['CHROME_HEADLESS'] = 'true'
env['CHROME_RESOLUTION'] = '800,600'
env['CHROME_USER_AGENT'] = 'TestBot/1.0'
env['CHROME_CHECK_SSL_VALIDITY'] = 'false' # Exercises checkSsl branch
env['CHROME_TIMEOUT'] = '60'
result = subprocess.run(
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-full'],
cwd=str(screenshot_dir),
capture_output=True,
text=True,
timeout=120,
env=env
)
assert result.returncode == 0, f"Screenshot should succeed: {result.stderr}"
# Verify JSONL output with success
result_json = None
for line in result.stdout.strip().split('\n'):
if line.strip().startswith('{'):
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
result_json = record
break
except json.JSONDecodeError:
pass
assert result_json, "Should have ArchiveResult JSONL output"
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
assert 'screenshot.png' in result_json['output_str']
# Verify file created
screenshot_file = screenshot_dir / 'screenshot.png'
assert screenshot_file.exists(), "screenshot.png should be created"
assert screenshot_file.stat().st_size > 1000, "Screenshot should have content"
def test_headless_mode_false():
"""Test headless=false code path specifically."""
with tempfile.TemporaryDirectory() as tmpdir:
data_dir = Path(tmpdir)
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-headless'
screenshot_dir = snapshot_dir / 'screenshot'
screenshot_dir.mkdir(parents=True)
env = get_test_env()
# Explicitly test headless=false (exercises the ternary false branch)
env['CHROME_HEADLESS'] = 'false'
result = subprocess.run(
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-headless-false'],
cwd=str(screenshot_dir),
capture_output=True,
text=True,
timeout=120,
env=env
)
# Should work or fail gracefully
assert result.returncode in (0, 1), f"Headless=false should handle: {result.stderr}"
def test_invalid_url_causes_error():
"""Test error path with invalid URL that causes navigation failure."""
with tempfile.TemporaryDirectory() as tmpdir:
data_dir = Path(tmpdir)
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-invalid'
screenshot_dir = snapshot_dir / 'screenshot'
screenshot_dir.mkdir(parents=True)
env = get_test_env()
env['CHROME_TIMEOUT'] = '5' # Short timeout
# Use invalid URL to trigger error path
result = subprocess.run(
['node', str(SCREENSHOT_HOOK), '--url=http://this-domain-does-not-exist-12345.invalid', '--snapshot-id=snap-invalid'],
cwd=str(screenshot_dir),
capture_output=True,
text=True,
timeout=30,
env=env
)
# Should fail due to navigation error
assert result.returncode != 0, "Should fail on invalid URL"
# Should NOT emit JSONL (transient error)
jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')]
assert len(jsonl_lines) == 0, f"Should not emit JSONL on error: {jsonl_lines}"
def test_with_corrupted_cdp_url_falls_back():
"""Test that corrupted CDP URL file causes fallback to launching browser."""
with tempfile.TemporaryDirectory() as tmpdir:
data_dir = Path(tmpdir)
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-corrupt-cdp'
screenshot_dir = snapshot_dir / 'screenshot'
screenshot_dir.mkdir(parents=True)
# Create chrome directory with corrupted CDP URL
chrome_dir = snapshot_dir / 'chrome'
tmpdir = Path(tmpdir)
chrome_dir = tmpdir / 'chrome'
chrome_dir.mkdir()
(chrome_dir / 'cdp_url.txt').write_text('ws://127.0.0.1:99999/invalid')
# Create target_id.txt and navigation.json but NOT cdp_url.txt
(chrome_dir / 'target_id.txt').write_text('test-target')
(chrome_dir / 'navigation.json').write_text('{}')
env = get_test_env()
env['CHROME_HEADLESS'] = 'true'
env['CHROME_TIMEOUT'] = '5' # Short timeout for fast test
screenshot_dir = tmpdir / 'screenshot'
screenshot_dir.mkdir()
# Screenshot should try CDP, fail quickly, then fall back to launching own browser
result = subprocess.run(
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-corrupt-cdp'],
['node', str(SCREENSHOT_HOOK), '--url=https://example.com', '--snapshot-id=test'],
cwd=str(screenshot_dir),
capture_output=True,
text=True,
timeout=30,
env=env
timeout=7,
env=get_test_env()
)
# Should succeed by falling back to launching browser
assert result.returncode == 0, f"Should fallback and succeed: {result.stderr}"
assert 'Failed to connect to CDP' in result.stderr, "Should log CDP connection failure"
# Verify screenshot was created via fallback path
screenshot_file = screenshot_dir / 'screenshot.png'
assert screenshot_file.exists(), "Screenshot should be created via fallback"
assert result.returncode != 0
assert 'no chrome session' in result.stderr.lower()
def test_user_agent_is_applied():
"""Test that CHROME_USER_AGENT is actually applied when launching browser."""
def test_no_target_id_fails():
"""Test error when cdp_url exists but no target_id.txt."""
with tempfile.TemporaryDirectory() as tmpdir:
data_dir = Path(tmpdir)
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-ua'
screenshot_dir = snapshot_dir / 'screenshot'
screenshot_dir.mkdir(parents=True)
tmpdir = Path(tmpdir)
chrome_dir = tmpdir / 'chrome'
chrome_dir.mkdir()
# Create cdp_url.txt and navigation.json but NOT target_id.txt
(chrome_dir / 'cdp_url.txt').write_text('ws://localhost:9222/devtools/browser/test')
(chrome_dir / 'navigation.json').write_text('{}')
env = get_test_env()
env['CHROME_USER_AGENT'] = 'CustomBot/9.9.9 (Testing)'
env['CHROME_HEADLESS'] = 'true'
screenshot_dir = tmpdir / 'screenshot'
screenshot_dir.mkdir()
result = subprocess.run(
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-ua'],
['node', str(SCREENSHOT_HOOK), '--url=https://example.com', '--snapshot-id=test'],
cwd=str(screenshot_dir),
capture_output=True,
text=True,
timeout=120,
env=env
timeout=7,
env=get_test_env()
)
# Should succeed with custom user agent
assert result.returncode == 0, f"Should succeed with custom UA: {result.stderr}"
screenshot_file = screenshot_dir / 'screenshot.png'
assert screenshot_file.exists(), "Screenshot should be created"
assert result.returncode != 0
assert 'target_id.txt' in result.stderr.lower()
def test_check_ssl_false_branch():
"""Test CHROME_CHECK_SSL_VALIDITY=false adds ignore-certificate-errors arg."""
def test_invalid_cdp_url_fails():
"""Test error with malformed CDP URL."""
with tempfile.TemporaryDirectory() as tmpdir:
data_dir = Path(tmpdir)
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-nossl'
screenshot_dir = snapshot_dir / 'screenshot'
screenshot_dir.mkdir(parents=True)
tmpdir = Path(tmpdir)
chrome_dir = tmpdir / 'chrome'
chrome_dir.mkdir()
(chrome_dir / 'cdp_url.txt').write_text('invalid-url')
(chrome_dir / 'target_id.txt').write_text('test-target')
(chrome_dir / 'navigation.json').write_text('{}')
env = get_test_env()
env['CHROME_CHECK_SSL_VALIDITY'] = 'false'
env['CHROME_HEADLESS'] = 'true'
screenshot_dir = tmpdir / 'screenshot'
screenshot_dir.mkdir()
# Test with both boolean false and string 'false'
result = subprocess.run(
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-nossl'],
['node', str(SCREENSHOT_HOOK), '--url=https://example.com', '--snapshot-id=test'],
cwd=str(screenshot_dir),
capture_output=True,
text=True,
timeout=120,
env=env
timeout=7,
env=get_test_env()
)
assert result.returncode == 0, f"Should work with SSL check disabled: {result.stderr}"
assert (screenshot_dir / 'screenshot.png').exists()
assert result.returncode != 0
def test_alternative_env_var_names():
"""Test fallback environment variable names (TIMEOUT vs CHROME_TIMEOUT, etc)."""
def test_invalid_timeout_uses_default():
"""Test that invalid SCREENSHOT_TIMEOUT falls back to default."""
with tempfile.TemporaryDirectory() as tmpdir:
data_dir = Path(tmpdir)
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-altenv'
screenshot_dir = snapshot_dir / 'screenshot'
screenshot_dir.mkdir(parents=True)
tmpdir = Path(tmpdir)
chrome_dir = tmpdir / 'chrome'
chrome_dir.mkdir()
# No navigation.json to trigger timeout
(chrome_dir / 'cdp_url.txt').write_text('ws://localhost:9222/test')
(chrome_dir / 'target_id.txt').write_text('test')
screenshot_dir = tmpdir / 'screenshot'
screenshot_dir.mkdir()
env = get_test_env()
# Use alternative env var names (without CHROME_ prefix)
env['TIMEOUT'] = '45'
env['RESOLUTION'] = '1024,768'
env['USER_AGENT'] = 'AltBot/1.0'
env['CHECK_SSL_VALIDITY'] = 'false'
env['SCREENSHOT_TIMEOUT'] = 'invalid' # Should fallback to default (10s becomes NaN, treated as 0)
import time
start = time.time()
result = subprocess.run(
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-altenv'],
['node', str(SCREENSHOT_HOOK), '--url=https://example.com', '--snapshot-id=test'],
cwd=str(screenshot_dir),
capture_output=True,
text=True,
timeout=120,
timeout=5,
env=env
)
elapsed = time.time() - start
assert result.returncode == 0, f"Should work with alternative env vars: {result.stderr}"
assert (screenshot_dir / 'screenshot.png').exists()
def test_very_large_resolution():
"""Test screenshot with very large resolution."""
with tempfile.TemporaryDirectory() as tmpdir:
data_dir = Path(tmpdir)
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-large'
screenshot_dir = snapshot_dir / 'screenshot'
screenshot_dir.mkdir(parents=True)
env = get_test_env()
env['CHROME_RESOLUTION'] = '3840,2160' # 4K resolution
env['CHROME_HEADLESS'] = 'true'
result = subprocess.run(
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-large'],
cwd=str(screenshot_dir),
capture_output=True,
text=True,
timeout=120,
env=env
)
assert result.returncode == 0, f"Should handle large resolution: {result.stderr}"
screenshot_file = screenshot_dir / 'screenshot.png'
assert screenshot_file.exists()
# 4K screenshot should be larger
assert screenshot_file.stat().st_size > 5000, "4K screenshot should be substantial"
def test_very_small_resolution():
"""Test screenshot with very small resolution."""
with tempfile.TemporaryDirectory() as tmpdir:
data_dir = Path(tmpdir)
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-small'
screenshot_dir = snapshot_dir / 'screenshot'
screenshot_dir.mkdir(parents=True)
env = get_test_env()
env['CHROME_RESOLUTION'] = '320,240' # Very small
env['CHROME_HEADLESS'] = 'true'
result = subprocess.run(
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-small'],
cwd=str(screenshot_dir),
capture_output=True,
text=True,
timeout=120,
env=env
)
assert result.returncode == 0, f"Should handle small resolution: {result.stderr}"
assert (screenshot_dir / 'screenshot.png').exists()
# With invalid timeout, parseInt returns NaN, which should be handled
assert result.returncode != 0
assert elapsed < 2 # Should fail quickly, not wait 10s
if __name__ == '__main__':

View File

@@ -72,10 +72,9 @@ class TestSEOWithChrome(TestCase):
test_url=test_url,
navigate=True,
timeout=30,
) as (chrome_process, chrome_pid, snapshot_chrome_dir):
# Get environment and run the SEO hook
env = get_test_env()
env['CHROME_HEADLESS'] = 'true'
) as (chrome_process, chrome_pid, snapshot_chrome_dir, env):
# Use the environment from chrome_session (already has CHROME_HEADLESS=true)
# Run SEO hook with the active Chrome session
result = subprocess.run(

View File

@@ -22,7 +22,7 @@ from archivebox.plugins.chrome.tests.chrome_test_helpers import (
get_test_env,
get_plugin_dir,
get_hook_script,
setup_chrome_session,
chrome_session,
cleanup_chrome,
)
@@ -96,17 +96,15 @@ def test_singlefile_with_chrome_session():
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
try:
# Set up Chrome session using shared helper
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(
tmpdir=tmpdir,
crawl_id='singlefile-test-crawl',
snapshot_id='singlefile-test-snap',
test_url=TEST_URL,
navigate=False, # Don't navigate, singlefile will do that
timeout=20,
)
# Set up Chrome session using shared helper
with chrome_session(
tmpdir=tmpdir,
crawl_id='singlefile-test-crawl',
snapshot_id='singlefile-test-snap',
test_url=TEST_URL,
navigate=False, # Don't navigate, singlefile will do that
timeout=20,
) as (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env):
# singlefile looks for ../chrome/cdp_url.txt relative to cwd
# So we need to run from a directory that has ../chrome pointing to our chrome dir
singlefile_output_dir = tmpdir / 'snapshot' / 'singlefile'
@@ -117,9 +115,8 @@ def test_singlefile_with_chrome_session():
if not chrome_link.exists():
chrome_link.symlink_to(tmpdir / 'crawl' / 'chrome')
env = get_test_env()
# Use env from chrome_session
env['SINGLEFILE_ENABLED'] = 'true'
env['CHROME_HEADLESS'] = 'true'
# Run singlefile - it should find and use the existing Chrome session
result = subprocess.run(
@@ -143,9 +140,6 @@ def test_singlefile_with_chrome_session():
assert result.returncode == 0 or 'browser-server' in result.stderr or 'cdp' in result.stderr.lower(), \
f"Singlefile should attempt CDP connection. stderr: {result.stderr}"
finally:
cleanup_chrome(chrome_launch_process, chrome_pid)
def test_singlefile_disabled_skips():
"""Test that SINGLEFILE_ENABLED=False exits without JSONL."""

View File

@@ -72,10 +72,9 @@ class TestSSLWithChrome(TestCase):
test_url=test_url,
navigate=True,
timeout=30,
) as (chrome_process, chrome_pid, snapshot_chrome_dir):
# Get environment and run the SSL hook
env = get_test_env()
env['CHROME_HEADLESS'] = 'true'
) as (chrome_process, chrome_pid, snapshot_chrome_dir, env):
# Use the environment from chrome_session (already has CHROME_HEADLESS=true)
# Run SSL hook with the active Chrome session
result = subprocess.run(

View File

@@ -72,16 +72,14 @@ class TestStaticfileWithChrome(TestCase):
test_url=test_url,
navigate=True,
timeout=30,
) as (chrome_process, chrome_pid, snapshot_chrome_dir):
# Get environment and run the staticfile hook
env = get_test_env()
env['CHROME_HEADLESS'] = 'true'
) as (chrome_process, chrome_pid, snapshot_chrome_dir, env):
# Use the environment from chrome_session (already has CHROME_HEADLESS=true)
# Run staticfile hook with the active Chrome session
result = subprocess.run(
['node', str(STATICFILE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
cwd=str(snapshot_chrome_dir,
env=get_test_env()),
cwd=str(snapshot_chrome_dir),
capture_output=True,
text=True,
timeout=120, # Longer timeout as it waits for navigation

View File

@@ -384,11 +384,11 @@ def test_root_snapshot_has_depth_zero(tmp_path, process, disable_extractors_dict
def test_archiveresult_worker_queue_filters_by_foreground_extractors(tmp_path, process):
"""Test that ArchiveResultWorker.get_queue() only blocks on foreground extractors."""
"""Test that background hooks don't block foreground extractors from running."""
os.chdir(tmp_path)
# This test verifies the fix for the orchestrator bug where background hooks
# were blocking parser extractors from running
# This test verifies that background hooks run concurrently with foreground hooks
# and don't block parser extractors
# Start a crawl
env = os.environ.copy()

View File

@@ -1,15 +1,13 @@
"""
Orchestrator for managing worker processes.
The Orchestrator polls queues for each model type (Crawl, Snapshot, ArchiveResult)
and lazily spawns worker processes when there is work to be done.
The Orchestrator polls the Crawl queue and spawns CrawlWorkers as needed.
Architecture:
Orchestrator (main loop, polls queues)
── CrawlWorker subprocess(es)
── SnapshotWorker subprocess(es)
└── ArchiveResultWorker subprocess(es)
└── Each worker spawns task subprocesses via CLI
Orchestrator (polls Crawl queue)
── CrawlWorker(s) (one per active Crawl)
── SnapshotWorker(s) (one per Snapshot, up to limit)
└── Hook Processes (sequential, forked by SnapshotWorker)
Usage:
# Default: runs forever (for use as subprocess of server)
@@ -38,7 +36,7 @@ from django.utils import timezone
from rich import print
from archivebox.misc.logging_util import log_worker_event
from .worker import Worker, CrawlWorker, SnapshotWorker, ArchiveResultWorker
from .worker import Worker, CrawlWorker
def _run_orchestrator_process(exit_on_idle: bool) -> None:
@@ -52,22 +50,27 @@ def _run_orchestrator_process(exit_on_idle: bool) -> None:
class Orchestrator:
"""
Manages worker processes by polling queues and spawning workers as needed.
The orchestrator:
1. Polls each model queue (Crawl, Snapshot, ArchiveResult)
2. If items exist and fewer than MAX_CONCURRENT workers are running, spawns workers
1. Polls Crawl queue
2. If crawls exist and fewer than MAX_CRAWL_WORKERS are running, spawns CrawlWorkers
3. Monitors worker health and cleans up stale PIDs
4. Exits when all queues are empty (unless daemon mode)
4. Exits when queue is empty (unless daemon mode)
Architecture:
- Orchestrator spawns CrawlWorkers (one per active Crawl)
- Each CrawlWorker spawns SnapshotWorkers (one per Snapshot, up to limit)
- Each SnapshotWorker runs hooks sequentially for its snapshot
"""
WORKER_TYPES: list[Type[Worker]] = [CrawlWorker, SnapshotWorker, ArchiveResultWorker]
# Only CrawlWorker - SnapshotWorkers are spawned by CrawlWorker subprocess, not by Orchestrator
WORKER_TYPES: list[Type[Worker]] = [CrawlWorker]
# Configuration
POLL_INTERVAL: float = 2.0 # How often to check for new work (seconds)
IDLE_TIMEOUT: int = 3 # Exit after N idle ticks (0 = never exit)
MAX_WORKERS_PER_TYPE: int = 8 # Max workers per model type
MAX_TOTAL_WORKERS: int = 24 # Max workers across all types
MAX_CRAWL_WORKERS: int = 8 # Max crawls processing simultaneously
def __init__(self, exit_on_idle: bool = True, crawl_id: str | None = None):
self.exit_on_idle = exit_on_idle
self.crawl_id = crawl_id # If set, only process work for this crawl
@@ -76,11 +79,9 @@ class Orchestrator:
self.idle_count: int = 0
self._last_cleanup_time: float = 0.0 # For throttling cleanup_stale_running()
# In foreground mode (exit_on_idle=True), limit workers but allow enough
# for crawl progression: 1 CrawlWorker + 1 SnapshotWorker + 1 ArchiveResultWorker
# In foreground mode (exit_on_idle=True), limit to 1 CrawlWorker
if self.exit_on_idle:
self.MAX_WORKERS_PER_TYPE = 1
self.MAX_TOTAL_WORKERS = 3 # Allow one worker of each type to run concurrently
self.MAX_CRAWL_WORKERS = 1
def __repr__(self) -> str:
return f'[underline]Orchestrator[/underline]\\[pid={self.pid}]'
@@ -109,14 +110,18 @@ class Orchestrator:
# Clean up any stale Process records from previous runs
stale_count = Process.cleanup_stale_running()
# Clean up orphaned Chrome processes from previous crashes
chrome_count = Process.cleanup_orphaned_chrome()
# Collect startup metadata
metadata = {
'max_workers_per_type': self.MAX_WORKERS_PER_TYPE,
'max_total_workers': self.MAX_TOTAL_WORKERS,
'max_crawl_workers': self.MAX_CRAWL_WORKERS,
'poll_interval': self.POLL_INTERVAL,
}
if stale_count:
metadata['cleaned_stale_pids'] = stale_count
if chrome_count:
metadata['cleaned_orphaned_chrome'] = chrome_count
log_worker_event(
worker_type='Orchestrator',
@@ -126,8 +131,34 @@ class Orchestrator:
metadata=metadata,
)
def terminate_all_workers(self) -> None:
"""Terminate all running worker processes."""
from archivebox.machine.models import Process
import signal
# Get all running worker processes
running_workers = Process.objects.filter(
process_type=Process.TypeChoices.WORKER,
status__in=['running', 'started']
)
for worker_process in running_workers:
try:
# Send SIGTERM to gracefully terminate the worker
os.kill(worker_process.pid, signal.SIGTERM)
except ProcessLookupError:
# Process already dead
pass
except Exception:
# Ignore other errors during shutdown
pass
def on_shutdown(self, error: BaseException | None = None) -> None:
"""Called when orchestrator shuts down."""
# Terminate all worker processes in exit_on_idle mode
if self.exit_on_idle:
self.terminate_all_workers()
# Update Process record status
if hasattr(self, 'db_process') and self.db_process:
# KeyboardInterrupt is a graceful shutdown, not an error
@@ -163,20 +194,15 @@ class Orchestrator:
return len(WorkerClass.get_running_workers())
def should_spawn_worker(self, WorkerClass: Type[Worker], queue_count: int) -> bool:
"""Determine if we should spawn a new worker of the given type."""
"""Determine if we should spawn a new CrawlWorker."""
if queue_count == 0:
return False
# Check per-type limit
# Check CrawlWorker limit
running_workers = WorkerClass.get_running_workers()
running_count = len(running_workers)
if running_count >= self.MAX_WORKERS_PER_TYPE:
return False
# Check total limit
total_workers = self.get_total_worker_count()
if total_workers >= self.MAX_TOTAL_WORKERS:
if running_count >= self.MAX_CRAWL_WORKERS:
return False
# Check if we already have enough workers for the queue size
@@ -190,7 +216,7 @@ class Orchestrator:
"""Spawn a new worker process. Returns PID or None if spawn failed."""
try:
print(f'[yellow]DEBUG: Spawning {WorkerClass.name} worker with crawl_id={self.crawl_id}...[/yellow]')
pid = WorkerClass.start(daemon=False, crawl_id=self.crawl_id)
pid = WorkerClass.start(crawl_id=self.crawl_id)
print(f'[yellow]DEBUG: Spawned {WorkerClass.name} worker with PID={pid}[/yellow]')
# CRITICAL: Block until worker registers itself in Process table
@@ -259,24 +285,49 @@ class Orchestrator:
def check_queues_and_spawn_workers(self) -> dict[str, int]:
"""
Check all queues and spawn workers as needed.
Returns dict of queue sizes by worker type.
Check Crawl queue and spawn CrawlWorkers as needed.
Returns dict of queue sizes.
"""
from archivebox.crawls.models import Crawl
queue_sizes = {}
for WorkerClass in self.WORKER_TYPES:
# Get queue for this worker type
# Need to instantiate worker to get queue (for model access)
worker = WorkerClass(worker_id=-1, crawl_id=self.crawl_id) # temp instance just for queue access
queue = worker.get_queue()
queue_count = queue.count()
queue_sizes[WorkerClass.name] = queue_count
# Only check Crawl queue
crawl_queue = Crawl.objects.filter(
retry_at__lte=timezone.now()
).exclude(
status__in=Crawl.FINAL_STATES
)
# Apply crawl_id filter if set
if self.crawl_id:
crawl_queue = crawl_queue.filter(id=self.crawl_id)
crawl_queue = crawl_queue.order_by('retry_at')
crawl_count = crawl_queue.count()
queue_sizes['crawl'] = crawl_count
# Spawn CrawlWorker if needed
if self.should_spawn_worker(CrawlWorker, crawl_count):
# Claim next crawl
crawl = crawl_queue.first()
if crawl and self._claim_crawl(crawl):
CrawlWorker.start(crawl_id=str(crawl.id))
# Spawn worker if needed
if self.should_spawn_worker(WorkerClass, queue_count):
self.spawn_worker(WorkerClass)
return queue_sizes
def _claim_crawl(self, crawl) -> bool:
"""Atomically claim a crawl using optimistic locking."""
from archivebox.crawls.models import Crawl
updated = Crawl.objects.filter(
pk=crawl.pk,
retry_at=crawl.retry_at,
).update(
retry_at=timezone.now() + timedelta(hours=24), # Long lock (crawls take time)
)
return updated == 1
def has_pending_work(self, queue_sizes: dict[str, int]) -> bool:
"""Check if any queue has pending work."""
@@ -287,30 +338,21 @@ class Orchestrator:
return self.get_total_worker_count() > 0
def has_future_work(self) -> bool:
"""Check if there's work scheduled for the future (retry_at > now)."""
for WorkerClass in self.WORKER_TYPES:
worker = WorkerClass(worker_id=-1, crawl_id=self.crawl_id)
Model = worker.get_model()
"""Check if there's work scheduled for the future (retry_at > now) in Crawl queue."""
from archivebox.crawls.models import Crawl
# Build filter for future work, respecting crawl_id if set
qs = Model.objects.filter(
retry_at__gt=timezone.now()
).exclude(
status__in=Model.FINAL_STATES
)
# Build filter for future work, respecting crawl_id if set
qs = Crawl.objects.filter(
retry_at__gt=timezone.now()
).exclude(
status__in=Crawl.FINAL_STATES
)
# Apply crawl_id filter if set
if self.crawl_id:
if WorkerClass.name == 'crawl':
qs = qs.filter(id=self.crawl_id)
elif WorkerClass.name == 'snapshot':
qs = qs.filter(crawl_id=self.crawl_id)
elif WorkerClass.name == 'archiveresult':
qs = qs.filter(snapshot__crawl_id=self.crawl_id)
# Apply crawl_id filter if set
if self.crawl_id:
qs = qs.filter(id=self.crawl_id)
if qs.count() > 0:
return True
return False
return qs.count() > 0
def on_tick(self, queue_sizes: dict[str, int]) -> None:
"""Called each orchestrator tick. Override for custom behavior."""
@@ -345,20 +387,20 @@ class Orchestrator:
def runloop(self) -> None:
"""Main orchestrator loop."""
from rich.progress import Progress, BarColumn, TextColumn, TaskProgressColumn
from archivebox.misc.logging import IS_TTY, CONSOLE
from rich.live import Live
from archivebox.misc.logging import IS_TTY
from archivebox.misc.progress_layout import ArchiveBoxProgressLayout
import sys
import os
# Enable progress bars only in TTY + foreground mode
# Enable progress layout only in TTY + foreground mode
show_progress = IS_TTY and self.exit_on_idle
self.on_startup()
task_ids = {}
if not show_progress:
# No progress bars - just run normally
self._run_orchestrator_loop(None, task_ids)
# No progress layout - just run normally
self._run_orchestrator_loop(None)
else:
# Redirect worker subprocess output to /dev/null
devnull_fd = os.open(os.devnull, os.O_WRONLY)
@@ -384,14 +426,16 @@ class Orchestrator:
original_console = logging_module.CONSOLE
logging_module.CONSOLE = orchestrator_console
# Now create Progress and run loop (DON'T restore stdout/stderr - workers need /dev/null)
with Progress(
TextColumn("[cyan]{task.description}"),
BarColumn(bar_width=40),
TaskProgressColumn(),
# Create layout and run with Live display
progress_layout = ArchiveBoxProgressLayout(crawl_id=self.crawl_id)
with Live(
progress_layout.get_layout(),
refresh_per_second=4,
screen=True,
console=orchestrator_console,
) as progress:
self._run_orchestrator_loop(progress, task_ids)
):
self._run_orchestrator_loop(progress_layout)
# Restore original console
logging_module.CONSOLE = original_console
@@ -409,22 +453,68 @@ class Orchestrator:
pass
# stdout_for_console is closed by orchestrator_console
def _run_orchestrator_loop(self, progress, task_ids):
def _run_orchestrator_loop(self, progress_layout):
"""Run the main orchestrator loop with optional progress display."""
last_queue_sizes = {}
last_snapshot_count = None
tick_count = 0
# Track snapshot progress to detect changes
snapshot_progress = {} # snapshot_id -> (total, completed, current_plugin)
try:
while True:
tick_count += 1
# Check queues and spawn workers
queue_sizes = self.check_queues_and_spawn_workers()
# Debug queue sizes (only when changed)
if progress and queue_sizes != last_queue_sizes:
progress.console.print(f'[yellow]DEBUG: Queue sizes: {queue_sizes}[/yellow]')
last_queue_sizes = queue_sizes.copy()
# Get worker counts for each type
worker_counts = {
WorkerClass.name: len(WorkerClass.get_running_workers())
for WorkerClass in self.WORKER_TYPES
}
# Update progress bars
if progress:
# Update layout if enabled
if progress_layout:
# Get crawl queue and worker counts
crawl_queue_count = queue_sizes.get('crawl', 0)
crawl_workers_count = worker_counts.get('crawl', 0)
# Determine orchestrator status
if crawl_workers_count > 0:
status = "Working"
elif crawl_queue_count > 0:
status = "Spawning"
else:
status = "Idle"
# Update orchestrator status
progress_layout.update_orchestrator_status(
status=status,
crawl_queue_count=crawl_queue_count,
crawl_workers_count=crawl_workers_count,
max_crawl_workers=self.MAX_CRAWL_WORKERS,
)
# Log queue size changes
if queue_sizes != last_queue_sizes:
for worker_type, count in queue_sizes.items():
old_count = last_queue_sizes.get(worker_type, 0)
if count != old_count:
if count > old_count:
progress_layout.log_event(
f"{worker_type.capitalize()} queue: {old_count}{count}",
style="yellow"
)
else:
progress_layout.log_event(
f"{worker_type.capitalize()} queue: {old_count}{count}",
style="green"
)
last_queue_sizes = queue_sizes.copy()
# Update snapshot progress
from archivebox.core.models import Snapshot
# Get all started snapshots (optionally filtered by crawl_id)
@@ -438,9 +528,36 @@ class Orchestrator:
active_snapshots = list(Snapshot.objects.filter(**snapshot_filter))
# Debug snapshot count (only when changed)
# Log snapshot count changes and details
if len(active_snapshots) != last_snapshot_count:
progress.console.print(f'[yellow]DEBUG: Found {len(active_snapshots)} active snapshots (crawl_id={self.crawl_id})[/yellow]')
if last_snapshot_count is not None:
if len(active_snapshots) > last_snapshot_count:
progress_layout.log_event(
f"Active snapshots: {last_snapshot_count}{len(active_snapshots)}",
style="cyan"
)
# Log which snapshots started
for snapshot in active_snapshots[-1:]: # Just show the newest one
progress_layout.log_event(
f"Started: {snapshot.url[:60]}",
style="green"
)
# Log SnapshotWorker count
from archivebox.machine.models import Process
all_workers = Process.objects.filter(
process_type=Process.TypeChoices.WORKER,
status__in=['running', 'started']
).count()
progress_layout.log_event(
f"Workers running: {all_workers} ({crawl_workers_count} CrawlWorkers)",
style="grey53"
)
else:
progress_layout.log_event(
f"Active snapshots: {last_snapshot_count}{len(active_snapshots)}",
style="blue"
)
last_snapshot_count = len(active_snapshots)
# Track which snapshots are still active
@@ -450,13 +567,14 @@ class Orchestrator:
active_ids.add(snapshot.id)
total = snapshot.archiveresult_set.count()
if total == 0:
continue
completed = snapshot.archiveresult_set.filter(
status__in=['succeeded', 'skipped', 'failed']
).count()
# Count hooks by status for debugging
queued = snapshot.archiveresult_set.filter(status='queued').count()
started = snapshot.archiveresult_set.filter(status='started').count()
# Find currently running hook (ordered by hook_name to get lowest step number)
current_ar = snapshot.archiveresult_set.filter(status='started').order_by('hook_name').first()
if not current_ar:
@@ -472,24 +590,78 @@ class Orchestrator:
# Clean up the name: remove prefix and extension
clean_name = hook_name.split('__')[-1] if '__' in hook_name else hook_name
clean_name = clean_name.replace('.py', '').replace('.sh', '').replace('.bg', '')
current_plugin = f"{clean_name}"
current_plugin = clean_name
elif total == 0:
# Snapshot just started, hooks not created yet
current_plugin = "initializing"
elif queued > 0:
# Hooks created but none started yet
current_plugin = "waiting"
# Build description with URL + current plugin
url = snapshot.url[:50] + '...' if len(snapshot.url) > 50 else snapshot.url
description = f"{url}{current_plugin}"
# Update snapshot worker (show even if no hooks yet)
# Debug: Log first time we see this snapshot
if snapshot.id not in progress_layout.snapshot_to_worker:
progress_layout.log_event(
f"Assigning to worker: {snapshot.url[:50]}",
style="grey53"
)
# Create or update task
if snapshot.id not in task_ids:
task_ids[snapshot.id] = progress.add_task(description, total=total, completed=completed)
else:
# Update both progress and description
progress.update(task_ids[snapshot.id], description=description, completed=completed)
# Track progress changes
prev_progress = snapshot_progress.get(snapshot.id, (0, 0, ''))
curr_progress = (total, completed, current_plugin)
# Remove tasks for snapshots that are no longer active
for snapshot_id in list(task_ids.keys()):
if prev_progress != curr_progress:
prev_total, prev_completed, prev_plugin = prev_progress
# Log hooks created
if total > prev_total:
progress_layout.log_event(
f"Hooks created: {total} for {snapshot.url[:40]}",
style="cyan"
)
# Log hook completion
if completed > prev_completed:
progress_layout.log_event(
f"Hook completed: {completed}/{total} for {snapshot.url[:40]}",
style="green"
)
# Log plugin change
if current_plugin and current_plugin != prev_plugin:
progress_layout.log_event(
f"Running: {current_plugin} ({snapshot.url[:40]})",
style="yellow"
)
snapshot_progress[snapshot.id] = curr_progress
# Debug: Every 10 ticks, log detailed status if stuck at initializing
if tick_count % 10 == 0 and total == 0 and current_plugin == "initializing":
progress_layout.log_event(
f"DEBUG: Snapshot stuck at initializing (status={snapshot.status})",
style="red"
)
progress_layout.update_snapshot_worker(
snapshot_id=snapshot.id,
url=snapshot.url,
total=max(total, 1), # Show at least 1 to avoid division by zero
completed=completed,
current_plugin=current_plugin,
)
# Remove snapshots that are no longer active
for snapshot_id in list(progress_layout.snapshot_to_worker.keys()):
if snapshot_id not in active_ids:
progress.remove_task(task_ids[snapshot_id])
del task_ids[snapshot_id]
progress_layout.log_event(
f"Snapshot completed/removed",
style="blue"
)
progress_layout.remove_snapshot_worker(snapshot_id)
# Also clean up progress tracking
if snapshot_id in snapshot_progress:
del snapshot_progress[snapshot_id]
# Track idle state
has_pending = self.has_pending_work(queue_sizes)
@@ -503,6 +675,8 @@ class Orchestrator:
# Check if we should exit
if self.should_exit(queue_sizes):
if progress_layout:
progress_layout.log_event("All work complete", style="green")
log_worker_event(
worker_type='Orchestrator',
event='All work complete',
@@ -514,8 +688,12 @@ class Orchestrator:
time.sleep(self.POLL_INTERVAL)
except KeyboardInterrupt:
if progress_layout:
progress_layout.log_event("Interrupted by user", style="red")
print() # Newline after ^C
except BaseException as e:
if progress_layout:
progress_layout.log_event(f"Error: {e}", style="red")
self.on_shutdown(error=e)
raise
else:

View File

@@ -34,7 +34,7 @@ CPU_COUNT = cpu_count()
WORKER_TYPES: dict[str, type['Worker']] = {}
def _run_worker(worker_class_name: str, worker_id: int, daemon: bool, **kwargs):
def _run_worker(worker_class_name: str, worker_id: int, **kwargs):
"""
Module-level function to run a worker. Must be at module level for pickling.
"""
@@ -43,16 +43,28 @@ def _run_worker(worker_class_name: str, worker_id: int, daemon: bool, **kwargs):
# Get worker class by name to avoid pickling class objects
worker_cls = WORKER_TYPES[worker_class_name]
worker = worker_cls(worker_id=worker_id, daemon=daemon, **kwargs)
worker = worker_cls(worker_id=worker_id, **kwargs)
worker.runloop()
def _run_snapshot_worker(snapshot_id: str, worker_id: int, **kwargs):
"""
Module-level function to run a SnapshotWorker for a specific snapshot.
Must be at module level for pickling compatibility.
"""
from archivebox.config.django import setup_django
setup_django()
worker = SnapshotWorker(snapshot_id=snapshot_id, worker_id=worker_id, **kwargs)
worker.runloop()
class Worker:
"""
Base worker class that polls a queue and processes items directly.
Base worker class for CrawlWorker and SnapshotWorker.
Each item is processed by calling its state machine tick() method.
Workers exit when idle for too long (unless daemon mode).
Workers are spawned as subprocesses to process crawls and snapshots.
Each worker type has its own custom runloop implementation.
"""
name: ClassVar[str] = 'worker'
@@ -60,16 +72,10 @@ class Worker:
# Configuration (can be overridden by subclasses)
MAX_TICK_TIME: ClassVar[int] = 60
MAX_CONCURRENT_TASKS: ClassVar[int] = 1
POLL_INTERVAL: ClassVar[float] = 0.1 # How often to check for new work (seconds)
IDLE_TIMEOUT: ClassVar[int] = 100 # Exit after N idle iterations (10 sec at 0.1 poll interval)
def __init__(self, worker_id: int = 0, daemon: bool = False, crawl_id: str | None = None, **kwargs: Any):
def __init__(self, worker_id: int = 0, **kwargs: Any):
self.worker_id = worker_id
self.daemon = daemon
self.crawl_id = crawl_id # If set, only process work for this crawl
self.pid: int = os.getpid()
self.pid_file: Path | None = None
self.idle_count: int = 0
def __repr__(self) -> str:
return f'[underline]{self.__class__.__name__}[/underline]\\[id={self.worker_id}, pid={self.pid}]'
@@ -78,55 +84,6 @@ class Worker:
"""Get the Django model class. Subclasses must override this."""
raise NotImplementedError("Subclasses must implement get_model()")
def get_queue(self) -> QuerySet:
"""Get the queue of objects ready for processing."""
Model = self.get_model()
return Model.objects.filter(
retry_at__lte=timezone.now()
).exclude(
status__in=Model.FINAL_STATES
).order_by('retry_at')
def claim_next(self):
"""
Atomically claim the next object from the queue.
Returns the claimed object or None if queue is empty or claim failed.
"""
Model = self.get_model()
queue = self.get_queue()
obj = queue.first()
if obj is None:
return None
# Atomic claim using optimistic locking on retry_at
claimed = Model.objects.filter(
pk=obj.pk,
retry_at=obj.retry_at,
).update(
retry_at=timezone.now() + timedelta(seconds=self.MAX_TICK_TIME)
)
if claimed == 1:
obj.refresh_from_db()
return obj
return None # Someone else claimed it
def process_item(self, obj) -> bool:
"""
Process a single item by calling its state machine tick().
Returns True on success, False on failure.
Subclasses can override for custom processing.
"""
try:
obj.sm.tick()
return True
except Exception as e:
# Error will be logged in runloop's completion event
traceback.print_exc()
return False
def on_startup(self) -> None:
"""Called when worker starts."""
from archivebox.machine.models import Process
@@ -139,7 +96,7 @@ class Worker:
if self.db_process.process_type != Process.TypeChoices.WORKER:
self.db_process.process_type = Process.TypeChoices.WORKER
update_fields.append('process_type')
# Store worker type name (crawl/snapshot/archiveresult) in worker_type field
# Store worker type name (crawl/snapshot) in worker_type field
if not self.db_process.worker_type:
self.db_process.worker_type = self.name
update_fields.append('worker_type')
@@ -148,13 +105,11 @@ class Worker:
# Determine worker type for logging
worker_type_name = self.__class__.__name__
indent_level = 1 # Default for most workers
indent_level = 1 # Default for CrawlWorker
# Adjust indent level based on worker type
# SnapshotWorker gets indent level 2
if 'Snapshot' in worker_type_name:
indent_level = 2
elif 'ArchiveResult' in worker_type_name:
indent_level = 3
log_worker_event(
worker_type=worker_type_name,
@@ -162,10 +117,6 @@ class Worker:
indent_level=indent_level,
pid=self.pid,
worker_id=str(self.worker_id),
metadata={
'max_concurrent': self.MAX_CONCURRENT_TASKS,
'poll_interval': self.POLL_INTERVAL,
},
)
def on_shutdown(self, error: BaseException | None = None) -> None:
@@ -179,12 +130,10 @@ class Worker:
# Determine worker type for logging
worker_type_name = self.__class__.__name__
indent_level = 1
indent_level = 1 # CrawlWorker
if 'Snapshot' in worker_type_name:
indent_level = 2
elif 'ArchiveResult' in worker_type_name:
indent_level = 3
log_worker_event(
worker_type=worker_type_name,
@@ -195,121 +144,157 @@ class Worker:
error=error if error and not isinstance(error, KeyboardInterrupt) else None,
)
def should_exit(self) -> bool:
"""Check if worker should exit due to idle timeout."""
if self.daemon:
return False
def _terminate_background_hooks(
self,
background_processes: dict[str, 'Process'],
worker_type: str,
indent_level: int,
) -> None:
"""
Terminate background hooks in 3 phases (shared logic for Crawl/Snapshot workers).
if self.IDLE_TIMEOUT == 0:
return False
Phase 1: Send SIGTERM to all bg hooks + children in parallel (polite request to wrap up)
Phase 2: Wait for each hook's remaining timeout before SIGKILL
Phase 3: SIGKILL any stragglers that exceeded their timeout
return self.idle_count >= self.IDLE_TIMEOUT
Args:
background_processes: Dict mapping hook name -> Process instance
worker_type: Worker type name for logging (e.g., 'CrawlWorker', 'SnapshotWorker')
indent_level: Logging indent level (1 for Crawl, 2 for Snapshot)
"""
import signal
import time
def runloop(self) -> None:
"""Main worker loop - polls queue, processes items."""
self.on_startup()
if not background_processes:
return
# Determine worker type for logging
worker_type_name = self.__class__.__name__
indent_level = 1
now = time.time()
if 'Snapshot' in worker_type_name:
indent_level = 2
elif 'ArchiveResult' in worker_type_name:
indent_level = 3
# Phase 1: Send SIGTERM to ALL background processes + children in parallel
log_worker_event(
worker_type=worker_type,
event=f'Sending SIGTERM to {len(background_processes)} background hooks (+ children)',
indent_level=indent_level,
pid=self.pid,
)
try:
while True:
# Try to claim and process an item
obj = self.claim_next()
if obj is not None:
self.idle_count = 0
# Build metadata for task start
start_metadata = {}
url = None
if hasattr(obj, 'url'):
# SnapshotWorker
url = str(obj.url) if obj.url else None
elif hasattr(obj, 'snapshot') and hasattr(obj.snapshot, 'url'):
# ArchiveResultWorker
url = str(obj.snapshot.url) if obj.snapshot.url else None
elif hasattr(obj, 'get_urls_list'):
# CrawlWorker
urls = obj.get_urls_list()
url = urls[0] if urls else None
plugin = None
if hasattr(obj, 'plugin'):
# ArchiveResultWorker, Crawl
plugin = obj.plugin
# Build deadline map first (before killing, to get accurate remaining time)
deadlines = {}
for hook_name, process in background_processes.items():
elapsed = now - process.started_at.timestamp()
remaining = max(0, process.timeout - elapsed)
deadline = now + remaining
deadlines[hook_name] = (process, deadline)
# Send SIGTERM to all process trees in parallel (non-blocking)
for hook_name, process in background_processes.items():
try:
# Get chrome children (renderer processes etc) before sending signal
children_pids = process.get_children_pids()
if children_pids:
# Chrome hook with children - kill tree
os.kill(process.pid, signal.SIGTERM)
for child_pid in children_pids:
try:
os.kill(child_pid, signal.SIGTERM)
except ProcessLookupError:
pass
log_worker_event(
worker_type=worker_type_name,
event='Processing',
worker_type=worker_type,
event=f'Sent SIGTERM to {hook_name} + {len(children_pids)} children',
indent_level=indent_level,
pid=self.pid,
worker_id=str(self.worker_id),
url=url,
plugin=plugin,
metadata=start_metadata if start_metadata else None,
)
start_time = time.time()
success = self.process_item(obj)
elapsed = time.time() - start_time
# Build metadata for task completion
complete_metadata = {
'duration': elapsed,
'status': 'success' if success else 'failed',
}
log_worker_event(
worker_type=worker_type_name,
event='Completed' if success else 'Failed',
indent_level=indent_level,
pid=self.pid,
worker_id=str(self.worker_id),
url=url,
plugin=plugin,
metadata=complete_metadata,
)
else:
# No work available - idle logging suppressed
self.idle_count += 1
# No children - normal kill
os.kill(process.pid, signal.SIGTERM)
except ProcessLookupError:
pass # Already dead
except Exception as e:
log_worker_event(
worker_type=worker_type,
event=f'Failed to SIGTERM {hook_name}: {e}',
indent_level=indent_level,
pid=self.pid,
)
# Check if we should exit
if self.should_exit():
# Exit logging suppressed - shutdown will be logged by on_shutdown()
break
# Phase 2: Wait for all processes in parallel, respecting individual timeouts
for hook_name, (process, deadline) in deadlines.items():
remaining = deadline - now
log_worker_event(
worker_type=worker_type,
event=f'Waiting up to {remaining:.1f}s for {hook_name}',
indent_level=indent_level,
pid=self.pid,
)
time.sleep(self.POLL_INTERVAL)
# Poll all processes in parallel using Process.poll()
still_running = set(deadlines.keys())
except KeyboardInterrupt:
pass
except BaseException as e:
self.on_shutdown(error=e)
raise
else:
self.on_shutdown()
while still_running:
time.sleep(0.1)
now = time.time()
for hook_name in list(still_running):
process, deadline = deadlines[hook_name]
# Check if process exited using Process.poll()
exit_code = process.poll()
if exit_code is not None:
# Process exited
still_running.remove(hook_name)
log_worker_event(
worker_type=worker_type,
event=f'{hook_name} exited with code {exit_code}',
indent_level=indent_level,
pid=self.pid,
)
continue
# Check if deadline exceeded
if now >= deadline:
# Timeout exceeded - SIGKILL process tree
try:
# Get children before killing (chrome may have spawned more)
children_pids = process.get_children_pids()
if children_pids:
# Kill children first
for child_pid in children_pids:
try:
os.kill(child_pid, signal.SIGKILL)
except ProcessLookupError:
pass
# Then kill parent
process.kill(signal_num=signal.SIGKILL)
log_worker_event(
worker_type=worker_type,
event=f'⚠ Sent SIGKILL to {hook_name} + {len(children_pids) if children_pids else 0} children (exceeded timeout)',
indent_level=indent_level,
pid=self.pid,
)
except Exception as e:
log_worker_event(
worker_type=worker_type,
event=f'Failed to SIGKILL {hook_name}: {e}',
indent_level=indent_level,
pid=self.pid,
)
still_running.remove(hook_name)
@classmethod
def start(cls, worker_id: int | None = None, daemon: bool = False, **kwargs: Any) -> int:
def start(cls, **kwargs: Any) -> int:
"""
Fork a new worker as a subprocess.
Returns the PID of the new process.
"""
from archivebox.machine.models import Process
if worker_id is None:
worker_id = Process.get_next_worker_id(process_type=Process.TypeChoices.WORKER)
worker_id = Process.get_next_worker_id(process_type=Process.TypeChoices.WORKER)
# Use module-level function for pickling compatibility
proc = MPProcess(
target=_run_worker,
args=(cls.name, worker_id, daemon),
args=(cls.name, worker_id),
kwargs=kwargs,
name=f'{cls.name}_worker_{worker_id}',
)
@@ -356,120 +341,397 @@ class Worker:
class CrawlWorker(Worker):
"""Worker for processing Crawl objects."""
"""
Worker for processing Crawl objects.
Responsibilities:
1. Run on_Crawl__* hooks (e.g., chrome launcher)
2. Create Snapshots from URLs
3. Spawn SnapshotWorkers (up to MAX_SNAPSHOT_WORKERS)
4. Monitor snapshots and seal crawl when all done
"""
name: ClassVar[str] = 'crawl'
MAX_TICK_TIME: ClassVar[int] = 60
MAX_SNAPSHOT_WORKERS: ClassVar[int] = 8 # Per crawl limit
def __init__(self, crawl_id: str, **kwargs: Any):
super().__init__(**kwargs)
self.crawl_id = crawl_id
self.crawl = None
def get_model(self):
from archivebox.crawls.models import Crawl
return Crawl
def get_queue(self) -> QuerySet:
"""Get queue of Crawls ready for processing, optionally filtered by crawl_id."""
qs = super().get_queue()
if self.crawl_id:
qs = qs.filter(id=self.crawl_id)
return qs
def on_startup(self) -> None:
"""Load crawl."""
super().on_startup()
from archivebox.crawls.models import Crawl
self.crawl = Crawl.objects.get(id=self.crawl_id)
def runloop(self) -> None:
"""Run crawl state machine, spawn SnapshotWorkers."""
import sys
self.on_startup()
try:
print(f'[cyan]🔄 CrawlWorker.runloop: Starting tick() for crawl {self.crawl_id}[/cyan]', file=sys.stderr)
# Advance state machine: QUEUED → STARTED (triggers run() via @started.enter)
self.crawl.sm.tick()
self.crawl.refresh_from_db()
print(f'[cyan]🔄 tick() complete, crawl status={self.crawl.status}[/cyan]', file=sys.stderr)
# Now spawn SnapshotWorkers and monitor progress
while True:
# Check if crawl is done
if self._is_crawl_finished():
print(f'[cyan]🔄 Crawl finished, sealing...[/cyan]', file=sys.stderr)
self.crawl.sm.seal()
break
# Spawn workers for queued snapshots
self._spawn_snapshot_workers()
time.sleep(2) # Check every 2s
finally:
self.on_shutdown()
def _spawn_snapshot_workers(self) -> None:
"""Spawn SnapshotWorkers for queued snapshots (up to limit)."""
from archivebox.core.models import Snapshot
from archivebox.machine.models import Process
# Count running SnapshotWorkers for this crawl
running_count = Process.objects.filter(
process_type=Process.TypeChoices.WORKER,
worker_type='snapshot',
parent_id=self.db_process.id, # Children of this CrawlWorker
status__in=['running', 'started'],
).count()
if running_count >= self.MAX_SNAPSHOT_WORKERS:
return # At limit
# Get queued snapshots for this crawl (SnapshotWorker will mark as STARTED in on_startup)
queued_snapshots = Snapshot.objects.filter(
crawl_id=self.crawl_id,
status=Snapshot.StatusChoices.QUEUED,
).order_by('created_at')[:self.MAX_SNAPSHOT_WORKERS - running_count]
import sys
print(f'[yellow]🔧 _spawn_snapshot_workers: running={running_count}/{self.MAX_SNAPSHOT_WORKERS}, queued={queued_snapshots.count()}[/yellow]', file=sys.stderr)
# Spawn workers
for snapshot in queued_snapshots:
print(f'[yellow]🔧 Spawning worker for {snapshot.url} (status={snapshot.status})[/yellow]', file=sys.stderr)
SnapshotWorker.start(snapshot_id=str(snapshot.id))
log_worker_event(
worker_type='CrawlWorker',
event=f'Spawned SnapshotWorker for {snapshot.url}',
indent_level=1,
pid=self.pid,
)
def _is_crawl_finished(self) -> bool:
"""Check if all snapshots are sealed."""
from archivebox.core.models import Snapshot
pending = Snapshot.objects.filter(
crawl_id=self.crawl_id,
status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED],
).count()
return pending == 0
def on_shutdown(self, error: BaseException | None = None) -> None:
"""
Terminate all background Crawl hooks when crawl finishes.
Background hooks (e.g., chrome launcher) should only be killed when:
- All snapshots are done (crawl is sealed)
- Worker is shutting down
"""
from archivebox.machine.models import Process
# Query for all running hook processes that are children of this CrawlWorker
background_hooks = Process.objects.filter(
parent_id=self.db_process.id,
process_type=Process.TypeChoices.HOOK,
status=Process.StatusChoices.RUNNING,
).select_related('machine')
# Build dict for shared termination logic
background_processes = {
hook.cmd[0] if hook.cmd else f'hook-{hook.pid}': hook
for hook in background_hooks
}
# Use shared termination logic from Worker base class
self._terminate_background_hooks(
background_processes=background_processes,
worker_type='CrawlWorker',
indent_level=1,
)
super().on_shutdown(error)
class SnapshotWorker(Worker):
"""Worker for processing Snapshot objects."""
"""
Worker that owns sequential hook execution for ONE snapshot.
Unlike other workers, SnapshotWorker doesn't poll a queue - it's given
a specific snapshot_id and runs all hooks for that snapshot sequentially.
Execution flow:
1. Mark snapshot as STARTED
2. Discover hooks for snapshot
3. For each hook (sorted by name):
a. Fork hook Process
b. If foreground: wait for completion
c. If background: track but continue to next hook
d. Update ArchiveResult status
e. Advance current_step when all step's hooks complete
4. When all hooks done: seal snapshot
5. On shutdown: SIGTERM all background hooks
"""
name: ClassVar[str] = 'snapshot'
MAX_TICK_TIME: ClassVar[int] = 60
def __init__(self, snapshot_id: str, **kwargs: Any):
super().__init__(**kwargs)
self.snapshot_id = snapshot_id
self.snapshot = None
self.background_processes: dict[str, Any] = {} # hook_name -> Process
def get_model(self):
"""Not used - SnapshotWorker doesn't poll queues."""
from archivebox.core.models import Snapshot
return Snapshot
def get_queue(self) -> QuerySet:
"""Get queue of Snapshots ready for processing, optionally filtered by crawl_id."""
qs = super().get_queue()
if self.crawl_id:
qs = qs.filter(crawl_id=self.crawl_id)
return qs
def on_startup(self) -> None:
"""Load snapshot and mark as STARTED."""
super().on_startup()
from archivebox.core.models import Snapshot
self.snapshot = Snapshot.objects.get(id=self.snapshot_id)
class ArchiveResultWorker(Worker):
"""Worker for processing ArchiveResult objects."""
# Mark snapshot as STARTED
self.snapshot.status = Snapshot.StatusChoices.STARTED
self.snapshot.retry_at = None # No more polling needed
self.snapshot.save(update_fields=['status', 'retry_at', 'modified_at'])
name: ClassVar[str] = 'archiveresult'
MAX_TICK_TIME: ClassVar[int] = 120
def __init__(self, plugin: str | None = None, **kwargs: Any):
super().__init__(**kwargs)
self.plugin = plugin
def get_model(self):
def runloop(self) -> None:
"""Execute all hooks sequentially."""
from archivebox.hooks import discover_hooks, is_background_hook, extract_step
from archivebox.core.models import ArchiveResult
return ArchiveResult
def get_queue(self) -> QuerySet:
"""
Get queue of ArchiveResults ready for processing.
self.on_startup()
Uses step-based filtering: only claims ARs where hook step <= snapshot.current_step.
This ensures hooks execute in order (step 0 → 1 → 2 ... → 9).
"""
from archivebox.core.models import ArchiveResult
from archivebox.hooks import extract_step
qs = super().get_queue()
if self.crawl_id:
qs = qs.filter(snapshot__crawl_id=self.crawl_id)
if self.plugin:
qs = qs.filter(plugin=self.plugin)
# Step-based filtering: only process ARs whose step <= snapshot.current_step
# Since step is derived from hook_name, we filter in Python after initial query
# This is efficient because the base query already filters by retry_at and status
# Get candidate ARs
candidates = list(qs[:50]) # Limit to avoid loading too many
ready_pks = []
for ar in candidates:
if not ar.hook_name:
# Legacy ARs without hook_name - process them
ready_pks.append(ar.pk)
continue
ar_step = extract_step(ar.hook_name)
snapshot_step = ar.snapshot.current_step
if ar_step <= snapshot_step:
ready_pks.append(ar.pk)
# Return filtered queryset ordered by hook_name (so earlier hooks run first within a step)
return ArchiveResult.objects.filter(pk__in=ready_pks).order_by('hook_name', 'retry_at')
def process_item(self, obj) -> bool:
"""Process an ArchiveResult by running its plugin."""
try:
obj.sm.tick()
return True
# Discover all hooks for this snapshot
hooks = discover_hooks('Snapshot', config=self.snapshot.config)
hooks = sorted(hooks, key=lambda h: h.name) # Sort by name (includes step prefix)
# Execute each hook sequentially
for hook_path in hooks:
hook_name = hook_path.name
plugin = self._extract_plugin_name(hook_name)
hook_step = extract_step(hook_name)
is_background = is_background_hook(hook_name)
# Create ArchiveResult for THIS HOOK (not per plugin)
# One plugin can have multiple hooks (e.g., chrome/on_Snapshot__20_launch_chrome.js, chrome/on_Snapshot__21_navigate_chrome.js)
# Unique key = (snapshot, plugin, hook_name) for idempotency
ar, created = ArchiveResult.objects.get_or_create(
snapshot=self.snapshot,
plugin=plugin,
hook_name=hook_name,
defaults={
'status': ArchiveResult.StatusChoices.STARTED,
'start_ts': timezone.now(),
}
)
if not created:
# Update existing AR to STARTED
ar.status = ArchiveResult.StatusChoices.STARTED
ar.start_ts = timezone.now()
ar.save(update_fields=['status', 'start_ts', 'modified_at'])
# Fork and run the hook
process = self._run_hook(hook_path, ar)
if is_background:
# Track but don't wait
self.background_processes[hook_name] = process
log_worker_event(
worker_type='SnapshotWorker',
event=f'Started background hook: {hook_name} (timeout={process.timeout}s)',
indent_level=2,
pid=self.pid,
)
else:
# Wait for foreground hook to complete
self._wait_for_hook(process, ar)
log_worker_event(
worker_type='SnapshotWorker',
event=f'Completed hook: {hook_name}',
indent_level=2,
pid=self.pid,
)
# Check if we can advance to next step
self._try_advance_step()
# All hooks launched (or completed) - cleanup and seal
self._cleanup_empty_archiveresults()
self.snapshot.status = Snapshot.StatusChoices.SEALED
self.snapshot.save(update_fields=['status', 'modified_at'])
except Exception as e:
# Error will be logged in runloop's completion event
traceback.print_exc()
return False
# Mark snapshot as failed
self.snapshot.status = Snapshot.StatusChoices.SEALED # Still seal on error
self.snapshot.save(update_fields=['status', 'modified_at'])
raise
finally:
self.on_shutdown()
def _run_hook(self, hook_path: Path, ar: Any) -> Any:
"""Fork and run a hook using Process model, return Process."""
from archivebox.hooks import run_hook
# Create output directory
output_dir = ar.create_output_dir()
# Run hook using Process.launch() - returns Process model directly
# Pass self.db_process as parent to track SnapshotWorker -> Hook hierarchy
process = run_hook(
script=hook_path,
output_dir=output_dir,
config=self.snapshot.config,
timeout=120,
parent=self.db_process,
url=str(self.snapshot.url),
snapshot_id=str(self.snapshot.id),
)
# Link ArchiveResult to Process for tracking
ar.process = process
ar.save(update_fields=['process_id', 'modified_at'])
return process
def _wait_for_hook(self, process: Any, ar: Any) -> None:
"""Wait for hook using Process.wait(), update AR status."""
# Use Process.wait() helper instead of manual polling
try:
exit_code = process.wait(timeout=process.timeout)
except TimeoutError:
# Hook exceeded timeout - kill it
process.kill(signal_num=9)
exit_code = -1
# Update ArchiveResult from hook output
ar.update_from_output()
ar.end_ts = timezone.now()
# Determine final status from hook exit code
if exit_code == 0:
ar.status = ar.StatusChoices.SUCCEEDED
else:
ar.status = ar.StatusChoices.FAILED
ar.save(update_fields=['status', 'end_ts', 'modified_at'])
def _try_advance_step(self) -> None:
"""Advance current_step if all foreground hooks in current step are done."""
from django.db.models import Q
from archivebox.core.models import ArchiveResult
current_step = self.snapshot.current_step
# Single query: foreground hooks in current step that aren't finished
# Foreground hooks: hook_name doesn't contain '.bg.'
pending_foreground = self.snapshot.archiveresult_set.filter(
Q(hook_name__contains=f'__{current_step}_') & # Current step
~Q(hook_name__contains='.bg.') & # Not background
~Q(status__in=ArchiveResult.FINAL_STATES) # Not finished
).exists()
if pending_foreground:
return # Still waiting for hooks
# All foreground hooks done - advance!
self.snapshot.current_step += 1
self.snapshot.save(update_fields=['current_step', 'modified_at'])
log_worker_event(
worker_type='SnapshotWorker',
event=f'Advanced to step {self.snapshot.current_step}',
indent_level=2,
pid=self.pid,
)
def _cleanup_empty_archiveresults(self) -> None:
"""Delete ArchiveResults that produced no output files."""
empty_ars = self.snapshot.archiveresult_set.filter(
output_files={} # No output files
).filter(
status__in=self.snapshot.archiveresult_set.model.FINAL_STATES # Only delete finished ones
)
deleted_count = empty_ars.count()
if deleted_count > 0:
empty_ars.delete()
log_worker_event(
worker_type='SnapshotWorker',
event=f'Deleted {deleted_count} empty ArchiveResults',
indent_level=2,
pid=self.pid,
)
def on_shutdown(self, error: BaseException | None = None) -> None:
"""
Terminate all background Snapshot hooks when snapshot finishes.
Background hooks should only be killed when:
- All foreground hooks are done (snapshot is sealed)
- Worker is shutting down
"""
# Use shared termination logic from Worker base class
self._terminate_background_hooks(
background_processes=self.background_processes,
worker_type='SnapshotWorker',
indent_level=2,
)
super().on_shutdown(error)
@staticmethod
def _extract_plugin_name(hook_name: str) -> str:
"""Extract plugin name from hook filename."""
# on_Snapshot__50_wget.py -> wget
name = hook_name.split('__')[-1] # Get part after last __
name = name.replace('.py', '').replace('.js', '').replace('.sh', '')
name = name.replace('.bg', '') # Remove .bg suffix
return name
@classmethod
def start(cls, worker_id: int | None = None, daemon: bool = False, plugin: str | None = None, **kwargs: Any) -> int:
"""Fork a new worker as subprocess with optional plugin filter."""
def start(cls, snapshot_id: str, **kwargs: Any) -> int:
"""Fork a SnapshotWorker for a specific snapshot."""
from archivebox.machine.models import Process
if worker_id is None:
worker_id = Process.get_next_worker_id(process_type=Process.TypeChoices.WORKER)
worker_id = Process.get_next_worker_id(process_type=Process.TypeChoices.WORKER)
# Use module-level function for pickling compatibility
proc = MPProcess(
target=_run_worker,
args=(cls.name, worker_id, daemon),
kwargs={'plugin': plugin, **kwargs},
name=f'{cls.name}_worker_{worker_id}',
target=_run_snapshot_worker, # New module-level function
args=(snapshot_id, worker_id),
kwargs=kwargs,
name=f'snapshot_worker_{snapshot_id[:8]}',
)
proc.start()
@@ -481,7 +743,6 @@ class ArchiveResultWorker(Worker):
WORKER_TYPES.update({
'crawl': CrawlWorker,
'snapshot': SnapshotWorker,
'archiveresult': ArchiveResultWorker,
})

View File

@@ -0,0 +1,333 @@
# Process Model Integration Plan
## Current Architecture
### Hook Execution Flow
```
Orchestrator
├─> CrawlWorker
│ └─> Crawl.run() [state machine @started.enter]
│ └─> run_hook() for on_Crawl__* hooks
│ └─> subprocess.Popen (NOT using Process model)
└─> SnapshotWorker
└─> Snapshot.run() [planned - doesn't exist yet]
└─> ArchiveResult.run() [state machine @started.enter]
└─> run_hook() for on_Snapshot__* hooks
└─> subprocess.Popen (NOT using Process model)
```
### Problem
1. **No Process tracking**: `run_hook()` uses `subprocess.Popen` directly, never creates Process records
2. **Orphaned Process model**: Process model has `.launch()`, `.wait()`, `.terminate()` methods that are NEVER used
3. **Manual process management**: SnapshotWorker manually uses psutil for waiting/killing
4. **Duplicate logic**: Process model and run_hook() both do subprocess management independently
## Unified Architecture
### Goal
Make Process model the **single source of truth** for all subprocess operations:
- Hook execution
- PID tracking
- stdout/stderr capture
- Process lifecycle (launch, wait, terminate)
### Design
```python
# hooks.py - Thin wrapper
def run_hook(...) -> Process:
"""
Run a hook using Process model (THIN WRAPPER).
Returns Process model instance for tracking and control.
"""
from archivebox.machine.models import Process
# Build command
cmd = build_hook_cmd(script, kwargs)
# Use Process.launch() - handles everything
process = Process.objects.create(
machine=Machine.current(),
process_type=Process.TypeChoices.HOOK,
pwd=str(output_dir),
cmd=cmd,
env=build_hook_env(config),
timeout=timeout,
)
# Launch subprocess
process.launch(background=is_background_hook(script.name))
return process # Return Process, not dict
# worker.py - Use Process methods
class SnapshotWorker:
def _run_hook(self, hook_path, ar) -> Process:
"""Fork hook using Process model."""
process = run_hook(
hook_path,
ar.create_output_dir(),
self.snapshot.config,
url=self.snapshot.url,
snapshot_id=str(self.snapshot.id),
)
# Link ArchiveResult to Process
ar.process = process
ar.save()
return process
def _wait_for_hook(self, process, ar):
"""Wait using Process.wait() method."""
exit_code = process.wait(timeout=None)
# Update AR from hook output
ar.update_from_output()
ar.status = ar.StatusChoices.SUCCEEDED if exit_code == 0 else ar.StatusChoices.FAILED
ar.save()
def on_shutdown(self):
"""
Terminate all background hooks in parallel with per-plugin timeouts.
Phase 1: Send SIGTERM to all in parallel (polite request to wrap up)
Phase 2: Wait for all in parallel, respecting individual plugin timeouts
Phase 3: SIGKILL any that exceed their timeout
Each plugin has its own timeout (SCREENSHOT_TIMEOUT=60, YTDLP_TIMEOUT=300, etc.)
Some hooks (consolelog, responses) exit immediately on SIGTERM.
Others (ytdlp, wget) need their full timeout to finish actual work.
"""
# Send SIGTERM to all processes in parallel
for hook_name, process in self.background_processes.items():
os.kill(process.pid, signal.SIGTERM)
# Build per-process deadlines based on plugin-specific timeouts
deadlines = {
name: (proc, time.time() + max(0, proc.timeout - (time.time() - proc.started_at.timestamp())))
for name, proc in self.background_processes.items()
}
# Poll all processes in parallel - no head-of-line blocking
still_running = set(deadlines.keys())
while still_running:
time.sleep(0.1)
for name in list(still_running):
proc, deadline = deadlines[name]
if not proc.is_running():
still_running.remove(name)
elif time.time() >= deadline:
os.kill(proc.pid, signal.SIGKILL) # Timeout exceeded
still_running.remove(name)
# models.py - Process becomes active
class Process:
def launch(self, background=False):
"""Spawn subprocess and track it."""
with open(self.stdout_file, 'w') as out, open(self.stderr_file, 'w') as err:
proc = subprocess.Popen(
self.cmd,
cwd=self.pwd,
stdout=out,
stderr=err,
env=self._build_env(),
)
self.pid = proc.pid
self.started_at = timezone.now()
self.status = self.StatusChoices.RUNNING
self.save()
if not background:
# Foreground - wait inline
proc.wait()
self.exit_code = proc.returncode
self.ended_at = timezone.now()
self.status = self.StatusChoices.EXITED
self.save()
return self
def wait(self, timeout=None):
"""Wait for process to exit, polling DB."""
while True:
self.refresh_from_db()
if self.status == self.StatusChoices.EXITED:
return self.exit_code
# Check via psutil if Process died without updating DB
if not self.is_running():
self._reap() # Update status from OS
return self.exit_code
time.sleep(0.1)
def terminate(self, sig=signal.SIGTERM):
"""Gracefully terminate: SIGTERM → wait → SIGKILL."""
if not self.is_running():
return True
os.kill(self.pid, sig)
# Wait for graceful shutdown
for _ in range(50): # 5 seconds
if not self.is_running():
self._reap()
return True
time.sleep(0.1)
# Escalate to SIGKILL
os.kill(self.pid, signal.SIGKILL)
self._reap()
return True
```
## Migration Steps
### Step 1: Update Process.launch() (DONE - already exists)
Process model already has `.launch()`, `.wait()`, `.terminate()` methods implemented in machine/models.py:1295-1593
### Step 2: Refactor run_hook() to use Process.launch()
**File**: `archivebox/hooks.py`
Change signature from:
```python
def run_hook(...) -> HookResult: # Returns dict
```
To:
```python
def run_hook(...) -> Process: # Returns Process model
```
**Implementation**:
```python
def run_hook(script, output_dir, config, timeout=None, **kwargs) -> Process:
from archivebox.machine.models import Process, Machine
# Build command
cmd = build_hook_cmd(script, kwargs)
env = build_hook_env(config)
is_bg = is_background_hook(script.name)
# Create Process record
process = Process.objects.create(
machine=Machine.current(),
process_type=Process.TypeChoices.HOOK,
pwd=str(output_dir),
cmd=cmd,
env=env,
timeout=timeout or 120,
)
# Launch subprocess
process.launch(background=is_bg)
return process
```
### Step 3: Update SnapshotWorker to use Process methods
**File**: `archivebox/workers/worker.py`
Replace manual psutil code with Process model methods (shown above in Design section).
### Step 4: Update ArchiveResult.run() to use new run_hook()
**File**: `archivebox/core/models.py:2559`
Change from:
```python
result = run_hook(...) # Returns HookResult dict
if result is None:
is_bg_hook = True
```
To:
```python
process = run_hook(...) # Returns Process
self.process = process
self.save()
if process.status == Process.StatusChoices.RUNNING:
# Background hook - still running
return
else:
# Foreground hook - completed
self.update_from_output()
```
### Step 5: Update Crawl.run() similarly
**File**: `archivebox/crawls/models.py:374`
Same pattern as ArchiveResult.run()
## Benefits
### 1. Single Source of Truth
- Process model owns ALL subprocess operations
- No duplicate logic between run_hook(), Process, and workers
- Consistent PID tracking, stdout/stderr handling
### 2. Proper Hierarchy
```
Process.parent_id creates tree:
Orchestrator (PID 1000)
└─> CrawlWorker (PID 1001, parent=1000)
└─> on_Crawl__01_chrome.js (PID 1010, parent=1001)
└─> SnapshotWorker (PID 1020, parent=1000)
└─> on_Snapshot__50_wget.py (PID 1021, parent=1020)
└─> on_Snapshot__63_ytdlp.bg.py (PID 1022, parent=1020)
```
### 3. Better Observability
- Query all hook processes: `snapshot.process_set.all()`
- Count running: `Process.objects.filter(status='running').count()`
- Track resource usage via Process.get_memory_info()
### 4. Cleaner Code
- SnapshotWorker._wait_for_hook: 25 lines → 8 lines
- SnapshotWorker.on_shutdown: 12 lines → 7 lines
- run_hook(): ~200 lines → ~50 lines
- Total: ~100 LoC saved
## Risks & Mitigation
### Risk 1: Breaking existing run_hook() callers
**Mitigation**: Two-phase rollout
1. Phase 1: Add run_hook_v2() that returns Process
2. Phase 2: Migrate callers to run_hook_v2()
3. Phase 3: Rename run_hook → run_hook_legacy, run_hook_v2 → run_hook
### Risk 2: Background hook tracking changes
**Mitigation**:
- Process.launch(background=True) handles async launches
- Process.wait() already polls for completion
- Behavior identical to current subprocess.Popen
### Risk 3: Performance overhead (extra DB writes)
**Mitigation**:
- Process records already being created (just not used)
- Batch updates where possible
- Monitor via metrics
## Timeline
### Immediate (This PR)
- [x] State machine fixes (completed)
- [x] Step advancement optimization (completed)
- [x] Document unified architecture (this file)
### Next PR (Process Integration)
1. Add run_hook_v2() returning Process
2. Update SnapshotWorker to use Process methods
3. Migrate ArchiveResult.run() and Crawl.run()
4. Deprecate old run_hook()
### Future
- Remove run_hook_legacy after migration complete
- Add Process.get_tree() for hierarchy visualization
- Add ProcessMachine state machine for lifecycle management