From 86fdc3be1e2e5074233cb31a736ed796f8496f38 Mon Sep 17 00:00:00 2001
From: Nick Sweeting <git@sweeting.me>
Date: Sun, 15 Mar 2026 11:07:55 -0700
Subject: [PATCH] Refresh worker config from resolved plugin installs

---
 archivebox/config/configset.py                |  5 +-
 archivebox/crawls/models.py                   | 16 ++++--
 archivebox/machine/models.py                  | 57 +++++++++++++++++--
 .../tests/test_worker_config_propagation.py   |  9 +++
 archivebox/workers/worker.py                  | 12 ++++
 5 files changed, 89 insertions(+), 10 deletions(-)

diff --git a/archivebox/config/configset.py b/archivebox/config/configset.py
index 19e2e2d2..e284d44b 100644
--- a/archivebox/config/configset.py
+++ b/archivebox/config/configset.py
@@ -255,9 +255,10 @@ def get_config(
     if crawl and hasattr(crawl, "config") and crawl.config:
         config.update(crawl.config)
 
-    # Add CRAWL_OUTPUT_DIR for snapshot hooks to find shared Chrome session
+    # Add crawl path aliases for hooks that need shared crawl state.
     if crawl and hasattr(crawl, "output_dir"):
         config['CRAWL_OUTPUT_DIR'] = str(crawl.output_dir)
+        config['CRAWL_DIR'] = str(crawl.output_dir)
         config['CRAWL_ID'] = str(getattr(crawl, "id", "")) if getattr(crawl, "id", None) else config.get('CRAWL_ID')
 
     # Apply snapshot config overrides (highest priority)
@@ -267,6 +268,8 @@ def get_config(
     if snapshot:
         config['SNAPSHOT_ID'] = str(getattr(snapshot, "id", "")) if getattr(snapshot, "id", None) else config.get('SNAPSHOT_ID')
         config['SNAPSHOT_DEPTH'] = int(getattr(snapshot, "depth", 0) or 0)
+        if hasattr(snapshot, "output_dir"):
+            config['SNAP_DIR'] = str(snapshot.output_dir)
         if getattr(snapshot, "crawl_id", None):
             config['CRAWL_ID'] = str(snapshot.crawl_id)
 
diff --git a/archivebox/crawls/models.py b/archivebox/crawls/models.py
index 53f909f1..96c7db4b 100755
--- a/archivebox/crawls/models.py
+++ b/archivebox/crawls/models.py
@@ -342,8 +342,8 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
             f.write(f'\n=== Crawl.run() starting for {self.id} at {time.time()} ===\n')
             f.flush()
 
-        # Get merged config with crawl context
-        config = get_config(crawl=self)
+        def get_runtime_config():
+            return get_config(crawl=self)
 
         machine = Machine.current()
         declared_binary_names: set[str] = set()
@@ -393,6 +393,10 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
 
         def run_crawl_hook(hook: Path) -> set[str]:
             executed_crawl_hooks.add(str(hook))
+            primary_url = next(
+                (line.strip() for line in self.urls.splitlines() if line.strip()),
+                self.urls.strip(),
+            )
 
             with open(debug_log, 'a') as f:
                 f.write(f'Running hook: {hook.name}\n')
@@ -405,9 +409,11 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
             process = run_hook(
                 hook,
                 output_dir=output_dir,
-                config=config,
+                config=get_runtime_config(),
                 crawl_id=str(self.id),
                 source_url=self.urls,
+                url=primary_url,
+                snapshot_id=str(self.id),
             )
             with open(debug_log, 'a') as f:
                 f.write(f'Hook {hook.name} completed with status={process.status}\n')
@@ -476,7 +482,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
 
                 provider_hooks = [
                     hook
-                    for hook in discover_hooks('Crawl', filter_disabled=False, config=config)
+                    for hook in discover_hooks('Crawl', filter_disabled=False, config=get_runtime_config())
                     if hook.parent.name in needed_provider_names and str(hook) not in executed_crawl_hooks
                 ]
                 if not provider_hooks:
@@ -489,7 +495,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
         with open(debug_log, 'a') as f:
             f.write(f'Discovering Crawl hooks...\n')
             f.flush()
-        hooks = discover_hooks('Crawl', config=config)
+        hooks = discover_hooks('Crawl', config=get_runtime_config())
         with open(debug_log, 'a') as f:
             f.write(f'Found {len(hooks)} hooks\n')
             f.flush()
diff --git a/archivebox/machine/models.py b/archivebox/machine/models.py
index 4740d639..e8f0ee2f 100755
--- a/archivebox/machine/models.py
+++ b/archivebox/machine/models.py
@@ -74,13 +74,31 @@ class Machine(ModelWithHealthStats):
         global _CURRENT_MACHINE
         if _CURRENT_MACHINE:
             if timezone.now() < _CURRENT_MACHINE.modified_at + timedelta(seconds=MACHINE_RECHECK_INTERVAL):
-                return _CURRENT_MACHINE
+                return cls._hydrate_config_from_sibling(_CURRENT_MACHINE)
             _CURRENT_MACHINE = None
         _CURRENT_MACHINE, _ = cls.objects.update_or_create(
             guid=get_host_guid(),
             defaults={'hostname': socket.gethostname(), **get_os_info(), **get_vm_info(), 'stats': get_host_stats()},
         )
-        return _CURRENT_MACHINE
+        return cls._hydrate_config_from_sibling(_CURRENT_MACHINE)
+
+    @classmethod
+    def _hydrate_config_from_sibling(cls, machine: 'Machine') -> 'Machine':
+        if machine.config:
+            return machine
+
+        sibling = (
+            cls.objects
+            .exclude(pk=machine.pk)
+            .filter(hostname=machine.hostname)
+            .exclude(config={})
+            .order_by('-modified_at')
+            .first()
+        )
+        if sibling and sibling.config:
+            machine.config = dict(sibling.config)
+            machine.save(update_fields=['config', 'modified_at'])
+        return machine
 
     def to_json(self) -> dict:
         """
@@ -326,6 +344,7 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine):
 
         machine = Machine.current()
         overrides = overrides or {}
+        normalized_overrides = Binary._normalize_overrides(record.get('overrides', {}))
 
         # Case 1: Already installed (from on_Crawl hooks) - has abspath AND binproviders
         # This happens when on_Crawl hooks detect already-installed binaries
@@ -357,7 +376,7 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine):
                 name=name,
                 defaults={
                     'binproviders': record.get('binproviders', 'env'),
-                    'overrides': record.get('overrides', {}),
+                    'overrides': normalized_overrides,
                     'status': Binary.StatusChoices.QUEUED,
                     'retry_at': timezone.now(),
                 }
@@ -394,6 +413,31 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine):
         self.modified_at = timezone.now()
         self.save()
 
+    @staticmethod
+    def _normalize_overrides(overrides: dict | None) -> dict:
+        """Normalize hook-emitted binary overrides to the canonical install_args shape."""
+        if not isinstance(overrides, dict):
+            return {}
+
+        normalized: dict = {}
+        reserved_keys = {'custom_cmd', 'cmd', 'command'}
+
+        for provider, value in overrides.items():
+            if provider in reserved_keys:
+                normalized[provider] = value
+                continue
+
+            if isinstance(value, dict):
+                normalized[provider] = value
+            elif isinstance(value, (list, tuple)):
+                normalized[provider] = {'install_args': list(value)}
+            elif isinstance(value, str) and value.strip():
+                normalized[provider] = {'install_args': value.strip()}
+            else:
+                normalized[provider] = value
+
+        return normalized
+
     def _allowed_binproviders(self) -> set[str] | None:
         """Return the allowed binproviders for this binary, or None for wildcard."""
         providers = str(self.binproviders or '').strip()
@@ -441,8 +485,13 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine):
         from archivebox.hooks import discover_hooks, run_hook
         from archivebox.config.configset import get_config
 
-        # Get merged config (Binary doesn't have crawl/snapshot context)
+        # Get merged config (Binary doesn't have crawl/snapshot context).
+        # Binary workers can install several dependencies in one process, so
+        # refresh from the latest persisted machine config before each hook run.
         config = get_config()
+        current_machine = Machine.current()
+        if current_machine.config:
+            config.update(current_machine.config)
 
         # Create output directory
         output_dir = self.output_dir
diff --git a/archivebox/tests/test_worker_config_propagation.py b/archivebox/tests/test_worker_config_propagation.py
index 61229118..dbb1bfe3 100644
--- a/archivebox/tests/test_worker_config_propagation.py
+++ b/archivebox/tests/test_worker_config_propagation.py
@@ -697,6 +697,15 @@ path = hook_process.env.get('PATH')
 if lib_bin_dir:
     assert lib_bin_dir in path, f"LIB_BIN_DIR not in PATH. LIB_BIN_DIR={{lib_bin_dir}}, PATH={{path[:200]}}..."
 
+# Verify canonical crawl/snapshot directories are exported for plugins
+crawl_dir = hook_process.env.get('CRAWL_DIR')
+snap_dir = hook_process.env.get('SNAP_DIR')
+print(f"  CRAWL_DIR: {{crawl_dir}}")
+print(f"  SNAP_DIR: {{snap_dir}}")
+assert crawl_dir is not None, "CRAWL_DIR not set"
+assert snap_dir is not None, "SNAP_DIR not set"
+assert str(snapshot.id) in snap_dir, f"SNAP_DIR should contain snapshot id, got {{snap_dir}}"
+
 # Verify NODE_PATH is set
 node_path = hook_process.env.get('NODE_PATH')
 node_modules_dir = hook_process.env.get('NODE_MODULES_DIR')
diff --git a/archivebox/workers/worker.py b/archivebox/workers/worker.py
index 841f5017..bf872966 100644
--- a/archivebox/workers/worker.py
+++ b/archivebox/workers/worker.py
@@ -299,6 +299,10 @@ class Worker:
         from django.conf import settings
         import sys
 
+        refresh_machine_config = bool(
+            parent and getattr(parent, 'process_type', None) == Process.TypeChoices.WORKER
+        )
+
         # Build command and get config for the appropriate scope
         if cls.name == 'crawl':
             crawl_id = kwargs.get('crawl_id')
@@ -349,6 +353,14 @@ class Worker:
         else:
             raise ValueError(f"Unknown worker type: {cls.name}")
 
+        if refresh_machine_config:
+            current_machine = Machine.current()
+            if current_machine.config:
+                # Worker subprocesses inherit parent Process.env, which can contain
+                # stale pre-install binary aliases. Refresh resolved machine values
+                # before serializing the child worker env.
+                env.update(current_machine.config)
+
         # Ensure output directory exists
         pwd.mkdir(parents=True, exist_ok=True)