From 3672174dad81f078f54c931b85cf86830073b5ec Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 2 Jan 2026 00:24:44 -0800 Subject: [PATCH] fix transition mid transition --- archivebox/core/models.py | 6 ++++-- archivebox/crawls/models.py | 13 ++++++++----- archivebox/plugins/chrome/chrome_utils.js | 1 + 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 471a410d..3a21041a 100755 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -2241,10 +2241,12 @@ class SnapshotMachine(BaseStateMachine, strict_states=True): # Tick Event (polled by workers) tick = ( queued.to.itself(unless='can_start') | - queued.to(started, cond='can_start') + queued.to(started, cond='can_start') | + started.to.itself(unless='is_finished') | + started.to(sealed, cond='is_finished') ) - # Manual event (triggered by last ArchiveResult finishing) + # Manual event (can also be triggered by last ArchiveResult finishing) seal = started.to(sealed) def can_start(self) -> bool: diff --git a/archivebox/crawls/models.py b/archivebox/crawls/models.py index 40bbb6c2..52ed6c81 100755 --- a/archivebox/crawls/models.py +++ b/archivebox/crawls/models.py @@ -548,17 +548,20 @@ class CrawlMachine(BaseStateMachine, strict_states=True): if root_snapshot: print(f'[cyan]🔄 Created root snapshot: {root_snapshot.url}[/cyan]', file=sys.stderr) # Update status to STARTED - # Set retry_at to far future so workers don't claim us (we're waiting for snapshots to finish) + # Set retry_at to None so workers don't claim us (we wait for snapshots to finish) # Last snapshot will manually call self.seal() when done self.crawl.update_and_requeue( - retry_at=timezone.now() + timedelta(days=365), + retry_at=None, status=Crawl.StatusChoices.STARTED, ) else: # No snapshots (system crawl like archivebox://install) - print(f'[cyan]🔄 No snapshots created, sealing crawl immediately[/cyan]', file=sys.stderr) - # Seal immediately since there's no work to do - self.seal() + print(f'[cyan]🔄 No snapshots created, allowing immediate seal[/cyan]', file=sys.stderr) + # Set retry_at=now so next tick() will transition to sealed + self.crawl.update_and_requeue( + retry_at=timezone.now(), + status=Crawl.StatusChoices.STARTED, + ) except Exception as e: print(f'[red]⚠️ Crawl {self.crawl.id} failed to start: {e}[/red]') diff --git a/archivebox/plugins/chrome/chrome_utils.js b/archivebox/plugins/chrome/chrome_utils.js index b0293356..dd9ad47b 100755 --- a/archivebox/plugins/chrome/chrome_utils.js +++ b/archivebox/plugins/chrome/chrome_utils.js @@ -1435,6 +1435,7 @@ function getTestEnv() { MACHINE_TYPE: machineType, LIB_DIR: libDir, NODE_MODULES_DIR: nodeModulesDir, + NODE_PATH: nodeModulesDir, // Node.js uses NODE_PATH for module resolution NPM_BIN_DIR: path.join(libDir, 'npm', '.bin'), CHROME_EXTENSIONS_DIR: getExtensionsDir(), };