From 0ac83c8799da16dd0ab5bf0fe1ba882a0e47d0e9 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sun, 15 Mar 2026 14:15:04 -0700 Subject: [PATCH] Wait for crawl hook records before advancing --- archivebox/crawls/models.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/archivebox/crawls/models.py b/archivebox/crawls/models.py index 96c7db4b..b07831e9 100755 --- a/archivebox/crawls/models.py +++ b/archivebox/crawls/models.py @@ -432,7 +432,16 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith return set() from archivebox.hooks import extract_records_from_process - records = extract_records_from_process(process) + records = [] + # Finite background hooks can exit before their stdout log is fully + # visible to our polling loop. Give successful hooks a brief chance + # to flush JSONL records before we move on to downstream hooks. + for delay in (0.0, 0.05, 0.1, 0.25, 0.5): + if delay: + time.sleep(delay) + records = extract_records_from_process(process) + if records: + break if records: print(f'[cyan]📝 Processing {len(records)} records from {hook.name}[/cyan]') for record in records[:3]: