From 0ac83c8799da16dd0ab5bf0fe1ba882a0e47d0e9 Mon Sep 17 00:00:00 2001
From: Nick Sweeting <git@sweeting.me>
Date: Sun, 15 Mar 2026 14:15:04 -0700
Subject: [PATCH] Wait for crawl hook records before advancing

---
 archivebox/crawls/models.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/archivebox/crawls/models.py b/archivebox/crawls/models.py
index 96c7db4b..b07831e9 100755
--- a/archivebox/crawls/models.py
+++ b/archivebox/crawls/models.py
@@ -432,7 +432,16 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
                     return set()
 
             from archivebox.hooks import extract_records_from_process
-            records = extract_records_from_process(process)
+            records = []
+            # Finite background hooks can exit before their stdout log is fully
+            # visible to our polling loop. Give successful hooks a brief chance
+            # to flush JSONL records before we move on to downstream hooks.
+            for delay in (0.0, 0.05, 0.1, 0.25, 0.5):
+                if delay:
+                    time.sleep(delay)
+                records = extract_records_from_process(process)
+                if records:
+                    break
             if records:
                 print(f'[cyan]📝 Processing {len(records)} records from {hook.name}[/cyan]')
                 for record in records[:3]: