Run recursive crawl tests to completion

2026-04-06 07:47:53 +10:00 · 2026-03-15 06:55:35 -07:00
parent 68b9f75dab
commit 5fb3709281
1 changed files with 18 additions and 64 deletions
--- a/archivebox/tests/test_recursive_crawl.py
+++ b/archivebox/tests/test_recursive_crawl.py
@@ -5,7 +5,6 @@ import json
 import os
 import subprocess
 import sqlite3
-import time
 from pathlib import Path

 import pytest
@@ -13,25 +12,6 @@ import pytest
 from .fixtures import process, disable_extractors_dict, recursive_test_site


-def wait_for_db_condition(timeout, condition, interval=0.5):
-    deadline = time.time() + timeout
-    while time.time() < deadline:
-        if os.path.exists("index.sqlite3"):
-            conn = sqlite3.connect("index.sqlite3")
-            try:
-                if condition(conn.cursor()):
-                    return True
-            finally:
-                conn.close()
-        time.sleep(interval)
-    return False
-
-
-def stop_process(proc):
-    proc.kill()
-    return proc.communicate()
-
-
 def test_background_hooks_dont_block_parser_extractors(tmp_path, process, recursive_test_site):
    """Test that background hooks (.bg.) don't block other extractors from running."""
    os.chdir(tmp_path)
@@ -60,21 +40,15 @@ def test_background_hooks_dont_block_parser_extractors(tmp_path, process, recurs
        "SAVE_WGET": "true",
    })

-    proc = subprocess.Popen(
+    result = subprocess.run(
        ['archivebox', 'add', '--depth=1', '--plugins=favicon,wget,parse_html_urls', recursive_test_site['root_url']],
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE,
+        capture_output=True,
        text=True,
        env=env,
+        timeout=60,
    )
-
-    assert wait_for_db_condition(
-        timeout=30,
-        condition=lambda c: c.execute(
-            "SELECT COUNT(*) FROM core_archiveresult WHERE plugin LIKE 'parse_%_urls' AND status IN ('started', 'succeeded', 'failed')"
-        ).fetchone()[0] > 0,
-    ), "Parser extractors never progressed beyond queued status"
-    stdout, stderr = stop_process(proc)
+    assert result.returncode == 0, result.stderr
+    stdout, stderr = result.stdout, result.stderr

    if stderr:
        print(f"\n=== STDERR ===\n{stderr}\n=== END STDERR ===\n")
@@ -137,21 +111,14 @@ def test_parser_extractors_emit_snapshot_jsonl(tmp_path, process, recursive_test
        "USE_CHROME": "false",
    })

-    proc = subprocess.Popen(
+    result = subprocess.run(
        ['archivebox', 'add', '--depth=0', '--plugins=wget,parse_html_urls', recursive_test_site['root_url']],
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE,
+        capture_output=True,
        text=True,
        env=env,
+        timeout=60,
    )
-
-    assert wait_for_db_condition(
-        timeout=20,
-        condition=lambda c: c.execute(
-            "SELECT COUNT(*) FROM core_archiveresult WHERE plugin LIKE '%parse_html_urls' AND status IN ('started', 'succeeded', 'failed')"
-        ).fetchone()[0] > 0,
-    ), "parse_html_urls did not run in time"
-    stop_process(proc)
+    assert result.returncode == 0, result.stderr

    conn = sqlite3.connect('index.sqlite3')
    c = conn.cursor()
@@ -204,21 +171,15 @@ def test_recursive_crawl_creates_child_snapshots(tmp_path, process, recursive_te
        "SAVE_TITLE": "false",
    })

-    proc = subprocess.Popen(
+    result = subprocess.run(
        ['archivebox', 'add', '--depth=1', '--plugins=wget,parse_html_urls', recursive_test_site['root_url']],
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE,
+        capture_output=True,
        text=True,
        env=env,
+        timeout=60,
    )
-
-    assert wait_for_db_condition(
-        timeout=30,
-        condition=lambda c: c.execute(
-            "SELECT COUNT(*) FROM core_snapshot WHERE depth = 1"
-        ).fetchone()[0] >= 3,
-    ), "Recursive crawl never created child snapshots"
-    stdout, stderr = stop_process(proc)
+    assert result.returncode == 0, result.stderr
+    stdout, stderr = result.stdout, result.stderr

    if stderr:
        print(f"\n=== STDERR ===\n{stderr}\n=== END STDERR ===\n")
@@ -368,21 +329,14 @@ def test_archiveresult_worker_queue_filters_by_foreground_extractors(tmp_path, p
        "SAVE_FAVICON": "true",
    })

-    proc = subprocess.Popen(
+    result = subprocess.run(
        ['archivebox', 'add', '--plugins=favicon,wget,parse_html_urls', recursive_test_site['root_url']],
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE,
+        capture_output=True,
        text=True,
        env=env,
+        timeout=60,
    )
-
-    assert wait_for_db_condition(
-        timeout=20,
-        condition=lambda c: c.execute(
-            "SELECT COUNT(*) FROM core_archiveresult WHERE plugin LIKE 'parse_%_urls' AND status IN ('started', 'succeeded', 'failed')"
-        ).fetchone()[0] > 0,
-    ), "Parser extractor never started"
-    stop_process(proc)
+    assert result.returncode == 0, result.stderr

    conn = sqlite3.connect('index.sqlite3')
    c = conn.cursor()