Stabilize recursive crawl CI coverage

2026-04-06 07:47:53 +10:00 · 2026-03-15 06:49:40 -07:00
parent 760cf9d6b2
commit 68b9f75dab
2 changed files with 173 additions and 134 deletions
--- a/archivebox/tests/fixtures.py
+++ b/archivebox/tests/fixtures.py
@@ -1,5 +1,7 @@
 import os
 import subprocess
+from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
+from threading import Thread

 import pytest

@@ -29,3 +31,83 @@ def disable_extractors_dict():
        "SAVE_FAVICON": "false",
    })
    return env
+
+
+@pytest.fixture
+def recursive_test_site():
+    pages = {
+        "/": """
+            <html>
+              <head>
+                <title>Root</title>
+                <link rel="icon" href="/favicon.ico">
+              </head>
+              <body>
+                <a href="/about">About</a>
+                <a href="/blog">Blog</a>
+                <a href="/contact">Contact</a>
+              </body>
+            </html>
+        """.strip().encode("utf-8"),
+        "/about": """
+            <html>
+              <body>
+                <a href="/deep/about">Deep About</a>
+              </body>
+            </html>
+        """.strip().encode("utf-8"),
+        "/blog": """
+            <html>
+              <body>
+                <a href="/deep/blog">Deep Blog</a>
+              </body>
+            </html>
+        """.strip().encode("utf-8"),
+        "/contact": """
+            <html>
+              <body>
+                <a href="/deep/contact">Deep Contact</a>
+              </body>
+            </html>
+        """.strip().encode("utf-8"),
+        "/deep/about": b"<html><body><h1>Deep About</h1></body></html>",
+        "/deep/blog": b"<html><body><h1>Deep Blog</h1></body></html>",
+        "/deep/contact": b"<html><body><h1>Deep Contact</h1></body></html>",
+        "/favicon.ico": b"test-icon",
+    }
+
+    class _RecursiveHandler(BaseHTTPRequestHandler):
+        def do_GET(self):
+            body = pages.get(self.path)
+            if body is None:
+                self.send_response(404)
+                self.end_headers()
+                return
+
+            self.send_response(200)
+            if self.path.endswith(".ico"):
+                self.send_header("Content-Type", "image/x-icon")
+            else:
+                self.send_header("Content-Type", "text/html; charset=utf-8")
+            self.send_header("Content-Length", str(len(body)))
+            self.end_headers()
+            self.wfile.write(body)
+
+        def log_message(self, format, *args):
+            return
+
+    server = ThreadingHTTPServer(("127.0.0.1", 0), _RecursiveHandler)
+    thread = Thread(target=server.serve_forever, daemon=True)
+    thread.start()
+    try:
+        base_url = f"http://127.0.0.1:{server.server_address[1]}"
+        yield {
+            "base_url": base_url,
+            "root_url": f"{base_url}/",
+            "child_urls": [f"{base_url}/about", f"{base_url}/blog", f"{base_url}/contact"],
+            "deep_urls": [f"{base_url}/deep/about", f"{base_url}/deep/blog", f"{base_url}/deep/contact"],
+        }
+    finally:
+        server.shutdown()
+        server.server_close()
+        thread.join(timeout=5)
--- a/archivebox/tests/test_recursive_crawl.py
+++ b/archivebox/tests/test_recursive_crawl.py
@@ -1,17 +1,38 @@
 #!/usr/bin/env python3
 """Integration tests for recursive crawling functionality."""

+import json
 import os
 import subprocess
 import sqlite3
 import time
+from pathlib import Path

 import pytest

-from .fixtures import process, disable_extractors_dict
+from .fixtures import process, disable_extractors_dict, recursive_test_site


-def test_background_hooks_dont_block_parser_extractors(tmp_path, process):
+def wait_for_db_condition(timeout, condition, interval=0.5):
+    deadline = time.time() + timeout
+    while time.time() < deadline:
+        if os.path.exists("index.sqlite3"):
+            conn = sqlite3.connect("index.sqlite3")
+            try:
+                if condition(conn.cursor()):
+                    return True
+            finally:
+                conn.close()
+        time.sleep(interval)
+    return False
+
+
+def stop_process(proc):
+    proc.kill()
+    return proc.communicate()
+
+
+def test_background_hooks_dont_block_parser_extractors(tmp_path, process, recursive_test_site):
    """Test that background hooks (.bg.) don't block other extractors from running."""
    os.chdir(tmp_path)

@@ -39,23 +60,22 @@ def test_background_hooks_dont_block_parser_extractors(tmp_path, process):
        "SAVE_WGET": "true",
    })

-    # Start a crawl with depth=1
    proc = subprocess.Popen(
-        ['archivebox', 'add', '--depth=1', '--plugins=favicon,wget,parse_html_urls', 'https://monadical.com'],
+        ['archivebox', 'add', '--depth=1', '--plugins=favicon,wget,parse_html_urls', recursive_test_site['root_url']],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True,
        env=env,
    )

-    # Give the background hook + parser enough time to create and process the root snapshot.
-    time.sleep(20)
+    assert wait_for_db_condition(
+        timeout=30,
+        condition=lambda c: c.execute(
+            "SELECT COUNT(*) FROM core_archiveresult WHERE plugin LIKE 'parse_%_urls' AND status IN ('started', 'succeeded', 'failed')"
+        ).fetchone()[0] > 0,
+    ), "Parser extractors never progressed beyond queued status"
+    stdout, stderr = stop_process(proc)

-    # Kill the process
-    proc.kill()
-    stdout, stderr = proc.communicate()
-
-    # Debug: print stderr to see what's happening
    if stderr:
        print(f"\n=== STDERR ===\n{stderr}\n=== END STDERR ===\n")
    if stdout:
@@ -64,58 +84,40 @@ def test_background_hooks_dont_block_parser_extractors(tmp_path, process):
    conn = sqlite3.connect('index.sqlite3')
    c = conn.cursor()

-    # Check if snapshot was created
    snapshots = c.execute("SELECT url, depth, status FROM core_snapshot").fetchall()
-
-    # Check that background hooks are running
-    # Background hooks: consolelog, ssl, responses, redirects, staticfile
    bg_hooks = c.execute(
-        "SELECT plugin, status FROM core_archiveresult WHERE plugin IN ('consolelog', 'ssl', 'responses', 'redirects', 'staticfile') ORDER BY plugin"
+        "SELECT plugin, status FROM core_archiveresult WHERE plugin IN ('favicon', 'consolelog', 'ssl', 'responses', 'redirects', 'staticfile') ORDER BY plugin"
    ).fetchall()
-
-    # Check that parser extractors have run (not stuck in queued)
    parser_extractors = c.execute(
        "SELECT plugin, status FROM core_archiveresult WHERE plugin LIKE 'parse_%_urls' ORDER BY plugin"
    ).fetchall()
-
-    # Check all extractors to see what's happening
    all_extractors = c.execute(
        "SELECT plugin, status FROM core_archiveresult ORDER BY plugin"
    ).fetchall()

    conn.close()

-    # Should have created at least a snapshot
    assert len(snapshots) > 0, (
        f"Should have created snapshot after Crawl hooks finished. "
        f"If this fails, Crawl hooks may be taking too long. "
        f"Snapshots: {snapshots}"
    )

-    # Should have background hooks (or at least some extractors created)
    assert len(all_extractors) > 0, (
        f"Should have extractors created for snapshot. "
        f"If this fails, Snapshot.run() may not have started. "
        f"Got: {all_extractors}"
    )
-    # Background hooks are optional - test passes even if none are created
-    # Main requirement is that parser extractors run (not blocked by anything)
-    # assert len(bg_hooks) > 0, (
-    #     f"Should have background hooks created with USE_CHROME=true. "
-    #     f"All extractors: {all_extractors}"
-    # )

-    # Parser extractors should not all be queued (at least some should have run)
    parser_statuses = [status for _, status in parser_extractors]
    assert 'started' in parser_statuses or 'succeeded' in parser_statuses or 'failed' in parser_statuses, \
-        f"Parser extractors should have run, got statuses: {parser_statuses}"
+        f"Parser extractors should have run, got statuses: {parser_statuses}. Background hooks: {bg_hooks}"


-def test_parser_extractors_emit_snapshot_jsonl(tmp_path, process):
+def test_parser_extractors_emit_snapshot_jsonl(tmp_path, process, recursive_test_site):
    """Test that parser extractors emit Snapshot JSONL to stdout."""
    os.chdir(tmp_path)

-    # Enable only parse_html_urls for this test
    env = os.environ.copy()
    env.update({
        "SAVE_WGET": "false",
@@ -135,28 +137,27 @@ def test_parser_extractors_emit_snapshot_jsonl(tmp_path, process):
        "USE_CHROME": "false",
    })

-    # Add a URL with depth=0 (no recursion yet)
    proc = subprocess.Popen(
-        ['archivebox', 'add', '--depth=0', '--plugins=wget,parse_html_urls', 'https://monadical.com'],
+        ['archivebox', 'add', '--depth=0', '--plugins=wget,parse_html_urls', recursive_test_site['root_url']],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True,
        env=env,
    )

-    # Give time for extractors to run
-    time.sleep(5)
-
-    # Kill the process
-    proc.kill()
-    proc.wait()
+    assert wait_for_db_condition(
+        timeout=20,
+        condition=lambda c: c.execute(
+            "SELECT COUNT(*) FROM core_archiveresult WHERE plugin LIKE '%parse_html_urls' AND status IN ('started', 'succeeded', 'failed')"
+        ).fetchone()[0] > 0,
+    ), "parse_html_urls did not run in time"
+    stop_process(proc)

    conn = sqlite3.connect('index.sqlite3')
    c = conn.cursor()

-    # Check that parse_html_urls ran
    parse_html = c.execute(
-        "SELECT id, status, output_str FROM core_archiveresult WHERE plugin = '60_parse_html_urls'"
+        "SELECT id, status, output_str FROM core_archiveresult WHERE plugin LIKE '%parse_html_urls' ORDER BY id LIMIT 1"
    ).fetchone()

    conn.close()
@@ -165,39 +166,32 @@ def test_parser_extractors_emit_snapshot_jsonl(tmp_path, process):
        status = parse_html[1]
        output = parse_html[2] or ""

-        # Parser should have run
        assert status in ['started', 'succeeded', 'failed'], \
            f"60_parse_html_urls should have run, got status: {status}"

-        # If it succeeded and found links, output should contain JSON
        if status == 'succeeded' and output:
-            # Output should be JSONL format (one JSON object per line)
-            # Each line should have {"type": "Snapshot", ...}
-            assert 'Snapshot' in output or output == '', \
-                "Parser output should contain Snapshot JSONL or be empty"
+            assert 'parsed' in output.lower(), "Parser summary should report parsed URLs"
+
+    urls_jsonl_files = list(Path("users/system/snapshots").rglob("parse_html_urls/**/urls.jsonl"))
+    assert urls_jsonl_files, "parse_html_urls should write urls.jsonl output"
+
+    records = []
+    for line in urls_jsonl_files[0].read_text().splitlines():
+        if line.strip():
+            records.append(json.loads(line))
+
+    assert records, "urls.jsonl should contain parsed Snapshot records"
+    assert all(record.get("type") == "Snapshot" for record in records), \
+        f"Expected Snapshot JSONL records, got: {records}"


-def test_recursive_crawl_creates_child_snapshots(tmp_path, process):
+def test_recursive_crawl_creates_child_snapshots(tmp_path, process, recursive_test_site):
    """Test that recursive crawling creates child snapshots with proper depth and parent_snapshot_id."""
    os.chdir(tmp_path)

-    # Create a test HTML file with links
-    test_html = tmp_path / 'test.html'
-    test_html.write_text('''
-    <html>
-    <body>
-        <h1>Test Page</h1>
-        <a href="https://monadical.com/about">About</a>
-        <a href="https://monadical.com/blog">Blog</a>
-        <a href="https://monadical.com/contact">Contact</a>
-    </body>
-    </html>
-    ''')
-
-    # Minimal env for fast testing
    env = os.environ.copy()
    env.update({
-        "URL_ALLOWLIST": r"monadical\.com/.*",  # Only crawl same domain
+        "URL_ALLOWLIST": r"127\.0\.0\.1[:/].*",
        "SAVE_READABILITY": "false",
        "SAVE_SINGLEFILE": "false",
        "SAVE_MERCURY": "false",
@@ -210,24 +204,22 @@ def test_recursive_crawl_creates_child_snapshots(tmp_path, process):
        "SAVE_TITLE": "false",
    })

-    # Start a crawl with depth=1 (just one hop to test recursive crawling)
-    # Use file:// URL so it's instant, no network fetch needed
    proc = subprocess.Popen(
-        ['archivebox', 'add', '--depth=1', '--plugins=wget,parse_html_urls', f'file://{test_html}'],
+        ['archivebox', 'add', '--depth=1', '--plugins=wget,parse_html_urls', recursive_test_site['root_url']],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True,
        env=env,
    )

-    # Give orchestrator time to process - file:// is fast, should complete in 20s
-    time.sleep(20)
+    assert wait_for_db_condition(
+        timeout=30,
+        condition=lambda c: c.execute(
+            "SELECT COUNT(*) FROM core_snapshot WHERE depth = 1"
+        ).fetchone()[0] >= 3,
+    ), "Recursive crawl never created child snapshots"
+    stdout, stderr = stop_process(proc)

-    # Kill the process
-    proc.kill()
-    stdout, stderr = proc.communicate()
-
-    # Debug: print stderr to see what's happening
    if stderr:
        print(f"\n=== STDERR ===\n{stderr}\n=== END STDERR ===\n")
    if stdout:
@@ -236,31 +228,20 @@ def test_recursive_crawl_creates_child_snapshots(tmp_path, process):
    conn = sqlite3.connect('index.sqlite3')
    c = conn.cursor()

-    # Check if any snapshots were created
    all_snapshots = c.execute("SELECT url, depth FROM core_snapshot").fetchall()
-
-    # Check root snapshot (depth=0)
    root_snapshot = c.execute(
        "SELECT id, url, depth, parent_snapshot_id FROM core_snapshot WHERE depth = 0 ORDER BY created_at LIMIT 1"
    ).fetchone()
-
-    # Check if any child snapshots were created (depth=1)
    child_snapshots = c.execute(
        "SELECT id, url, depth, parent_snapshot_id FROM core_snapshot WHERE depth = 1"
    ).fetchall()
-
-    # Check crawl was created
    crawl = c.execute(
        "SELECT id, max_depth FROM crawls_crawl ORDER BY created_at DESC LIMIT 1"
    ).fetchone()
-
-    # Check parser extractor status
    parser_status = c.execute(
        "SELECT plugin, status FROM core_archiveresult WHERE snapshot_id = ? AND plugin LIKE 'parse_%_urls'",
        (root_snapshot[0] if root_snapshot else '',)
    ).fetchall()
-
-    # Check for started extractors that might be blocking
    started_extractors = c.execute(
        "SELECT plugin, status FROM core_archiveresult WHERE snapshot_id = ? AND status = 'started'",
        (root_snapshot[0] if root_snapshot else '',)
@@ -268,61 +249,46 @@ def test_recursive_crawl_creates_child_snapshots(tmp_path, process):

    conn.close()

-    # Verify root snapshot exists
    assert root_snapshot is not None, f"Root snapshot should exist at depth=0. All snapshots: {all_snapshots}"
    root_id = root_snapshot[0]

-    # Verify crawl was created with correct max_depth
    assert crawl is not None, "Crawl should be created"
    assert crawl[1] == 1, f"Crawl max_depth should be 1, got {crawl[1]}"

-    # Verify child snapshots were created (monadical.com should have links)
    assert len(child_snapshots) > 0, \
        f"Child snapshots should be created from monadical.com links. Parser status: {parser_status}. Started extractors blocking: {started_extractors}"

-    # If children exist, verify they have correct parent_snapshot_id
    for child_id, child_url, child_depth, parent_id in child_snapshots:
        assert child_depth == 1, f"Child snapshot should have depth=1, got {child_depth}"
        assert parent_id == root_id, \
            f"Child snapshot {child_url} should have parent_snapshot_id={root_id}, got {parent_id}"


-def test_recursive_crawl_respects_depth_limit(tmp_path, process, disable_extractors_dict):
+def test_recursive_crawl_respects_depth_limit(tmp_path, process, disable_extractors_dict, recursive_test_site):
    """Test that recursive crawling stops at max_depth."""
    os.chdir(tmp_path)

-    # Start a crawl with depth=1
-    proc = subprocess.Popen(
-        ['archivebox', 'add', '--depth=1', '--plugins=wget,parse_html_urls', 'https://monadical.com'],
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE,
+    result = subprocess.run(
+        ['archivebox', 'add', '--depth=1', '--plugins=wget,parse_html_urls', recursive_test_site['root_url']],
+        capture_output=True,
        text=True,
        env=disable_extractors_dict,
+        timeout=60,
    )
-
-    # Give orchestrator time to process
-    time.sleep(10)
-
-    # Kill the process
-    proc.kill()
-    proc.wait()
+    assert result.returncode == 0, result.stderr

    conn = sqlite3.connect('index.sqlite3')
    c = conn.cursor()

-    # Check that no snapshots exceed depth=1
    max_depth_found = c.execute(
        "SELECT MAX(depth) FROM core_snapshot"
    ).fetchone()[0]
-
-    # Get depth distribution
    depth_counts = c.execute(
        "SELECT depth, COUNT(*) FROM core_snapshot GROUP BY depth ORDER BY depth"
    ).fetchall()

    conn.close()

-    # Should not exceed max_depth=1
    assert max_depth_found is not None, "Should have at least one snapshot"
    assert max_depth_found <= 1, \
        f"Max depth should not exceed 1, got {max_depth_found}. Depth distribution: {depth_counts}"
@@ -362,25 +328,25 @@ def test_snapshot_depth_field_exists(tmp_path, process, disable_extractors_dict)
        f"Snapshot table should have depth column. Columns: {column_names}"


-def test_root_snapshot_has_depth_zero(tmp_path, process, disable_extractors_dict):
+def test_root_snapshot_has_depth_zero(tmp_path, process, disable_extractors_dict, recursive_test_site):
    """Test that root snapshots are created with depth=0."""
    os.chdir(tmp_path)

-    subprocess.run(
-        ['archivebox', 'add', '--depth=1', '--plugins=wget,parse_html_urls', 'https://monadical.com'],
+    result = subprocess.run(
+        ['archivebox', 'add', '--depth=1', '--plugins=wget,parse_html_urls', recursive_test_site['root_url']],
        capture_output=True,
        text=True,
        env=disable_extractors_dict,
-        timeout=90,
+        timeout=60,
    )
+    assert result.returncode == 0, result.stderr

    conn = sqlite3.connect('index.sqlite3')
    c = conn.cursor()

-    # Get the first snapshot for this URL
    snapshot = c.execute(
        "SELECT id, depth FROM core_snapshot WHERE url = ? ORDER BY created_at LIMIT 1",
-        ('https://monadical.com',)
+        (recursive_test_site['root_url'],)
    ).fetchone()

    conn.close()
@@ -389,14 +355,10 @@ def test_root_snapshot_has_depth_zero(tmp_path, process, disable_extractors_dict
    assert snapshot[1] == 0, f"Root snapshot should have depth=0, got {snapshot[1]}"


-def test_archiveresult_worker_queue_filters_by_foreground_extractors(tmp_path, process):
+def test_archiveresult_worker_queue_filters_by_foreground_extractors(tmp_path, process, recursive_test_site):
    """Test that background hooks don't block foreground extractors from running."""
    os.chdir(tmp_path)

-    # This test verifies that background hooks run concurrently with foreground hooks
-    # and don't block parser extractors
-
-    # Start a crawl
    env = os.environ.copy()
    env.update({
        "SAVE_WGET": "true",
@@ -407,43 +369,38 @@ def test_archiveresult_worker_queue_filters_by_foreground_extractors(tmp_path, p
    })

    proc = subprocess.Popen(
-        ['archivebox', 'add', '--plugins=favicon,wget,parse_html_urls', 'https://monadical.com'],
+        ['archivebox', 'add', '--plugins=favicon,wget,parse_html_urls', recursive_test_site['root_url']],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True,
        env=env,
    )

-    # Give time for background hooks to start
-    time.sleep(10)
-
-    # Kill the process
-    proc.kill()
-    proc.wait()
+    assert wait_for_db_condition(
+        timeout=20,
+        condition=lambda c: c.execute(
+            "SELECT COUNT(*) FROM core_archiveresult WHERE plugin LIKE 'parse_%_urls' AND status IN ('started', 'succeeded', 'failed')"
+        ).fetchone()[0] > 0,
+    ), "Parser extractor never started"
+    stop_process(proc)

    conn = sqlite3.connect('index.sqlite3')
    c = conn.cursor()

-    # Get background hooks that are started
-    bg_started = c.execute(
-        "SELECT plugin FROM core_archiveresult WHERE plugin IN ('favicon') AND status = 'started'"
+    bg_results = c.execute(
+        "SELECT plugin, status FROM core_archiveresult WHERE plugin IN ('favicon', 'consolelog', 'ssl', 'responses', 'redirects', 'staticfile') AND status IN ('started', 'succeeded', 'failed')"
    ).fetchall()
-
-    # Get parser extractors that should be queued or better
    parser_status = c.execute(
        "SELECT plugin, status FROM core_archiveresult WHERE plugin LIKE 'parse_%_urls'"
    ).fetchall()

    conn.close()

-    # If background hooks are running, parser extractors should still run
-    # (not permanently stuck in queued status)
-    if len(bg_started) > 0:
+    if len(bg_results) > 0:
        parser_statuses = [status for _, status in parser_status]
-        # At least some parsers should have progressed beyond queued
        non_queued = [s for s in parser_statuses if s != 'queued']
        assert len(non_queued) > 0 or len(parser_status) == 0, \
-            f"With {len(bg_started)} background hooks started, parser extractors should still run. " \
+            f"With {len(bg_results)} background hooks started, parser extractors should still run. " \
            f"Got statuses: {parser_statuses}"