From 68b9f75dab8426358ed946bf3c1483a0cbf2c38d Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sun, 15 Mar 2026 06:49:40 -0700 Subject: [PATCH] Stabilize recursive crawl CI coverage --- archivebox/tests/fixtures.py | 82 +++++++++ archivebox/tests/test_recursive_crawl.py | 225 +++++++++-------------- 2 files changed, 173 insertions(+), 134 deletions(-) diff --git a/archivebox/tests/fixtures.py b/archivebox/tests/fixtures.py index eceb8fa8..b92d1887 100644 --- a/archivebox/tests/fixtures.py +++ b/archivebox/tests/fixtures.py @@ -1,5 +1,7 @@ import os import subprocess +from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer +from threading import Thread import pytest @@ -29,3 +31,83 @@ def disable_extractors_dict(): "SAVE_FAVICON": "false", }) return env + + +@pytest.fixture +def recursive_test_site(): + pages = { + "/": """ + + + Root + + + + About + Blog + Contact + + + """.strip().encode("utf-8"), + "/about": """ + + + Deep About + + + """.strip().encode("utf-8"), + "/blog": """ + + + Deep Blog + + + """.strip().encode("utf-8"), + "/contact": """ + + + Deep Contact + + + """.strip().encode("utf-8"), + "/deep/about": b"

Deep About

", + "/deep/blog": b"

Deep Blog

", + "/deep/contact": b"

Deep Contact

", + "/favicon.ico": b"test-icon", + } + + class _RecursiveHandler(BaseHTTPRequestHandler): + def do_GET(self): + body = pages.get(self.path) + if body is None: + self.send_response(404) + self.end_headers() + return + + self.send_response(200) + if self.path.endswith(".ico"): + self.send_header("Content-Type", "image/x-icon") + else: + self.send_header("Content-Type", "text/html; charset=utf-8") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + + def log_message(self, format, *args): + return + + server = ThreadingHTTPServer(("127.0.0.1", 0), _RecursiveHandler) + thread = Thread(target=server.serve_forever, daemon=True) + thread.start() + try: + base_url = f"http://127.0.0.1:{server.server_address[1]}" + yield { + "base_url": base_url, + "root_url": f"{base_url}/", + "child_urls": [f"{base_url}/about", f"{base_url}/blog", f"{base_url}/contact"], + "deep_urls": [f"{base_url}/deep/about", f"{base_url}/deep/blog", f"{base_url}/deep/contact"], + } + finally: + server.shutdown() + server.server_close() + thread.join(timeout=5) diff --git a/archivebox/tests/test_recursive_crawl.py b/archivebox/tests/test_recursive_crawl.py index c5ab3eb1..29299441 100644 --- a/archivebox/tests/test_recursive_crawl.py +++ b/archivebox/tests/test_recursive_crawl.py @@ -1,17 +1,38 @@ #!/usr/bin/env python3 """Integration tests for recursive crawling functionality.""" +import json import os import subprocess import sqlite3 import time +from pathlib import Path import pytest -from .fixtures import process, disable_extractors_dict +from .fixtures import process, disable_extractors_dict, recursive_test_site -def test_background_hooks_dont_block_parser_extractors(tmp_path, process): +def wait_for_db_condition(timeout, condition, interval=0.5): + deadline = time.time() + timeout + while time.time() < deadline: + if os.path.exists("index.sqlite3"): + conn = sqlite3.connect("index.sqlite3") + try: + if condition(conn.cursor()): + return True + finally: + conn.close() + time.sleep(interval) + return False + + +def stop_process(proc): + proc.kill() + return proc.communicate() + + +def test_background_hooks_dont_block_parser_extractors(tmp_path, process, recursive_test_site): """Test that background hooks (.bg.) don't block other extractors from running.""" os.chdir(tmp_path) @@ -39,23 +60,22 @@ def test_background_hooks_dont_block_parser_extractors(tmp_path, process): "SAVE_WGET": "true", }) - # Start a crawl with depth=1 proc = subprocess.Popen( - ['archivebox', 'add', '--depth=1', '--plugins=favicon,wget,parse_html_urls', 'https://monadical.com'], + ['archivebox', 'add', '--depth=1', '--plugins=favicon,wget,parse_html_urls', recursive_test_site['root_url']], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, env=env, ) - # Give the background hook + parser enough time to create and process the root snapshot. - time.sleep(20) + assert wait_for_db_condition( + timeout=30, + condition=lambda c: c.execute( + "SELECT COUNT(*) FROM core_archiveresult WHERE plugin LIKE 'parse_%_urls' AND status IN ('started', 'succeeded', 'failed')" + ).fetchone()[0] > 0, + ), "Parser extractors never progressed beyond queued status" + stdout, stderr = stop_process(proc) - # Kill the process - proc.kill() - stdout, stderr = proc.communicate() - - # Debug: print stderr to see what's happening if stderr: print(f"\n=== STDERR ===\n{stderr}\n=== END STDERR ===\n") if stdout: @@ -64,58 +84,40 @@ def test_background_hooks_dont_block_parser_extractors(tmp_path, process): conn = sqlite3.connect('index.sqlite3') c = conn.cursor() - # Check if snapshot was created snapshots = c.execute("SELECT url, depth, status FROM core_snapshot").fetchall() - - # Check that background hooks are running - # Background hooks: consolelog, ssl, responses, redirects, staticfile bg_hooks = c.execute( - "SELECT plugin, status FROM core_archiveresult WHERE plugin IN ('consolelog', 'ssl', 'responses', 'redirects', 'staticfile') ORDER BY plugin" + "SELECT plugin, status FROM core_archiveresult WHERE plugin IN ('favicon', 'consolelog', 'ssl', 'responses', 'redirects', 'staticfile') ORDER BY plugin" ).fetchall() - - # Check that parser extractors have run (not stuck in queued) parser_extractors = c.execute( "SELECT plugin, status FROM core_archiveresult WHERE plugin LIKE 'parse_%_urls' ORDER BY plugin" ).fetchall() - - # Check all extractors to see what's happening all_extractors = c.execute( "SELECT plugin, status FROM core_archiveresult ORDER BY plugin" ).fetchall() conn.close() - # Should have created at least a snapshot assert len(snapshots) > 0, ( f"Should have created snapshot after Crawl hooks finished. " f"If this fails, Crawl hooks may be taking too long. " f"Snapshots: {snapshots}" ) - # Should have background hooks (or at least some extractors created) assert len(all_extractors) > 0, ( f"Should have extractors created for snapshot. " f"If this fails, Snapshot.run() may not have started. " f"Got: {all_extractors}" ) - # Background hooks are optional - test passes even if none are created - # Main requirement is that parser extractors run (not blocked by anything) - # assert len(bg_hooks) > 0, ( - # f"Should have background hooks created with USE_CHROME=true. " - # f"All extractors: {all_extractors}" - # ) - # Parser extractors should not all be queued (at least some should have run) parser_statuses = [status for _, status in parser_extractors] assert 'started' in parser_statuses or 'succeeded' in parser_statuses or 'failed' in parser_statuses, \ - f"Parser extractors should have run, got statuses: {parser_statuses}" + f"Parser extractors should have run, got statuses: {parser_statuses}. Background hooks: {bg_hooks}" -def test_parser_extractors_emit_snapshot_jsonl(tmp_path, process): +def test_parser_extractors_emit_snapshot_jsonl(tmp_path, process, recursive_test_site): """Test that parser extractors emit Snapshot JSONL to stdout.""" os.chdir(tmp_path) - # Enable only parse_html_urls for this test env = os.environ.copy() env.update({ "SAVE_WGET": "false", @@ -135,28 +137,27 @@ def test_parser_extractors_emit_snapshot_jsonl(tmp_path, process): "USE_CHROME": "false", }) - # Add a URL with depth=0 (no recursion yet) proc = subprocess.Popen( - ['archivebox', 'add', '--depth=0', '--plugins=wget,parse_html_urls', 'https://monadical.com'], + ['archivebox', 'add', '--depth=0', '--plugins=wget,parse_html_urls', recursive_test_site['root_url']], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, env=env, ) - # Give time for extractors to run - time.sleep(5) - - # Kill the process - proc.kill() - proc.wait() + assert wait_for_db_condition( + timeout=20, + condition=lambda c: c.execute( + "SELECT COUNT(*) FROM core_archiveresult WHERE plugin LIKE '%parse_html_urls' AND status IN ('started', 'succeeded', 'failed')" + ).fetchone()[0] > 0, + ), "parse_html_urls did not run in time" + stop_process(proc) conn = sqlite3.connect('index.sqlite3') c = conn.cursor() - # Check that parse_html_urls ran parse_html = c.execute( - "SELECT id, status, output_str FROM core_archiveresult WHERE plugin = '60_parse_html_urls'" + "SELECT id, status, output_str FROM core_archiveresult WHERE plugin LIKE '%parse_html_urls' ORDER BY id LIMIT 1" ).fetchone() conn.close() @@ -165,39 +166,32 @@ def test_parser_extractors_emit_snapshot_jsonl(tmp_path, process): status = parse_html[1] output = parse_html[2] or "" - # Parser should have run assert status in ['started', 'succeeded', 'failed'], \ f"60_parse_html_urls should have run, got status: {status}" - # If it succeeded and found links, output should contain JSON if status == 'succeeded' and output: - # Output should be JSONL format (one JSON object per line) - # Each line should have {"type": "Snapshot", ...} - assert 'Snapshot' in output or output == '', \ - "Parser output should contain Snapshot JSONL or be empty" + assert 'parsed' in output.lower(), "Parser summary should report parsed URLs" + + urls_jsonl_files = list(Path("users/system/snapshots").rglob("parse_html_urls/**/urls.jsonl")) + assert urls_jsonl_files, "parse_html_urls should write urls.jsonl output" + + records = [] + for line in urls_jsonl_files[0].read_text().splitlines(): + if line.strip(): + records.append(json.loads(line)) + + assert records, "urls.jsonl should contain parsed Snapshot records" + assert all(record.get("type") == "Snapshot" for record in records), \ + f"Expected Snapshot JSONL records, got: {records}" -def test_recursive_crawl_creates_child_snapshots(tmp_path, process): +def test_recursive_crawl_creates_child_snapshots(tmp_path, process, recursive_test_site): """Test that recursive crawling creates child snapshots with proper depth and parent_snapshot_id.""" os.chdir(tmp_path) - # Create a test HTML file with links - test_html = tmp_path / 'test.html' - test_html.write_text(''' - - -

Test Page

- About - Blog - Contact - - - ''') - - # Minimal env for fast testing env = os.environ.copy() env.update({ - "URL_ALLOWLIST": r"monadical\.com/.*", # Only crawl same domain + "URL_ALLOWLIST": r"127\.0\.0\.1[:/].*", "SAVE_READABILITY": "false", "SAVE_SINGLEFILE": "false", "SAVE_MERCURY": "false", @@ -210,24 +204,22 @@ def test_recursive_crawl_creates_child_snapshots(tmp_path, process): "SAVE_TITLE": "false", }) - # Start a crawl with depth=1 (just one hop to test recursive crawling) - # Use file:// URL so it's instant, no network fetch needed proc = subprocess.Popen( - ['archivebox', 'add', '--depth=1', '--plugins=wget,parse_html_urls', f'file://{test_html}'], + ['archivebox', 'add', '--depth=1', '--plugins=wget,parse_html_urls', recursive_test_site['root_url']], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, env=env, ) - # Give orchestrator time to process - file:// is fast, should complete in 20s - time.sleep(20) + assert wait_for_db_condition( + timeout=30, + condition=lambda c: c.execute( + "SELECT COUNT(*) FROM core_snapshot WHERE depth = 1" + ).fetchone()[0] >= 3, + ), "Recursive crawl never created child snapshots" + stdout, stderr = stop_process(proc) - # Kill the process - proc.kill() - stdout, stderr = proc.communicate() - - # Debug: print stderr to see what's happening if stderr: print(f"\n=== STDERR ===\n{stderr}\n=== END STDERR ===\n") if stdout: @@ -236,31 +228,20 @@ def test_recursive_crawl_creates_child_snapshots(tmp_path, process): conn = sqlite3.connect('index.sqlite3') c = conn.cursor() - # Check if any snapshots were created all_snapshots = c.execute("SELECT url, depth FROM core_snapshot").fetchall() - - # Check root snapshot (depth=0) root_snapshot = c.execute( "SELECT id, url, depth, parent_snapshot_id FROM core_snapshot WHERE depth = 0 ORDER BY created_at LIMIT 1" ).fetchone() - - # Check if any child snapshots were created (depth=1) child_snapshots = c.execute( "SELECT id, url, depth, parent_snapshot_id FROM core_snapshot WHERE depth = 1" ).fetchall() - - # Check crawl was created crawl = c.execute( "SELECT id, max_depth FROM crawls_crawl ORDER BY created_at DESC LIMIT 1" ).fetchone() - - # Check parser extractor status parser_status = c.execute( "SELECT plugin, status FROM core_archiveresult WHERE snapshot_id = ? AND plugin LIKE 'parse_%_urls'", (root_snapshot[0] if root_snapshot else '',) ).fetchall() - - # Check for started extractors that might be blocking started_extractors = c.execute( "SELECT plugin, status FROM core_archiveresult WHERE snapshot_id = ? AND status = 'started'", (root_snapshot[0] if root_snapshot else '',) @@ -268,61 +249,46 @@ def test_recursive_crawl_creates_child_snapshots(tmp_path, process): conn.close() - # Verify root snapshot exists assert root_snapshot is not None, f"Root snapshot should exist at depth=0. All snapshots: {all_snapshots}" root_id = root_snapshot[0] - # Verify crawl was created with correct max_depth assert crawl is not None, "Crawl should be created" assert crawl[1] == 1, f"Crawl max_depth should be 1, got {crawl[1]}" - # Verify child snapshots were created (monadical.com should have links) assert len(child_snapshots) > 0, \ f"Child snapshots should be created from monadical.com links. Parser status: {parser_status}. Started extractors blocking: {started_extractors}" - # If children exist, verify they have correct parent_snapshot_id for child_id, child_url, child_depth, parent_id in child_snapshots: assert child_depth == 1, f"Child snapshot should have depth=1, got {child_depth}" assert parent_id == root_id, \ f"Child snapshot {child_url} should have parent_snapshot_id={root_id}, got {parent_id}" -def test_recursive_crawl_respects_depth_limit(tmp_path, process, disable_extractors_dict): +def test_recursive_crawl_respects_depth_limit(tmp_path, process, disable_extractors_dict, recursive_test_site): """Test that recursive crawling stops at max_depth.""" os.chdir(tmp_path) - # Start a crawl with depth=1 - proc = subprocess.Popen( - ['archivebox', 'add', '--depth=1', '--plugins=wget,parse_html_urls', 'https://monadical.com'], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, + result = subprocess.run( + ['archivebox', 'add', '--depth=1', '--plugins=wget,parse_html_urls', recursive_test_site['root_url']], + capture_output=True, text=True, env=disable_extractors_dict, + timeout=60, ) - - # Give orchestrator time to process - time.sleep(10) - - # Kill the process - proc.kill() - proc.wait() + assert result.returncode == 0, result.stderr conn = sqlite3.connect('index.sqlite3') c = conn.cursor() - # Check that no snapshots exceed depth=1 max_depth_found = c.execute( "SELECT MAX(depth) FROM core_snapshot" ).fetchone()[0] - - # Get depth distribution depth_counts = c.execute( "SELECT depth, COUNT(*) FROM core_snapshot GROUP BY depth ORDER BY depth" ).fetchall() conn.close() - # Should not exceed max_depth=1 assert max_depth_found is not None, "Should have at least one snapshot" assert max_depth_found <= 1, \ f"Max depth should not exceed 1, got {max_depth_found}. Depth distribution: {depth_counts}" @@ -362,25 +328,25 @@ def test_snapshot_depth_field_exists(tmp_path, process, disable_extractors_dict) f"Snapshot table should have depth column. Columns: {column_names}" -def test_root_snapshot_has_depth_zero(tmp_path, process, disable_extractors_dict): +def test_root_snapshot_has_depth_zero(tmp_path, process, disable_extractors_dict, recursive_test_site): """Test that root snapshots are created with depth=0.""" os.chdir(tmp_path) - subprocess.run( - ['archivebox', 'add', '--depth=1', '--plugins=wget,parse_html_urls', 'https://monadical.com'], + result = subprocess.run( + ['archivebox', 'add', '--depth=1', '--plugins=wget,parse_html_urls', recursive_test_site['root_url']], capture_output=True, text=True, env=disable_extractors_dict, - timeout=90, + timeout=60, ) + assert result.returncode == 0, result.stderr conn = sqlite3.connect('index.sqlite3') c = conn.cursor() - # Get the first snapshot for this URL snapshot = c.execute( "SELECT id, depth FROM core_snapshot WHERE url = ? ORDER BY created_at LIMIT 1", - ('https://monadical.com',) + (recursive_test_site['root_url'],) ).fetchone() conn.close() @@ -389,14 +355,10 @@ def test_root_snapshot_has_depth_zero(tmp_path, process, disable_extractors_dict assert snapshot[1] == 0, f"Root snapshot should have depth=0, got {snapshot[1]}" -def test_archiveresult_worker_queue_filters_by_foreground_extractors(tmp_path, process): +def test_archiveresult_worker_queue_filters_by_foreground_extractors(tmp_path, process, recursive_test_site): """Test that background hooks don't block foreground extractors from running.""" os.chdir(tmp_path) - # This test verifies that background hooks run concurrently with foreground hooks - # and don't block parser extractors - - # Start a crawl env = os.environ.copy() env.update({ "SAVE_WGET": "true", @@ -407,43 +369,38 @@ def test_archiveresult_worker_queue_filters_by_foreground_extractors(tmp_path, p }) proc = subprocess.Popen( - ['archivebox', 'add', '--plugins=favicon,wget,parse_html_urls', 'https://monadical.com'], + ['archivebox', 'add', '--plugins=favicon,wget,parse_html_urls', recursive_test_site['root_url']], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, env=env, ) - # Give time for background hooks to start - time.sleep(10) - - # Kill the process - proc.kill() - proc.wait() + assert wait_for_db_condition( + timeout=20, + condition=lambda c: c.execute( + "SELECT COUNT(*) FROM core_archiveresult WHERE plugin LIKE 'parse_%_urls' AND status IN ('started', 'succeeded', 'failed')" + ).fetchone()[0] > 0, + ), "Parser extractor never started" + stop_process(proc) conn = sqlite3.connect('index.sqlite3') c = conn.cursor() - # Get background hooks that are started - bg_started = c.execute( - "SELECT plugin FROM core_archiveresult WHERE plugin IN ('favicon') AND status = 'started'" + bg_results = c.execute( + "SELECT plugin, status FROM core_archiveresult WHERE plugin IN ('favicon', 'consolelog', 'ssl', 'responses', 'redirects', 'staticfile') AND status IN ('started', 'succeeded', 'failed')" ).fetchall() - - # Get parser extractors that should be queued or better parser_status = c.execute( "SELECT plugin, status FROM core_archiveresult WHERE plugin LIKE 'parse_%_urls'" ).fetchall() conn.close() - # If background hooks are running, parser extractors should still run - # (not permanently stuck in queued status) - if len(bg_started) > 0: + if len(bg_results) > 0: parser_statuses = [status for _, status in parser_status] - # At least some parsers should have progressed beyond queued non_queued = [s for s in parser_statuses if s != 'queued'] assert len(non_queued) > 0 or len(parser_status) == 0, \ - f"With {len(bg_started)} background hooks started, parser extractors should still run. " \ + f"With {len(bg_results)} background hooks started, parser extractors should still run. " \ f"Got statuses: {parser_statuses}"