fix lint

2026-04-06 07:47:53 +10:00 · 2026-03-15 18:45:29 -07:00
parent f97725d16f
commit 934e02695b
111 changed files with 919 additions and 461 deletions
--- a/archivebox/tests/conftest.py
+++ b/archivebox/tests/conftest.py
@@ -1,7 +1,6 @@
 """archivebox/tests/conftest.py - Pytest fixtures for CLI tests."""

 import os
-import shutil
 import sys
 import subprocess
 import textwrap
@@ -13,6 +12,8 @@ import pytest

 from archivebox.uuid_compat import uuid7

+pytest_plugins = ["archivebox.tests.fixtures"]
+

 # =============================================================================
 # CLI Helpers (defined before fixtures that use them)
--- a/archivebox/tests/test_add.py
+++ b/archivebox/tests/test_add.py
@@ -1,9 +1,6 @@
-import subprocess
-import json
-import sqlite3
 import os
-
-from .fixtures import *
+import sqlite3
+import subprocess

 def test_depth_flag_is_accepted(process, disable_extractors_dict):
    arg_process = subprocess.run(["archivebox", "add", "--index-only", "https://example.com", "--depth=0"],
@@ -31,7 +28,7 @@ def test_depth_flag_fails_if_it_is_not_0_or_1(process, disable_extractors_dict):

 def test_depth_flag_0_creates_source_file(tmp_path, process, disable_extractors_dict):
    os.chdir(tmp_path)
-    arg_process = subprocess.run(
+    subprocess.run(
        ["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
        capture_output=True,
        env=disable_extractors_dict,
--- a/archivebox/tests/test_admin_views.py
+++ b/archivebox/tests/test_admin_views.py
@@ -9,7 +9,7 @@ Tests cover:
 """

 import pytest
-from django.test import TestCase, Client, override_settings
+from django.test import override_settings
 from django.urls import reverse
 from django.contrib.auth import get_user_model

--- a/archivebox/tests/test_auth_ldap.py
+++ b/archivebox/tests/test_auth_ldap.py
@@ -9,7 +9,7 @@ import os
 import sys
 import tempfile
 import unittest
-from pathlib import Path
+from importlib.util import find_spec


 class TestLDAPConfig(unittest.TestCase):
@@ -100,13 +100,7 @@ class TestLDAPIntegration(unittest.TestCase):

    def test_django_settings_with_ldap_library_check(self):
        """Test that Django settings check for LDAP libraries when enabled."""
-        # Try to import django-auth-ldap to see if it's available
-        try:
-            import django_auth_ldap
-            import ldap
-            ldap_available = True
-        except ImportError:
-            ldap_available = False
+        ldap_available = find_spec("django_auth_ldap") is not None and find_spec("ldap") is not None

        # If LDAP libraries are not available, settings should handle gracefully
        if not ldap_available:
--- a/archivebox/tests/test_cli_add.py
+++ b/archivebox/tests/test_cli_add.py
@@ -5,11 +5,8 @@ Verify add creates snapshots in DB, crawls, source files, and archive directorie
 """

 import os
-import subprocess
 import sqlite3
-from pathlib import Path
-
-from .fixtures import *
+import subprocess


 def test_add_single_url_creates_snapshot_in_db(tmp_path, process, disable_extractors_dict):
@@ -169,6 +166,30 @@ def test_add_with_tags(tmp_path, process, disable_extractors_dict):
    assert 'test' in tags_str or 'example' in tags_str


+def test_add_records_selected_persona_on_crawl(tmp_path, process, disable_extractors_dict):
+    """Test add persists the selected persona so browser config derives from it later."""
+    os.chdir(tmp_path)
+    result = subprocess.run(
+        ['archivebox', 'add', '--index-only', '--depth=0', '--persona=Default', 'https://example.com'],
+        capture_output=True,
+        env=disable_extractors_dict,
+    )
+
+    assert result.returncode == 0
+
+    conn = sqlite3.connect("index.sqlite3")
+    c = conn.cursor()
+    persona_id, default_persona = c.execute(
+        "SELECT persona_id, json_extract(config, '$.DEFAULT_PERSONA') FROM crawls_crawl LIMIT 1"
+    ).fetchone()
+    conn.close()
+
+    assert persona_id
+    assert default_persona == 'Default'
+    assert (tmp_path / "personas" / "Default" / "chrome_user_data").is_dir()
+    assert (tmp_path / "personas" / "Default" / "chrome_extensions").is_dir()
+
+
 def test_add_duplicate_url_creates_separate_crawls(tmp_path, process, disable_extractors_dict):
    """Test that adding the same URL twice creates separate crawls and snapshots.

--- a/archivebox/tests/test_cli_archiveresult.py
+++ b/archivebox/tests/test_cli_archiveresult.py
@@ -9,7 +9,6 @@ Tests cover:
 """

 import json
-import pytest

 from archivebox.tests.conftest import (
    run_archivebox_cmd,
--- a/archivebox/tests/test_cli_config.py
+++ b/archivebox/tests/test_cli_config.py
@@ -6,9 +6,6 @@ Verify config reads/writes ArchiveBox.conf file correctly.

 import os
 import subprocess
-from pathlib import Path
-
-from .fixtures import *


 def test_config_displays_all_config(tmp_path, process):
--- a/archivebox/tests/test_cli_crawl.py
+++ b/archivebox/tests/test_cli_crawl.py
@@ -9,14 +9,11 @@ Tests cover:
 """

 import json
-import pytest

 from archivebox.tests.conftest import (
    run_archivebox_cmd,
    parse_jsonl_output,
-    assert_jsonl_contains_type,
    create_test_url,
-    create_test_crawl_json,
 )


--- a/archivebox/tests/test_cli_extract.py
+++ b/archivebox/tests/test_cli_extract.py
@@ -5,10 +5,8 @@ Verify extract re-runs extractors on existing snapshots.
 """

 import os
-import subprocess
 import sqlite3
-
-from .fixtures import *
+import subprocess


 def test_extract_runs_on_existing_snapshots(tmp_path, process, disable_extractors_dict):
--- a/archivebox/tests/test_cli_help.py
+++ b/archivebox/tests/test_cli_help.py
@@ -7,8 +7,6 @@ Verify command runs successfully and produces output.
 import os
 import subprocess

-from .fixtures import *
-

 def test_help_runs_successfully(tmp_path):
    """Test that help command runs and produces output."""
--- a/archivebox/tests/test_cli_init.py
+++ b/archivebox/tests/test_cli_init.py
@@ -5,14 +5,11 @@ Verify init creates correct database schema, filesystem structure, and config.
 """

 import os
-import subprocess
 import sqlite3
-from pathlib import Path
+import subprocess

 from archivebox.config.common import STORAGE_CONFIG

-from .fixtures import *
-

 DIR_PERMISSIONS = STORAGE_CONFIG.OUTPUT_PERMISSIONS.replace('6', '7').replace('4', '5')

--- a/archivebox/tests/test_cli_install.py
+++ b/archivebox/tests/test_cli_install.py
@@ -5,12 +5,10 @@ Verify install detects and records binary dependencies in DB.
 """

 import os
-import subprocess
 import sqlite3
+import subprocess
 from pathlib import Path

-from .fixtures import *
-

 def test_install_runs_successfully(tmp_path, process):
    """Test that install command runs without error."""
--- a/archivebox/tests/test_cli_manage.py
+++ b/archivebox/tests/test_cli_manage.py
@@ -6,9 +6,6 @@ Verify manage command runs Django management commands.

 import os
 import subprocess
-import sqlite3
-
-from .fixtures import *


 def test_manage_help_works(tmp_path, process):
--- a/archivebox/tests/test_cli_remove.py
+++ b/archivebox/tests/test_cli_remove.py
@@ -5,11 +5,8 @@ Verify remove deletes snapshots from DB and filesystem.
 """

 import os
-import subprocess
 import sqlite3
-from pathlib import Path
-
-from .fixtures import *
+import subprocess


 def test_remove_deletes_snapshot_from_db(tmp_path, process, disable_extractors_dict):
--- a/archivebox/tests/test_cli_run.py
+++ b/archivebox/tests/test_cli_run.py
@@ -8,7 +8,6 @@ Tests cover:
 """

 import json
-import pytest

 from archivebox.tests.conftest import (
    run_archivebox_cmd,
--- a/archivebox/tests/test_cli_run_binary_worker.py
+++ b/archivebox/tests/test_cli_run_binary_worker.py
@@ -10,11 +10,9 @@ Tests cover:

 import json
 import sqlite3
-import time

 from archivebox.tests.conftest import (
    run_archivebox_cmd,
-    parse_jsonl_output,
 )


--- a/archivebox/tests/test_cli_schedule.py
+++ b/archivebox/tests/test_cli_schedule.py
@@ -5,7 +5,6 @@ import os
 import sqlite3
 import subprocess

-from .fixtures import process, disable_extractors_dict


 def test_schedule_run_all_enqueues_scheduled_crawl(tmp_path, process, disable_extractors_dict):
--- a/archivebox/tests/test_cli_search.py
+++ b/archivebox/tests/test_cli_search.py
@@ -6,9 +6,6 @@ Verify search queries snapshots from DB.

 import os
 import subprocess
-import sqlite3
-
-from .fixtures import *


 def test_search_finds_snapshots(tmp_path, process, disable_extractors_dict):
--- a/archivebox/tests/test_cli_server.py
+++ b/archivebox/tests/test_cli_server.py
@@ -6,10 +6,6 @@ Verify server can start (basic smoke tests only, no full server testing).

 import os
 import subprocess
-import signal
-import time
-
-from .fixtures import *


 def test_server_shows_usage_info(tmp_path, process):
--- a/archivebox/tests/test_cli_shell.py
+++ b/archivebox/tests/test_cli_shell.py
@@ -7,8 +7,6 @@ Verify shell command starts Django shell (basic smoke tests only).
 import os
 import subprocess

-from .fixtures import *
-

 def test_shell_command_exists(tmp_path, process):
    """Test that shell command is recognized."""
--- a/archivebox/tests/test_cli_snapshot.py
+++ b/archivebox/tests/test_cli_snapshot.py
@@ -9,12 +9,10 @@ Tests cover:
 """

 import json
-import pytest

 from archivebox.tests.conftest import (
    run_archivebox_cmd,
    parse_jsonl_output,
-    assert_jsonl_contains_type,
    create_test_url,
 )

--- a/archivebox/tests/test_cli_status.py
+++ b/archivebox/tests/test_cli_status.py
@@ -5,12 +5,10 @@ Verify status reports accurate collection state from DB and filesystem.
 """

 import os
-import subprocess
 import sqlite3
+import subprocess
 from pathlib import Path

-from .fixtures import *
-

 def _find_snapshot_dir(data_dir: Path, snapshot_id: str) -> Path | None:
    candidates = {snapshot_id}
--- a/archivebox/tests/test_cli_update.py
+++ b/archivebox/tests/test_cli_update.py
@@ -5,10 +5,8 @@ Verify update drains old dirs, reconciles DB, and queues snapshots.
 """

 import os
-import subprocess
 import sqlite3
-
-from .fixtures import *
+import subprocess


 def test_update_runs_successfully_on_empty_archive(tmp_path, process):
--- a/archivebox/tests/test_cli_version.py
+++ b/archivebox/tests/test_cli_version.py
@@ -11,7 +11,9 @@ import tempfile
 import subprocess
 from pathlib import Path

-from .fixtures import *
+from .fixtures import process
+
+FIXTURES = (process,)


 def _archivebox_cli() -> str:
--- a/archivebox/tests/test_config.py
+++ b/archivebox/tests/test_config.py
@@ -6,7 +6,6 @@ import subprocess

 import pytest

-from .fixtures import process, disable_extractors_dict


 def test_config_shows_all_config_values(tmp_path, process):
@@ -49,6 +48,7 @@ def test_config_set_value_writes_to_config_file(tmp_path, process):
        capture_output=True,
        text=True,
    )
+    assert result.returncode == 0, result.stderr

    # Read the config file directly to verify it was written
    config_file = tmp_path / 'ArchiveBox.conf'
--- a/archivebox/tests/test_crawl.py
+++ b/archivebox/tests/test_crawl.py
@@ -4,11 +4,9 @@
 import os
 import subprocess
 import sqlite3
-import json

 import pytest

-from .fixtures import process, disable_extractors_dict


 def test_crawl_creates_crawl_object(tmp_path, process, disable_extractors_dict):
--- a/archivebox/tests/test_extract.py
+++ b/archivebox/tests/test_extract.py
@@ -8,7 +8,6 @@ import json

 import pytest

-from .fixtures import process, disable_extractors_dict


 def test_extract_runs_on_snapshot_id(tmp_path, process, disable_extractors_dict):
@@ -231,6 +230,7 @@ def test_extract_multiple_snapshots(tmp_path, process, disable_extractors_dict):
        text=True,
        env=disable_extractors_dict,
    )
+    assert result.returncode == 0, result.stderr

    # Should not error
    conn = sqlite3.connect('index.sqlite3')
--- a/archivebox/tests/test_extractors.py
+++ b/archivebox/tests/test_extractors.py
@@ -1,8 +1,12 @@
-from .fixtures import *
 import json as pyjson
 import sqlite3
+import subprocess
 from pathlib import Path

+from .fixtures import disable_extractors_dict, process
+
+FIXTURES = (disable_extractors_dict, process)
+

 def _find_snapshot_dir(data_dir: Path, snapshot_id: str) -> Path | None:
    candidates = {snapshot_id}
--- a/archivebox/tests/test_hooks.py
+++ b/archivebox/tests/test_hooks.py
@@ -16,7 +16,7 @@ import subprocess
 import tempfile
 import unittest
 from pathlib import Path
-from unittest.mock import MagicMock, patch
+from unittest.mock import patch

 # Set up Django before importing any Django-dependent modules
 os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
--- a/archivebox/tests/test_init.py
+++ b/archivebox/tests/test_init.py
@@ -3,13 +3,13 @@

 import os
 import subprocess
-from pathlib import Path
-import json, shutil
 import sqlite3

 from archivebox.config.common import STORAGE_CONFIG

-from .fixtures import *
+from .fixtures import disable_extractors_dict, process
+
+FIXTURES = (disable_extractors_dict, process)

 DIR_PERMISSIONS = STORAGE_CONFIG.OUTPUT_PERMISSIONS.replace('6', '7').replace('4', '5')

@@ -25,6 +25,7 @@ def test_add_link(tmp_path, process, disable_extractors_dict):
    os.chdir(tmp_path)
    add_process = subprocess.run(['archivebox', 'add', '--index-only', 'https://example.com'],
                                  capture_output=True, env=disable_extractors_dict)
+    assert add_process.returncode == 0, add_process.stderr.decode("utf-8")

    # In the new architecture, URLs are saved to source files
    # Check that a source file was created with the URL
@@ -41,6 +42,7 @@ def test_add_multiple_urls(tmp_path, process, disable_extractors_dict):
    os.chdir(tmp_path)
    add_process = subprocess.run(['archivebox', 'add', '--index-only', 'https://example.com', 'https://iana.org'],
                                  capture_output=True, env=disable_extractors_dict)
+    assert add_process.returncode == 0, add_process.stderr.decode("utf-8")

    # Check that a source file was created with both URLs
    sources_dir = tmp_path / "sources"
@@ -61,6 +63,7 @@ def test_correct_permissions_add_command_results(tmp_path, process, disable_extr
    os.chdir(tmp_path)
    add_process = subprocess.run(['archivebox', 'add', '--index-only', 'https://example.com'], capture_output=True,
                                  env=disable_extractors_dict)
+    assert add_process.returncode == 0, add_process.stderr.decode("utf-8")

    # Check database permissions
    assert oct((tmp_path / "index.sqlite3").stat().st_mode)[-3:] in (STORAGE_CONFIG.OUTPUT_PERMISSIONS, DIR_PERMISSIONS)
--- a/archivebox/tests/test_install.py
+++ b/archivebox/tests/test_install.py
@@ -7,7 +7,6 @@ import sqlite3

 import pytest

-from .fixtures import process, disable_extractors_dict


 class TestInstallDryRun:
--- a/archivebox/tests/test_list.py
+++ b/archivebox/tests/test_list.py
@@ -1,7 +1,9 @@
 import json
 import subprocess

-from .fixtures import *
+from .fixtures import disable_extractors_dict, process
+
+FIXTURES = (disable_extractors_dict, process)

 def test_search_json(process, disable_extractors_dict):
    subprocess.run(["archivebox", "add", "--index-only", "https://example.com", "--depth=0"],
--- a/archivebox/tests/test_migrations_08_to_09.py
+++ b/archivebox/tests/test_migrations_08_to_09.py
@@ -10,10 +10,8 @@ Migration tests from 0.8.x to 0.9.x.
 - New fields like depth, retry_at, etc.
 """

-import json
 import shutil
 import sqlite3
-import subprocess
 import tempfile
 import unittest
 from pathlib import Path
@@ -579,7 +577,7 @@ class TestFilesystemMigration08to09(unittest.TestCase):
                        f"Files lost during migration: {files_before_count} -> {files_after_count}")

        # Run update to trigger filesystem reorganization
-        print(f"\n[*] Running archivebox update to reorganize filesystem...")
+        print("\n[*] Running archivebox update to reorganize filesystem...")
        result = run_archivebox(self.work_dir, ['update'], timeout=120)
        self.assertEqual(result.returncode, 0, f"Update failed: {result.stderr}")

@@ -657,7 +655,7 @@ class TestFilesystemMigration08to09(unittest.TestCase):

        # CRITICAL: Verify sample files exist in new structure
        self.assertGreater(len(new_sample_files), 0,
-                          f"Sample files not found in new structure")
+                          "Sample files not found in new structure")

        # Verify new path format
        for path_key, file_path in new_sample_files.items():
--- a/archivebox/tests/test_recursive_crawl.py
+++ b/archivebox/tests/test_recursive_crawl.py
@@ -10,7 +10,6 @@ from pathlib import Path

 import pytest

-from .fixtures import process, disable_extractors_dict, recursive_test_site


 def wait_for_db_condition(timeout, condition, interval=0.5):
@@ -77,7 +76,6 @@ def test_background_hooks_dont_block_parser_extractors(tmp_path, process, recurs
        "SAVE_ARCHIVEDOTORG": "false",
        "SAVE_TITLE": "false",
        "SAVE_FAVICON": "true",
-        "SAVE_WGET": "false",
    })

    proc = subprocess.Popen(
--- a/archivebox/tests/test_remove.py
+++ b/archivebox/tests/test_remove.py
@@ -1,7 +1,10 @@
 import os
 import sqlite3
+import subprocess

-from .fixtures import *
+from .fixtures import disable_extractors_dict, process
+
+FIXTURES = (disable_extractors_dict, process)

 def test_remove_single_snapshot(tmp_path, process, disable_extractors_dict):
    """Test removing a snapshot by URL pattern"""
--- a/archivebox/tests/test_schedule.py
+++ b/archivebox/tests/test_schedule.py
@@ -7,7 +7,6 @@ import subprocess

 import pytest

-from .fixtures import process


 def _fetchone(tmp_path, query):
--- a/archivebox/tests/test_schedule_e2e.py
+++ b/archivebox/tests/test_schedule_e2e.py
@@ -0,0 +1,420 @@
+#!/usr/bin/env python3
+"""End-to-end tests for scheduling across CLI, server, API, and web UI."""
+
+import os
+import socket
+import sqlite3
+import subprocess
+import sys
+import textwrap
+import time
+from pathlib import Path
+
+import pytest
+import requests
+
+from .conftest import run_python_cwd
+
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+
+
+def init_archive(cwd: Path) -> None:
+    result = subprocess.run(
+        [sys.executable, '-m', 'archivebox', 'init', '--quick'],
+        cwd=cwd,
+        capture_output=True,
+        text=True,
+        timeout=60,
+    )
+    assert result.returncode == 0, result.stderr
+
+
+def build_test_env(port: int, **extra: str) -> dict[str, str]:
+    env = os.environ.copy()
+    env.pop('DATA_DIR', None)
+    env.update({
+        'LISTEN_HOST': f'archivebox.localhost:{port}',
+        'ALLOWED_HOSTS': '*',
+        'CSRF_TRUSTED_ORIGINS': f'http://admin.archivebox.localhost:{port}',
+        'PUBLIC_ADD_VIEW': 'True',
+        'USE_COLOR': 'False',
+        'SHOW_PROGRESS': 'False',
+        'TIMEOUT': '20',
+        'URL_ALLOWLIST': r'127\.0\.0\.1[:/].*',
+        'SAVE_ARCHIVEDOTORG': 'False',
+        'SAVE_TITLE': 'False',
+        'SAVE_FAVICON': 'False',
+        'SAVE_WARC': 'False',
+        'SAVE_PDF': 'False',
+        'SAVE_SCREENSHOT': 'False',
+        'SAVE_DOM': 'False',
+        'SAVE_SINGLEFILE': 'False',
+        'SAVE_READABILITY': 'False',
+        'SAVE_MERCURY': 'False',
+        'SAVE_GIT': 'False',
+        'SAVE_YTDLP': 'False',
+        'SAVE_HEADERS': 'False',
+        'SAVE_HTMLTOTEXT': 'False',
+        'SAVE_WGET': 'True',
+        'USE_CHROME': 'False',
+    })
+    env.update(extra)
+    return env
+
+
+def get_free_port() -> int:
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
+        sock.bind(('127.0.0.1', 0))
+        return sock.getsockname()[1]
+
+
+def start_server(cwd: Path, env: dict[str, str], port: int) -> None:
+    result = subprocess.run(
+        [sys.executable, '-m', 'archivebox', 'server', '--daemonize', f'127.0.0.1:{port}'],
+        cwd=cwd,
+        capture_output=True,
+        text=True,
+        env=env,
+        timeout=60,
+    )
+    assert result.returncode == 0, result.stderr
+
+
+def stop_server(cwd: Path) -> None:
+    script = textwrap.dedent(
+        """
+        import os
+        os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
+        import django
+        django.setup()
+        from archivebox.workers.supervisord_util import stop_existing_supervisord_process
+        stop_existing_supervisord_process()
+        print('stopped')
+        """
+    )
+    run_python_cwd(script, cwd=cwd, timeout=30)
+
+
+def wait_for_http(port: int, host: str, path: str = '/', timeout: int = 30) -> requests.Response:
+    deadline = time.time() + timeout
+    last_exc = None
+    while time.time() < deadline:
+        try:
+            response = requests.get(
+                f'http://127.0.0.1:{port}{path}',
+                headers={'Host': host},
+                timeout=2,
+                allow_redirects=False,
+            )
+            if response.status_code < 500:
+                return response
+        except requests.RequestException as exc:
+            last_exc = exc
+        time.sleep(0.5)
+    raise AssertionError(f'Timed out waiting for HTTP on {host}: {last_exc}')
+
+
+def make_latest_schedule_due(cwd: Path) -> None:
+    conn = sqlite3.connect(cwd / 'index.sqlite3')
+    try:
+        conn.execute(
+            """
+            UPDATE crawls_crawl
+            SET created_at = datetime('now', '-2 day'),
+                modified_at = datetime('now', '-2 day')
+            WHERE id = (
+                SELECT template_id
+                FROM crawls_crawlschedule
+                ORDER BY created_at DESC
+                LIMIT 1
+            )
+            """
+        )
+        conn.commit()
+    finally:
+        conn.close()
+
+
+def get_snapshot_file_text(cwd: Path, url: str) -> str:
+    script = textwrap.dedent(
+        f"""
+        import os
+        from pathlib import Path
+
+        os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
+        import django
+        django.setup()
+
+        from archivebox.core.models import Snapshot
+
+        snapshot = Snapshot.objects.filter(url={url!r}).order_by('-created_at').first()
+        assert snapshot is not None, 'missing snapshot'
+        assert snapshot.status == 'sealed', snapshot.status
+
+        snapshot_dir = Path(snapshot.output_dir)
+        candidates = []
+        preferred_patterns = (
+            'wget/**/index.html',
+            'wget/**/*.html',
+            'trafilatura/content.html',
+            'trafilatura/content.txt',
+            'defuddle/content.html',
+            'defuddle/content.txt',
+        )
+        for pattern in preferred_patterns:
+            for candidate in snapshot_dir.glob(pattern):
+                if candidate.is_file():
+                    candidates.append(candidate)
+
+        if not candidates:
+            for candidate in snapshot_dir.rglob('*'):
+                if not candidate.is_file():
+                    continue
+                rel = candidate.relative_to(snapshot_dir)
+                if rel.parts and rel.parts[0] == 'responses':
+                    continue
+                if candidate.suffix not in ('.html', '.htm', '.txt'):
+                    continue
+                if candidate.name in ('stdout.log', 'stderr.log', 'cmd.sh'):
+                    continue
+                candidates.append(candidate)
+
+        assert candidates, f'no captured html/txt files found in {{snapshot_dir}}'
+        print(candidates[0].read_text(errors='ignore'))
+        """
+    )
+    stdout, stderr, code = run_python_cwd(script, cwd=cwd, timeout=60)
+    assert code == 0, stderr
+    return stdout
+
+
+def wait_for_snapshot_capture(cwd: Path, url: str, timeout: int = 180) -> str:
+    deadline = time.time() + timeout
+    last_error = None
+    while time.time() < deadline:
+        try:
+            return get_snapshot_file_text(cwd, url)
+        except AssertionError as err:
+            last_error = err
+            time.sleep(2)
+    raise AssertionError(f'timed out waiting for captured content for {url}: {last_error}')
+
+
+def get_counts(cwd: Path, scheduled_url: str, one_shot_url: str) -> tuple[int, int, int]:
+    conn = sqlite3.connect(cwd / 'index.sqlite3')
+    try:
+        scheduled_snapshots = conn.execute(
+            "SELECT COUNT(*) FROM core_snapshot WHERE url = ?",
+            (scheduled_url,),
+        ).fetchone()[0]
+        one_shot_snapshots = conn.execute(
+            "SELECT COUNT(*) FROM core_snapshot WHERE url = ?",
+            (one_shot_url,),
+        ).fetchone()[0]
+        scheduled_crawls = conn.execute(
+            """
+            SELECT COUNT(*)
+            FROM crawls_crawl
+            WHERE schedule_id IS NOT NULL
+              AND urls = ?
+            """,
+            (scheduled_url,),
+        ).fetchone()[0]
+        return scheduled_snapshots, one_shot_snapshots, scheduled_crawls
+    finally:
+        conn.close()
+
+
+def create_admin_and_token(cwd: Path) -> str:
+    script = textwrap.dedent(
+        """
+        import os
+        from datetime import timedelta
+        from django.utils import timezone
+
+        os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
+        import django
+        django.setup()
+
+        from django.contrib.auth import get_user_model
+        from archivebox.api.models import APIToken
+
+        User = get_user_model()
+        user, _ = User.objects.get_or_create(
+            username='apitestadmin',
+            defaults={
+                'email': 'apitestadmin@example.com',
+                'is_staff': True,
+                'is_superuser': True,
+            },
+        )
+        user.is_staff = True
+        user.is_superuser = True
+        user.set_password('testpass123')
+        user.save()
+
+        token = APIToken.objects.create(
+            created_by=user,
+            expires=timezone.now() + timedelta(days=1),
+        )
+        print(token.token)
+        """
+    )
+    stdout, stderr, code = run_python_cwd(script, cwd=cwd, timeout=60)
+    assert code == 0, stderr
+    return stdout.strip().splitlines()[-1]
+
+
+@pytest.mark.timeout(180)
+def test_server_processes_due_cli_schedule_and_saves_real_content(tmp_path, recursive_test_site):
+    os.chdir(tmp_path)
+    init_archive(tmp_path)
+
+    port = get_free_port()
+    env = build_test_env(port)
+
+    schedule_result = subprocess.run(
+        [sys.executable, '-m', 'archivebox', 'schedule', '--every=daily', '--depth=0', recursive_test_site['root_url']],
+        cwd=tmp_path,
+        capture_output=True,
+        text=True,
+        env=env,
+        timeout=60,
+    )
+    assert schedule_result.returncode == 0, schedule_result.stderr
+    assert 'Created scheduled crawl' in schedule_result.stdout
+
+    make_latest_schedule_due(tmp_path)
+
+    try:
+        start_server(tmp_path, env=env, port=port)
+        wait_for_http(port, host=f'web.archivebox.localhost:{port}')
+        captured_text = wait_for_snapshot_capture(tmp_path, recursive_test_site['root_url'], timeout=180)
+        assert 'Root' in captured_text
+        assert 'About' in captured_text
+    finally:
+        stop_server(tmp_path)
+
+
+@pytest.mark.timeout(180)
+def test_archivebox_add_remains_one_shot_even_when_schedule_is_due(tmp_path, recursive_test_site):
+    os.chdir(tmp_path)
+    init_archive(tmp_path)
+
+    port = get_free_port()
+    env = build_test_env(port)
+    scheduled_url = recursive_test_site['root_url']
+    one_shot_url = recursive_test_site['child_urls'][0]
+
+    schedule_result = subprocess.run(
+        [sys.executable, '-m', 'archivebox', 'schedule', '--every=daily', '--depth=0', scheduled_url],
+        cwd=tmp_path,
+        capture_output=True,
+        text=True,
+        env=env,
+        timeout=60,
+    )
+    assert schedule_result.returncode == 0, schedule_result.stderr
+
+    make_latest_schedule_due(tmp_path)
+
+    add_result = subprocess.run(
+        [sys.executable, '-m', 'archivebox', 'add', '--depth=0', '--plugins=wget', one_shot_url],
+        cwd=tmp_path,
+        capture_output=True,
+        text=True,
+        env=env,
+        timeout=120,
+    )
+    assert add_result.returncode == 0, add_result.stderr
+    captured_text = wait_for_snapshot_capture(tmp_path, one_shot_url, timeout=120)
+    assert 'Deep About' in captured_text or 'About' in captured_text
+
+    scheduled_snapshots, one_shot_snapshots, scheduled_crawls = get_counts(tmp_path, scheduled_url, one_shot_url)
+    assert one_shot_snapshots >= 1
+    assert scheduled_snapshots == 0
+    assert scheduled_crawls == 1  # template only, no materialized scheduled run
+
+
+@pytest.mark.timeout(180)
+def test_schedule_rest_api_works_over_running_server(tmp_path, recursive_test_site):
+    os.chdir(tmp_path)
+    init_archive(tmp_path)
+
+    port = get_free_port()
+    env = build_test_env(port)
+    api_token = create_admin_and_token(tmp_path)
+
+    try:
+        start_server(tmp_path, env=env, port=port)
+        wait_for_http(port, host=f'api.archivebox.localhost:{port}', path='/api/v1/docs')
+
+        response = requests.post(
+            f'http://127.0.0.1:{port}/api/v1/cli/schedule',
+            headers={
+                'Host': f'api.archivebox.localhost:{port}',
+                'X-ArchiveBox-API-Key': api_token,
+            },
+            json={
+                'every': 'daily',
+                'import_path': recursive_test_site['root_url'],
+                'quiet': True,
+            },
+            timeout=10,
+        )
+
+        assert response.status_code == 200, response.text
+        payload = response.json()
+        assert payload['success'] is True
+        assert payload['result_format'] == 'json'
+        assert len(payload['result']['created_schedule_ids']) == 1
+    finally:
+        stop_server(tmp_path)
+
+
+@pytest.mark.timeout(180)
+def test_schedule_web_ui_post_works_over_running_server(tmp_path, recursive_test_site):
+    os.chdir(tmp_path)
+    init_archive(tmp_path)
+
+    port = get_free_port()
+    env = build_test_env(port, PUBLIC_ADD_VIEW='True')
+
+    try:
+        start_server(tmp_path, env=env, port=port)
+        wait_for_http(port, host=f'web.archivebox.localhost:{port}', path='/add/')
+
+        response = requests.post(
+            f'http://127.0.0.1:{port}/add/',
+            headers={'Host': f'web.archivebox.localhost:{port}'},
+            data={
+                'url': recursive_test_site['root_url'],
+                'depth': '0',
+                'schedule': 'daily',
+                'tag': 'web-ui',
+                'notes': 'created from web ui',
+            },
+            timeout=10,
+            allow_redirects=False,
+        )
+
+        assert response.status_code in (302, 303), response.text
+
+        conn = sqlite3.connect(tmp_path / 'index.sqlite3')
+        try:
+            row = conn.execute(
+                """
+                SELECT cs.schedule, c.urls, c.tags_str
+                FROM crawls_crawlschedule cs
+                JOIN crawls_crawl c ON c.schedule_id = cs.id
+                ORDER BY cs.created_at DESC
+                LIMIT 1
+                """
+            ).fetchone()
+        finally:
+            conn.close()
+
+        assert row == ('daily', recursive_test_site['root_url'], 'web-ui')
+    finally:
+        stop_server(tmp_path)
--- a/archivebox/tests/test_search.py
+++ b/archivebox/tests/test_search.py
@@ -3,12 +3,9 @@

 import os
 import subprocess
-import sqlite3
-import json

 import pytest

-from .fixtures import process, disable_extractors_dict


 def test_search_returns_snapshots(tmp_path, process, disable_extractors_dict):
--- a/archivebox/tests/test_snapshot.py
+++ b/archivebox/tests/test_snapshot.py
@@ -6,13 +6,11 @@ import subprocess
 import sqlite3
 from archivebox.machine.models import Process
 from datetime import datetime
-from pathlib import Path
 from urllib.parse import urlparse
 import uuid

 import pytest

-from .fixtures import process, disable_extractors_dict


 def test_snapshot_creates_snapshot_with_correct_url(tmp_path, process, disable_extractors_dict):
@@ -46,9 +44,7 @@ def test_snapshot_creates_snapshot_with_correct_url(tmp_path, process, disable_e

    snapshot_id_raw, snapshot_created_at, snapshot_url, crawl_id = snapshot_row
    snapshot_id = str(uuid.UUID(snapshot_id_raw))
-    crawl_id, crawl_created_at, crawl_urls, crawl_created_by_id = crawl_row
    username = user_row[0]
-    crawl_date_str = datetime.fromisoformat(crawl_created_at).strftime('%Y%m%d')
    snapshot_date_str = datetime.fromisoformat(snapshot_created_at).strftime('%Y%m%d')
    domain = urlparse(snapshot_url).hostname or 'unknown'

--- a/archivebox/tests/test_status.py
+++ b/archivebox/tests/test_status.py
@@ -3,11 +3,9 @@

 import os
 import subprocess
-import sqlite3

 import pytest

-from .fixtures import process, disable_extractors_dict


 def test_status_shows_index_info(tmp_path, process):
--- a/archivebox/tests/test_title.py
+++ b/archivebox/tests/test_title.py
@@ -1,7 +1,10 @@
 import os
 import sqlite3
+import subprocess

-from .fixtures import *
+from .fixtures import disable_extractors_dict, process
+
+FIXTURES = (disable_extractors_dict, process)

 def test_title_is_extracted(tmp_path, process, disable_extractors_dict):
    """Test that title is extracted from the page."""
--- a/archivebox/tests/test_update.py
+++ b/archivebox/tests/test_update.py
@@ -1,7 +1,10 @@
 import json
 import sqlite3
+import subprocess

-from .fixtures import *
+from .fixtures import disable_extractors_dict, process
+
+FIXTURES = (disable_extractors_dict, process)

 def test_update_imports_orphaned_snapshots(tmp_path, process, disable_extractors_dict):
    """Test that archivebox update imports real legacy archive directories."""
--- a/archivebox/tests/test_version.py
+++ b/archivebox/tests/test_version.py
@@ -3,11 +3,9 @@

 import os
 import subprocess
-import json

 import pytest

-from .fixtures import process, disable_extractors_dict


 class TestVersionQuiet:
--- a/archivebox/tests/test_worker_config_propagation.py
+++ b/archivebox/tests/test_worker_config_propagation.py
@@ -18,11 +18,9 @@ Config priority order (highest to lowest):
 """

 import os
-import json
 import sys
 import tempfile
 import subprocess
-import time
 from pathlib import Path


@@ -45,7 +43,7 @@ def test_config_propagation_through_worker_hierarchy():
        data_dir.mkdir()

        print(f"\n{'='*80}")
-        print(f"Test: Config Propagation Through Worker Hierarchy")
+        print("Test: Config Propagation Through Worker Hierarchy")
        print(f"DATA_DIR: {data_dir}")
        print(f"{'='*80}\n")

@@ -63,7 +61,7 @@ def test_config_propagation_through_worker_hierarchy():
            timeout=60,
        )
        assert result.returncode == 0, f"Init failed: {result.stderr.decode()}"
-        print(f"✓ Archive initialized\n")
+        print("✓ Archive initialized\n")

        # Step 2: Write custom config to ArchiveBox.conf
        print("Step 2: Write custom config to ArchiveBox.conf")
@@ -90,7 +88,7 @@ SAVE_TITLE = True
 SAVE_FAVICON = True
 SAVE_SCREENSHOT = True
 """)
-        print(f"✓ Wrote config file with TIMEOUT=999, all plugins enabled\n")
+        print("✓ Wrote config file with TIMEOUT=999, all plugins enabled\n")

        # Step 2.5: Set Machine.config values
        print("Step 2.5: Set Machine.config with custom binary path")
@@ -123,7 +121,7 @@ print(f"Machine {{machine.hostname}} config updated")
            timeout=30,
        )
        assert result.returncode == 0, f"Set machine config failed: {result.stderr.decode()}"
-        print(f"✓ Set Machine.config with CUSTOM_MACHINE_KEY=from_machine_config, WGET_BINARY=/custom/machine/wget\n")
+        print("✓ Set Machine.config with CUSTOM_MACHINE_KEY=from_machine_config, WGET_BINARY=/custom/machine/wget\n")

        # Step 3: Create Crawl via Django ORM with custom crawl.config
        print("Step 3: Create Crawl with custom crawl.config JSON")
@@ -421,7 +419,7 @@ def test_config_environment_variable_parsing():
        data_dir.mkdir()

        print(f"\n{'='*80}")
-        print(f"Test: Config Environment Variable Parsing")
+        print("Test: Config Environment Variable Parsing")
        print(f"DATA_DIR: {data_dir}")
        print(f"{'='*80}\n")

@@ -557,7 +555,7 @@ def test_parent_environment_preserved_in_hooks():
        data_dir.mkdir()

        print(f"\n{'='*80}")
-        print(f"Test: Parent Environment Preserved in Hooks")
+        print("Test: Parent Environment Preserved in Hooks")
        print(f"DATA_DIR: {data_dir}")
        print(f"{'='*80}\n")

@@ -575,7 +573,7 @@ def test_parent_environment_preserved_in_hooks():
            timeout=60,
        )
        assert result.returncode == 0, f"Init failed: {result.stderr.decode()}"
-        print(f"✓ Archive initialized\n")
+        print("✓ Archive initialized\n")

        # Create snapshot
        print("Step 2: Create Snapshot")
@@ -635,7 +633,6 @@ print(snapshot.id)
            timeout=120,
        )

-        stdout = result.stdout.decode()
        stderr = result.stderr.decode()

        print("\n--- SnapshotWorker stderr (first 50 lines) ---")
@@ -760,7 +757,7 @@ def test_config_auto_fetch_relationships():
        data_dir.mkdir()

        print(f"\n{'='*80}")
-        print(f"Test: Config Auto-Fetch Relationships")
+        print("Test: Config Auto-Fetch Relationships")
        print(f"DATA_DIR: {data_dir}")
        print(f"{'='*80}\n")

@@ -778,7 +775,7 @@ def test_config_auto_fetch_relationships():
            timeout=60,
        )
        assert result.returncode == 0, f"Init failed: {result.stderr.decode()}"
-        print(f"✓ Archive initialized\n")
+        print("✓ Archive initialized\n")

        # Create objects with config at each level
        print("Step 2: Create Crawl -> Snapshot -> ArchiveResult with config at each level")
@@ -906,7 +903,7 @@ def test_config_precedence_with_environment_vars():
        data_dir.mkdir()

        print(f"\n{'='*80}")
-        print(f"Test: Config Precedence with Environment Variables")
+        print("Test: Config Precedence with Environment Variables")
        print(f"DATA_DIR: {data_dir}")
        print(f"{'='*80}\n")

@@ -1006,7 +1003,7 @@ def test_new_environment_variables_added():
        data_dir.mkdir()

        print(f"\n{'='*80}")
-        print(f"Test: New Environment Variables Added to Config")
+        print("Test: New Environment Variables Added to Config")
        print(f"DATA_DIR: {data_dir}")
        print(f"{'='*80}\n")