cleanup archivebox tests

2026-04-05 23:37:58 +10:00 · 2026-03-15 22:09:56 -07:00
parent 9de084da65
commit 57e11879ec
23 changed files with 487 additions and 1495 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -38,6 +38,7 @@ lib/
 tmp/
 data/
 data*/
+archivebox/tests/data/
 archive/
 output/
 logs/
--- a/archivebox/cli/init.py
+++ b/archivebox/cli/init.py
@@ -107,7 +107,10 @@ class ArchiveBoxGroup(click.Group):
        # handle renamed commands
        if cmd_name in self.renamed_commands:
            new_name = self.renamed_commands[cmd_name]
-            print(f' [violet]Hint:[/violet] `archivebox {cmd_name}` has been renamed to `archivebox {new_name}`')
+            print(
+                f' [violet]Hint:[/violet] `archivebox {cmd_name}` has been renamed to `archivebox {new_name}`',
+                file=sys.stderr,
+            )
            cmd_name = new_name
            ctx.invoked_subcommand = cmd_name

--- a/archivebox/cli/archivebox_binary.py
+++ b/archivebox/cli/archivebox_binary.py
@@ -63,11 +63,28 @@ def create_binary(
        return 1

    try:
-        binary, created = Binary.objects.get_or_create(
+        from archivebox.machine.models import Machine
+
+        machine = Machine.current()
+        created = not Binary.objects.filter(
+            machine=machine,
            name=name,
            abspath=abspath,
-            defaults={'version': version}
-        )
+            version=version,
+        ).exists()
+
+        # Mirror the Binary model lifecycle used elsewhere in the system so CLI
+        # records are owned by the current machine and can be safely piped into
+        # `archivebox run` without creating invalid rows missing machine_id.
+        binary = Binary.from_json({
+            'name': name,
+            'abspath': abspath,
+            'version': version,
+            'binproviders': 'env',
+            'binprovider': 'env',
+        })
+        if binary is None:
+            raise ValueError('failed to create binary record')

        if not is_tty:
            write_record(binary.to_json())
--- a/archivebox/cli/archivebox_extract.py
+++ b/archivebox/cli/archivebox_extract.py
@@ -81,6 +81,7 @@ def process_archiveresult_by_id(archiveresult_id: str) -> int:

 def run_plugins(
    args: tuple,
+    records: list[dict] | None = None,
    plugins: str = '',
    wait: bool = True,
 ) -> int:
@@ -108,8 +109,12 @@ def run_plugins(
    # Parse comma-separated plugins list once (reused in creation and filtering)
    plugins_list = [p.strip() for p in plugins.split(',') if p.strip()] if plugins else []

-    # Collect all input records
-    records = list(read_args_or_stdin(args))
+    # Parse stdin/args exactly once per CLI invocation.
+    # `main()` may already have consumed stdin to distinguish Snapshot input from
+    # ArchiveResult IDs; if so, it must pass the parsed records through here
+    # instead of asking this helper to reread an already-drained pipe.
+    if records is None:
+        records = list(read_args_or_stdin(args))

    if not records:
        rprint('[yellow]No snapshots provided. Pass snapshot IDs as arguments or via stdin.[/yellow]', file=sys.stderr)
@@ -269,7 +274,7 @@ def main(plugins: str, wait: bool, args: tuple):
        sys.exit(exit_code)
    else:
        # Default behavior: run plugins on Snapshots from input
-        sys.exit(run_plugins(args, plugins=plugins, wait=wait))
+        sys.exit(run_plugins(args, records=records, plugins=plugins, wait=wait))


 if __name__ == '__main__':
--- a/archivebox/cli/tests.py
+++ b/archivebox/cli/tests.py
@@ -1,231 +0,0 @@
-#!/usr/bin/env python3
-
-__package__ = 'archivebox.cli'
-
-
-import importlib
-import os
-import shutil
-import sys
-import unittest
-from contextlib import contextmanager
-from pathlib import Path
-
-from archivebox.config.constants import CONSTANTS
-
-TEST_CONFIG = {
-    'USE_COLOR': 'False',
-    'SHOW_PROGRESS': 'False',
-
-    'DATA_DIR': 'data.tests',
-    
-    'SAVE_ARCHIVEDOTORG': 'False',
-    'SAVE_TITLE': 'False',
-    
-    'USE_CURL': 'False',
-    'USE_WGET': 'False',
-    'USE_GIT': 'False',
-    'USE_CHROME': 'False',
-    'USE_YOUTUBEDL': 'False',
-}
-
-DATA_DIR = 'data.tests'
-os.environ.update(TEST_CONFIG)
-
-init = importlib.import_module('archivebox.main').init
-SQL_INDEX_FILENAME = CONSTANTS.SQL_INDEX_FILENAME
-JSON_INDEX_FILENAME = CONSTANTS.JSON_INDEX_FILENAME
-HTML_INDEX_FILENAME = CONSTANTS.HTML_INDEX_FILENAME
-archivebox_init = importlib.import_module('archivebox.cli.archivebox_init')
-archivebox_add = importlib.import_module('archivebox.cli.archivebox_add')
-archivebox_remove = importlib.import_module('archivebox.cli.archivebox_remove')
-parse_json_main_index = importlib.import_module('archivebox.misc.legacy').parse_json_main_index
-
-HIDE_CLI_OUTPUT = True
-
-test_urls = '''
-https://example1.com/what/is/happening.html?what=1#how-about-this=1
-https://example2.com/what/is/happening/?what=1#how-about-this=1
-HTtpS://example3.com/what/is/happening/?what=1#how-about-this=1f
-https://example4.com/what/is/happening.html
-https://example5.com/
-https://example6.com
-
-<test>http://example7.com</test>
-[https://example8.com/what/is/this.php?what=1]
-[and http://example9.com?what=1&other=3#and-thing=2]
-<what>https://example10.com#and-thing=2 "</about>
-abc<this["https://subb.example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def
-sdflkf[what](https://subb.example12.com/who/what.php?whoami=1#whatami=2)?am=hi
-example13.bada
-and example14.badb
-<or>htt://example15.badc</that>
-'''
-
-stdout = sys.stdout
-stderr = sys.stderr
-
-
-def load_main_index(*, out_dir: str):
-    index_path = Path(out_dir) / JSON_INDEX_FILENAME
-    if not index_path.exists():
-        raise FileNotFoundError(index_path)
-    return list(parse_json_main_index(Path(out_dir)))
-
-
-@contextmanager
-def output_hidden(show_failing=True):
-    if not HIDE_CLI_OUTPUT:
-        yield
-        return
-
-    sys.stdout = open('stdout.txt', 'w+', encoding='utf-8')
-    sys.stderr = open('stderr.txt', 'w+', encoding='utf-8')
-    try:
-        yield
-        sys.stdout.close()
-        sys.stderr.close()
-        sys.stdout = stdout
-        sys.stderr = stderr
-    except Exception:
-        sys.stdout.close()
-        sys.stderr.close()
-        sys.stdout = stdout
-        sys.stderr = stderr
-        if show_failing:
-            with open('stdout.txt', 'r', encoding='utf-8') as f:
-                print(f.read())
-            with open('stderr.txt', 'r', encoding='utf-8') as f:
-                print(f.read())
-        raise
-    finally:
-        os.remove('stdout.txt')
-        os.remove('stderr.txt')
-
-
-class TestInit(unittest.TestCase):
-    def setUp(self):
-        os.makedirs(DATA_DIR, exist_ok=True)
-
-    def tearDown(self):
-        shutil.rmtree(DATA_DIR, ignore_errors=True)
-
-    def test_basic_init(self):
-        with output_hidden():
-            archivebox_init.main([])
-
-        assert (Path(DATA_DIR) / SQL_INDEX_FILENAME).exists()
-        assert (Path(DATA_DIR) / JSON_INDEX_FILENAME).exists()
-        assert (Path(DATA_DIR) / HTML_INDEX_FILENAME).exists()
-        assert len(load_main_index(out_dir=DATA_DIR)) == 0
-
-    def test_conflicting_init(self):
-        with open(Path(DATA_DIR) / 'test_conflict.txt', 'w+', encoding='utf-8') as f:
-            f.write('test')
-
-        try:
-            with output_hidden(show_failing=False):
-                archivebox_init.main([])
-            assert False, 'Init should have exited with an exception'
-        except SystemExit:
-            pass
-
-        assert not (Path(DATA_DIR) / SQL_INDEX_FILENAME).exists()
-        assert not (Path(DATA_DIR) / JSON_INDEX_FILENAME).exists()
-        assert not (Path(DATA_DIR) / HTML_INDEX_FILENAME).exists()
-        try:
-            load_main_index(out_dir=DATA_DIR)
-            assert False, 'load_main_index should raise an exception when no index is present'
-        except Exception:
-            pass
-
-    def test_no_dirty_state(self):
-        with output_hidden():
-            init()
-        shutil.rmtree(DATA_DIR, ignore_errors=True)
-        with output_hidden():
-            init()
-
-
-class TestAdd(unittest.TestCase):
-    def setUp(self):
-        os.makedirs(DATA_DIR, exist_ok=True)
-        with output_hidden():
-            init()
-
-    def tearDown(self):
-        shutil.rmtree(DATA_DIR, ignore_errors=True)
-
-    def test_add_arg_url(self):
-        with output_hidden():
-            archivebox_add.main(['https://getpocket.com/users/nikisweeting/feed/all'])
-
-        all_links = load_main_index(out_dir=DATA_DIR)
-        assert len(all_links) == 30
-
-    def test_add_arg_file(self):
-        test_file = Path(DATA_DIR) / 'test.txt'
-        with open(test_file, 'w+', encoding='utf') as f:
-            f.write(test_urls)
-
-        with output_hidden():
-            archivebox_add.main([test_file])
-
-        all_links = load_main_index(out_dir=DATA_DIR)
-        assert len(all_links) == 12
-        os.remove(test_file)
-
-    def test_add_stdin_url(self):
-        with output_hidden():
-            archivebox_add.main([], stdin=test_urls)
-
-        all_links = load_main_index(out_dir=DATA_DIR)
-        assert len(all_links) == 12
-
-
-class TestRemove(unittest.TestCase):
-    def setUp(self):
-        os.makedirs(DATA_DIR, exist_ok=True)
-        with output_hidden():
-            init()
-            archivebox_add.main([], stdin=test_urls)
-
-    # def tearDown(self):
-        # shutil.rmtree(DATA_DIR, ignore_errors=True)
-
-
-    def test_remove_exact(self):
-        with output_hidden():
-            archivebox_remove.main(['--yes', '--delete', 'https://example5.com/'])
-
-        all_links = load_main_index(out_dir=DATA_DIR)
-        assert len(all_links) == 11
-
-    def test_remove_regex(self):
-        with output_hidden():
-            archivebox_remove.main(['--yes', '--delete', '--filter-type=regex', r'http(s)?:\/\/(.+\.)?(example\d\.com)'])
-
-        all_links = load_main_index(out_dir=DATA_DIR)
-        assert len(all_links) == 4
-
-    def test_remove_domain(self):
-        with output_hidden():
-            archivebox_remove.main(['--yes', '--delete', '--filter-type=domain', 'example5.com', 'example6.com'])
-
-        all_links = load_main_index(out_dir=DATA_DIR)
-        assert len(all_links) == 10
-
-    def test_remove_none(self):
-        try:
-            with output_hidden(show_failing=False):
-                archivebox_remove.main(['--yes', '--delete', 'https://doesntexist.com'])
-            assert False, 'Should raise if no URLs match'
-        except Exception:
-            pass
-
-
-if __name__ == '__main__':
-    if '--verbose' in sys.argv or '-v' in sys.argv:
-        HIDE_CLI_OUTPUT = False
-    
-    unittest.main()
--- a/archivebox/cli/tests_piping.py
+++ b/archivebox/cli/tests_piping.py
@@ -1,665 +0,0 @@
-#!/usr/bin/env python3
-"""
-Tests for CLI piping workflow: crawl | snapshot | archiveresult | run
-
-This module tests the JSONL-based piping between CLI commands as described in:
-https://github.com/ArchiveBox/ArchiveBox/issues/1363
-
-Workflows tested:
-    archivebox crawl create URL        -> Crawl JSONL
-    archivebox snapshot create         -> Snapshot JSONL (accepts Crawl or URL input)
-    archivebox archiveresult create    -> ArchiveResult JSONL (accepts Snapshot input)
-    archivebox run                     -> Process queued records (accepts any JSONL)
-
-Pipeline:
-    archivebox crawl create URL | archivebox snapshot create | archivebox archiveresult create | archivebox run
-
-Each command should:
-    - Accept URLs, IDs, or JSONL as input (args or stdin)
-    - Output JSONL to stdout when piped (not TTY)
-    - Output human-readable to stderr when TTY
-"""
-
-__package__ = 'archivebox.cli'
-
-import os
-import json
-import shutil
-import tempfile
-import unittest
-from io import StringIO
-from pathlib import Path
-from typing import TypeVar
-
-# Test configuration - disable slow extractors
-TEST_CONFIG = {
-    'USE_COLOR': 'False',
-    'SHOW_PROGRESS': 'False',
-    'SAVE_ARCHIVEDOTORG': 'False',
-    'SAVE_TITLE': 'True',  # Fast extractor
-    'SAVE_FAVICON': 'False',
-    'SAVE_WGET': 'False',
-    'SAVE_WARC': 'False',
-    'SAVE_PDF': 'False',
-    'SAVE_SCREENSHOT': 'False',
-    'SAVE_DOM': 'False',
-    'SAVE_SINGLEFILE': 'False',
-    'SAVE_READABILITY': 'False',
-    'SAVE_MERCURY': 'False',
-    'SAVE_GIT': 'False',
-    'SAVE_YTDLP': 'False',
-    'SAVE_HEADERS': 'False',
-    'USE_CURL': 'False',
-    'USE_WGET': 'False',
-    'USE_GIT': 'False',
-    'USE_CHROME': 'False',
-    'USE_YOUTUBEDL': 'False',
-    'USE_NODE': 'False',
-}
-
-os.environ.update(TEST_CONFIG)
-
-T = TypeVar('T')
-
-
-def require(value: T | None) -> T:
-    if value is None:
-        raise AssertionError('Expected value to be present')
-    return value
-
-
-class MockTTYStringIO(StringIO):
-    def __init__(self, initial_value: str = '', *, is_tty: bool):
-        super().__init__(initial_value)
-        self._is_tty = is_tty
-
-    def isatty(self) -> bool:
-        return self._is_tty
-
-
-# =============================================================================
-# JSONL Utility Tests
-# =============================================================================
-
-class TestJSONLParsing(unittest.TestCase):
-    """Test JSONL input parsing utilities."""
-
-    def test_parse_plain_url(self):
-        """Plain URLs should be parsed as Snapshot records."""
-        from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT
-
-        result = require(parse_line('https://example.com'))
-        self.assertEqual(result['type'], TYPE_SNAPSHOT)
-        self.assertEqual(result['url'], 'https://example.com')
-
-    def test_parse_jsonl_snapshot(self):
-        """JSONL Snapshot records should preserve all fields."""
-        from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT
-
-        line = '{"type": "Snapshot", "url": "https://example.com", "tags": "test,demo"}'
-        result = require(parse_line(line))
-        self.assertEqual(result['type'], TYPE_SNAPSHOT)
-        self.assertEqual(result['url'], 'https://example.com')
-        self.assertEqual(result['tags'], 'test,demo')
-
-    def test_parse_jsonl_crawl(self):
-        """JSONL Crawl records should be parsed correctly."""
-        from archivebox.misc.jsonl import parse_line, TYPE_CRAWL
-
-        line = '{"type": "Crawl", "id": "abc123", "urls": "https://example.com", "max_depth": 1}'
-        result = require(parse_line(line))
-        self.assertEqual(result['type'], TYPE_CRAWL)
-        self.assertEqual(result['id'], 'abc123')
-        self.assertEqual(result['urls'], 'https://example.com')
-        self.assertEqual(result['max_depth'], 1)
-
-    def test_parse_jsonl_with_id(self):
-        """JSONL with id field should be recognized."""
-        from archivebox.misc.jsonl import parse_line
-
-        line = '{"type": "Snapshot", "id": "abc123", "url": "https://example.com"}'
-        result = require(parse_line(line))
-        self.assertEqual(result['id'], 'abc123')
-        self.assertEqual(result['url'], 'https://example.com')
-
-    def test_parse_uuid_as_snapshot_id(self):
-        """Bare UUIDs should be parsed as snapshot IDs."""
-        from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT
-
-        uuid = '01234567-89ab-cdef-0123-456789abcdef'
-        result = require(parse_line(uuid))
-        self.assertEqual(result['type'], TYPE_SNAPSHOT)
-        self.assertEqual(result['id'], uuid)
-
-    def test_parse_empty_line(self):
-        """Empty lines should return None."""
-        from archivebox.misc.jsonl import parse_line
-
-        self.assertIsNone(parse_line(''))
-        self.assertIsNone(parse_line('   '))
-        self.assertIsNone(parse_line('\n'))
-
-    def test_parse_comment_line(self):
-        """Comment lines should return None."""
-        from archivebox.misc.jsonl import parse_line
-
-        self.assertIsNone(parse_line('# This is a comment'))
-        self.assertIsNone(parse_line('  # Indented comment'))
-
-    def test_parse_invalid_url(self):
-        """Invalid URLs should return None."""
-        from archivebox.misc.jsonl import parse_line
-
-        self.assertIsNone(parse_line('not-a-url'))
-        self.assertIsNone(parse_line('ftp://example.com'))  # Only http/https/file
-
-    def test_parse_file_url(self):
-        """file:// URLs should be parsed."""
-        from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT
-
-        result = require(parse_line('file:///path/to/file.txt'))
-        self.assertEqual(result['type'], TYPE_SNAPSHOT)
-        self.assertEqual(result['url'], 'file:///path/to/file.txt')
-
-
-# Note: JSONL output serialization is tested in TestPipingWorkflowIntegration
-# using real model instances, not mocks.
-
-
-class TestReadArgsOrStdin(unittest.TestCase):
-    """Test reading from args or stdin."""
-
-    def test_read_from_args(self):
-        """Should read URLs from command line args."""
-        from archivebox.misc.jsonl import read_args_or_stdin
-
-        args = ('https://example1.com', 'https://example2.com')
-        records = list(read_args_or_stdin(args))
-
-        self.assertEqual(len(records), 2)
-        self.assertEqual(records[0]['url'], 'https://example1.com')
-        self.assertEqual(records[1]['url'], 'https://example2.com')
-
-    def test_read_from_stdin(self):
-        """Should read URLs from stdin when no args provided."""
-        from archivebox.misc.jsonl import read_args_or_stdin
-
-        stdin_content = 'https://example1.com\nhttps://example2.com\n'
-        stream = MockTTYStringIO(stdin_content, is_tty=False)
-
-        records = list(read_args_or_stdin((), stream=stream))
-
-        self.assertEqual(len(records), 2)
-        self.assertEqual(records[0]['url'], 'https://example1.com')
-        self.assertEqual(records[1]['url'], 'https://example2.com')
-
-    def test_read_jsonl_from_stdin(self):
-        """Should read JSONL from stdin."""
-        from archivebox.misc.jsonl import read_args_or_stdin
-
-        stdin_content = '{"type": "Snapshot", "url": "https://example.com", "tags": "test"}\n'
-        stream = MockTTYStringIO(stdin_content, is_tty=False)
-
-        records = list(read_args_or_stdin((), stream=stream))
-
-        self.assertEqual(len(records), 1)
-        self.assertEqual(records[0]['url'], 'https://example.com')
-        self.assertEqual(records[0]['tags'], 'test')
-
-    def test_read_crawl_jsonl_from_stdin(self):
-        """Should read Crawl JSONL from stdin."""
-        from archivebox.misc.jsonl import read_args_or_stdin, TYPE_CRAWL
-
-        stdin_content = '{"type": "Crawl", "id": "abc123", "urls": "https://example.com\\nhttps://foo.com"}\n'
-        stream = MockTTYStringIO(stdin_content, is_tty=False)
-
-        records = list(read_args_or_stdin((), stream=stream))
-
-        self.assertEqual(len(records), 1)
-        self.assertEqual(records[0]['type'], TYPE_CRAWL)
-        self.assertEqual(records[0]['id'], 'abc123')
-
-    def test_skip_tty_stdin(self):
-        """Should not read from TTY stdin (would block)."""
-        from archivebox.misc.jsonl import read_args_or_stdin
-
-        stream = MockTTYStringIO('https://example.com', is_tty=True)
-
-        records = list(read_args_or_stdin((), stream=stream))
-        self.assertEqual(len(records), 0)
-
-
-# =============================================================================
-# Unit Tests for Individual Commands
-# =============================================================================
-
-class TestCrawlCommand(unittest.TestCase):
-    """Unit tests for archivebox crawl command."""
-
-    def setUp(self):
-        """Set up test environment."""
-        self.test_dir = tempfile.mkdtemp()
-        os.environ['DATA_DIR'] = self.test_dir
-
-    def tearDown(self):
-        """Clean up test environment."""
-        shutil.rmtree(self.test_dir, ignore_errors=True)
-
-    def test_crawl_accepts_url(self):
-        """crawl should accept URLs as input."""
-        from archivebox.misc.jsonl import read_args_or_stdin
-
-        args = ('https://example.com',)
-        records = list(read_args_or_stdin(args))
-
-        self.assertEqual(len(records), 1)
-        self.assertEqual(records[0]['url'], 'https://example.com')
-
-    def test_crawl_output_format(self):
-        """crawl should output Crawl JSONL records."""
-        from archivebox.misc.jsonl import TYPE_CRAWL
-
-        # Mock crawl output
-        crawl_output = {
-            'type': TYPE_CRAWL,
-            'schema_version': '0.9.0',
-            'id': 'test-crawl-id',
-            'urls': 'https://example.com',
-            'status': 'queued',
-            'max_depth': 0,
-        }
-
-        self.assertEqual(crawl_output['type'], TYPE_CRAWL)
-        self.assertIn('id', crawl_output)
-        self.assertIn('urls', crawl_output)
-
-
-class TestSnapshotCommand(unittest.TestCase):
-    """Unit tests for archivebox snapshot command."""
-
-    def setUp(self):
-        """Set up test environment."""
-        self.test_dir = tempfile.mkdtemp()
-        os.environ['DATA_DIR'] = self.test_dir
-
-    def tearDown(self):
-        """Clean up test environment."""
-        shutil.rmtree(self.test_dir, ignore_errors=True)
-
-    def test_snapshot_accepts_url(self):
-        """snapshot should accept URLs as input."""
-        from archivebox.misc.jsonl import read_args_or_stdin
-
-        args = ('https://example.com',)
-        records = list(read_args_or_stdin(args))
-
-        self.assertEqual(len(records), 1)
-        self.assertEqual(records[0]['url'], 'https://example.com')
-
-    def test_snapshot_accepts_crawl_jsonl(self):
-        """snapshot should accept Crawl JSONL as input."""
-        from archivebox.misc.jsonl import read_args_or_stdin, TYPE_CRAWL
-
-        stdin = MockTTYStringIO('{"type": "Crawl", "id": "abc123", "urls": "https://example.com"}\n', is_tty=False)
-
-        records = list(read_args_or_stdin((), stream=stdin))
-
-        self.assertEqual(len(records), 1)
-        self.assertEqual(records[0]['type'], TYPE_CRAWL)
-        self.assertEqual(records[0]['id'], 'abc123')
-        self.assertEqual(records[0]['urls'], 'https://example.com')
-
-    def test_snapshot_accepts_jsonl_with_metadata(self):
-        """snapshot should accept JSONL with tags and other metadata."""
-        from archivebox.misc.jsonl import read_args_or_stdin
-
-        stdin = MockTTYStringIO('{"type": "Snapshot", "url": "https://example.com", "tags": "tag1,tag2", "title": "Test"}\n', is_tty=False)
-
-        records = list(read_args_or_stdin((), stream=stdin))
-
-        self.assertEqual(len(records), 1)
-        self.assertEqual(records[0]['url'], 'https://example.com')
-        self.assertEqual(records[0]['tags'], 'tag1,tag2')
-        self.assertEqual(records[0]['title'], 'Test')
-
-    # Note: Snapshot output format is tested in integration tests
-    # (TestPipingWorkflowIntegration.test_snapshot_creates_and_outputs_jsonl)
-    # using real Snapshot instances.
-
-
-class TestArchiveResultCommand(unittest.TestCase):
-    """Unit tests for archivebox archiveresult command."""
-
-    def setUp(self):
-        """Set up test environment."""
-        self.test_dir = tempfile.mkdtemp()
-        os.environ['DATA_DIR'] = self.test_dir
-
-    def tearDown(self):
-        """Clean up test environment."""
-        shutil.rmtree(self.test_dir, ignore_errors=True)
-
-    def test_archiveresult_accepts_snapshot_id(self):
-        """archiveresult should accept snapshot IDs as input."""
-        from archivebox.misc.jsonl import read_args_or_stdin
-
-        uuid = '01234567-89ab-cdef-0123-456789abcdef'
-        args = (uuid,)
-        records = list(read_args_or_stdin(args))
-
-        self.assertEqual(len(records), 1)
-        self.assertEqual(records[0]['id'], uuid)
-
-    def test_archiveresult_accepts_jsonl_snapshot(self):
-        """archiveresult should accept JSONL Snapshot records."""
-        from archivebox.misc.jsonl import read_args_or_stdin, TYPE_SNAPSHOT
-
-        stdin = MockTTYStringIO('{"type": "Snapshot", "id": "abc123", "url": "https://example.com"}\n', is_tty=False)
-
-        records = list(read_args_or_stdin((), stream=stdin))
-
-        self.assertEqual(len(records), 1)
-        self.assertEqual(records[0]['type'], TYPE_SNAPSHOT)
-        self.assertEqual(records[0]['id'], 'abc123')
-
-    def test_archiveresult_gathers_snapshot_ids(self):
-        """archiveresult should gather snapshot IDs from various input formats."""
-        from archivebox.misc.jsonl import TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
-
-        records = [
-            {'type': TYPE_SNAPSHOT, 'id': 'snap-1'},
-            {'type': TYPE_SNAPSHOT, 'id': 'snap-2', 'url': 'https://example.com'},
-            {'type': TYPE_ARCHIVERESULT, 'snapshot_id': 'snap-3'},
-            {'id': 'snap-4'},  # Bare id
-        ]
-
-        snapshot_ids = set()
-        for record in records:
-            record_type = record.get('type')
-
-            if record_type == TYPE_SNAPSHOT:
-                snapshot_id = record.get('id')
-                if snapshot_id:
-                    snapshot_ids.add(snapshot_id)
-            elif record_type == TYPE_ARCHIVERESULT:
-                snapshot_id = record.get('snapshot_id')
-                if snapshot_id:
-                    snapshot_ids.add(snapshot_id)
-            elif 'id' in record:
-                snapshot_ids.add(record['id'])
-
-        self.assertEqual(len(snapshot_ids), 4)
-        self.assertIn('snap-1', snapshot_ids)
-        self.assertIn('snap-2', snapshot_ids)
-        self.assertIn('snap-3', snapshot_ids)
-        self.assertIn('snap-4', snapshot_ids)
-
-
-# =============================================================================
-# URL Collection Tests
-# =============================================================================
-
-class TestURLCollection(unittest.TestCase):
-    """Test collecting urls.jsonl from extractor output."""
-
-    def setUp(self):
-        """Create test directory structure."""
-        self.test_dir = Path(tempfile.mkdtemp())
-
-        # Create fake extractor output directories with urls.jsonl
-        (self.test_dir / 'wget').mkdir()
-        (self.test_dir / 'wget' / 'urls.jsonl').write_text(
-            '{"url": "https://wget-link-1.com"}\n'
-            '{"url": "https://wget-link-2.com"}\n'
-        )
-
-        (self.test_dir / 'parse_html_urls').mkdir()
-        (self.test_dir / 'parse_html_urls' / 'urls.jsonl').write_text(
-            '{"url": "https://html-link-1.com"}\n'
-            '{"url": "https://html-link-2.com", "title": "HTML Link 2"}\n'
-        )
-
-        (self.test_dir / 'screenshot').mkdir()
-        # No urls.jsonl in screenshot dir - not a parser
-
-    def tearDown(self):
-        """Clean up test directory."""
-        shutil.rmtree(self.test_dir, ignore_errors=True)
-
-    def test_collect_urls_from_plugins(self):
-        """Should collect urls.jsonl from all parser plugin subdirectories."""
-        from archivebox.hooks import collect_urls_from_plugins
-
-        urls = collect_urls_from_plugins(self.test_dir)
-
-        self.assertEqual(len(urls), 4)
-
-        # Check that plugin is set
-        plugins = {u['plugin'] for u in urls}
-        self.assertIn('wget', plugins)
-        self.assertIn('parse_html_urls', plugins)
-        self.assertNotIn('screenshot', plugins)  # No urls.jsonl
-
-    def test_collect_urls_preserves_metadata(self):
-        """Should preserve metadata from urls.jsonl entries."""
-        from archivebox.hooks import collect_urls_from_plugins
-
-        urls = collect_urls_from_plugins(self.test_dir)
-
-        # Find the entry with title
-        titled = [u for u in urls if u.get('title') == 'HTML Link 2']
-        self.assertEqual(len(titled), 1)
-        self.assertEqual(titled[0]['url'], 'https://html-link-2.com')
-
-    def test_collect_urls_empty_dir(self):
-        """Should handle empty or non-existent directories."""
-        from archivebox.hooks import collect_urls_from_plugins
-
-        empty_dir = self.test_dir / 'nonexistent'
-        urls = collect_urls_from_plugins(empty_dir)
-
-        self.assertEqual(len(urls), 0)
-
-
-class TestEdgeCases(unittest.TestCase):
-    """Test edge cases and error handling."""
-
-    def test_empty_input(self):
-        """Commands should handle empty input gracefully."""
-        from archivebox.misc.jsonl import read_args_or_stdin
-
-        # Empty args, TTY stdin (should not block)
-        stdin = MockTTYStringIO('', is_tty=True)
-
-        records = list(read_args_or_stdin((), stream=stdin))
-        self.assertEqual(len(records), 0)
-
-    def test_malformed_jsonl(self):
-        """Should skip malformed JSONL lines."""
-        from archivebox.misc.jsonl import read_args_or_stdin
-
-        stdin = MockTTYStringIO(
-            '{"url": "https://good.com"}\n'
-            'not valid json\n'
-            '{"url": "https://also-good.com"}\n',
-            is_tty=False,
-        )
-
-        records = list(read_args_or_stdin((), stream=stdin))
-
-        self.assertEqual(len(records), 2)
-        urls = {r['url'] for r in records}
-        self.assertEqual(urls, {'https://good.com', 'https://also-good.com'})
-
-    def test_mixed_input_formats(self):
-        """Should handle mixed URLs and JSONL."""
-        from archivebox.misc.jsonl import read_args_or_stdin
-
-        stdin = MockTTYStringIO(
-            'https://plain-url.com\n'
-            '{"type": "Snapshot", "url": "https://jsonl-url.com", "tags": "test"}\n'
-            '01234567-89ab-cdef-0123-456789abcdef\n',  # UUID
-            is_tty=False,
-        )
-
-        records = list(read_args_or_stdin((), stream=stdin))
-
-        self.assertEqual(len(records), 3)
-
-        # Plain URL
-        self.assertEqual(records[0]['url'], 'https://plain-url.com')
-
-        # JSONL with metadata
-        self.assertEqual(records[1]['url'], 'https://jsonl-url.com')
-        self.assertEqual(records[1]['tags'], 'test')
-
-        # UUID
-        self.assertEqual(records[2]['id'], '01234567-89ab-cdef-0123-456789abcdef')
-
-    def test_crawl_with_multiple_urls(self):
-        """Crawl should handle multiple URLs in a single crawl."""
-        from archivebox.misc.jsonl import TYPE_CRAWL
-
-        # Test crawl JSONL with multiple URLs
-        crawl_output = {
-            'type': TYPE_CRAWL,
-            'id': 'test-multi-url-crawl',
-            'urls': 'https://url1.com\nhttps://url2.com\nhttps://url3.com',
-            'max_depth': 0,
-        }
-
-        # Parse the URLs
-        urls = [u.strip() for u in crawl_output['urls'].split('\n') if u.strip()]
-
-        self.assertEqual(len(urls), 3)
-        self.assertEqual(urls[0], 'https://url1.com')
-        self.assertEqual(urls[1], 'https://url2.com')
-        self.assertEqual(urls[2], 'https://url3.com')
-
-
-# =============================================================================
-# Pass-Through Behavior Tests
-# =============================================================================
-
-class TestPassThroughBehavior(unittest.TestCase):
-    """Test pass-through behavior in CLI commands."""
-
-    def test_crawl_passes_through_other_types(self):
-        """crawl create should pass through records with other types."""
-
-        # Input: a Tag record (not a Crawl or URL)
-        tag_record = {'type': 'Tag', 'id': 'test-tag', 'name': 'example'}
-        url_record = {'url': 'https://example.com'}
-
-        # Mock stdin with both records
-        stdin = MockTTYStringIO(
-            json.dumps(tag_record)
-            + '\n'
-            + json.dumps(url_record),
-            is_tty=False,
-        )
-
-        # The Tag should be passed through, the URL should create a Crawl
-        # (This is a unit test of the pass-through logic)
-        from archivebox.misc.jsonl import read_args_or_stdin
-        records = list(read_args_or_stdin((), stream=stdin))
-
-        self.assertEqual(len(records), 2)
-        # First record is a Tag (other type)
-        self.assertEqual(records[0]['type'], 'Tag')
-        # Second record has a URL
-        self.assertIn('url', records[1])
-
-    def test_snapshot_passes_through_crawl(self):
-        """snapshot create should pass through Crawl records."""
-        from archivebox.misc.jsonl import TYPE_CRAWL
-
-        crawl_record = {
-            'type': TYPE_CRAWL,
-            'id': 'test-crawl',
-            'urls': 'https://example.com',
-        }
-
-        # Crawl records should be passed through AND create snapshots
-        # This tests the accumulation behavior
-        self.assertEqual(crawl_record['type'], TYPE_CRAWL)
-        self.assertIn('urls', crawl_record)
-
-    def test_archiveresult_passes_through_snapshot(self):
-        """archiveresult create should pass through Snapshot records."""
-        from archivebox.misc.jsonl import TYPE_SNAPSHOT
-
-        snapshot_record = {
-            'type': TYPE_SNAPSHOT,
-            'id': 'test-snapshot',
-            'url': 'https://example.com',
-        }
-
-        # Snapshot records should be passed through
-        self.assertEqual(snapshot_record['type'], TYPE_SNAPSHOT)
-        self.assertIn('url', snapshot_record)
-
-    def test_run_passes_through_unknown_types(self):
-        """run should pass through records with unknown types."""
-        unknown_record = {'type': 'Unknown', 'id': 'test', 'data': 'value'}
-
-        # Unknown types should be passed through unchanged
-        self.assertEqual(unknown_record['type'], 'Unknown')
-        self.assertIn('data', unknown_record)
-
-
-class TestPipelineAccumulation(unittest.TestCase):
-    """Test that pipelines accumulate records correctly."""
-
-    def test_full_pipeline_output_types(self):
-        """Full pipeline should output all record types."""
-        from archivebox.misc.jsonl import TYPE_CRAWL, TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
-
-        # Simulated pipeline output after: crawl | snapshot | archiveresult | run
-        # Should contain Crawl, Snapshot, and ArchiveResult records
-        pipeline_output = [
-            {'type': TYPE_CRAWL, 'id': 'c1', 'urls': 'https://example.com'},
-            {'type': TYPE_SNAPSHOT, 'id': 's1', 'url': 'https://example.com'},
-            {'type': TYPE_ARCHIVERESULT, 'id': 'ar1', 'plugin': 'title'},
-        ]
-
-        types = {r['type'] for r in pipeline_output}
-        self.assertIn(TYPE_CRAWL, types)
-        self.assertIn(TYPE_SNAPSHOT, types)
-        self.assertIn(TYPE_ARCHIVERESULT, types)
-
-    def test_pipeline_preserves_ids(self):
-        """Pipeline should preserve record IDs through all stages."""
-        records = [
-            {'type': 'Crawl', 'id': 'c1', 'urls': 'https://example.com'},
-            {'type': 'Snapshot', 'id': 's1', 'url': 'https://example.com'},
-        ]
-
-        # All records should have IDs
-        for record in records:
-            self.assertIn('id', record)
-            self.assertTrue(record['id'])
-
-    def test_jq_transform_pattern(self):
-        """Test pattern for jq transforms in pipeline."""
-        # Simulated: archiveresult list --status=failed | jq 'del(.id) | .status = "queued"'
-        failed_record = {
-            'type': 'ArchiveResult',
-            'id': 'ar1',
-            'status': 'failed',
-            'plugin': 'wget',
-        }
-
-        # Transform: delete id, set status to queued
-        transformed = {
-            'type': failed_record['type'],
-            'status': 'queued',
-            'plugin': failed_record['plugin'],
-        }
-
-        self.assertNotIn('id', transformed)
-        self.assertEqual(transformed['status'], 'queued')
-
-
-if __name__ == '__main__':
-    unittest.main()
--- a/archivebox/core/tests.py
+++ b/archivebox/core/tests.py
@@ -1,382 +0,0 @@
-"""Tests for the core views, especially AddView."""
-
-import importlib
-import os
-import django
-from unittest.mock import patch
-from typing import TypeVar, cast
-
-from django.forms import BaseForm
-
-# Set up Django before importing any Django-dependent modules
-os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
-django.setup()
-
-TestCase = importlib.import_module('django.test').TestCase
-Client = importlib.import_module('django.test').Client
-User = importlib.import_module('django.contrib.auth.models').User
-reverse = importlib.import_module('django.urls').reverse
-Crawl = importlib.import_module('archivebox.crawls.models').Crawl
-CrawlSchedule = importlib.import_module('archivebox.crawls.models').CrawlSchedule
-Tag = importlib.import_module('archivebox.core.models').Tag
-SERVER_CONFIG = importlib.import_module('archivebox.config.common').SERVER_CONFIG
-
-T = TypeVar('T')
-
-
-def require(value: T | None) -> T:
-    if value is None:
-        raise AssertionError('Expected value to be present')
-    return value
-
-
-class AddViewTests(TestCase):
-    """Tests for the AddView (crawl creation form)."""
-
-    def setUp(self):
-        """Set up test user and client."""
-        self.client = Client()
-        self.user = User.objects.create_user(
-            username='testuser',
-            password='testpass123',
-            email='test@example.com'
-        )
-        self.client.login(username='testuser', password='testpass123')
-        self.add_url = reverse('add')
-
-    def test_add_view_get_requires_auth(self):
-        """Test that GET /add requires authentication."""
-        self.client.logout()
-        response = self.client.get(self.add_url)
-        # Should redirect to login or show 403/404
-        self.assertIn(response.status_code, [302, 403, 404])
-
-    def test_add_view_get_shows_form(self):
-        """Test that GET /add shows the form with all fields."""
-        response = self.client.get(self.add_url)
-        self.assertEqual(response.status_code, 200)
-
-        # Check that form fields are present
-        self.assertContains(response, 'name="url"')
-        self.assertContains(response, 'name="tag"')
-        self.assertContains(response, 'name="depth"')
-        self.assertContains(response, 'name="notes"')
-        self.assertContains(response, 'name="schedule"')
-        self.assertContains(response, 'name="persona"')
-        self.assertContains(response, 'name="overwrite"')
-        self.assertContains(response, 'name="update"')
-        self.assertContains(response, 'name="index_only"')
-
-        # Check for plugin groups
-        self.assertContains(response, 'name="chrome_plugins"')
-        self.assertContains(response, 'name="archiving_plugins"')
-        self.assertContains(response, 'name="parsing_plugins"')
-
-    def test_add_view_shows_tag_autocomplete(self):
-        """Test that tag autocomplete datalist is rendered."""
-        # Create some tags
-        Tag.objects.create(name='test-tag-1')
-        Tag.objects.create(name='test-tag-2')
-
-        response = self.client.get(self.add_url)
-        self.assertEqual(response.status_code, 200)
-
-        # Check for datalist with tags
-        self.assertContains(response, 'id="tag-datalist"')
-        self.assertContains(response, 'test-tag-1')
-        self.assertContains(response, 'test-tag-2')
-
-    def test_add_view_shows_plugin_presets(self):
-        """Test that plugin preset buttons are rendered."""
-        response = self.client.get(self.add_url)
-        self.assertEqual(response.status_code, 200)
-
-        self.assertContains(response, 'Quick Archive')
-        self.assertContains(response, 'Full Chrome')
-        self.assertContains(response, 'Text Only')
-        self.assertContains(response, 'Select All')
-        self.assertContains(response, 'Clear All')
-
-    def test_add_view_shows_links_to_resources(self):
-        """Test that helpful links are present."""
-        response = self.client.get(self.add_url)
-        self.assertEqual(response.status_code, 200)
-
-        # Link to plugin documentation
-        self.assertContains(response, '/admin/environment/plugins/')
-
-        # Link to create new persona
-        self.assertContains(response, '/admin/personas/persona/add/')
-
-    def test_add_basic_crawl_without_schedule(self):
-        """Test creating a basic crawl without a schedule."""
-        response = self.client.post(self.add_url, {
-            'url': 'https://example.com\nhttps://example.org',
-            'tag': 'test-tag',
-            'depth': '0',
-            'notes': 'Test crawl notes',
-        })
-
-        # Should redirect to crawl admin page
-        self.assertEqual(response.status_code, 302)
-
-        # Check that crawl was created
-        self.assertEqual(Crawl.objects.count(), 1)
-        crawl = require(Crawl.objects.first())
-
-        self.assertIn('https://example.com', crawl.urls)
-        self.assertIn('https://example.org', crawl.urls)
-        self.assertEqual(crawl.tags_str, 'test-tag')
-        self.assertEqual(crawl.max_depth, 0)
-        self.assertEqual(crawl.notes, 'Test crawl notes')
-        self.assertEqual(crawl.created_by, self.user)
-
-        # No schedule should be created
-        self.assertIsNone(crawl.schedule)
-        self.assertEqual(CrawlSchedule.objects.count(), 0)
-
-    def test_add_crawl_with_schedule(self):
-        """Test creating a crawl with a repeat schedule."""
-        response = self.client.post(self.add_url, {
-            'url': 'https://example.com',
-            'tag': 'scheduled',
-            'depth': '1',
-            'notes': 'Daily crawl',
-            'schedule': 'daily',
-        })
-
-        self.assertEqual(response.status_code, 302)
-
-        # Check that crawl and schedule were created
-        self.assertEqual(Crawl.objects.count(), 1)
-        self.assertEqual(CrawlSchedule.objects.count(), 1)
-
-        crawl = require(Crawl.objects.first())
-        schedule = require(CrawlSchedule.objects.first())
-
-        self.assertEqual(crawl.schedule, schedule)
-        self.assertEqual(schedule.template, crawl)
-        self.assertEqual(schedule.schedule, 'daily')
-        self.assertTrue(schedule.is_enabled)
-        self.assertEqual(schedule.created_by, self.user)
-
-    def test_add_crawl_with_cron_schedule(self):
-        """Test creating a crawl with a cron format schedule."""
-        response = self.client.post(self.add_url, {
-            'url': 'https://example.com',
-            'depth': '0',
-            'schedule': '0 */6 * * *',  # Every 6 hours
-        })
-
-        self.assertEqual(response.status_code, 302)
-
-        schedule = require(CrawlSchedule.objects.first())
-        self.assertEqual(schedule.schedule, '0 */6 * * *')
-
-    def test_add_crawl_with_plugins(self):
-        """Test creating a crawl with specific plugins selected."""
-        response = self.client.post(self.add_url, {
-            'url': 'https://example.com',
-            'depth': '0',
-            'chrome_plugins': ['screenshot', 'dom'],
-            'archiving_plugins': ['wget'],
-        })
-
-        self.assertEqual(response.status_code, 302)
-
-        crawl = require(Crawl.objects.first())
-        plugins = crawl.config.get('PLUGINS', '')
-
-        # Should contain the selected plugins
-        self.assertIn('screenshot', plugins)
-        self.assertIn('dom', plugins)
-        self.assertIn('wget', plugins)
-
-    def test_add_crawl_with_depth_range(self):
-        """Test creating crawls with different depth values (0-4)."""
-        for depth in range(5):
-            response = self.client.post(self.add_url, {
-                'url': f'https://example{depth}.com',
-                'depth': str(depth),
-            })
-
-            self.assertEqual(response.status_code, 302)
-
-        self.assertEqual(Crawl.objects.count(), 5)
-
-        for i, crawl in enumerate(Crawl.objects.order_by('created_at')):
-            self.assertEqual(crawl.max_depth, i)
-
-    def test_add_crawl_with_advanced_options(self):
-        """Test creating a crawl with advanced options."""
-        response = self.client.post(self.add_url, {
-            'url': 'https://example.com',
-            'depth': '0',
-            'persona': 'CustomPersona',
-            'overwrite': True,
-            'update': True,
-            'index_only': True,
-        })
-
-        self.assertEqual(response.status_code, 302)
-
-        crawl = require(Crawl.objects.first())
-        config = crawl.config
-
-        self.assertEqual(config.get('DEFAULT_PERSONA'), 'CustomPersona')
-        self.assertEqual(config.get('OVERWRITE'), True)
-        self.assertEqual(config.get('ONLY_NEW'), False)  # opposite of update
-        self.assertEqual(config.get('INDEX_ONLY'), True)
-
-    def test_add_crawl_with_custom_config(self):
-        """Test creating a crawl with custom config overrides."""
-        # Note: Django test client can't easily POST the KeyValueWidget format,
-        # so this test would need to use the form directly or mock the cleaned_data
-        # For now, we'll skip this test or mark it as TODO
-        pass
-
-    def test_add_public_anonymous_custom_config_is_silently_stripped(self):
-        """Anonymous users cannot override crawl config, even with PUBLIC_ADD_VIEW enabled."""
-        self.client.logout()
-
-        with patch.object(SERVER_CONFIG, 'PUBLIC_ADD_VIEW', True):
-            response = self.client.post(self.add_url, {
-                'url': 'https://example.com',
-                'depth': '0',
-                'config': '{"YTDLP_ARGS_EXTRA":["--exec","id > /tmp/pwned"]}',
-            })
-
-        self.assertEqual(response.status_code, 302)
-        crawl = require(Crawl.objects.order_by('-created_at').first())
-        self.assertNotIn('YTDLP_ARGS_EXTRA', crawl.config)
-
-    def test_add_authenticated_non_admin_custom_config_is_silently_stripped(self):
-        """Authenticated non-admin users cannot override crawl config."""
-        response = self.client.post(self.add_url, {
-            'url': 'https://example.com',
-            'depth': '0',
-            'config': '{"YTDLP_ARGS_EXTRA":["--exec","id > /tmp/pwned"]}',
-        })
-
-        self.assertEqual(response.status_code, 302)
-        crawl = require(Crawl.objects.order_by('-created_at').first())
-        self.assertNotIn('YTDLP_ARGS_EXTRA', crawl.config)
-
-    def test_add_staff_admin_custom_config_is_allowed(self):
-        """Admin users can override crawl config."""
-        self.client.logout()
-        User.objects.create_user(
-            username='adminuser',
-            password='adminpass123',
-            email='admin@example.com',
-            is_staff=True,
-        )
-        self.client.login(username='adminuser', password='adminpass123')
-
-        response = self.client.post(self.add_url, {
-            'url': 'https://example.com',
-            'depth': '0',
-            'config': '{"YTDLP_ARGS_EXTRA":["--exec","echo hello"]}',
-        })
-
-        self.assertEqual(response.status_code, 302)
-        crawl = require(Crawl.objects.order_by('-created_at').first())
-        self.assertEqual(crawl.config.get('YTDLP_ARGS_EXTRA'), ['--exec', 'echo hello'])
-
-    def test_add_empty_urls_fails(self):
-        """Test that submitting without URLs fails validation."""
-        response = self.client.post(self.add_url, {
-            'url': '',
-            'depth': '0',
-        })
-
-        # Should show form again with errors, not redirect
-        self.assertEqual(response.status_code, 200)
-        self.assertFormError(cast(BaseForm, response.context['form']), 'url', 'This field is required.')
-
-    def test_add_invalid_urls_fails(self):
-        """Test that invalid URLs fail validation."""
-        response = self.client.post(self.add_url, {
-            'url': 'not-a-url',
-            'depth': '0',
-        })
-
-        # Should show form again with errors
-        self.assertEqual(response.status_code, 200)
-        # Check for validation error (URL regex should fail)
-        self.assertContains(response, 'error')
-
-    def test_add_success_message_without_schedule(self):
-        """Test that success message is shown without schedule link."""
-        response = self.client.post(self.add_url, {
-            'url': 'https://example.com\nhttps://example.org',
-            'depth': '0',
-        }, follow=True)
-
-        # Check success message mentions crawl creation
-        messages = list(response.context['messages'])
-        self.assertEqual(len(messages), 1)
-        message_text = str(messages[0])
-
-        self.assertIn('Created crawl with 2 starting URL', message_text)
-        self.assertIn('View Crawl', message_text)
-        self.assertNotIn('scheduled to repeat', message_text)
-
-    def test_add_success_message_with_schedule(self):
-        """Test that success message includes schedule link."""
-        response = self.client.post(self.add_url, {
-            'url': 'https://example.com',
-            'depth': '0',
-            'schedule': 'weekly',
-        }, follow=True)
-
-        # Check success message mentions schedule
-        messages = list(response.context['messages'])
-        self.assertEqual(len(messages), 1)
-        message_text = str(messages[0])
-
-        self.assertIn('Created crawl', message_text)
-        self.assertIn('scheduled to repeat weekly', message_text)
-        self.assertIn('View Crawl', message_text)
-
-    def test_add_crawl_creates_source_file(self):
-        """Test that crawl creation saves URLs to sources file."""
-        response = self.client.post(self.add_url, {
-            'url': 'https://example.com',
-            'depth': '0',
-        })
-
-        self.assertEqual(response.status_code, 302)
-
-        # Check that source file was created in sources/ directory
-        from archivebox.config import CONSTANTS
-        sources_dir = CONSTANTS.SOURCES_DIR
-
-        # Should have created a source file
-        source_files = list(sources_dir.glob('*__web_ui_add_by_user_*.txt'))
-        self.assertGreater(len(source_files), 0)
-
-    def test_multiple_tags_are_saved(self):
-        """Test that multiple comma-separated tags are saved."""
-        response = self.client.post(self.add_url, {
-            'url': 'https://example.com',
-            'depth': '0',
-            'tag': 'tag1,tag2,tag3',
-        })
-
-        self.assertEqual(response.status_code, 302)
-
-        crawl = require(Crawl.objects.first())
-        self.assertEqual(crawl.tags_str, 'tag1,tag2,tag3')
-
-    def test_crawl_redirects_to_admin_change_page(self):
-        """Test that successful submission redirects to crawl admin page."""
-        response = self.client.post(self.add_url, {
-            'url': 'https://example.com',
-            'depth': '0',
-        })
-
-        crawl = require(Crawl.objects.first())
-        expected_redirect = f'/admin/crawls/crawl/{crawl.id}/change/'
-
-        self.assertRedirects(response, expected_redirect, fetch_redirect_response=False)
--- a/archivebox/misc/jsonl.py
+++ b/archivebox/misc/jsonl.py
@@ -70,9 +70,16 @@ def parse_line(line: str) -> Optional[Dict[str, Any]]:
    if line.startswith('http://') or line.startswith('https://') or line.startswith('file://'):
        return {'type': TYPE_SNAPSHOT, 'url': line}

-    # Could be a snapshot ID (UUID)
+    # Could be a snapshot ID (UUID with dashes or compact 32-char hex)
    if len(line) == 36 and line.count('-') == 4:
        return {'type': TYPE_SNAPSHOT, 'id': line}
+    if len(line) == 32:
+        try:
+            int(line, 16)
+        except ValueError:
+            pass
+        else:
+            return {'type': TYPE_SNAPSHOT, 'id': line}

    # Unknown format, skip
    return None
--- a/archivebox/misc/logging_util.py
+++ b/archivebox/misc/logging_util.py
@@ -607,7 +607,7 @@ def log_worker_event(

    # Build final message
    error_str = f' {type(error).__name__}: {error}' if error else ''
-    from archivebox.misc.logging import CONSOLE
+    from archivebox.misc.logging import CONSOLE, STDERR
    from rich.text import Text

    # Create a Rich Text object for proper formatting
@@ -632,7 +632,11 @@ def log_worker_event(
    if metadata_str:
        text.append(f' | {metadata_str}')

-    CONSOLE.print(text, soft_wrap=True)
+    # Stdout is reserved for JSONL records whenever commands are piped together.
+    # Route worker/DB progress to stderr in non-TTY contexts so pipelines like
+    # `archivebox snapshot list | archivebox run` keep stdout machine-readable.
+    output_console = CONSOLE if sys.stdout.isatty() else STDERR
+    output_console.print(text, soft_wrap=True)


@enforce_types
--- a/archivebox/personas/tests.py
+++ b/archivebox/personas/tests.py
@@ -1,2 +0,0 @@
-
-# Create your tests here.
--- a/archivebox/tests/conftest.py
+++ b/archivebox/tests/conftest.py
@@ -3,8 +3,10 @@
 import os
 import sys
 import subprocess
+import tempfile
 import textwrap
 import time
+import shutil
 from pathlib import Path
 from typing import List, Dict, Any, Optional, Tuple

@@ -14,6 +16,9 @@ from archivebox.uuid_compat import uuid7

 pytest_plugins = ["archivebox.tests.fixtures"]

+SESSION_DATA_DIR = Path(tempfile.mkdtemp(prefix="archivebox-pytest-session-")).resolve()
+os.environ.setdefault("DATA_DIR", str(SESSION_DATA_DIR))
+

 # =============================================================================
 # CLI Helpers (defined before fixtures that use them)
@@ -82,6 +87,36 @@ def run_archivebox_cmd(
 # Fixtures
 # =============================================================================

+@pytest.fixture(autouse=True)
+def isolate_test_runtime(tmp_path):
+    """
+    Run each pytest test from an isolated temp cwd and restore env mutations.
+
+    The maintained pytest suite lives under ``archivebox/tests``. Many of those
+    CLI tests shell out without passing ``cwd=`` explicitly, so the safest
+    contract is that every test starts in its own temp directory and any
+    in-process ``os.environ`` edits are rolled back afterwards.
+
+    We intentionally clear ``DATA_DIR`` for the body of each test so subprocess
+    tests that rely on cwd keep working. During collection/import time we still
+    seed a separate session-scoped temp ``DATA_DIR`` above so any ArchiveBox
+    config imported before this fixture runs never points at the repo root.
+    """
+    original_cwd = Path.cwd()
+    original_env = os.environ.copy()
+    os.chdir(tmp_path)
+    os.environ.pop("DATA_DIR", None)
+    try:
+        yield
+    finally:
+        os.chdir(original_cwd)
+        os.environ.clear()
+        os.environ.update(original_env)
+
+
+def pytest_sessionfinish(session, exitstatus):
+    shutil.rmtree(SESSION_DATA_DIR, ignore_errors=True)
+
@pytest.fixture
 def isolated_data_dir(tmp_path):
    """
--- a/archivebox/tests/fixtures.py
+++ b/archivebox/tests/fixtures.py
@@ -7,8 +7,11 @@ import pytest

@pytest.fixture
 def process(tmp_path):
-    os.chdir(tmp_path)
-    process = subprocess.run(['archivebox', 'init'], capture_output=True)
+    process = subprocess.run(
+        ['archivebox', 'init'],
+        capture_output=True,
+        cwd=tmp_path,
+    )
    return process

@pytest.fixture
--- a/archivebox/tests/test_api_cli_schedule.py
+++ b/archivebox/tests/test_api_cli_schedule.py
@@ -1,17 +1,12 @@
-import importlib
 from io import StringIO

-from archivebox.config.django import setup_django
+from django.contrib.auth import get_user_model
+from django.test import RequestFactory, TestCase

-setup_django()
+from archivebox.api.v1_cli import ScheduleCommandSchema, cli_schedule
+from archivebox.crawls.models import CrawlSchedule

-User = importlib.import_module('django.contrib.auth.models').User
-TestCase = importlib.import_module('django.test').TestCase
-RequestFactory = importlib.import_module('django.test').RequestFactory
-api_v1_cli = importlib.import_module('archivebox.api.v1_cli')
-ScheduleCommandSchema = api_v1_cli.ScheduleCommandSchema
-cli_schedule = api_v1_cli.cli_schedule
-CrawlSchedule = importlib.import_module('archivebox.crawls.models').CrawlSchedule
+User = get_user_model()


 class CLIScheduleAPITests(TestCase):
--- a/archivebox/tests/test_cli_extract_input.py
+++ b/archivebox/tests/test_cli_extract_input.py
@@ -1,13 +1,10 @@
-#!/usr/bin/env python3
-"""Integration tests for archivebox extract command."""
+"""Tests for archivebox extract input handling and pipelines."""

 import os
 import subprocess
 import sqlite3
 import json

-import pytest
-


 def test_extract_runs_on_snapshot_id(tmp_path, process, disable_extractors_dict):
@@ -271,7 +268,3 @@ class TestExtractCLI:

        # Should show warning about no snapshots or exit normally (empty input)
        assert result.returncode == 0 or 'No' in result.stderr
-
-
-if __name__ == '__main__':
-    pytest.main([__file__, '-v'])
--- a/archivebox/tests/test_cli_piping.py
+++ b/archivebox/tests/test_cli_piping.py
@@ -0,0 +1,377 @@
+"""
+Tests for JSONL piping contracts and `archivebox run` / `archivebox orchestrator`.
+
+This file covers both:
+- low-level JSONL/stdin parsing behavior that makes CLI piping work
+- subprocess integration for the supported records `archivebox run` consumes
+"""
+
+import sqlite3
+import sys
+import uuid
+from io import StringIO
+from pathlib import Path
+
+from archivebox.tests.conftest import (
+    create_test_url,
+    parse_jsonl_output,
+    run_archivebox_cmd,
+)
+
+
+PIPE_TEST_ENV = {
+    "PLUGINS": "favicon",
+    "SAVE_FAVICON": "True",
+    "USE_COLOR": "False",
+    "SHOW_PROGRESS": "False",
+}
+
+
+class MockTTYStringIO(StringIO):
+    def __init__(self, initial_value: str = "", *, is_tty: bool):
+        super().__init__(initial_value)
+        self._is_tty = is_tty
+
+    def isatty(self) -> bool:
+        return self._is_tty
+
+
+def _stdout_lines(stdout: str) -> list[str]:
+    return [line for line in stdout.splitlines() if line.strip()]
+
+
+def _assert_stdout_is_jsonl_only(stdout: str) -> None:
+    lines = _stdout_lines(stdout)
+    assert lines, "Expected stdout to contain JSONL records"
+    assert all(line.lstrip().startswith("{") for line in lines), stdout
+
+
+def _sqlite_param(value: object) -> object:
+    if not isinstance(value, str):
+        return value
+    try:
+        return uuid.UUID(value).hex
+    except ValueError:
+        return value
+
+
+def _db_value(data_dir: Path, sql: str, params: tuple[object, ...] = ()) -> object | None:
+    conn = sqlite3.connect(data_dir / "index.sqlite3")
+    try:
+        row = conn.execute(sql, tuple(_sqlite_param(param) for param in params)).fetchone()
+    finally:
+        conn.close()
+    return row[0] if row else None
+
+
+def test_parse_line_accepts_supported_piping_inputs():
+    """The JSONL parser should normalize the input forms CLI pipes accept."""
+    from archivebox.misc.jsonl import TYPE_CRAWL, TYPE_SNAPSHOT, parse_line
+
+    assert parse_line("") is None
+    assert parse_line("   ") is None
+    assert parse_line("# comment") is None
+    assert parse_line("not-a-url") is None
+    assert parse_line("ftp://example.com") is None
+
+    plain_url = parse_line("https://example.com")
+    assert plain_url == {"type": TYPE_SNAPSHOT, "url": "https://example.com"}
+
+    file_url = parse_line("file:///tmp/example.txt")
+    assert file_url == {"type": TYPE_SNAPSHOT, "url": "file:///tmp/example.txt"}
+
+    snapshot_json = parse_line('{"type":"Snapshot","url":"https://example.com","tags":"tag1,tag2"}')
+    assert snapshot_json is not None
+    assert snapshot_json["type"] == TYPE_SNAPSHOT
+    assert snapshot_json["tags"] == "tag1,tag2"
+
+    crawl_json = parse_line('{"type":"Crawl","id":"abc123","urls":"https://example.com","max_depth":1}')
+    assert crawl_json is not None
+    assert crawl_json["type"] == TYPE_CRAWL
+    assert crawl_json["id"] == "abc123"
+    assert crawl_json["max_depth"] == 1
+
+    snapshot_id = "01234567-89ab-cdef-0123-456789abcdef"
+    parsed_id = parse_line(snapshot_id)
+    assert parsed_id == {"type": TYPE_SNAPSHOT, "id": snapshot_id}
+
+    compact_snapshot_id = "0123456789abcdef0123456789abcdef"
+    compact_parsed_id = parse_line(compact_snapshot_id)
+    assert compact_parsed_id == {"type": TYPE_SNAPSHOT, "id": compact_snapshot_id}
+
+
+def test_read_args_or_stdin_handles_args_stdin_and_mixed_jsonl():
+    """Piping helpers should consume args, structured JSONL, and pass-through records."""
+    from archivebox.misc.jsonl import TYPE_CRAWL, read_args_or_stdin
+
+    records = list(read_args_or_stdin(("https://example1.com", "https://example2.com")))
+    assert [record["url"] for record in records] == ["https://example1.com", "https://example2.com"]
+
+    stdin_records = list(
+        read_args_or_stdin(
+            (),
+            stream=MockTTYStringIO(
+                'https://plain-url.com\n'
+                '{"type":"Snapshot","url":"https://jsonl-url.com","tags":"test"}\n'
+                '{"type":"Tag","id":"tag-1","name":"example"}\n'
+                '01234567-89ab-cdef-0123-456789abcdef\n'
+                'not valid json\n',
+                is_tty=False,
+            ),
+        )
+    )
+    assert len(stdin_records) == 4
+    assert stdin_records[0]["url"] == "https://plain-url.com"
+    assert stdin_records[1]["url"] == "https://jsonl-url.com"
+    assert stdin_records[1]["tags"] == "test"
+    assert stdin_records[2]["type"] == "Tag"
+    assert stdin_records[2]["name"] == "example"
+    assert stdin_records[3]["id"] == "01234567-89ab-cdef-0123-456789abcdef"
+
+    crawl_records = list(
+        read_args_or_stdin(
+            (),
+            stream=MockTTYStringIO(
+                '{"type":"Crawl","id":"crawl-1","urls":"https://example.com\\nhttps://foo.com"}\n',
+                is_tty=False,
+            ),
+        )
+    )
+    assert len(crawl_records) == 1
+    assert crawl_records[0]["type"] == TYPE_CRAWL
+    assert crawl_records[0]["id"] == "crawl-1"
+
+    tty_records = list(read_args_or_stdin((), stream=MockTTYStringIO("https://example.com", is_tty=True)))
+    assert tty_records == []
+
+
+def test_collect_urls_from_plugins_reads_only_parser_outputs(tmp_path):
+    """Parser extractor `urls.jsonl` outputs should be discoverable for recursive piping."""
+    from archivebox.hooks import collect_urls_from_plugins
+
+    (tmp_path / "wget").mkdir()
+    (tmp_path / "wget" / "urls.jsonl").write_text(
+        '{"url":"https://wget-link-1.com"}\n'
+        '{"url":"https://wget-link-2.com"}\n',
+        encoding="utf-8",
+    )
+    (tmp_path / "parse_html_urls").mkdir()
+    (tmp_path / "parse_html_urls" / "urls.jsonl").write_text(
+        '{"url":"https://html-link-1.com"}\n'
+        '{"url":"https://html-link-2.com","title":"HTML Link 2"}\n',
+        encoding="utf-8",
+    )
+    (tmp_path / "screenshot").mkdir()
+
+    urls = collect_urls_from_plugins(tmp_path)
+    assert len(urls) == 4
+    assert {url["plugin"] for url in urls} == {"wget", "parse_html_urls"}
+    titled = [url for url in urls if url.get("title") == "HTML Link 2"]
+    assert len(titled) == 1
+    assert titled[0]["url"] == "https://html-link-2.com"
+
+    assert collect_urls_from_plugins(tmp_path / "nonexistent") == []
+
+
+def test_crawl_create_stdout_pipes_into_run(initialized_archive):
+    """`archivebox crawl create | archivebox run` should queue and materialize snapshots."""
+    url = create_test_url()
+
+    create_stdout, create_stderr, create_code = run_archivebox_cmd(
+        ["crawl", "create", url],
+        data_dir=initialized_archive,
+    )
+    assert create_code == 0, create_stderr
+    _assert_stdout_is_jsonl_only(create_stdout)
+
+    crawl = next(record for record in parse_jsonl_output(create_stdout) if record.get("type") == "Crawl")
+
+    run_stdout, run_stderr, run_code = run_archivebox_cmd(
+        ["run"],
+        stdin=create_stdout,
+        data_dir=initialized_archive,
+        timeout=120,
+        env=PIPE_TEST_ENV,
+    )
+    assert run_code == 0, run_stderr
+    _assert_stdout_is_jsonl_only(run_stdout)
+
+    run_records = parse_jsonl_output(run_stdout)
+    assert any(record.get("type") == "Crawl" and record.get("id") == crawl["id"] for record in run_records)
+
+    snapshot_count = _db_value(
+        initialized_archive,
+        "SELECT COUNT(*) FROM core_snapshot WHERE crawl_id = ?",
+        (crawl["id"],),
+    )
+    assert isinstance(snapshot_count, int)
+    assert snapshot_count >= 1
+
+
+def test_snapshot_list_stdout_pipes_into_run(initialized_archive):
+    """`archivebox snapshot list | archivebox run` should requeue listed snapshots."""
+    url = create_test_url()
+
+    create_stdout, create_stderr, create_code = run_archivebox_cmd(
+        ["snapshot", "create", url],
+        data_dir=initialized_archive,
+    )
+    assert create_code == 0, create_stderr
+    snapshot = next(record for record in parse_jsonl_output(create_stdout) if record.get("type") == "Snapshot")
+
+    list_stdout, list_stderr, list_code = run_archivebox_cmd(
+        ["snapshot", "list", "--status=queued", f"--url__icontains={snapshot['id']}"],
+        data_dir=initialized_archive,
+    )
+    if list_code != 0 or not parse_jsonl_output(list_stdout):
+        list_stdout, list_stderr, list_code = run_archivebox_cmd(
+            ["snapshot", "list", f"--url__icontains={url}"],
+            data_dir=initialized_archive,
+        )
+    assert list_code == 0, list_stderr
+    _assert_stdout_is_jsonl_only(list_stdout)
+
+    run_stdout, run_stderr, run_code = run_archivebox_cmd(
+        ["run"],
+        stdin=list_stdout,
+        data_dir=initialized_archive,
+        timeout=120,
+        env=PIPE_TEST_ENV,
+    )
+    assert run_code == 0, run_stderr
+    _assert_stdout_is_jsonl_only(run_stdout)
+
+    run_records = parse_jsonl_output(run_stdout)
+    assert any(record.get("type") == "Snapshot" and record.get("id") == snapshot["id"] for record in run_records)
+
+    snapshot_status = _db_value(
+        initialized_archive,
+        "SELECT status FROM core_snapshot WHERE id = ?",
+        (snapshot["id"],),
+    )
+    assert snapshot_status == "sealed"
+
+
+def test_archiveresult_list_stdout_pipes_into_orchestrator_alias(initialized_archive):
+    """`archivebox archiveresult list | archivebox orchestrator` should preserve clean JSONL stdout."""
+    url = create_test_url()
+
+    snapshot_stdout, snapshot_stderr, snapshot_code = run_archivebox_cmd(
+        ["snapshot", "create", url],
+        data_dir=initialized_archive,
+    )
+    assert snapshot_code == 0, snapshot_stderr
+
+    ar_create_stdout, ar_create_stderr, ar_create_code = run_archivebox_cmd(
+        ["archiveresult", "create", "--plugin=favicon"],
+        stdin=snapshot_stdout,
+        data_dir=initialized_archive,
+    )
+    assert ar_create_code == 0, ar_create_stderr
+
+    created_records = parse_jsonl_output(ar_create_stdout)
+    archiveresult = next(record for record in created_records if record.get("type") == "ArchiveResult")
+
+    list_stdout, list_stderr, list_code = run_archivebox_cmd(
+        ["archiveresult", "list", "--plugin=favicon"],
+        data_dir=initialized_archive,
+    )
+    assert list_code == 0, list_stderr
+    _assert_stdout_is_jsonl_only(list_stdout)
+
+    orchestrator_stdout, orchestrator_stderr, orchestrator_code = run_archivebox_cmd(
+        ["orchestrator"],
+        stdin=list_stdout,
+        data_dir=initialized_archive,
+        timeout=120,
+        env=PIPE_TEST_ENV,
+    )
+    assert orchestrator_code == 0, orchestrator_stderr
+    _assert_stdout_is_jsonl_only(orchestrator_stdout)
+    assert "renamed to `archivebox run`" in orchestrator_stderr
+
+    run_records = parse_jsonl_output(orchestrator_stdout)
+    assert any(
+        record.get("type") == "ArchiveResult" and record.get("id") == archiveresult["id"]
+        for record in run_records
+    )
+
+
+def test_binary_create_stdout_pipes_into_run(initialized_archive):
+    """`archivebox binary create | archivebox run` should queue the binary record for processing."""
+    create_stdout, create_stderr, create_code = run_archivebox_cmd(
+        ["binary", "create", "--name=python3", f"--abspath={sys.executable}", "--version=test"],
+        data_dir=initialized_archive,
+    )
+    assert create_code == 0, create_stderr
+    _assert_stdout_is_jsonl_only(create_stdout)
+
+    binary = next(record for record in parse_jsonl_output(create_stdout) if record.get("type") == "Binary")
+
+    run_stdout, run_stderr, run_code = run_archivebox_cmd(
+        ["run"],
+        stdin=create_stdout,
+        data_dir=initialized_archive,
+        timeout=120,
+    )
+    assert run_code == 0, run_stderr
+    _assert_stdout_is_jsonl_only(run_stdout)
+
+    run_records = parse_jsonl_output(run_stdout)
+    assert any(record.get("type") == "Binary" and record.get("id") == binary["id"] for record in run_records)
+
+    status = _db_value(
+        initialized_archive,
+        "SELECT status FROM machine_binary WHERE id = ?",
+        (binary["id"],),
+    )
+    assert status in {"queued", "installed"}
+
+
+def test_multi_stage_pipeline_into_run(initialized_archive):
+    """`crawl create | snapshot create | archiveresult create | run` should preserve JSONL and finish work."""
+    url = create_test_url()
+
+    crawl_stdout, crawl_stderr, crawl_code = run_archivebox_cmd(
+        ["crawl", "create", url],
+        data_dir=initialized_archive,
+    )
+    assert crawl_code == 0, crawl_stderr
+    _assert_stdout_is_jsonl_only(crawl_stdout)
+
+    snapshot_stdout, snapshot_stderr, snapshot_code = run_archivebox_cmd(
+        ["snapshot", "create"],
+        stdin=crawl_stdout,
+        data_dir=initialized_archive,
+    )
+    assert snapshot_code == 0, snapshot_stderr
+    _assert_stdout_is_jsonl_only(snapshot_stdout)
+
+    archiveresult_stdout, archiveresult_stderr, archiveresult_code = run_archivebox_cmd(
+        ["archiveresult", "create", "--plugin=favicon"],
+        stdin=snapshot_stdout,
+        data_dir=initialized_archive,
+    )
+    assert archiveresult_code == 0, archiveresult_stderr
+    _assert_stdout_is_jsonl_only(archiveresult_stdout)
+
+    run_stdout, run_stderr, run_code = run_archivebox_cmd(
+        ["run"],
+        stdin=archiveresult_stdout,
+        data_dir=initialized_archive,
+        timeout=120,
+        env=PIPE_TEST_ENV,
+    )
+    assert run_code == 0, run_stderr
+    _assert_stdout_is_jsonl_only(run_stdout)
+
+    run_records = parse_jsonl_output(run_stdout)
+    snapshot = next(record for record in run_records if record.get("type") == "Snapshot")
+    assert any(record.get("type") == "ArchiveResult" for record in run_records)
+
+    snapshot_status = _db_value(
+        initialized_archive,
+        "SELECT status FROM core_snapshot WHERE id = ?",
+        (snapshot["id"],),
+    )
+    assert snapshot_status == "sealed"
--- a/archivebox/tests/test_extractors.py
+++ b/archivebox/tests/test_extractors.py
@@ -1,156 +0,0 @@
-import json as pyjson
-import sqlite3
-import subprocess
-from pathlib import Path
-
-from .fixtures import disable_extractors_dict, process
-
-FIXTURES = (disable_extractors_dict, process)
-
-
-def _find_snapshot_dir(data_dir: Path, snapshot_id: str) -> Path | None:
-    candidates = {snapshot_id}
-    if len(snapshot_id) == 32:
-        candidates.add(f"{snapshot_id[:8]}-{snapshot_id[8:12]}-{snapshot_id[12:16]}-{snapshot_id[16:20]}-{snapshot_id[20:]}")
-    elif len(snapshot_id) == 36 and "-" in snapshot_id:
-        candidates.add(snapshot_id.replace("-", ""))
-
-    for needle in candidates:
-        for path in data_dir.rglob(needle):
-            if path.is_dir():
-                return path
-    return None
-
-
-def _latest_snapshot_dir(data_dir: Path) -> Path:
-    conn = sqlite3.connect(data_dir / "index.sqlite3")
-    try:
-        snapshot_id = conn.execute(
-            "SELECT id FROM core_snapshot ORDER BY created_at DESC LIMIT 1"
-        ).fetchone()
-    finally:
-        conn.close()
-
-    assert snapshot_id is not None, "Expected a snapshot to be created"
-    snapshot_dir = _find_snapshot_dir(data_dir, str(snapshot_id[0]))
-    assert snapshot_dir is not None, f"Snapshot output directory not found for {snapshot_id[0]}"
-    return snapshot_dir
-
-
-def _latest_plugin_result(data_dir: Path, plugin: str) -> tuple[str, str, dict]:
-    conn = sqlite3.connect(data_dir / "index.sqlite3")
-    try:
-        row = conn.execute(
-            "SELECT snapshot_id, status, output_files FROM core_archiveresult "
-            "WHERE plugin = ? ORDER BY created_at DESC LIMIT 1",
-            (plugin,),
-        ).fetchone()
-    finally:
-        conn.close()
-
-    assert row is not None, f"Expected an ArchiveResult row for plugin={plugin}"
-    output_files = row[2]
-    if isinstance(output_files, str):
-        output_files = pyjson.loads(output_files or "{}")
-    output_files = output_files or {}
-    return str(row[0]), str(row[1]), output_files
-
-
-def _plugin_output_paths(data_dir: Path, plugin: str) -> list[Path]:
-    snapshot_id, status, output_files = _latest_plugin_result(data_dir, plugin)
-    assert status == "succeeded", f"Expected {plugin} ArchiveResult to succeed, got {status}"
-    assert output_files, f"Expected {plugin} ArchiveResult to record output_files"
-
-    snapshot_dir = _find_snapshot_dir(data_dir, snapshot_id)
-    assert snapshot_dir is not None, f"Snapshot output directory not found for {snapshot_id}"
-
-    plugin_dir = snapshot_dir / plugin
-    output_paths = [plugin_dir / rel_path for rel_path in output_files.keys()]
-    missing_paths = [path for path in output_paths if not path.exists()]
-    assert not missing_paths, f"Expected plugin outputs to exist on disk, missing: {missing_paths}"
-    return output_paths
-
-
-def _archivebox_env(base_env: dict, data_dir: Path) -> dict:
-    env = base_env.copy()
-    tmp_dir = Path("/tmp") / f"abx-{data_dir.name}"
-    tmp_dir.mkdir(parents=True, exist_ok=True)
-    env["TMP_DIR"] = str(tmp_dir)
-    env["ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS"] = "true"
-    return env
-
-
-def test_singlefile_works(tmp_path, process, disable_extractors_dict):
-    data_dir = Path.cwd()
-    env = _archivebox_env(disable_extractors_dict, data_dir)
-    env.update({"SAVE_SINGLEFILE": "true"})
-    add_process = subprocess.run(
-        ['archivebox', 'add', '--plugins=singlefile', 'https://example.com'],
-        capture_output=True,
-        text=True,
-        env=env,
-        timeout=900,
-    )
-    assert add_process.returncode == 0, add_process.stderr
-    output_files = _plugin_output_paths(data_dir, "singlefile")
-    assert any(path.suffix in (".html", ".htm") for path in output_files)
-
-def test_readability_works(tmp_path, process, disable_extractors_dict):
-    data_dir = Path.cwd()
-    env = _archivebox_env(disable_extractors_dict, data_dir)
-    env.update({"SAVE_SINGLEFILE": "true", "SAVE_READABILITY": "true"})
-    add_process = subprocess.run(
-        ['archivebox', 'add', '--plugins=singlefile,readability', 'https://example.com'],
-        capture_output=True,
-        text=True,
-        env=env,
-        timeout=900,
-    )
-    assert add_process.returncode == 0, add_process.stderr
-    output_files = _plugin_output_paths(data_dir, "readability")
-    assert any(path.suffix in (".html", ".htm") for path in output_files)
-
-def test_htmltotext_works(tmp_path, process, disable_extractors_dict):
-    data_dir = Path.cwd()
-    env = _archivebox_env(disable_extractors_dict, data_dir)
-    env.update({"SAVE_WGET": "true", "SAVE_HTMLTOTEXT": "true"})
-    add_process = subprocess.run(
-        ['archivebox', 'add', '--plugins=wget,htmltotext', 'https://example.com'],
-        capture_output=True,
-        text=True,
-        env=env,
-        timeout=900,
-    )
-    assert add_process.returncode == 0, add_process.stderr
-    output_files = _plugin_output_paths(data_dir, "htmltotext")
-    assert any(path.suffix == ".txt" for path in output_files)
-
-def test_use_node_false_disables_readability_and_singlefile(tmp_path, process, disable_extractors_dict):
-    env = _archivebox_env(disable_extractors_dict, Path.cwd())
-    env.update({"SAVE_READABILITY": "true", "SAVE_DOM": "true", "SAVE_SINGLEFILE": "true", "USE_NODE": "false"})
-    add_process = subprocess.run(['archivebox', 'add', '--plugins=readability,dom,singlefile', 'https://example.com'],
-                                  capture_output=True, env=env)
-    output_str = add_process.stdout.decode("utf-8")
-    assert "> singlefile" not in output_str
-    assert "> readability" not in output_str
-
-def test_headers_retrieved(tmp_path, process, disable_extractors_dict):
-    data_dir = Path.cwd()
-    env = _archivebox_env(disable_extractors_dict, data_dir)
-    env.update({"SAVE_HEADERS": "true"})
-    add_process = subprocess.run(
-        ['archivebox', 'add', '--plugins=headers', 'https://example.com'],
-        capture_output=True,
-        text=True,
-        env=env,
-        timeout=900,
-    )
-    assert add_process.returncode == 0, add_process.stderr
-    output_files = _plugin_output_paths(data_dir, "headers")
-    output_file = next((path for path in output_files if path.suffix == ".json"), None)
-    assert output_file is not None, f"Expected headers output_files to include a JSON file, got: {output_files}"
-    with open(output_file, 'r', encoding='utf-8') as f:
-        headers = pyjson.load(f)
-    response_headers = headers.get("response_headers") or headers.get("headers") or {}
-    assert isinstance(response_headers, dict), f"Expected response_headers dict, got: {response_headers!r}"
-    assert 'Content-Type' in response_headers or 'content-type' in response_headers
--- a/archivebox/machine/tests/test_machine_models.py
+++ b/archivebox/machine/tests/test_machine_models.py
--- a/archivebox/workers/tests/test_orchestrator.py
+++ b/archivebox/workers/tests/test_orchestrator.py
--- a/archivebox/tests/test_savepagenow.py
+++ b/archivebox/tests/test_savepagenow.py
@@ -13,7 +13,6 @@ ADMIN_HOST = 'admin.archivebox.localhost:8000'


 def _run_savepagenow_script(initialized_archive: Path, request_url: str, expected_url: str, *, login: bool, public_add_view: bool, host: str):
-    project_root = Path(__file__).resolve().parents[2]
    script = textwrap.dedent(
        f"""
        import os
@@ -81,7 +80,7 @@ def _run_savepagenow_script(initialized_archive: Path, request_url: str, expecte

    return subprocess.run(
        [sys.executable, '-c', script],
-        cwd=project_root,
+        cwd=initialized_archive,
        env=env,
        text=True,
        capture_output=True,
@@ -90,7 +89,6 @@ def _run_savepagenow_script(initialized_archive: Path, request_url: str, expecte


 def _run_savepagenow_not_found_script(initialized_archive: Path, request_url: str):
-    project_root = Path(__file__).resolve().parents[2]
    script = textwrap.dedent(
        f"""
        import os
@@ -137,7 +135,7 @@ def _run_savepagenow_not_found_script(initialized_archive: Path, request_url: st

    return subprocess.run(
        [sys.executable, '-c', script],
-        cwd=project_root,
+        cwd=initialized_archive,
        env=env,
        text=True,
        capture_output=True,
@@ -146,7 +144,6 @@ def _run_savepagenow_not_found_script(initialized_archive: Path, request_url: st


 def _run_savepagenow_existing_snapshot_script(initialized_archive: Path, request_url: str, stored_url: str):
-    project_root = Path(__file__).resolve().parents[2]
    script = textwrap.dedent(
        f"""
        import os
@@ -199,7 +196,7 @@ def _run_savepagenow_existing_snapshot_script(initialized_archive: Path, request

    return subprocess.run(
        [sys.executable, '-c', script],
-        cwd=project_root,
+        cwd=initialized_archive,
        env=env,
        text=True,
        capture_output=True,
--- a/archivebox/workers/tests/test_scheduled_crawls.py
+++ b/archivebox/workers/tests/test_scheduled_crawls.py
--- a/archivebox/workers/tests/test_snapshot_worker.py
+++ b/archivebox/workers/tests/test_snapshot_worker.py
--- a/archivebox/workers/orchestrator.py
+++ b/archivebox/workers/orchestrator.py
@@ -29,6 +29,7 @@ Usage:
 __package__ = 'archivebox.workers'

 import os
+import sys
 import time
 from typing import Type
 from datetime import datetime, timedelta
@@ -258,9 +259,7 @@ class Orchestrator:
    def spawn_worker(self, WorkerClass: Type[Worker]) -> int | None:
        """Spawn a new worker process. Returns PID or None if spawn failed."""
        try:
-            print(f'[yellow]DEBUG: Spawning {WorkerClass.name} worker with crawl_id={self.crawl_id}...[/yellow]')
            pid = WorkerClass.start(parent=self.db_process, crawl_id=self.crawl_id)
-            print(f'[yellow]DEBUG: Spawned {WorkerClass.name} worker with PID={pid}[/yellow]')

            # CRITICAL: Block until worker registers itself in Process table
            # This prevents race condition where orchestrator spawns multiple workers
@@ -281,17 +280,6 @@ class Orchestrator:
                # 4. Parent is this orchestrator
                # 5. Started recently (within last 10 seconds)

-                # Debug: Check all processes with this PID first
-                if elapsed < 0.5:
-                    all_procs = list(Process.objects.filter(pid=pid))
-                    print(f'[yellow]DEBUG spawn_worker: elapsed={elapsed:.1f}s pid={pid} orchestrator_id={self.db_process.id}[/yellow]')
-                    print(f'[yellow]  Found {len(all_procs)} Process records for pid={pid}[/yellow]')
-                    for p in all_procs:
-                        print(
-                            f'[yellow]  -> type={p.process_type} status={p.status} '
-                            f'parent_id={p.parent_id} match={p.parent_id == self.db_process.id}[/yellow]'
-                        )
-
                worker_process = Process.objects.filter(
                    pid=pid,
                    process_type=Process.TypeChoices.WORKER,
@@ -302,7 +290,6 @@ class Orchestrator:

                if worker_process:
                    # Worker successfully registered!
-                    print(f'[green]DEBUG spawn_worker: Worker registered! Returning pid={pid}[/green]')
                    return pid

                time.sleep(poll_interval)
@@ -653,14 +640,15 @@ class Orchestrator:
    def runloop(self) -> None:
        """Main orchestrator loop."""
        from rich.live import Live
-        from archivebox.misc.logging import IS_TTY
        from archivebox.misc.progress_layout import ArchiveBoxProgressLayout
-        import sys
        import os

+        is_tty = sys.stdout.isatty()
        # Enable progress layout only in TTY + foreground mode
-        show_progress = IS_TTY and self.exit_on_idle
-        plain_output = not IS_TTY
+        show_progress = is_tty and self.exit_on_idle
+        # When stdout is not a TTY, it may be reserved for JSONL pipeline output.
+        # Keep the plain progress view, but emit it to stderr instead of stdout.
+        plain_output = not is_tty
        self.on_startup()

        if not show_progress:
@@ -1241,7 +1229,7 @@ class Orchestrator:
                            ts = timezone.now().strftime("%Y-%m-%d %H:%M:%S")
                            for panel, line in new_lines:
                                if line:
-                                    print(f"[{ts}] [{panel}] {line}")
+                                    print(f"[{ts}] [{panel}] {line}", file=sys.stderr)
                        last_plain_lines = set(plain_lines)

                # Track idle state
@@ -1271,7 +1259,7 @@ class Orchestrator:
        except KeyboardInterrupt:
            if progress_layout:
                progress_layout.log_event("Interrupted by user", style="red")
-            print()  # Newline after ^C
+            print(file=sys.stderr)  # Newline after ^C
            self.on_shutdown(error=KeyboardInterrupt())
        except BaseException as e:
            if progress_layout:
@@ -1310,7 +1298,7 @@ class Orchestrator:
        Used by commands like 'add' to ensure orchestrator is running.
        """
        if cls.is_running():
-            print('[grey53]👨‍✈️ Orchestrator already running[/grey53]')
+            print('[grey53]👨‍✈️ Orchestrator already running[/grey53]', file=sys.stderr)
            # Return a placeholder - actual orchestrator is in another process
            return cls(exit_on_idle=exit_on_idle)

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -176,7 +176,7 @@ package-dir = {"archivebox" = "archivebox"}
 line-length = 140
 target-version = "py313"
 src = ["archivebox"]
-exclude = ["*.pyi", "typings/", "migrations/"]
+exclude = ["*.pyi", "typings/", "migrations/", "archivebox/tests/data/"]

 # https://docs.astral.sh/ruff/rules/
 [tool.ruff.lint]
@@ -184,6 +184,7 @@ ignore = ["E731", "E303", "E266", "E241", "E222"]

 [tool.pytest.ini_options]
 testpaths = [ "archivebox/tests" ]
+norecursedirs = ["archivebox/tests/data"]
 DJANGO_SETTINGS_MODULE = "archivebox.core.settings"
 # Note: Plugin tests under abx_plugins/plugins/ must NOT load Django
 # They use a conftest.py to disable Django automatically
@@ -254,6 +255,8 @@ exclude = [
    "**/node_modules",
    "**/__pycache__",
    "**/migrations",
+    "archivebox/tests/data",
+    "archivebox/tests/data/**",
 ]
 stubPath = "./typings"
 venvPath = "."
@@ -267,7 +270,7 @@ pythonPlatform = "Linux"

 [tool.ty]
 environment = { python-version = "3.13", python-platform = "linux" }
-src = { include = ["archivebox"], exclude = [".venv", "**/*.pyi", "**/__init__.pyi", "**/node_modules", "**/__pycache__", "**/migrations"] }
+src = { include = ["archivebox"], exclude = [".venv", "**/*.pyi", "**/__init__.pyi", "**/node_modules", "**/__pycache__", "**/migrations", "archivebox/tests/data", "archivebox/tests/data/**"] }


 [project.scripts]