cleanup archivebox tests

This commit is contained in:
Nick Sweeting
2026-03-15 22:09:56 -07:00
parent 9de084da65
commit 57e11879ec
23 changed files with 487 additions and 1495 deletions

View File

@@ -107,7 +107,10 @@ class ArchiveBoxGroup(click.Group):
# handle renamed commands
if cmd_name in self.renamed_commands:
new_name = self.renamed_commands[cmd_name]
print(f' [violet]Hint:[/violet] `archivebox {cmd_name}` has been renamed to `archivebox {new_name}`')
print(
f' [violet]Hint:[/violet] `archivebox {cmd_name}` has been renamed to `archivebox {new_name}`',
file=sys.stderr,
)
cmd_name = new_name
ctx.invoked_subcommand = cmd_name

View File

@@ -63,11 +63,28 @@ def create_binary(
return 1
try:
binary, created = Binary.objects.get_or_create(
from archivebox.machine.models import Machine
machine = Machine.current()
created = not Binary.objects.filter(
machine=machine,
name=name,
abspath=abspath,
defaults={'version': version}
)
version=version,
).exists()
# Mirror the Binary model lifecycle used elsewhere in the system so CLI
# records are owned by the current machine and can be safely piped into
# `archivebox run` without creating invalid rows missing machine_id.
binary = Binary.from_json({
'name': name,
'abspath': abspath,
'version': version,
'binproviders': 'env',
'binprovider': 'env',
})
if binary is None:
raise ValueError('failed to create binary record')
if not is_tty:
write_record(binary.to_json())

View File

@@ -81,6 +81,7 @@ def process_archiveresult_by_id(archiveresult_id: str) -> int:
def run_plugins(
args: tuple,
records: list[dict] | None = None,
plugins: str = '',
wait: bool = True,
) -> int:
@@ -108,8 +109,12 @@ def run_plugins(
# Parse comma-separated plugins list once (reused in creation and filtering)
plugins_list = [p.strip() for p in plugins.split(',') if p.strip()] if plugins else []
# Collect all input records
records = list(read_args_or_stdin(args))
# Parse stdin/args exactly once per CLI invocation.
# `main()` may already have consumed stdin to distinguish Snapshot input from
# ArchiveResult IDs; if so, it must pass the parsed records through here
# instead of asking this helper to reread an already-drained pipe.
if records is None:
records = list(read_args_or_stdin(args))
if not records:
rprint('[yellow]No snapshots provided. Pass snapshot IDs as arguments or via stdin.[/yellow]', file=sys.stderr)
@@ -269,7 +274,7 @@ def main(plugins: str, wait: bool, args: tuple):
sys.exit(exit_code)
else:
# Default behavior: run plugins on Snapshots from input
sys.exit(run_plugins(args, plugins=plugins, wait=wait))
sys.exit(run_plugins(args, records=records, plugins=plugins, wait=wait))
if __name__ == '__main__':

View File

@@ -1,231 +0,0 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
import importlib
import os
import shutil
import sys
import unittest
from contextlib import contextmanager
from pathlib import Path
from archivebox.config.constants import CONSTANTS
TEST_CONFIG = {
'USE_COLOR': 'False',
'SHOW_PROGRESS': 'False',
'DATA_DIR': 'data.tests',
'SAVE_ARCHIVEDOTORG': 'False',
'SAVE_TITLE': 'False',
'USE_CURL': 'False',
'USE_WGET': 'False',
'USE_GIT': 'False',
'USE_CHROME': 'False',
'USE_YOUTUBEDL': 'False',
}
DATA_DIR = 'data.tests'
os.environ.update(TEST_CONFIG)
init = importlib.import_module('archivebox.main').init
SQL_INDEX_FILENAME = CONSTANTS.SQL_INDEX_FILENAME
JSON_INDEX_FILENAME = CONSTANTS.JSON_INDEX_FILENAME
HTML_INDEX_FILENAME = CONSTANTS.HTML_INDEX_FILENAME
archivebox_init = importlib.import_module('archivebox.cli.archivebox_init')
archivebox_add = importlib.import_module('archivebox.cli.archivebox_add')
archivebox_remove = importlib.import_module('archivebox.cli.archivebox_remove')
parse_json_main_index = importlib.import_module('archivebox.misc.legacy').parse_json_main_index
HIDE_CLI_OUTPUT = True
test_urls = '''
https://example1.com/what/is/happening.html?what=1#how-about-this=1
https://example2.com/what/is/happening/?what=1#how-about-this=1
HTtpS://example3.com/what/is/happening/?what=1#how-about-this=1f
https://example4.com/what/is/happening.html
https://example5.com/
https://example6.com
<test>http://example7.com</test>
[https://example8.com/what/is/this.php?what=1]
[and http://example9.com?what=1&other=3#and-thing=2]
<what>https://example10.com#and-thing=2 "</about>
abc<this["https://subb.example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def
sdflkf[what](https://subb.example12.com/who/what.php?whoami=1#whatami=2)?am=hi
example13.bada
and example14.badb
<or>htt://example15.badc</that>
'''
stdout = sys.stdout
stderr = sys.stderr
def load_main_index(*, out_dir: str):
index_path = Path(out_dir) / JSON_INDEX_FILENAME
if not index_path.exists():
raise FileNotFoundError(index_path)
return list(parse_json_main_index(Path(out_dir)))
@contextmanager
def output_hidden(show_failing=True):
if not HIDE_CLI_OUTPUT:
yield
return
sys.stdout = open('stdout.txt', 'w+', encoding='utf-8')
sys.stderr = open('stderr.txt', 'w+', encoding='utf-8')
try:
yield
sys.stdout.close()
sys.stderr.close()
sys.stdout = stdout
sys.stderr = stderr
except Exception:
sys.stdout.close()
sys.stderr.close()
sys.stdout = stdout
sys.stderr = stderr
if show_failing:
with open('stdout.txt', 'r', encoding='utf-8') as f:
print(f.read())
with open('stderr.txt', 'r', encoding='utf-8') as f:
print(f.read())
raise
finally:
os.remove('stdout.txt')
os.remove('stderr.txt')
class TestInit(unittest.TestCase):
def setUp(self):
os.makedirs(DATA_DIR, exist_ok=True)
def tearDown(self):
shutil.rmtree(DATA_DIR, ignore_errors=True)
def test_basic_init(self):
with output_hidden():
archivebox_init.main([])
assert (Path(DATA_DIR) / SQL_INDEX_FILENAME).exists()
assert (Path(DATA_DIR) / JSON_INDEX_FILENAME).exists()
assert (Path(DATA_DIR) / HTML_INDEX_FILENAME).exists()
assert len(load_main_index(out_dir=DATA_DIR)) == 0
def test_conflicting_init(self):
with open(Path(DATA_DIR) / 'test_conflict.txt', 'w+', encoding='utf-8') as f:
f.write('test')
try:
with output_hidden(show_failing=False):
archivebox_init.main([])
assert False, 'Init should have exited with an exception'
except SystemExit:
pass
assert not (Path(DATA_DIR) / SQL_INDEX_FILENAME).exists()
assert not (Path(DATA_DIR) / JSON_INDEX_FILENAME).exists()
assert not (Path(DATA_DIR) / HTML_INDEX_FILENAME).exists()
try:
load_main_index(out_dir=DATA_DIR)
assert False, 'load_main_index should raise an exception when no index is present'
except Exception:
pass
def test_no_dirty_state(self):
with output_hidden():
init()
shutil.rmtree(DATA_DIR, ignore_errors=True)
with output_hidden():
init()
class TestAdd(unittest.TestCase):
def setUp(self):
os.makedirs(DATA_DIR, exist_ok=True)
with output_hidden():
init()
def tearDown(self):
shutil.rmtree(DATA_DIR, ignore_errors=True)
def test_add_arg_url(self):
with output_hidden():
archivebox_add.main(['https://getpocket.com/users/nikisweeting/feed/all'])
all_links = load_main_index(out_dir=DATA_DIR)
assert len(all_links) == 30
def test_add_arg_file(self):
test_file = Path(DATA_DIR) / 'test.txt'
with open(test_file, 'w+', encoding='utf') as f:
f.write(test_urls)
with output_hidden():
archivebox_add.main([test_file])
all_links = load_main_index(out_dir=DATA_DIR)
assert len(all_links) == 12
os.remove(test_file)
def test_add_stdin_url(self):
with output_hidden():
archivebox_add.main([], stdin=test_urls)
all_links = load_main_index(out_dir=DATA_DIR)
assert len(all_links) == 12
class TestRemove(unittest.TestCase):
def setUp(self):
os.makedirs(DATA_DIR, exist_ok=True)
with output_hidden():
init()
archivebox_add.main([], stdin=test_urls)
# def tearDown(self):
# shutil.rmtree(DATA_DIR, ignore_errors=True)
def test_remove_exact(self):
with output_hidden():
archivebox_remove.main(['--yes', '--delete', 'https://example5.com/'])
all_links = load_main_index(out_dir=DATA_DIR)
assert len(all_links) == 11
def test_remove_regex(self):
with output_hidden():
archivebox_remove.main(['--yes', '--delete', '--filter-type=regex', r'http(s)?:\/\/(.+\.)?(example\d\.com)'])
all_links = load_main_index(out_dir=DATA_DIR)
assert len(all_links) == 4
def test_remove_domain(self):
with output_hidden():
archivebox_remove.main(['--yes', '--delete', '--filter-type=domain', 'example5.com', 'example6.com'])
all_links = load_main_index(out_dir=DATA_DIR)
assert len(all_links) == 10
def test_remove_none(self):
try:
with output_hidden(show_failing=False):
archivebox_remove.main(['--yes', '--delete', 'https://doesntexist.com'])
assert False, 'Should raise if no URLs match'
except Exception:
pass
if __name__ == '__main__':
if '--verbose' in sys.argv or '-v' in sys.argv:
HIDE_CLI_OUTPUT = False
unittest.main()

View File

@@ -1,665 +0,0 @@
#!/usr/bin/env python3
"""
Tests for CLI piping workflow: crawl | snapshot | archiveresult | run
This module tests the JSONL-based piping between CLI commands as described in:
https://github.com/ArchiveBox/ArchiveBox/issues/1363
Workflows tested:
archivebox crawl create URL -> Crawl JSONL
archivebox snapshot create -> Snapshot JSONL (accepts Crawl or URL input)
archivebox archiveresult create -> ArchiveResult JSONL (accepts Snapshot input)
archivebox run -> Process queued records (accepts any JSONL)
Pipeline:
archivebox crawl create URL | archivebox snapshot create | archivebox archiveresult create | archivebox run
Each command should:
- Accept URLs, IDs, or JSONL as input (args or stdin)
- Output JSONL to stdout when piped (not TTY)
- Output human-readable to stderr when TTY
"""
__package__ = 'archivebox.cli'
import os
import json
import shutil
import tempfile
import unittest
from io import StringIO
from pathlib import Path
from typing import TypeVar
# Test configuration - disable slow extractors
TEST_CONFIG = {
'USE_COLOR': 'False',
'SHOW_PROGRESS': 'False',
'SAVE_ARCHIVEDOTORG': 'False',
'SAVE_TITLE': 'True', # Fast extractor
'SAVE_FAVICON': 'False',
'SAVE_WGET': 'False',
'SAVE_WARC': 'False',
'SAVE_PDF': 'False',
'SAVE_SCREENSHOT': 'False',
'SAVE_DOM': 'False',
'SAVE_SINGLEFILE': 'False',
'SAVE_READABILITY': 'False',
'SAVE_MERCURY': 'False',
'SAVE_GIT': 'False',
'SAVE_YTDLP': 'False',
'SAVE_HEADERS': 'False',
'USE_CURL': 'False',
'USE_WGET': 'False',
'USE_GIT': 'False',
'USE_CHROME': 'False',
'USE_YOUTUBEDL': 'False',
'USE_NODE': 'False',
}
os.environ.update(TEST_CONFIG)
T = TypeVar('T')
def require(value: T | None) -> T:
if value is None:
raise AssertionError('Expected value to be present')
return value
class MockTTYStringIO(StringIO):
def __init__(self, initial_value: str = '', *, is_tty: bool):
super().__init__(initial_value)
self._is_tty = is_tty
def isatty(self) -> bool:
return self._is_tty
# =============================================================================
# JSONL Utility Tests
# =============================================================================
class TestJSONLParsing(unittest.TestCase):
"""Test JSONL input parsing utilities."""
def test_parse_plain_url(self):
"""Plain URLs should be parsed as Snapshot records."""
from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT
result = require(parse_line('https://example.com'))
self.assertEqual(result['type'], TYPE_SNAPSHOT)
self.assertEqual(result['url'], 'https://example.com')
def test_parse_jsonl_snapshot(self):
"""JSONL Snapshot records should preserve all fields."""
from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT
line = '{"type": "Snapshot", "url": "https://example.com", "tags": "test,demo"}'
result = require(parse_line(line))
self.assertEqual(result['type'], TYPE_SNAPSHOT)
self.assertEqual(result['url'], 'https://example.com')
self.assertEqual(result['tags'], 'test,demo')
def test_parse_jsonl_crawl(self):
"""JSONL Crawl records should be parsed correctly."""
from archivebox.misc.jsonl import parse_line, TYPE_CRAWL
line = '{"type": "Crawl", "id": "abc123", "urls": "https://example.com", "max_depth": 1}'
result = require(parse_line(line))
self.assertEqual(result['type'], TYPE_CRAWL)
self.assertEqual(result['id'], 'abc123')
self.assertEqual(result['urls'], 'https://example.com')
self.assertEqual(result['max_depth'], 1)
def test_parse_jsonl_with_id(self):
"""JSONL with id field should be recognized."""
from archivebox.misc.jsonl import parse_line
line = '{"type": "Snapshot", "id": "abc123", "url": "https://example.com"}'
result = require(parse_line(line))
self.assertEqual(result['id'], 'abc123')
self.assertEqual(result['url'], 'https://example.com')
def test_parse_uuid_as_snapshot_id(self):
"""Bare UUIDs should be parsed as snapshot IDs."""
from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT
uuid = '01234567-89ab-cdef-0123-456789abcdef'
result = require(parse_line(uuid))
self.assertEqual(result['type'], TYPE_SNAPSHOT)
self.assertEqual(result['id'], uuid)
def test_parse_empty_line(self):
"""Empty lines should return None."""
from archivebox.misc.jsonl import parse_line
self.assertIsNone(parse_line(''))
self.assertIsNone(parse_line(' '))
self.assertIsNone(parse_line('\n'))
def test_parse_comment_line(self):
"""Comment lines should return None."""
from archivebox.misc.jsonl import parse_line
self.assertIsNone(parse_line('# This is a comment'))
self.assertIsNone(parse_line(' # Indented comment'))
def test_parse_invalid_url(self):
"""Invalid URLs should return None."""
from archivebox.misc.jsonl import parse_line
self.assertIsNone(parse_line('not-a-url'))
self.assertIsNone(parse_line('ftp://example.com')) # Only http/https/file
def test_parse_file_url(self):
"""file:// URLs should be parsed."""
from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT
result = require(parse_line('file:///path/to/file.txt'))
self.assertEqual(result['type'], TYPE_SNAPSHOT)
self.assertEqual(result['url'], 'file:///path/to/file.txt')
# Note: JSONL output serialization is tested in TestPipingWorkflowIntegration
# using real model instances, not mocks.
class TestReadArgsOrStdin(unittest.TestCase):
"""Test reading from args or stdin."""
def test_read_from_args(self):
"""Should read URLs from command line args."""
from archivebox.misc.jsonl import read_args_or_stdin
args = ('https://example1.com', 'https://example2.com')
records = list(read_args_or_stdin(args))
self.assertEqual(len(records), 2)
self.assertEqual(records[0]['url'], 'https://example1.com')
self.assertEqual(records[1]['url'], 'https://example2.com')
def test_read_from_stdin(self):
"""Should read URLs from stdin when no args provided."""
from archivebox.misc.jsonl import read_args_or_stdin
stdin_content = 'https://example1.com\nhttps://example2.com\n'
stream = MockTTYStringIO(stdin_content, is_tty=False)
records = list(read_args_or_stdin((), stream=stream))
self.assertEqual(len(records), 2)
self.assertEqual(records[0]['url'], 'https://example1.com')
self.assertEqual(records[1]['url'], 'https://example2.com')
def test_read_jsonl_from_stdin(self):
"""Should read JSONL from stdin."""
from archivebox.misc.jsonl import read_args_or_stdin
stdin_content = '{"type": "Snapshot", "url": "https://example.com", "tags": "test"}\n'
stream = MockTTYStringIO(stdin_content, is_tty=False)
records = list(read_args_or_stdin((), stream=stream))
self.assertEqual(len(records), 1)
self.assertEqual(records[0]['url'], 'https://example.com')
self.assertEqual(records[0]['tags'], 'test')
def test_read_crawl_jsonl_from_stdin(self):
"""Should read Crawl JSONL from stdin."""
from archivebox.misc.jsonl import read_args_or_stdin, TYPE_CRAWL
stdin_content = '{"type": "Crawl", "id": "abc123", "urls": "https://example.com\\nhttps://foo.com"}\n'
stream = MockTTYStringIO(stdin_content, is_tty=False)
records = list(read_args_or_stdin((), stream=stream))
self.assertEqual(len(records), 1)
self.assertEqual(records[0]['type'], TYPE_CRAWL)
self.assertEqual(records[0]['id'], 'abc123')
def test_skip_tty_stdin(self):
"""Should not read from TTY stdin (would block)."""
from archivebox.misc.jsonl import read_args_or_stdin
stream = MockTTYStringIO('https://example.com', is_tty=True)
records = list(read_args_or_stdin((), stream=stream))
self.assertEqual(len(records), 0)
# =============================================================================
# Unit Tests for Individual Commands
# =============================================================================
class TestCrawlCommand(unittest.TestCase):
"""Unit tests for archivebox crawl command."""
def setUp(self):
"""Set up test environment."""
self.test_dir = tempfile.mkdtemp()
os.environ['DATA_DIR'] = self.test_dir
def tearDown(self):
"""Clean up test environment."""
shutil.rmtree(self.test_dir, ignore_errors=True)
def test_crawl_accepts_url(self):
"""crawl should accept URLs as input."""
from archivebox.misc.jsonl import read_args_or_stdin
args = ('https://example.com',)
records = list(read_args_or_stdin(args))
self.assertEqual(len(records), 1)
self.assertEqual(records[0]['url'], 'https://example.com')
def test_crawl_output_format(self):
"""crawl should output Crawl JSONL records."""
from archivebox.misc.jsonl import TYPE_CRAWL
# Mock crawl output
crawl_output = {
'type': TYPE_CRAWL,
'schema_version': '0.9.0',
'id': 'test-crawl-id',
'urls': 'https://example.com',
'status': 'queued',
'max_depth': 0,
}
self.assertEqual(crawl_output['type'], TYPE_CRAWL)
self.assertIn('id', crawl_output)
self.assertIn('urls', crawl_output)
class TestSnapshotCommand(unittest.TestCase):
"""Unit tests for archivebox snapshot command."""
def setUp(self):
"""Set up test environment."""
self.test_dir = tempfile.mkdtemp()
os.environ['DATA_DIR'] = self.test_dir
def tearDown(self):
"""Clean up test environment."""
shutil.rmtree(self.test_dir, ignore_errors=True)
def test_snapshot_accepts_url(self):
"""snapshot should accept URLs as input."""
from archivebox.misc.jsonl import read_args_or_stdin
args = ('https://example.com',)
records = list(read_args_or_stdin(args))
self.assertEqual(len(records), 1)
self.assertEqual(records[0]['url'], 'https://example.com')
def test_snapshot_accepts_crawl_jsonl(self):
"""snapshot should accept Crawl JSONL as input."""
from archivebox.misc.jsonl import read_args_or_stdin, TYPE_CRAWL
stdin = MockTTYStringIO('{"type": "Crawl", "id": "abc123", "urls": "https://example.com"}\n', is_tty=False)
records = list(read_args_or_stdin((), stream=stdin))
self.assertEqual(len(records), 1)
self.assertEqual(records[0]['type'], TYPE_CRAWL)
self.assertEqual(records[0]['id'], 'abc123')
self.assertEqual(records[0]['urls'], 'https://example.com')
def test_snapshot_accepts_jsonl_with_metadata(self):
"""snapshot should accept JSONL with tags and other metadata."""
from archivebox.misc.jsonl import read_args_or_stdin
stdin = MockTTYStringIO('{"type": "Snapshot", "url": "https://example.com", "tags": "tag1,tag2", "title": "Test"}\n', is_tty=False)
records = list(read_args_or_stdin((), stream=stdin))
self.assertEqual(len(records), 1)
self.assertEqual(records[0]['url'], 'https://example.com')
self.assertEqual(records[0]['tags'], 'tag1,tag2')
self.assertEqual(records[0]['title'], 'Test')
# Note: Snapshot output format is tested in integration tests
# (TestPipingWorkflowIntegration.test_snapshot_creates_and_outputs_jsonl)
# using real Snapshot instances.
class TestArchiveResultCommand(unittest.TestCase):
"""Unit tests for archivebox archiveresult command."""
def setUp(self):
"""Set up test environment."""
self.test_dir = tempfile.mkdtemp()
os.environ['DATA_DIR'] = self.test_dir
def tearDown(self):
"""Clean up test environment."""
shutil.rmtree(self.test_dir, ignore_errors=True)
def test_archiveresult_accepts_snapshot_id(self):
"""archiveresult should accept snapshot IDs as input."""
from archivebox.misc.jsonl import read_args_or_stdin
uuid = '01234567-89ab-cdef-0123-456789abcdef'
args = (uuid,)
records = list(read_args_or_stdin(args))
self.assertEqual(len(records), 1)
self.assertEqual(records[0]['id'], uuid)
def test_archiveresult_accepts_jsonl_snapshot(self):
"""archiveresult should accept JSONL Snapshot records."""
from archivebox.misc.jsonl import read_args_or_stdin, TYPE_SNAPSHOT
stdin = MockTTYStringIO('{"type": "Snapshot", "id": "abc123", "url": "https://example.com"}\n', is_tty=False)
records = list(read_args_or_stdin((), stream=stdin))
self.assertEqual(len(records), 1)
self.assertEqual(records[0]['type'], TYPE_SNAPSHOT)
self.assertEqual(records[0]['id'], 'abc123')
def test_archiveresult_gathers_snapshot_ids(self):
"""archiveresult should gather snapshot IDs from various input formats."""
from archivebox.misc.jsonl import TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
records = [
{'type': TYPE_SNAPSHOT, 'id': 'snap-1'},
{'type': TYPE_SNAPSHOT, 'id': 'snap-2', 'url': 'https://example.com'},
{'type': TYPE_ARCHIVERESULT, 'snapshot_id': 'snap-3'},
{'id': 'snap-4'}, # Bare id
]
snapshot_ids = set()
for record in records:
record_type = record.get('type')
if record_type == TYPE_SNAPSHOT:
snapshot_id = record.get('id')
if snapshot_id:
snapshot_ids.add(snapshot_id)
elif record_type == TYPE_ARCHIVERESULT:
snapshot_id = record.get('snapshot_id')
if snapshot_id:
snapshot_ids.add(snapshot_id)
elif 'id' in record:
snapshot_ids.add(record['id'])
self.assertEqual(len(snapshot_ids), 4)
self.assertIn('snap-1', snapshot_ids)
self.assertIn('snap-2', snapshot_ids)
self.assertIn('snap-3', snapshot_ids)
self.assertIn('snap-4', snapshot_ids)
# =============================================================================
# URL Collection Tests
# =============================================================================
class TestURLCollection(unittest.TestCase):
"""Test collecting urls.jsonl from extractor output."""
def setUp(self):
"""Create test directory structure."""
self.test_dir = Path(tempfile.mkdtemp())
# Create fake extractor output directories with urls.jsonl
(self.test_dir / 'wget').mkdir()
(self.test_dir / 'wget' / 'urls.jsonl').write_text(
'{"url": "https://wget-link-1.com"}\n'
'{"url": "https://wget-link-2.com"}\n'
)
(self.test_dir / 'parse_html_urls').mkdir()
(self.test_dir / 'parse_html_urls' / 'urls.jsonl').write_text(
'{"url": "https://html-link-1.com"}\n'
'{"url": "https://html-link-2.com", "title": "HTML Link 2"}\n'
)
(self.test_dir / 'screenshot').mkdir()
# No urls.jsonl in screenshot dir - not a parser
def tearDown(self):
"""Clean up test directory."""
shutil.rmtree(self.test_dir, ignore_errors=True)
def test_collect_urls_from_plugins(self):
"""Should collect urls.jsonl from all parser plugin subdirectories."""
from archivebox.hooks import collect_urls_from_plugins
urls = collect_urls_from_plugins(self.test_dir)
self.assertEqual(len(urls), 4)
# Check that plugin is set
plugins = {u['plugin'] for u in urls}
self.assertIn('wget', plugins)
self.assertIn('parse_html_urls', plugins)
self.assertNotIn('screenshot', plugins) # No urls.jsonl
def test_collect_urls_preserves_metadata(self):
"""Should preserve metadata from urls.jsonl entries."""
from archivebox.hooks import collect_urls_from_plugins
urls = collect_urls_from_plugins(self.test_dir)
# Find the entry with title
titled = [u for u in urls if u.get('title') == 'HTML Link 2']
self.assertEqual(len(titled), 1)
self.assertEqual(titled[0]['url'], 'https://html-link-2.com')
def test_collect_urls_empty_dir(self):
"""Should handle empty or non-existent directories."""
from archivebox.hooks import collect_urls_from_plugins
empty_dir = self.test_dir / 'nonexistent'
urls = collect_urls_from_plugins(empty_dir)
self.assertEqual(len(urls), 0)
class TestEdgeCases(unittest.TestCase):
"""Test edge cases and error handling."""
def test_empty_input(self):
"""Commands should handle empty input gracefully."""
from archivebox.misc.jsonl import read_args_or_stdin
# Empty args, TTY stdin (should not block)
stdin = MockTTYStringIO('', is_tty=True)
records = list(read_args_or_stdin((), stream=stdin))
self.assertEqual(len(records), 0)
def test_malformed_jsonl(self):
"""Should skip malformed JSONL lines."""
from archivebox.misc.jsonl import read_args_or_stdin
stdin = MockTTYStringIO(
'{"url": "https://good.com"}\n'
'not valid json\n'
'{"url": "https://also-good.com"}\n',
is_tty=False,
)
records = list(read_args_or_stdin((), stream=stdin))
self.assertEqual(len(records), 2)
urls = {r['url'] for r in records}
self.assertEqual(urls, {'https://good.com', 'https://also-good.com'})
def test_mixed_input_formats(self):
"""Should handle mixed URLs and JSONL."""
from archivebox.misc.jsonl import read_args_or_stdin
stdin = MockTTYStringIO(
'https://plain-url.com\n'
'{"type": "Snapshot", "url": "https://jsonl-url.com", "tags": "test"}\n'
'01234567-89ab-cdef-0123-456789abcdef\n', # UUID
is_tty=False,
)
records = list(read_args_or_stdin((), stream=stdin))
self.assertEqual(len(records), 3)
# Plain URL
self.assertEqual(records[0]['url'], 'https://plain-url.com')
# JSONL with metadata
self.assertEqual(records[1]['url'], 'https://jsonl-url.com')
self.assertEqual(records[1]['tags'], 'test')
# UUID
self.assertEqual(records[2]['id'], '01234567-89ab-cdef-0123-456789abcdef')
def test_crawl_with_multiple_urls(self):
"""Crawl should handle multiple URLs in a single crawl."""
from archivebox.misc.jsonl import TYPE_CRAWL
# Test crawl JSONL with multiple URLs
crawl_output = {
'type': TYPE_CRAWL,
'id': 'test-multi-url-crawl',
'urls': 'https://url1.com\nhttps://url2.com\nhttps://url3.com',
'max_depth': 0,
}
# Parse the URLs
urls = [u.strip() for u in crawl_output['urls'].split('\n') if u.strip()]
self.assertEqual(len(urls), 3)
self.assertEqual(urls[0], 'https://url1.com')
self.assertEqual(urls[1], 'https://url2.com')
self.assertEqual(urls[2], 'https://url3.com')
# =============================================================================
# Pass-Through Behavior Tests
# =============================================================================
class TestPassThroughBehavior(unittest.TestCase):
"""Test pass-through behavior in CLI commands."""
def test_crawl_passes_through_other_types(self):
"""crawl create should pass through records with other types."""
# Input: a Tag record (not a Crawl or URL)
tag_record = {'type': 'Tag', 'id': 'test-tag', 'name': 'example'}
url_record = {'url': 'https://example.com'}
# Mock stdin with both records
stdin = MockTTYStringIO(
json.dumps(tag_record)
+ '\n'
+ json.dumps(url_record),
is_tty=False,
)
# The Tag should be passed through, the URL should create a Crawl
# (This is a unit test of the pass-through logic)
from archivebox.misc.jsonl import read_args_or_stdin
records = list(read_args_or_stdin((), stream=stdin))
self.assertEqual(len(records), 2)
# First record is a Tag (other type)
self.assertEqual(records[0]['type'], 'Tag')
# Second record has a URL
self.assertIn('url', records[1])
def test_snapshot_passes_through_crawl(self):
"""snapshot create should pass through Crawl records."""
from archivebox.misc.jsonl import TYPE_CRAWL
crawl_record = {
'type': TYPE_CRAWL,
'id': 'test-crawl',
'urls': 'https://example.com',
}
# Crawl records should be passed through AND create snapshots
# This tests the accumulation behavior
self.assertEqual(crawl_record['type'], TYPE_CRAWL)
self.assertIn('urls', crawl_record)
def test_archiveresult_passes_through_snapshot(self):
"""archiveresult create should pass through Snapshot records."""
from archivebox.misc.jsonl import TYPE_SNAPSHOT
snapshot_record = {
'type': TYPE_SNAPSHOT,
'id': 'test-snapshot',
'url': 'https://example.com',
}
# Snapshot records should be passed through
self.assertEqual(snapshot_record['type'], TYPE_SNAPSHOT)
self.assertIn('url', snapshot_record)
def test_run_passes_through_unknown_types(self):
"""run should pass through records with unknown types."""
unknown_record = {'type': 'Unknown', 'id': 'test', 'data': 'value'}
# Unknown types should be passed through unchanged
self.assertEqual(unknown_record['type'], 'Unknown')
self.assertIn('data', unknown_record)
class TestPipelineAccumulation(unittest.TestCase):
"""Test that pipelines accumulate records correctly."""
def test_full_pipeline_output_types(self):
"""Full pipeline should output all record types."""
from archivebox.misc.jsonl import TYPE_CRAWL, TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
# Simulated pipeline output after: crawl | snapshot | archiveresult | run
# Should contain Crawl, Snapshot, and ArchiveResult records
pipeline_output = [
{'type': TYPE_CRAWL, 'id': 'c1', 'urls': 'https://example.com'},
{'type': TYPE_SNAPSHOT, 'id': 's1', 'url': 'https://example.com'},
{'type': TYPE_ARCHIVERESULT, 'id': 'ar1', 'plugin': 'title'},
]
types = {r['type'] for r in pipeline_output}
self.assertIn(TYPE_CRAWL, types)
self.assertIn(TYPE_SNAPSHOT, types)
self.assertIn(TYPE_ARCHIVERESULT, types)
def test_pipeline_preserves_ids(self):
"""Pipeline should preserve record IDs through all stages."""
records = [
{'type': 'Crawl', 'id': 'c1', 'urls': 'https://example.com'},
{'type': 'Snapshot', 'id': 's1', 'url': 'https://example.com'},
]
# All records should have IDs
for record in records:
self.assertIn('id', record)
self.assertTrue(record['id'])
def test_jq_transform_pattern(self):
"""Test pattern for jq transforms in pipeline."""
# Simulated: archiveresult list --status=failed | jq 'del(.id) | .status = "queued"'
failed_record = {
'type': 'ArchiveResult',
'id': 'ar1',
'status': 'failed',
'plugin': 'wget',
}
# Transform: delete id, set status to queued
transformed = {
'type': failed_record['type'],
'status': 'queued',
'plugin': failed_record['plugin'],
}
self.assertNotIn('id', transformed)
self.assertEqual(transformed['status'], 'queued')
if __name__ == '__main__':
unittest.main()

View File

@@ -1,382 +0,0 @@
"""Tests for the core views, especially AddView."""
import importlib
import os
import django
from unittest.mock import patch
from typing import TypeVar, cast
from django.forms import BaseForm
# Set up Django before importing any Django-dependent modules
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
django.setup()
TestCase = importlib.import_module('django.test').TestCase
Client = importlib.import_module('django.test').Client
User = importlib.import_module('django.contrib.auth.models').User
reverse = importlib.import_module('django.urls').reverse
Crawl = importlib.import_module('archivebox.crawls.models').Crawl
CrawlSchedule = importlib.import_module('archivebox.crawls.models').CrawlSchedule
Tag = importlib.import_module('archivebox.core.models').Tag
SERVER_CONFIG = importlib.import_module('archivebox.config.common').SERVER_CONFIG
T = TypeVar('T')
def require(value: T | None) -> T:
if value is None:
raise AssertionError('Expected value to be present')
return value
class AddViewTests(TestCase):
"""Tests for the AddView (crawl creation form)."""
def setUp(self):
"""Set up test user and client."""
self.client = Client()
self.user = User.objects.create_user(
username='testuser',
password='testpass123',
email='test@example.com'
)
self.client.login(username='testuser', password='testpass123')
self.add_url = reverse('add')
def test_add_view_get_requires_auth(self):
"""Test that GET /add requires authentication."""
self.client.logout()
response = self.client.get(self.add_url)
# Should redirect to login or show 403/404
self.assertIn(response.status_code, [302, 403, 404])
def test_add_view_get_shows_form(self):
"""Test that GET /add shows the form with all fields."""
response = self.client.get(self.add_url)
self.assertEqual(response.status_code, 200)
# Check that form fields are present
self.assertContains(response, 'name="url"')
self.assertContains(response, 'name="tag"')
self.assertContains(response, 'name="depth"')
self.assertContains(response, 'name="notes"')
self.assertContains(response, 'name="schedule"')
self.assertContains(response, 'name="persona"')
self.assertContains(response, 'name="overwrite"')
self.assertContains(response, 'name="update"')
self.assertContains(response, 'name="index_only"')
# Check for plugin groups
self.assertContains(response, 'name="chrome_plugins"')
self.assertContains(response, 'name="archiving_plugins"')
self.assertContains(response, 'name="parsing_plugins"')
def test_add_view_shows_tag_autocomplete(self):
"""Test that tag autocomplete datalist is rendered."""
# Create some tags
Tag.objects.create(name='test-tag-1')
Tag.objects.create(name='test-tag-2')
response = self.client.get(self.add_url)
self.assertEqual(response.status_code, 200)
# Check for datalist with tags
self.assertContains(response, 'id="tag-datalist"')
self.assertContains(response, 'test-tag-1')
self.assertContains(response, 'test-tag-2')
def test_add_view_shows_plugin_presets(self):
"""Test that plugin preset buttons are rendered."""
response = self.client.get(self.add_url)
self.assertEqual(response.status_code, 200)
self.assertContains(response, 'Quick Archive')
self.assertContains(response, 'Full Chrome')
self.assertContains(response, 'Text Only')
self.assertContains(response, 'Select All')
self.assertContains(response, 'Clear All')
def test_add_view_shows_links_to_resources(self):
"""Test that helpful links are present."""
response = self.client.get(self.add_url)
self.assertEqual(response.status_code, 200)
# Link to plugin documentation
self.assertContains(response, '/admin/environment/plugins/')
# Link to create new persona
self.assertContains(response, '/admin/personas/persona/add/')
def test_add_basic_crawl_without_schedule(self):
"""Test creating a basic crawl without a schedule."""
response = self.client.post(self.add_url, {
'url': 'https://example.com\nhttps://example.org',
'tag': 'test-tag',
'depth': '0',
'notes': 'Test crawl notes',
})
# Should redirect to crawl admin page
self.assertEqual(response.status_code, 302)
# Check that crawl was created
self.assertEqual(Crawl.objects.count(), 1)
crawl = require(Crawl.objects.first())
self.assertIn('https://example.com', crawl.urls)
self.assertIn('https://example.org', crawl.urls)
self.assertEqual(crawl.tags_str, 'test-tag')
self.assertEqual(crawl.max_depth, 0)
self.assertEqual(crawl.notes, 'Test crawl notes')
self.assertEqual(crawl.created_by, self.user)
# No schedule should be created
self.assertIsNone(crawl.schedule)
self.assertEqual(CrawlSchedule.objects.count(), 0)
def test_add_crawl_with_schedule(self):
"""Test creating a crawl with a repeat schedule."""
response = self.client.post(self.add_url, {
'url': 'https://example.com',
'tag': 'scheduled',
'depth': '1',
'notes': 'Daily crawl',
'schedule': 'daily',
})
self.assertEqual(response.status_code, 302)
# Check that crawl and schedule were created
self.assertEqual(Crawl.objects.count(), 1)
self.assertEqual(CrawlSchedule.objects.count(), 1)
crawl = require(Crawl.objects.first())
schedule = require(CrawlSchedule.objects.first())
self.assertEqual(crawl.schedule, schedule)
self.assertEqual(schedule.template, crawl)
self.assertEqual(schedule.schedule, 'daily')
self.assertTrue(schedule.is_enabled)
self.assertEqual(schedule.created_by, self.user)
def test_add_crawl_with_cron_schedule(self):
"""Test creating a crawl with a cron format schedule."""
response = self.client.post(self.add_url, {
'url': 'https://example.com',
'depth': '0',
'schedule': '0 */6 * * *', # Every 6 hours
})
self.assertEqual(response.status_code, 302)
schedule = require(CrawlSchedule.objects.first())
self.assertEqual(schedule.schedule, '0 */6 * * *')
def test_add_crawl_with_plugins(self):
"""Test creating a crawl with specific plugins selected."""
response = self.client.post(self.add_url, {
'url': 'https://example.com',
'depth': '0',
'chrome_plugins': ['screenshot', 'dom'],
'archiving_plugins': ['wget'],
})
self.assertEqual(response.status_code, 302)
crawl = require(Crawl.objects.first())
plugins = crawl.config.get('PLUGINS', '')
# Should contain the selected plugins
self.assertIn('screenshot', plugins)
self.assertIn('dom', plugins)
self.assertIn('wget', plugins)
def test_add_crawl_with_depth_range(self):
"""Test creating crawls with different depth values (0-4)."""
for depth in range(5):
response = self.client.post(self.add_url, {
'url': f'https://example{depth}.com',
'depth': str(depth),
})
self.assertEqual(response.status_code, 302)
self.assertEqual(Crawl.objects.count(), 5)
for i, crawl in enumerate(Crawl.objects.order_by('created_at')):
self.assertEqual(crawl.max_depth, i)
def test_add_crawl_with_advanced_options(self):
"""Test creating a crawl with advanced options."""
response = self.client.post(self.add_url, {
'url': 'https://example.com',
'depth': '0',
'persona': 'CustomPersona',
'overwrite': True,
'update': True,
'index_only': True,
})
self.assertEqual(response.status_code, 302)
crawl = require(Crawl.objects.first())
config = crawl.config
self.assertEqual(config.get('DEFAULT_PERSONA'), 'CustomPersona')
self.assertEqual(config.get('OVERWRITE'), True)
self.assertEqual(config.get('ONLY_NEW'), False) # opposite of update
self.assertEqual(config.get('INDEX_ONLY'), True)
def test_add_crawl_with_custom_config(self):
"""Test creating a crawl with custom config overrides."""
# Note: Django test client can't easily POST the KeyValueWidget format,
# so this test would need to use the form directly or mock the cleaned_data
# For now, we'll skip this test or mark it as TODO
pass
def test_add_public_anonymous_custom_config_is_silently_stripped(self):
"""Anonymous users cannot override crawl config, even with PUBLIC_ADD_VIEW enabled."""
self.client.logout()
with patch.object(SERVER_CONFIG, 'PUBLIC_ADD_VIEW', True):
response = self.client.post(self.add_url, {
'url': 'https://example.com',
'depth': '0',
'config': '{"YTDLP_ARGS_EXTRA":["--exec","id > /tmp/pwned"]}',
})
self.assertEqual(response.status_code, 302)
crawl = require(Crawl.objects.order_by('-created_at').first())
self.assertNotIn('YTDLP_ARGS_EXTRA', crawl.config)
def test_add_authenticated_non_admin_custom_config_is_silently_stripped(self):
"""Authenticated non-admin users cannot override crawl config."""
response = self.client.post(self.add_url, {
'url': 'https://example.com',
'depth': '0',
'config': '{"YTDLP_ARGS_EXTRA":["--exec","id > /tmp/pwned"]}',
})
self.assertEqual(response.status_code, 302)
crawl = require(Crawl.objects.order_by('-created_at').first())
self.assertNotIn('YTDLP_ARGS_EXTRA', crawl.config)
def test_add_staff_admin_custom_config_is_allowed(self):
"""Admin users can override crawl config."""
self.client.logout()
User.objects.create_user(
username='adminuser',
password='adminpass123',
email='admin@example.com',
is_staff=True,
)
self.client.login(username='adminuser', password='adminpass123')
response = self.client.post(self.add_url, {
'url': 'https://example.com',
'depth': '0',
'config': '{"YTDLP_ARGS_EXTRA":["--exec","echo hello"]}',
})
self.assertEqual(response.status_code, 302)
crawl = require(Crawl.objects.order_by('-created_at').first())
self.assertEqual(crawl.config.get('YTDLP_ARGS_EXTRA'), ['--exec', 'echo hello'])
def test_add_empty_urls_fails(self):
"""Test that submitting without URLs fails validation."""
response = self.client.post(self.add_url, {
'url': '',
'depth': '0',
})
# Should show form again with errors, not redirect
self.assertEqual(response.status_code, 200)
self.assertFormError(cast(BaseForm, response.context['form']), 'url', 'This field is required.')
def test_add_invalid_urls_fails(self):
"""Test that invalid URLs fail validation."""
response = self.client.post(self.add_url, {
'url': 'not-a-url',
'depth': '0',
})
# Should show form again with errors
self.assertEqual(response.status_code, 200)
# Check for validation error (URL regex should fail)
self.assertContains(response, 'error')
def test_add_success_message_without_schedule(self):
"""Test that success message is shown without schedule link."""
response = self.client.post(self.add_url, {
'url': 'https://example.com\nhttps://example.org',
'depth': '0',
}, follow=True)
# Check success message mentions crawl creation
messages = list(response.context['messages'])
self.assertEqual(len(messages), 1)
message_text = str(messages[0])
self.assertIn('Created crawl with 2 starting URL', message_text)
self.assertIn('View Crawl', message_text)
self.assertNotIn('scheduled to repeat', message_text)
def test_add_success_message_with_schedule(self):
"""Test that success message includes schedule link."""
response = self.client.post(self.add_url, {
'url': 'https://example.com',
'depth': '0',
'schedule': 'weekly',
}, follow=True)
# Check success message mentions schedule
messages = list(response.context['messages'])
self.assertEqual(len(messages), 1)
message_text = str(messages[0])
self.assertIn('Created crawl', message_text)
self.assertIn('scheduled to repeat weekly', message_text)
self.assertIn('View Crawl', message_text)
def test_add_crawl_creates_source_file(self):
"""Test that crawl creation saves URLs to sources file."""
response = self.client.post(self.add_url, {
'url': 'https://example.com',
'depth': '0',
})
self.assertEqual(response.status_code, 302)
# Check that source file was created in sources/ directory
from archivebox.config import CONSTANTS
sources_dir = CONSTANTS.SOURCES_DIR
# Should have created a source file
source_files = list(sources_dir.glob('*__web_ui_add_by_user_*.txt'))
self.assertGreater(len(source_files), 0)
def test_multiple_tags_are_saved(self):
"""Test that multiple comma-separated tags are saved."""
response = self.client.post(self.add_url, {
'url': 'https://example.com',
'depth': '0',
'tag': 'tag1,tag2,tag3',
})
self.assertEqual(response.status_code, 302)
crawl = require(Crawl.objects.first())
self.assertEqual(crawl.tags_str, 'tag1,tag2,tag3')
def test_crawl_redirects_to_admin_change_page(self):
"""Test that successful submission redirects to crawl admin page."""
response = self.client.post(self.add_url, {
'url': 'https://example.com',
'depth': '0',
})
crawl = require(Crawl.objects.first())
expected_redirect = f'/admin/crawls/crawl/{crawl.id}/change/'
self.assertRedirects(response, expected_redirect, fetch_redirect_response=False)

View File

@@ -70,9 +70,16 @@ def parse_line(line: str) -> Optional[Dict[str, Any]]:
if line.startswith('http://') or line.startswith('https://') or line.startswith('file://'):
return {'type': TYPE_SNAPSHOT, 'url': line}
# Could be a snapshot ID (UUID)
# Could be a snapshot ID (UUID with dashes or compact 32-char hex)
if len(line) == 36 and line.count('-') == 4:
return {'type': TYPE_SNAPSHOT, 'id': line}
if len(line) == 32:
try:
int(line, 16)
except ValueError:
pass
else:
return {'type': TYPE_SNAPSHOT, 'id': line}
# Unknown format, skip
return None

View File

@@ -607,7 +607,7 @@ def log_worker_event(
# Build final message
error_str = f' {type(error).__name__}: {error}' if error else ''
from archivebox.misc.logging import CONSOLE
from archivebox.misc.logging import CONSOLE, STDERR
from rich.text import Text
# Create a Rich Text object for proper formatting
@@ -632,7 +632,11 @@ def log_worker_event(
if metadata_str:
text.append(f' | {metadata_str}')
CONSOLE.print(text, soft_wrap=True)
# Stdout is reserved for JSONL records whenever commands are piped together.
# Route worker/DB progress to stderr in non-TTY contexts so pipelines like
# `archivebox snapshot list | archivebox run` keep stdout machine-readable.
output_console = CONSOLE if sys.stdout.isatty() else STDERR
output_console.print(text, soft_wrap=True)
@enforce_types

View File

@@ -1,2 +0,0 @@
# Create your tests here.

View File

@@ -3,8 +3,10 @@
import os
import sys
import subprocess
import tempfile
import textwrap
import time
import shutil
from pathlib import Path
from typing import List, Dict, Any, Optional, Tuple
@@ -14,6 +16,9 @@ from archivebox.uuid_compat import uuid7
pytest_plugins = ["archivebox.tests.fixtures"]
SESSION_DATA_DIR = Path(tempfile.mkdtemp(prefix="archivebox-pytest-session-")).resolve()
os.environ.setdefault("DATA_DIR", str(SESSION_DATA_DIR))
# =============================================================================
# CLI Helpers (defined before fixtures that use them)
@@ -82,6 +87,36 @@ def run_archivebox_cmd(
# Fixtures
# =============================================================================
@pytest.fixture(autouse=True)
def isolate_test_runtime(tmp_path):
"""
Run each pytest test from an isolated temp cwd and restore env mutations.
The maintained pytest suite lives under ``archivebox/tests``. Many of those
CLI tests shell out without passing ``cwd=`` explicitly, so the safest
contract is that every test starts in its own temp directory and any
in-process ``os.environ`` edits are rolled back afterwards.
We intentionally clear ``DATA_DIR`` for the body of each test so subprocess
tests that rely on cwd keep working. During collection/import time we still
seed a separate session-scoped temp ``DATA_DIR`` above so any ArchiveBox
config imported before this fixture runs never points at the repo root.
"""
original_cwd = Path.cwd()
original_env = os.environ.copy()
os.chdir(tmp_path)
os.environ.pop("DATA_DIR", None)
try:
yield
finally:
os.chdir(original_cwd)
os.environ.clear()
os.environ.update(original_env)
def pytest_sessionfinish(session, exitstatus):
shutil.rmtree(SESSION_DATA_DIR, ignore_errors=True)
@pytest.fixture
def isolated_data_dir(tmp_path):
"""

View File

@@ -7,8 +7,11 @@ import pytest
@pytest.fixture
def process(tmp_path):
os.chdir(tmp_path)
process = subprocess.run(['archivebox', 'init'], capture_output=True)
process = subprocess.run(
['archivebox', 'init'],
capture_output=True,
cwd=tmp_path,
)
return process
@pytest.fixture

View File

@@ -1,17 +1,12 @@
import importlib
from io import StringIO
from archivebox.config.django import setup_django
from django.contrib.auth import get_user_model
from django.test import RequestFactory, TestCase
setup_django()
from archivebox.api.v1_cli import ScheduleCommandSchema, cli_schedule
from archivebox.crawls.models import CrawlSchedule
User = importlib.import_module('django.contrib.auth.models').User
TestCase = importlib.import_module('django.test').TestCase
RequestFactory = importlib.import_module('django.test').RequestFactory
api_v1_cli = importlib.import_module('archivebox.api.v1_cli')
ScheduleCommandSchema = api_v1_cli.ScheduleCommandSchema
cli_schedule = api_v1_cli.cli_schedule
CrawlSchedule = importlib.import_module('archivebox.crawls.models').CrawlSchedule
User = get_user_model()
class CLIScheduleAPITests(TestCase):

View File

@@ -1,13 +1,10 @@
#!/usr/bin/env python3
"""Integration tests for archivebox extract command."""
"""Tests for archivebox extract input handling and pipelines."""
import os
import subprocess
import sqlite3
import json
import pytest
def test_extract_runs_on_snapshot_id(tmp_path, process, disable_extractors_dict):
@@ -271,7 +268,3 @@ class TestExtractCLI:
# Should show warning about no snapshots or exit normally (empty input)
assert result.returncode == 0 or 'No' in result.stderr
if __name__ == '__main__':
pytest.main([__file__, '-v'])

View File

@@ -0,0 +1,377 @@
"""
Tests for JSONL piping contracts and `archivebox run` / `archivebox orchestrator`.
This file covers both:
- low-level JSONL/stdin parsing behavior that makes CLI piping work
- subprocess integration for the supported records `archivebox run` consumes
"""
import sqlite3
import sys
import uuid
from io import StringIO
from pathlib import Path
from archivebox.tests.conftest import (
create_test_url,
parse_jsonl_output,
run_archivebox_cmd,
)
PIPE_TEST_ENV = {
"PLUGINS": "favicon",
"SAVE_FAVICON": "True",
"USE_COLOR": "False",
"SHOW_PROGRESS": "False",
}
class MockTTYStringIO(StringIO):
def __init__(self, initial_value: str = "", *, is_tty: bool):
super().__init__(initial_value)
self._is_tty = is_tty
def isatty(self) -> bool:
return self._is_tty
def _stdout_lines(stdout: str) -> list[str]:
return [line for line in stdout.splitlines() if line.strip()]
def _assert_stdout_is_jsonl_only(stdout: str) -> None:
lines = _stdout_lines(stdout)
assert lines, "Expected stdout to contain JSONL records"
assert all(line.lstrip().startswith("{") for line in lines), stdout
def _sqlite_param(value: object) -> object:
if not isinstance(value, str):
return value
try:
return uuid.UUID(value).hex
except ValueError:
return value
def _db_value(data_dir: Path, sql: str, params: tuple[object, ...] = ()) -> object | None:
conn = sqlite3.connect(data_dir / "index.sqlite3")
try:
row = conn.execute(sql, tuple(_sqlite_param(param) for param in params)).fetchone()
finally:
conn.close()
return row[0] if row else None
def test_parse_line_accepts_supported_piping_inputs():
"""The JSONL parser should normalize the input forms CLI pipes accept."""
from archivebox.misc.jsonl import TYPE_CRAWL, TYPE_SNAPSHOT, parse_line
assert parse_line("") is None
assert parse_line(" ") is None
assert parse_line("# comment") is None
assert parse_line("not-a-url") is None
assert parse_line("ftp://example.com") is None
plain_url = parse_line("https://example.com")
assert plain_url == {"type": TYPE_SNAPSHOT, "url": "https://example.com"}
file_url = parse_line("file:///tmp/example.txt")
assert file_url == {"type": TYPE_SNAPSHOT, "url": "file:///tmp/example.txt"}
snapshot_json = parse_line('{"type":"Snapshot","url":"https://example.com","tags":"tag1,tag2"}')
assert snapshot_json is not None
assert snapshot_json["type"] == TYPE_SNAPSHOT
assert snapshot_json["tags"] == "tag1,tag2"
crawl_json = parse_line('{"type":"Crawl","id":"abc123","urls":"https://example.com","max_depth":1}')
assert crawl_json is not None
assert crawl_json["type"] == TYPE_CRAWL
assert crawl_json["id"] == "abc123"
assert crawl_json["max_depth"] == 1
snapshot_id = "01234567-89ab-cdef-0123-456789abcdef"
parsed_id = parse_line(snapshot_id)
assert parsed_id == {"type": TYPE_SNAPSHOT, "id": snapshot_id}
compact_snapshot_id = "0123456789abcdef0123456789abcdef"
compact_parsed_id = parse_line(compact_snapshot_id)
assert compact_parsed_id == {"type": TYPE_SNAPSHOT, "id": compact_snapshot_id}
def test_read_args_or_stdin_handles_args_stdin_and_mixed_jsonl():
"""Piping helpers should consume args, structured JSONL, and pass-through records."""
from archivebox.misc.jsonl import TYPE_CRAWL, read_args_or_stdin
records = list(read_args_or_stdin(("https://example1.com", "https://example2.com")))
assert [record["url"] for record in records] == ["https://example1.com", "https://example2.com"]
stdin_records = list(
read_args_or_stdin(
(),
stream=MockTTYStringIO(
'https://plain-url.com\n'
'{"type":"Snapshot","url":"https://jsonl-url.com","tags":"test"}\n'
'{"type":"Tag","id":"tag-1","name":"example"}\n'
'01234567-89ab-cdef-0123-456789abcdef\n'
'not valid json\n',
is_tty=False,
),
)
)
assert len(stdin_records) == 4
assert stdin_records[0]["url"] == "https://plain-url.com"
assert stdin_records[1]["url"] == "https://jsonl-url.com"
assert stdin_records[1]["tags"] == "test"
assert stdin_records[2]["type"] == "Tag"
assert stdin_records[2]["name"] == "example"
assert stdin_records[3]["id"] == "01234567-89ab-cdef-0123-456789abcdef"
crawl_records = list(
read_args_or_stdin(
(),
stream=MockTTYStringIO(
'{"type":"Crawl","id":"crawl-1","urls":"https://example.com\\nhttps://foo.com"}\n',
is_tty=False,
),
)
)
assert len(crawl_records) == 1
assert crawl_records[0]["type"] == TYPE_CRAWL
assert crawl_records[0]["id"] == "crawl-1"
tty_records = list(read_args_or_stdin((), stream=MockTTYStringIO("https://example.com", is_tty=True)))
assert tty_records == []
def test_collect_urls_from_plugins_reads_only_parser_outputs(tmp_path):
"""Parser extractor `urls.jsonl` outputs should be discoverable for recursive piping."""
from archivebox.hooks import collect_urls_from_plugins
(tmp_path / "wget").mkdir()
(tmp_path / "wget" / "urls.jsonl").write_text(
'{"url":"https://wget-link-1.com"}\n'
'{"url":"https://wget-link-2.com"}\n',
encoding="utf-8",
)
(tmp_path / "parse_html_urls").mkdir()
(tmp_path / "parse_html_urls" / "urls.jsonl").write_text(
'{"url":"https://html-link-1.com"}\n'
'{"url":"https://html-link-2.com","title":"HTML Link 2"}\n',
encoding="utf-8",
)
(tmp_path / "screenshot").mkdir()
urls = collect_urls_from_plugins(tmp_path)
assert len(urls) == 4
assert {url["plugin"] for url in urls} == {"wget", "parse_html_urls"}
titled = [url for url in urls if url.get("title") == "HTML Link 2"]
assert len(titled) == 1
assert titled[0]["url"] == "https://html-link-2.com"
assert collect_urls_from_plugins(tmp_path / "nonexistent") == []
def test_crawl_create_stdout_pipes_into_run(initialized_archive):
"""`archivebox crawl create | archivebox run` should queue and materialize snapshots."""
url = create_test_url()
create_stdout, create_stderr, create_code = run_archivebox_cmd(
["crawl", "create", url],
data_dir=initialized_archive,
)
assert create_code == 0, create_stderr
_assert_stdout_is_jsonl_only(create_stdout)
crawl = next(record for record in parse_jsonl_output(create_stdout) if record.get("type") == "Crawl")
run_stdout, run_stderr, run_code = run_archivebox_cmd(
["run"],
stdin=create_stdout,
data_dir=initialized_archive,
timeout=120,
env=PIPE_TEST_ENV,
)
assert run_code == 0, run_stderr
_assert_stdout_is_jsonl_only(run_stdout)
run_records = parse_jsonl_output(run_stdout)
assert any(record.get("type") == "Crawl" and record.get("id") == crawl["id"] for record in run_records)
snapshot_count = _db_value(
initialized_archive,
"SELECT COUNT(*) FROM core_snapshot WHERE crawl_id = ?",
(crawl["id"],),
)
assert isinstance(snapshot_count, int)
assert snapshot_count >= 1
def test_snapshot_list_stdout_pipes_into_run(initialized_archive):
"""`archivebox snapshot list | archivebox run` should requeue listed snapshots."""
url = create_test_url()
create_stdout, create_stderr, create_code = run_archivebox_cmd(
["snapshot", "create", url],
data_dir=initialized_archive,
)
assert create_code == 0, create_stderr
snapshot = next(record for record in parse_jsonl_output(create_stdout) if record.get("type") == "Snapshot")
list_stdout, list_stderr, list_code = run_archivebox_cmd(
["snapshot", "list", "--status=queued", f"--url__icontains={snapshot['id']}"],
data_dir=initialized_archive,
)
if list_code != 0 or not parse_jsonl_output(list_stdout):
list_stdout, list_stderr, list_code = run_archivebox_cmd(
["snapshot", "list", f"--url__icontains={url}"],
data_dir=initialized_archive,
)
assert list_code == 0, list_stderr
_assert_stdout_is_jsonl_only(list_stdout)
run_stdout, run_stderr, run_code = run_archivebox_cmd(
["run"],
stdin=list_stdout,
data_dir=initialized_archive,
timeout=120,
env=PIPE_TEST_ENV,
)
assert run_code == 0, run_stderr
_assert_stdout_is_jsonl_only(run_stdout)
run_records = parse_jsonl_output(run_stdout)
assert any(record.get("type") == "Snapshot" and record.get("id") == snapshot["id"] for record in run_records)
snapshot_status = _db_value(
initialized_archive,
"SELECT status FROM core_snapshot WHERE id = ?",
(snapshot["id"],),
)
assert snapshot_status == "sealed"
def test_archiveresult_list_stdout_pipes_into_orchestrator_alias(initialized_archive):
"""`archivebox archiveresult list | archivebox orchestrator` should preserve clean JSONL stdout."""
url = create_test_url()
snapshot_stdout, snapshot_stderr, snapshot_code = run_archivebox_cmd(
["snapshot", "create", url],
data_dir=initialized_archive,
)
assert snapshot_code == 0, snapshot_stderr
ar_create_stdout, ar_create_stderr, ar_create_code = run_archivebox_cmd(
["archiveresult", "create", "--plugin=favicon"],
stdin=snapshot_stdout,
data_dir=initialized_archive,
)
assert ar_create_code == 0, ar_create_stderr
created_records = parse_jsonl_output(ar_create_stdout)
archiveresult = next(record for record in created_records if record.get("type") == "ArchiveResult")
list_stdout, list_stderr, list_code = run_archivebox_cmd(
["archiveresult", "list", "--plugin=favicon"],
data_dir=initialized_archive,
)
assert list_code == 0, list_stderr
_assert_stdout_is_jsonl_only(list_stdout)
orchestrator_stdout, orchestrator_stderr, orchestrator_code = run_archivebox_cmd(
["orchestrator"],
stdin=list_stdout,
data_dir=initialized_archive,
timeout=120,
env=PIPE_TEST_ENV,
)
assert orchestrator_code == 0, orchestrator_stderr
_assert_stdout_is_jsonl_only(orchestrator_stdout)
assert "renamed to `archivebox run`" in orchestrator_stderr
run_records = parse_jsonl_output(orchestrator_stdout)
assert any(
record.get("type") == "ArchiveResult" and record.get("id") == archiveresult["id"]
for record in run_records
)
def test_binary_create_stdout_pipes_into_run(initialized_archive):
"""`archivebox binary create | archivebox run` should queue the binary record for processing."""
create_stdout, create_stderr, create_code = run_archivebox_cmd(
["binary", "create", "--name=python3", f"--abspath={sys.executable}", "--version=test"],
data_dir=initialized_archive,
)
assert create_code == 0, create_stderr
_assert_stdout_is_jsonl_only(create_stdout)
binary = next(record for record in parse_jsonl_output(create_stdout) if record.get("type") == "Binary")
run_stdout, run_stderr, run_code = run_archivebox_cmd(
["run"],
stdin=create_stdout,
data_dir=initialized_archive,
timeout=120,
)
assert run_code == 0, run_stderr
_assert_stdout_is_jsonl_only(run_stdout)
run_records = parse_jsonl_output(run_stdout)
assert any(record.get("type") == "Binary" and record.get("id") == binary["id"] for record in run_records)
status = _db_value(
initialized_archive,
"SELECT status FROM machine_binary WHERE id = ?",
(binary["id"],),
)
assert status in {"queued", "installed"}
def test_multi_stage_pipeline_into_run(initialized_archive):
"""`crawl create | snapshot create | archiveresult create | run` should preserve JSONL and finish work."""
url = create_test_url()
crawl_stdout, crawl_stderr, crawl_code = run_archivebox_cmd(
["crawl", "create", url],
data_dir=initialized_archive,
)
assert crawl_code == 0, crawl_stderr
_assert_stdout_is_jsonl_only(crawl_stdout)
snapshot_stdout, snapshot_stderr, snapshot_code = run_archivebox_cmd(
["snapshot", "create"],
stdin=crawl_stdout,
data_dir=initialized_archive,
)
assert snapshot_code == 0, snapshot_stderr
_assert_stdout_is_jsonl_only(snapshot_stdout)
archiveresult_stdout, archiveresult_stderr, archiveresult_code = run_archivebox_cmd(
["archiveresult", "create", "--plugin=favicon"],
stdin=snapshot_stdout,
data_dir=initialized_archive,
)
assert archiveresult_code == 0, archiveresult_stderr
_assert_stdout_is_jsonl_only(archiveresult_stdout)
run_stdout, run_stderr, run_code = run_archivebox_cmd(
["run"],
stdin=archiveresult_stdout,
data_dir=initialized_archive,
timeout=120,
env=PIPE_TEST_ENV,
)
assert run_code == 0, run_stderr
_assert_stdout_is_jsonl_only(run_stdout)
run_records = parse_jsonl_output(run_stdout)
snapshot = next(record for record in run_records if record.get("type") == "Snapshot")
assert any(record.get("type") == "ArchiveResult" for record in run_records)
snapshot_status = _db_value(
initialized_archive,
"SELECT status FROM core_snapshot WHERE id = ?",
(snapshot["id"],),
)
assert snapshot_status == "sealed"

View File

@@ -1,156 +0,0 @@
import json as pyjson
import sqlite3
import subprocess
from pathlib import Path
from .fixtures import disable_extractors_dict, process
FIXTURES = (disable_extractors_dict, process)
def _find_snapshot_dir(data_dir: Path, snapshot_id: str) -> Path | None:
candidates = {snapshot_id}
if len(snapshot_id) == 32:
candidates.add(f"{snapshot_id[:8]}-{snapshot_id[8:12]}-{snapshot_id[12:16]}-{snapshot_id[16:20]}-{snapshot_id[20:]}")
elif len(snapshot_id) == 36 and "-" in snapshot_id:
candidates.add(snapshot_id.replace("-", ""))
for needle in candidates:
for path in data_dir.rglob(needle):
if path.is_dir():
return path
return None
def _latest_snapshot_dir(data_dir: Path) -> Path:
conn = sqlite3.connect(data_dir / "index.sqlite3")
try:
snapshot_id = conn.execute(
"SELECT id FROM core_snapshot ORDER BY created_at DESC LIMIT 1"
).fetchone()
finally:
conn.close()
assert snapshot_id is not None, "Expected a snapshot to be created"
snapshot_dir = _find_snapshot_dir(data_dir, str(snapshot_id[0]))
assert snapshot_dir is not None, f"Snapshot output directory not found for {snapshot_id[0]}"
return snapshot_dir
def _latest_plugin_result(data_dir: Path, plugin: str) -> tuple[str, str, dict]:
conn = sqlite3.connect(data_dir / "index.sqlite3")
try:
row = conn.execute(
"SELECT snapshot_id, status, output_files FROM core_archiveresult "
"WHERE plugin = ? ORDER BY created_at DESC LIMIT 1",
(plugin,),
).fetchone()
finally:
conn.close()
assert row is not None, f"Expected an ArchiveResult row for plugin={plugin}"
output_files = row[2]
if isinstance(output_files, str):
output_files = pyjson.loads(output_files or "{}")
output_files = output_files or {}
return str(row[0]), str(row[1]), output_files
def _plugin_output_paths(data_dir: Path, plugin: str) -> list[Path]:
snapshot_id, status, output_files = _latest_plugin_result(data_dir, plugin)
assert status == "succeeded", f"Expected {plugin} ArchiveResult to succeed, got {status}"
assert output_files, f"Expected {plugin} ArchiveResult to record output_files"
snapshot_dir = _find_snapshot_dir(data_dir, snapshot_id)
assert snapshot_dir is not None, f"Snapshot output directory not found for {snapshot_id}"
plugin_dir = snapshot_dir / plugin
output_paths = [plugin_dir / rel_path for rel_path in output_files.keys()]
missing_paths = [path for path in output_paths if not path.exists()]
assert not missing_paths, f"Expected plugin outputs to exist on disk, missing: {missing_paths}"
return output_paths
def _archivebox_env(base_env: dict, data_dir: Path) -> dict:
env = base_env.copy()
tmp_dir = Path("/tmp") / f"abx-{data_dir.name}"
tmp_dir.mkdir(parents=True, exist_ok=True)
env["TMP_DIR"] = str(tmp_dir)
env["ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS"] = "true"
return env
def test_singlefile_works(tmp_path, process, disable_extractors_dict):
data_dir = Path.cwd()
env = _archivebox_env(disable_extractors_dict, data_dir)
env.update({"SAVE_SINGLEFILE": "true"})
add_process = subprocess.run(
['archivebox', 'add', '--plugins=singlefile', 'https://example.com'],
capture_output=True,
text=True,
env=env,
timeout=900,
)
assert add_process.returncode == 0, add_process.stderr
output_files = _plugin_output_paths(data_dir, "singlefile")
assert any(path.suffix in (".html", ".htm") for path in output_files)
def test_readability_works(tmp_path, process, disable_extractors_dict):
data_dir = Path.cwd()
env = _archivebox_env(disable_extractors_dict, data_dir)
env.update({"SAVE_SINGLEFILE": "true", "SAVE_READABILITY": "true"})
add_process = subprocess.run(
['archivebox', 'add', '--plugins=singlefile,readability', 'https://example.com'],
capture_output=True,
text=True,
env=env,
timeout=900,
)
assert add_process.returncode == 0, add_process.stderr
output_files = _plugin_output_paths(data_dir, "readability")
assert any(path.suffix in (".html", ".htm") for path in output_files)
def test_htmltotext_works(tmp_path, process, disable_extractors_dict):
data_dir = Path.cwd()
env = _archivebox_env(disable_extractors_dict, data_dir)
env.update({"SAVE_WGET": "true", "SAVE_HTMLTOTEXT": "true"})
add_process = subprocess.run(
['archivebox', 'add', '--plugins=wget,htmltotext', 'https://example.com'],
capture_output=True,
text=True,
env=env,
timeout=900,
)
assert add_process.returncode == 0, add_process.stderr
output_files = _plugin_output_paths(data_dir, "htmltotext")
assert any(path.suffix == ".txt" for path in output_files)
def test_use_node_false_disables_readability_and_singlefile(tmp_path, process, disable_extractors_dict):
env = _archivebox_env(disable_extractors_dict, Path.cwd())
env.update({"SAVE_READABILITY": "true", "SAVE_DOM": "true", "SAVE_SINGLEFILE": "true", "USE_NODE": "false"})
add_process = subprocess.run(['archivebox', 'add', '--plugins=readability,dom,singlefile', 'https://example.com'],
capture_output=True, env=env)
output_str = add_process.stdout.decode("utf-8")
assert "> singlefile" not in output_str
assert "> readability" not in output_str
def test_headers_retrieved(tmp_path, process, disable_extractors_dict):
data_dir = Path.cwd()
env = _archivebox_env(disable_extractors_dict, data_dir)
env.update({"SAVE_HEADERS": "true"})
add_process = subprocess.run(
['archivebox', 'add', '--plugins=headers', 'https://example.com'],
capture_output=True,
text=True,
env=env,
timeout=900,
)
assert add_process.returncode == 0, add_process.stderr
output_files = _plugin_output_paths(data_dir, "headers")
output_file = next((path for path in output_files if path.suffix == ".json"), None)
assert output_file is not None, f"Expected headers output_files to include a JSON file, got: {output_files}"
with open(output_file, 'r', encoding='utf-8') as f:
headers = pyjson.load(f)
response_headers = headers.get("response_headers") or headers.get("headers") or {}
assert isinstance(response_headers, dict), f"Expected response_headers dict, got: {response_headers!r}"
assert 'Content-Type' in response_headers or 'content-type' in response_headers

View File

@@ -13,7 +13,6 @@ ADMIN_HOST = 'admin.archivebox.localhost:8000'
def _run_savepagenow_script(initialized_archive: Path, request_url: str, expected_url: str, *, login: bool, public_add_view: bool, host: str):
project_root = Path(__file__).resolve().parents[2]
script = textwrap.dedent(
f"""
import os
@@ -81,7 +80,7 @@ def _run_savepagenow_script(initialized_archive: Path, request_url: str, expecte
return subprocess.run(
[sys.executable, '-c', script],
cwd=project_root,
cwd=initialized_archive,
env=env,
text=True,
capture_output=True,
@@ -90,7 +89,6 @@ def _run_savepagenow_script(initialized_archive: Path, request_url: str, expecte
def _run_savepagenow_not_found_script(initialized_archive: Path, request_url: str):
project_root = Path(__file__).resolve().parents[2]
script = textwrap.dedent(
f"""
import os
@@ -137,7 +135,7 @@ def _run_savepagenow_not_found_script(initialized_archive: Path, request_url: st
return subprocess.run(
[sys.executable, '-c', script],
cwd=project_root,
cwd=initialized_archive,
env=env,
text=True,
capture_output=True,
@@ -146,7 +144,6 @@ def _run_savepagenow_not_found_script(initialized_archive: Path, request_url: st
def _run_savepagenow_existing_snapshot_script(initialized_archive: Path, request_url: str, stored_url: str):
project_root = Path(__file__).resolve().parents[2]
script = textwrap.dedent(
f"""
import os
@@ -199,7 +196,7 @@ def _run_savepagenow_existing_snapshot_script(initialized_archive: Path, request
return subprocess.run(
[sys.executable, '-c', script],
cwd=project_root,
cwd=initialized_archive,
env=env,
text=True,
capture_output=True,

View File

@@ -29,6 +29,7 @@ Usage:
__package__ = 'archivebox.workers'
import os
import sys
import time
from typing import Type
from datetime import datetime, timedelta
@@ -258,9 +259,7 @@ class Orchestrator:
def spawn_worker(self, WorkerClass: Type[Worker]) -> int | None:
"""Spawn a new worker process. Returns PID or None if spawn failed."""
try:
print(f'[yellow]DEBUG: Spawning {WorkerClass.name} worker with crawl_id={self.crawl_id}...[/yellow]')
pid = WorkerClass.start(parent=self.db_process, crawl_id=self.crawl_id)
print(f'[yellow]DEBUG: Spawned {WorkerClass.name} worker with PID={pid}[/yellow]')
# CRITICAL: Block until worker registers itself in Process table
# This prevents race condition where orchestrator spawns multiple workers
@@ -281,17 +280,6 @@ class Orchestrator:
# 4. Parent is this orchestrator
# 5. Started recently (within last 10 seconds)
# Debug: Check all processes with this PID first
if elapsed < 0.5:
all_procs = list(Process.objects.filter(pid=pid))
print(f'[yellow]DEBUG spawn_worker: elapsed={elapsed:.1f}s pid={pid} orchestrator_id={self.db_process.id}[/yellow]')
print(f'[yellow] Found {len(all_procs)} Process records for pid={pid}[/yellow]')
for p in all_procs:
print(
f'[yellow] -> type={p.process_type} status={p.status} '
f'parent_id={p.parent_id} match={p.parent_id == self.db_process.id}[/yellow]'
)
worker_process = Process.objects.filter(
pid=pid,
process_type=Process.TypeChoices.WORKER,
@@ -302,7 +290,6 @@ class Orchestrator:
if worker_process:
# Worker successfully registered!
print(f'[green]DEBUG spawn_worker: Worker registered! Returning pid={pid}[/green]')
return pid
time.sleep(poll_interval)
@@ -653,14 +640,15 @@ class Orchestrator:
def runloop(self) -> None:
"""Main orchestrator loop."""
from rich.live import Live
from archivebox.misc.logging import IS_TTY
from archivebox.misc.progress_layout import ArchiveBoxProgressLayout
import sys
import os
is_tty = sys.stdout.isatty()
# Enable progress layout only in TTY + foreground mode
show_progress = IS_TTY and self.exit_on_idle
plain_output = not IS_TTY
show_progress = is_tty and self.exit_on_idle
# When stdout is not a TTY, it may be reserved for JSONL pipeline output.
# Keep the plain progress view, but emit it to stderr instead of stdout.
plain_output = not is_tty
self.on_startup()
if not show_progress:
@@ -1241,7 +1229,7 @@ class Orchestrator:
ts = timezone.now().strftime("%Y-%m-%d %H:%M:%S")
for panel, line in new_lines:
if line:
print(f"[{ts}] [{panel}] {line}")
print(f"[{ts}] [{panel}] {line}", file=sys.stderr)
last_plain_lines = set(plain_lines)
# Track idle state
@@ -1271,7 +1259,7 @@ class Orchestrator:
except KeyboardInterrupt:
if progress_layout:
progress_layout.log_event("Interrupted by user", style="red")
print() # Newline after ^C
print(file=sys.stderr) # Newline after ^C
self.on_shutdown(error=KeyboardInterrupt())
except BaseException as e:
if progress_layout:
@@ -1310,7 +1298,7 @@ class Orchestrator:
Used by commands like 'add' to ensure orchestrator is running.
"""
if cls.is_running():
print('[grey53]👨‍✈️ Orchestrator already running[/grey53]')
print('[grey53]👨‍✈️ Orchestrator already running[/grey53]', file=sys.stderr)
# Return a placeholder - actual orchestrator is in another process
return cls(exit_on_idle=exit_on_idle)