mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-05 23:37:58 +10:00
cleanup archivebox tests
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -38,6 +38,7 @@ lib/
|
||||
tmp/
|
||||
data/
|
||||
data*/
|
||||
archivebox/tests/data/
|
||||
archive/
|
||||
output/
|
||||
logs/
|
||||
|
||||
@@ -107,7 +107,10 @@ class ArchiveBoxGroup(click.Group):
|
||||
# handle renamed commands
|
||||
if cmd_name in self.renamed_commands:
|
||||
new_name = self.renamed_commands[cmd_name]
|
||||
print(f' [violet]Hint:[/violet] `archivebox {cmd_name}` has been renamed to `archivebox {new_name}`')
|
||||
print(
|
||||
f' [violet]Hint:[/violet] `archivebox {cmd_name}` has been renamed to `archivebox {new_name}`',
|
||||
file=sys.stderr,
|
||||
)
|
||||
cmd_name = new_name
|
||||
ctx.invoked_subcommand = cmd_name
|
||||
|
||||
|
||||
@@ -63,11 +63,28 @@ def create_binary(
|
||||
return 1
|
||||
|
||||
try:
|
||||
binary, created = Binary.objects.get_or_create(
|
||||
from archivebox.machine.models import Machine
|
||||
|
||||
machine = Machine.current()
|
||||
created = not Binary.objects.filter(
|
||||
machine=machine,
|
||||
name=name,
|
||||
abspath=abspath,
|
||||
defaults={'version': version}
|
||||
)
|
||||
version=version,
|
||||
).exists()
|
||||
|
||||
# Mirror the Binary model lifecycle used elsewhere in the system so CLI
|
||||
# records are owned by the current machine and can be safely piped into
|
||||
# `archivebox run` without creating invalid rows missing machine_id.
|
||||
binary = Binary.from_json({
|
||||
'name': name,
|
||||
'abspath': abspath,
|
||||
'version': version,
|
||||
'binproviders': 'env',
|
||||
'binprovider': 'env',
|
||||
})
|
||||
if binary is None:
|
||||
raise ValueError('failed to create binary record')
|
||||
|
||||
if not is_tty:
|
||||
write_record(binary.to_json())
|
||||
|
||||
@@ -81,6 +81,7 @@ def process_archiveresult_by_id(archiveresult_id: str) -> int:
|
||||
|
||||
def run_plugins(
|
||||
args: tuple,
|
||||
records: list[dict] | None = None,
|
||||
plugins: str = '',
|
||||
wait: bool = True,
|
||||
) -> int:
|
||||
@@ -108,8 +109,12 @@ def run_plugins(
|
||||
# Parse comma-separated plugins list once (reused in creation and filtering)
|
||||
plugins_list = [p.strip() for p in plugins.split(',') if p.strip()] if plugins else []
|
||||
|
||||
# Collect all input records
|
||||
records = list(read_args_or_stdin(args))
|
||||
# Parse stdin/args exactly once per CLI invocation.
|
||||
# `main()` may already have consumed stdin to distinguish Snapshot input from
|
||||
# ArchiveResult IDs; if so, it must pass the parsed records through here
|
||||
# instead of asking this helper to reread an already-drained pipe.
|
||||
if records is None:
|
||||
records = list(read_args_or_stdin(args))
|
||||
|
||||
if not records:
|
||||
rprint('[yellow]No snapshots provided. Pass snapshot IDs as arguments or via stdin.[/yellow]', file=sys.stderr)
|
||||
@@ -269,7 +274,7 @@ def main(plugins: str, wait: bool, args: tuple):
|
||||
sys.exit(exit_code)
|
||||
else:
|
||||
# Default behavior: run plugins on Snapshots from input
|
||||
sys.exit(run_plugins(args, plugins=plugins, wait=wait))
|
||||
sys.exit(run_plugins(args, records=records, plugins=plugins, wait=wait))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@@ -1,231 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
|
||||
|
||||
import importlib
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
import unittest
|
||||
from contextlib import contextmanager
|
||||
from pathlib import Path
|
||||
|
||||
from archivebox.config.constants import CONSTANTS
|
||||
|
||||
TEST_CONFIG = {
|
||||
'USE_COLOR': 'False',
|
||||
'SHOW_PROGRESS': 'False',
|
||||
|
||||
'DATA_DIR': 'data.tests',
|
||||
|
||||
'SAVE_ARCHIVEDOTORG': 'False',
|
||||
'SAVE_TITLE': 'False',
|
||||
|
||||
'USE_CURL': 'False',
|
||||
'USE_WGET': 'False',
|
||||
'USE_GIT': 'False',
|
||||
'USE_CHROME': 'False',
|
||||
'USE_YOUTUBEDL': 'False',
|
||||
}
|
||||
|
||||
DATA_DIR = 'data.tests'
|
||||
os.environ.update(TEST_CONFIG)
|
||||
|
||||
init = importlib.import_module('archivebox.main').init
|
||||
SQL_INDEX_FILENAME = CONSTANTS.SQL_INDEX_FILENAME
|
||||
JSON_INDEX_FILENAME = CONSTANTS.JSON_INDEX_FILENAME
|
||||
HTML_INDEX_FILENAME = CONSTANTS.HTML_INDEX_FILENAME
|
||||
archivebox_init = importlib.import_module('archivebox.cli.archivebox_init')
|
||||
archivebox_add = importlib.import_module('archivebox.cli.archivebox_add')
|
||||
archivebox_remove = importlib.import_module('archivebox.cli.archivebox_remove')
|
||||
parse_json_main_index = importlib.import_module('archivebox.misc.legacy').parse_json_main_index
|
||||
|
||||
HIDE_CLI_OUTPUT = True
|
||||
|
||||
test_urls = '''
|
||||
https://example1.com/what/is/happening.html?what=1#how-about-this=1
|
||||
https://example2.com/what/is/happening/?what=1#how-about-this=1
|
||||
HTtpS://example3.com/what/is/happening/?what=1#how-about-this=1f
|
||||
https://example4.com/what/is/happening.html
|
||||
https://example5.com/
|
||||
https://example6.com
|
||||
|
||||
<test>http://example7.com</test>
|
||||
[https://example8.com/what/is/this.php?what=1]
|
||||
[and http://example9.com?what=1&other=3#and-thing=2]
|
||||
<what>https://example10.com#and-thing=2 "</about>
|
||||
abc<this["https://subb.example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def
|
||||
sdflkf[what](https://subb.example12.com/who/what.php?whoami=1#whatami=2)?am=hi
|
||||
example13.bada
|
||||
and example14.badb
|
||||
<or>htt://example15.badc</that>
|
||||
'''
|
||||
|
||||
stdout = sys.stdout
|
||||
stderr = sys.stderr
|
||||
|
||||
|
||||
def load_main_index(*, out_dir: str):
|
||||
index_path = Path(out_dir) / JSON_INDEX_FILENAME
|
||||
if not index_path.exists():
|
||||
raise FileNotFoundError(index_path)
|
||||
return list(parse_json_main_index(Path(out_dir)))
|
||||
|
||||
|
||||
@contextmanager
|
||||
def output_hidden(show_failing=True):
|
||||
if not HIDE_CLI_OUTPUT:
|
||||
yield
|
||||
return
|
||||
|
||||
sys.stdout = open('stdout.txt', 'w+', encoding='utf-8')
|
||||
sys.stderr = open('stderr.txt', 'w+', encoding='utf-8')
|
||||
try:
|
||||
yield
|
||||
sys.stdout.close()
|
||||
sys.stderr.close()
|
||||
sys.stdout = stdout
|
||||
sys.stderr = stderr
|
||||
except Exception:
|
||||
sys.stdout.close()
|
||||
sys.stderr.close()
|
||||
sys.stdout = stdout
|
||||
sys.stderr = stderr
|
||||
if show_failing:
|
||||
with open('stdout.txt', 'r', encoding='utf-8') as f:
|
||||
print(f.read())
|
||||
with open('stderr.txt', 'r', encoding='utf-8') as f:
|
||||
print(f.read())
|
||||
raise
|
||||
finally:
|
||||
os.remove('stdout.txt')
|
||||
os.remove('stderr.txt')
|
||||
|
||||
|
||||
class TestInit(unittest.TestCase):
|
||||
def setUp(self):
|
||||
os.makedirs(DATA_DIR, exist_ok=True)
|
||||
|
||||
def tearDown(self):
|
||||
shutil.rmtree(DATA_DIR, ignore_errors=True)
|
||||
|
||||
def test_basic_init(self):
|
||||
with output_hidden():
|
||||
archivebox_init.main([])
|
||||
|
||||
assert (Path(DATA_DIR) / SQL_INDEX_FILENAME).exists()
|
||||
assert (Path(DATA_DIR) / JSON_INDEX_FILENAME).exists()
|
||||
assert (Path(DATA_DIR) / HTML_INDEX_FILENAME).exists()
|
||||
assert len(load_main_index(out_dir=DATA_DIR)) == 0
|
||||
|
||||
def test_conflicting_init(self):
|
||||
with open(Path(DATA_DIR) / 'test_conflict.txt', 'w+', encoding='utf-8') as f:
|
||||
f.write('test')
|
||||
|
||||
try:
|
||||
with output_hidden(show_failing=False):
|
||||
archivebox_init.main([])
|
||||
assert False, 'Init should have exited with an exception'
|
||||
except SystemExit:
|
||||
pass
|
||||
|
||||
assert not (Path(DATA_DIR) / SQL_INDEX_FILENAME).exists()
|
||||
assert not (Path(DATA_DIR) / JSON_INDEX_FILENAME).exists()
|
||||
assert not (Path(DATA_DIR) / HTML_INDEX_FILENAME).exists()
|
||||
try:
|
||||
load_main_index(out_dir=DATA_DIR)
|
||||
assert False, 'load_main_index should raise an exception when no index is present'
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def test_no_dirty_state(self):
|
||||
with output_hidden():
|
||||
init()
|
||||
shutil.rmtree(DATA_DIR, ignore_errors=True)
|
||||
with output_hidden():
|
||||
init()
|
||||
|
||||
|
||||
class TestAdd(unittest.TestCase):
|
||||
def setUp(self):
|
||||
os.makedirs(DATA_DIR, exist_ok=True)
|
||||
with output_hidden():
|
||||
init()
|
||||
|
||||
def tearDown(self):
|
||||
shutil.rmtree(DATA_DIR, ignore_errors=True)
|
||||
|
||||
def test_add_arg_url(self):
|
||||
with output_hidden():
|
||||
archivebox_add.main(['https://getpocket.com/users/nikisweeting/feed/all'])
|
||||
|
||||
all_links = load_main_index(out_dir=DATA_DIR)
|
||||
assert len(all_links) == 30
|
||||
|
||||
def test_add_arg_file(self):
|
||||
test_file = Path(DATA_DIR) / 'test.txt'
|
||||
with open(test_file, 'w+', encoding='utf') as f:
|
||||
f.write(test_urls)
|
||||
|
||||
with output_hidden():
|
||||
archivebox_add.main([test_file])
|
||||
|
||||
all_links = load_main_index(out_dir=DATA_DIR)
|
||||
assert len(all_links) == 12
|
||||
os.remove(test_file)
|
||||
|
||||
def test_add_stdin_url(self):
|
||||
with output_hidden():
|
||||
archivebox_add.main([], stdin=test_urls)
|
||||
|
||||
all_links = load_main_index(out_dir=DATA_DIR)
|
||||
assert len(all_links) == 12
|
||||
|
||||
|
||||
class TestRemove(unittest.TestCase):
|
||||
def setUp(self):
|
||||
os.makedirs(DATA_DIR, exist_ok=True)
|
||||
with output_hidden():
|
||||
init()
|
||||
archivebox_add.main([], stdin=test_urls)
|
||||
|
||||
# def tearDown(self):
|
||||
# shutil.rmtree(DATA_DIR, ignore_errors=True)
|
||||
|
||||
|
||||
def test_remove_exact(self):
|
||||
with output_hidden():
|
||||
archivebox_remove.main(['--yes', '--delete', 'https://example5.com/'])
|
||||
|
||||
all_links = load_main_index(out_dir=DATA_DIR)
|
||||
assert len(all_links) == 11
|
||||
|
||||
def test_remove_regex(self):
|
||||
with output_hidden():
|
||||
archivebox_remove.main(['--yes', '--delete', '--filter-type=regex', r'http(s)?:\/\/(.+\.)?(example\d\.com)'])
|
||||
|
||||
all_links = load_main_index(out_dir=DATA_DIR)
|
||||
assert len(all_links) == 4
|
||||
|
||||
def test_remove_domain(self):
|
||||
with output_hidden():
|
||||
archivebox_remove.main(['--yes', '--delete', '--filter-type=domain', 'example5.com', 'example6.com'])
|
||||
|
||||
all_links = load_main_index(out_dir=DATA_DIR)
|
||||
assert len(all_links) == 10
|
||||
|
||||
def test_remove_none(self):
|
||||
try:
|
||||
with output_hidden(show_failing=False):
|
||||
archivebox_remove.main(['--yes', '--delete', 'https://doesntexist.com'])
|
||||
assert False, 'Should raise if no URLs match'
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if '--verbose' in sys.argv or '-v' in sys.argv:
|
||||
HIDE_CLI_OUTPUT = False
|
||||
|
||||
unittest.main()
|
||||
@@ -1,665 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Tests for CLI piping workflow: crawl | snapshot | archiveresult | run
|
||||
|
||||
This module tests the JSONL-based piping between CLI commands as described in:
|
||||
https://github.com/ArchiveBox/ArchiveBox/issues/1363
|
||||
|
||||
Workflows tested:
|
||||
archivebox crawl create URL -> Crawl JSONL
|
||||
archivebox snapshot create -> Snapshot JSONL (accepts Crawl or URL input)
|
||||
archivebox archiveresult create -> ArchiveResult JSONL (accepts Snapshot input)
|
||||
archivebox run -> Process queued records (accepts any JSONL)
|
||||
|
||||
Pipeline:
|
||||
archivebox crawl create URL | archivebox snapshot create | archivebox archiveresult create | archivebox run
|
||||
|
||||
Each command should:
|
||||
- Accept URLs, IDs, or JSONL as input (args or stdin)
|
||||
- Output JSONL to stdout when piped (not TTY)
|
||||
- Output human-readable to stderr when TTY
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
|
||||
import os
|
||||
import json
|
||||
import shutil
|
||||
import tempfile
|
||||
import unittest
|
||||
from io import StringIO
|
||||
from pathlib import Path
|
||||
from typing import TypeVar
|
||||
|
||||
# Test configuration - disable slow extractors
|
||||
TEST_CONFIG = {
|
||||
'USE_COLOR': 'False',
|
||||
'SHOW_PROGRESS': 'False',
|
||||
'SAVE_ARCHIVEDOTORG': 'False',
|
||||
'SAVE_TITLE': 'True', # Fast extractor
|
||||
'SAVE_FAVICON': 'False',
|
||||
'SAVE_WGET': 'False',
|
||||
'SAVE_WARC': 'False',
|
||||
'SAVE_PDF': 'False',
|
||||
'SAVE_SCREENSHOT': 'False',
|
||||
'SAVE_DOM': 'False',
|
||||
'SAVE_SINGLEFILE': 'False',
|
||||
'SAVE_READABILITY': 'False',
|
||||
'SAVE_MERCURY': 'False',
|
||||
'SAVE_GIT': 'False',
|
||||
'SAVE_YTDLP': 'False',
|
||||
'SAVE_HEADERS': 'False',
|
||||
'USE_CURL': 'False',
|
||||
'USE_WGET': 'False',
|
||||
'USE_GIT': 'False',
|
||||
'USE_CHROME': 'False',
|
||||
'USE_YOUTUBEDL': 'False',
|
||||
'USE_NODE': 'False',
|
||||
}
|
||||
|
||||
os.environ.update(TEST_CONFIG)
|
||||
|
||||
T = TypeVar('T')
|
||||
|
||||
|
||||
def require(value: T | None) -> T:
|
||||
if value is None:
|
||||
raise AssertionError('Expected value to be present')
|
||||
return value
|
||||
|
||||
|
||||
class MockTTYStringIO(StringIO):
|
||||
def __init__(self, initial_value: str = '', *, is_tty: bool):
|
||||
super().__init__(initial_value)
|
||||
self._is_tty = is_tty
|
||||
|
||||
def isatty(self) -> bool:
|
||||
return self._is_tty
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# JSONL Utility Tests
|
||||
# =============================================================================
|
||||
|
||||
class TestJSONLParsing(unittest.TestCase):
|
||||
"""Test JSONL input parsing utilities."""
|
||||
|
||||
def test_parse_plain_url(self):
|
||||
"""Plain URLs should be parsed as Snapshot records."""
|
||||
from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT
|
||||
|
||||
result = require(parse_line('https://example.com'))
|
||||
self.assertEqual(result['type'], TYPE_SNAPSHOT)
|
||||
self.assertEqual(result['url'], 'https://example.com')
|
||||
|
||||
def test_parse_jsonl_snapshot(self):
|
||||
"""JSONL Snapshot records should preserve all fields."""
|
||||
from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT
|
||||
|
||||
line = '{"type": "Snapshot", "url": "https://example.com", "tags": "test,demo"}'
|
||||
result = require(parse_line(line))
|
||||
self.assertEqual(result['type'], TYPE_SNAPSHOT)
|
||||
self.assertEqual(result['url'], 'https://example.com')
|
||||
self.assertEqual(result['tags'], 'test,demo')
|
||||
|
||||
def test_parse_jsonl_crawl(self):
|
||||
"""JSONL Crawl records should be parsed correctly."""
|
||||
from archivebox.misc.jsonl import parse_line, TYPE_CRAWL
|
||||
|
||||
line = '{"type": "Crawl", "id": "abc123", "urls": "https://example.com", "max_depth": 1}'
|
||||
result = require(parse_line(line))
|
||||
self.assertEqual(result['type'], TYPE_CRAWL)
|
||||
self.assertEqual(result['id'], 'abc123')
|
||||
self.assertEqual(result['urls'], 'https://example.com')
|
||||
self.assertEqual(result['max_depth'], 1)
|
||||
|
||||
def test_parse_jsonl_with_id(self):
|
||||
"""JSONL with id field should be recognized."""
|
||||
from archivebox.misc.jsonl import parse_line
|
||||
|
||||
line = '{"type": "Snapshot", "id": "abc123", "url": "https://example.com"}'
|
||||
result = require(parse_line(line))
|
||||
self.assertEqual(result['id'], 'abc123')
|
||||
self.assertEqual(result['url'], 'https://example.com')
|
||||
|
||||
def test_parse_uuid_as_snapshot_id(self):
|
||||
"""Bare UUIDs should be parsed as snapshot IDs."""
|
||||
from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT
|
||||
|
||||
uuid = '01234567-89ab-cdef-0123-456789abcdef'
|
||||
result = require(parse_line(uuid))
|
||||
self.assertEqual(result['type'], TYPE_SNAPSHOT)
|
||||
self.assertEqual(result['id'], uuid)
|
||||
|
||||
def test_parse_empty_line(self):
|
||||
"""Empty lines should return None."""
|
||||
from archivebox.misc.jsonl import parse_line
|
||||
|
||||
self.assertIsNone(parse_line(''))
|
||||
self.assertIsNone(parse_line(' '))
|
||||
self.assertIsNone(parse_line('\n'))
|
||||
|
||||
def test_parse_comment_line(self):
|
||||
"""Comment lines should return None."""
|
||||
from archivebox.misc.jsonl import parse_line
|
||||
|
||||
self.assertIsNone(parse_line('# This is a comment'))
|
||||
self.assertIsNone(parse_line(' # Indented comment'))
|
||||
|
||||
def test_parse_invalid_url(self):
|
||||
"""Invalid URLs should return None."""
|
||||
from archivebox.misc.jsonl import parse_line
|
||||
|
||||
self.assertIsNone(parse_line('not-a-url'))
|
||||
self.assertIsNone(parse_line('ftp://example.com')) # Only http/https/file
|
||||
|
||||
def test_parse_file_url(self):
|
||||
"""file:// URLs should be parsed."""
|
||||
from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT
|
||||
|
||||
result = require(parse_line('file:///path/to/file.txt'))
|
||||
self.assertEqual(result['type'], TYPE_SNAPSHOT)
|
||||
self.assertEqual(result['url'], 'file:///path/to/file.txt')
|
||||
|
||||
|
||||
# Note: JSONL output serialization is tested in TestPipingWorkflowIntegration
|
||||
# using real model instances, not mocks.
|
||||
|
||||
|
||||
class TestReadArgsOrStdin(unittest.TestCase):
|
||||
"""Test reading from args or stdin."""
|
||||
|
||||
def test_read_from_args(self):
|
||||
"""Should read URLs from command line args."""
|
||||
from archivebox.misc.jsonl import read_args_or_stdin
|
||||
|
||||
args = ('https://example1.com', 'https://example2.com')
|
||||
records = list(read_args_or_stdin(args))
|
||||
|
||||
self.assertEqual(len(records), 2)
|
||||
self.assertEqual(records[0]['url'], 'https://example1.com')
|
||||
self.assertEqual(records[1]['url'], 'https://example2.com')
|
||||
|
||||
def test_read_from_stdin(self):
|
||||
"""Should read URLs from stdin when no args provided."""
|
||||
from archivebox.misc.jsonl import read_args_or_stdin
|
||||
|
||||
stdin_content = 'https://example1.com\nhttps://example2.com\n'
|
||||
stream = MockTTYStringIO(stdin_content, is_tty=False)
|
||||
|
||||
records = list(read_args_or_stdin((), stream=stream))
|
||||
|
||||
self.assertEqual(len(records), 2)
|
||||
self.assertEqual(records[0]['url'], 'https://example1.com')
|
||||
self.assertEqual(records[1]['url'], 'https://example2.com')
|
||||
|
||||
def test_read_jsonl_from_stdin(self):
|
||||
"""Should read JSONL from stdin."""
|
||||
from archivebox.misc.jsonl import read_args_or_stdin
|
||||
|
||||
stdin_content = '{"type": "Snapshot", "url": "https://example.com", "tags": "test"}\n'
|
||||
stream = MockTTYStringIO(stdin_content, is_tty=False)
|
||||
|
||||
records = list(read_args_or_stdin((), stream=stream))
|
||||
|
||||
self.assertEqual(len(records), 1)
|
||||
self.assertEqual(records[0]['url'], 'https://example.com')
|
||||
self.assertEqual(records[0]['tags'], 'test')
|
||||
|
||||
def test_read_crawl_jsonl_from_stdin(self):
|
||||
"""Should read Crawl JSONL from stdin."""
|
||||
from archivebox.misc.jsonl import read_args_or_stdin, TYPE_CRAWL
|
||||
|
||||
stdin_content = '{"type": "Crawl", "id": "abc123", "urls": "https://example.com\\nhttps://foo.com"}\n'
|
||||
stream = MockTTYStringIO(stdin_content, is_tty=False)
|
||||
|
||||
records = list(read_args_or_stdin((), stream=stream))
|
||||
|
||||
self.assertEqual(len(records), 1)
|
||||
self.assertEqual(records[0]['type'], TYPE_CRAWL)
|
||||
self.assertEqual(records[0]['id'], 'abc123')
|
||||
|
||||
def test_skip_tty_stdin(self):
|
||||
"""Should not read from TTY stdin (would block)."""
|
||||
from archivebox.misc.jsonl import read_args_or_stdin
|
||||
|
||||
stream = MockTTYStringIO('https://example.com', is_tty=True)
|
||||
|
||||
records = list(read_args_or_stdin((), stream=stream))
|
||||
self.assertEqual(len(records), 0)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Unit Tests for Individual Commands
|
||||
# =============================================================================
|
||||
|
||||
class TestCrawlCommand(unittest.TestCase):
|
||||
"""Unit tests for archivebox crawl command."""
|
||||
|
||||
def setUp(self):
|
||||
"""Set up test environment."""
|
||||
self.test_dir = tempfile.mkdtemp()
|
||||
os.environ['DATA_DIR'] = self.test_dir
|
||||
|
||||
def tearDown(self):
|
||||
"""Clean up test environment."""
|
||||
shutil.rmtree(self.test_dir, ignore_errors=True)
|
||||
|
||||
def test_crawl_accepts_url(self):
|
||||
"""crawl should accept URLs as input."""
|
||||
from archivebox.misc.jsonl import read_args_or_stdin
|
||||
|
||||
args = ('https://example.com',)
|
||||
records = list(read_args_or_stdin(args))
|
||||
|
||||
self.assertEqual(len(records), 1)
|
||||
self.assertEqual(records[0]['url'], 'https://example.com')
|
||||
|
||||
def test_crawl_output_format(self):
|
||||
"""crawl should output Crawl JSONL records."""
|
||||
from archivebox.misc.jsonl import TYPE_CRAWL
|
||||
|
||||
# Mock crawl output
|
||||
crawl_output = {
|
||||
'type': TYPE_CRAWL,
|
||||
'schema_version': '0.9.0',
|
||||
'id': 'test-crawl-id',
|
||||
'urls': 'https://example.com',
|
||||
'status': 'queued',
|
||||
'max_depth': 0,
|
||||
}
|
||||
|
||||
self.assertEqual(crawl_output['type'], TYPE_CRAWL)
|
||||
self.assertIn('id', crawl_output)
|
||||
self.assertIn('urls', crawl_output)
|
||||
|
||||
|
||||
class TestSnapshotCommand(unittest.TestCase):
|
||||
"""Unit tests for archivebox snapshot command."""
|
||||
|
||||
def setUp(self):
|
||||
"""Set up test environment."""
|
||||
self.test_dir = tempfile.mkdtemp()
|
||||
os.environ['DATA_DIR'] = self.test_dir
|
||||
|
||||
def tearDown(self):
|
||||
"""Clean up test environment."""
|
||||
shutil.rmtree(self.test_dir, ignore_errors=True)
|
||||
|
||||
def test_snapshot_accepts_url(self):
|
||||
"""snapshot should accept URLs as input."""
|
||||
from archivebox.misc.jsonl import read_args_or_stdin
|
||||
|
||||
args = ('https://example.com',)
|
||||
records = list(read_args_or_stdin(args))
|
||||
|
||||
self.assertEqual(len(records), 1)
|
||||
self.assertEqual(records[0]['url'], 'https://example.com')
|
||||
|
||||
def test_snapshot_accepts_crawl_jsonl(self):
|
||||
"""snapshot should accept Crawl JSONL as input."""
|
||||
from archivebox.misc.jsonl import read_args_or_stdin, TYPE_CRAWL
|
||||
|
||||
stdin = MockTTYStringIO('{"type": "Crawl", "id": "abc123", "urls": "https://example.com"}\n', is_tty=False)
|
||||
|
||||
records = list(read_args_or_stdin((), stream=stdin))
|
||||
|
||||
self.assertEqual(len(records), 1)
|
||||
self.assertEqual(records[0]['type'], TYPE_CRAWL)
|
||||
self.assertEqual(records[0]['id'], 'abc123')
|
||||
self.assertEqual(records[0]['urls'], 'https://example.com')
|
||||
|
||||
def test_snapshot_accepts_jsonl_with_metadata(self):
|
||||
"""snapshot should accept JSONL with tags and other metadata."""
|
||||
from archivebox.misc.jsonl import read_args_or_stdin
|
||||
|
||||
stdin = MockTTYStringIO('{"type": "Snapshot", "url": "https://example.com", "tags": "tag1,tag2", "title": "Test"}\n', is_tty=False)
|
||||
|
||||
records = list(read_args_or_stdin((), stream=stdin))
|
||||
|
||||
self.assertEqual(len(records), 1)
|
||||
self.assertEqual(records[0]['url'], 'https://example.com')
|
||||
self.assertEqual(records[0]['tags'], 'tag1,tag2')
|
||||
self.assertEqual(records[0]['title'], 'Test')
|
||||
|
||||
# Note: Snapshot output format is tested in integration tests
|
||||
# (TestPipingWorkflowIntegration.test_snapshot_creates_and_outputs_jsonl)
|
||||
# using real Snapshot instances.
|
||||
|
||||
|
||||
class TestArchiveResultCommand(unittest.TestCase):
|
||||
"""Unit tests for archivebox archiveresult command."""
|
||||
|
||||
def setUp(self):
|
||||
"""Set up test environment."""
|
||||
self.test_dir = tempfile.mkdtemp()
|
||||
os.environ['DATA_DIR'] = self.test_dir
|
||||
|
||||
def tearDown(self):
|
||||
"""Clean up test environment."""
|
||||
shutil.rmtree(self.test_dir, ignore_errors=True)
|
||||
|
||||
def test_archiveresult_accepts_snapshot_id(self):
|
||||
"""archiveresult should accept snapshot IDs as input."""
|
||||
from archivebox.misc.jsonl import read_args_or_stdin
|
||||
|
||||
uuid = '01234567-89ab-cdef-0123-456789abcdef'
|
||||
args = (uuid,)
|
||||
records = list(read_args_or_stdin(args))
|
||||
|
||||
self.assertEqual(len(records), 1)
|
||||
self.assertEqual(records[0]['id'], uuid)
|
||||
|
||||
def test_archiveresult_accepts_jsonl_snapshot(self):
|
||||
"""archiveresult should accept JSONL Snapshot records."""
|
||||
from archivebox.misc.jsonl import read_args_or_stdin, TYPE_SNAPSHOT
|
||||
|
||||
stdin = MockTTYStringIO('{"type": "Snapshot", "id": "abc123", "url": "https://example.com"}\n', is_tty=False)
|
||||
|
||||
records = list(read_args_or_stdin((), stream=stdin))
|
||||
|
||||
self.assertEqual(len(records), 1)
|
||||
self.assertEqual(records[0]['type'], TYPE_SNAPSHOT)
|
||||
self.assertEqual(records[0]['id'], 'abc123')
|
||||
|
||||
def test_archiveresult_gathers_snapshot_ids(self):
|
||||
"""archiveresult should gather snapshot IDs from various input formats."""
|
||||
from archivebox.misc.jsonl import TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
|
||||
|
||||
records = [
|
||||
{'type': TYPE_SNAPSHOT, 'id': 'snap-1'},
|
||||
{'type': TYPE_SNAPSHOT, 'id': 'snap-2', 'url': 'https://example.com'},
|
||||
{'type': TYPE_ARCHIVERESULT, 'snapshot_id': 'snap-3'},
|
||||
{'id': 'snap-4'}, # Bare id
|
||||
]
|
||||
|
||||
snapshot_ids = set()
|
||||
for record in records:
|
||||
record_type = record.get('type')
|
||||
|
||||
if record_type == TYPE_SNAPSHOT:
|
||||
snapshot_id = record.get('id')
|
||||
if snapshot_id:
|
||||
snapshot_ids.add(snapshot_id)
|
||||
elif record_type == TYPE_ARCHIVERESULT:
|
||||
snapshot_id = record.get('snapshot_id')
|
||||
if snapshot_id:
|
||||
snapshot_ids.add(snapshot_id)
|
||||
elif 'id' in record:
|
||||
snapshot_ids.add(record['id'])
|
||||
|
||||
self.assertEqual(len(snapshot_ids), 4)
|
||||
self.assertIn('snap-1', snapshot_ids)
|
||||
self.assertIn('snap-2', snapshot_ids)
|
||||
self.assertIn('snap-3', snapshot_ids)
|
||||
self.assertIn('snap-4', snapshot_ids)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# URL Collection Tests
|
||||
# =============================================================================
|
||||
|
||||
class TestURLCollection(unittest.TestCase):
|
||||
"""Test collecting urls.jsonl from extractor output."""
|
||||
|
||||
def setUp(self):
|
||||
"""Create test directory structure."""
|
||||
self.test_dir = Path(tempfile.mkdtemp())
|
||||
|
||||
# Create fake extractor output directories with urls.jsonl
|
||||
(self.test_dir / 'wget').mkdir()
|
||||
(self.test_dir / 'wget' / 'urls.jsonl').write_text(
|
||||
'{"url": "https://wget-link-1.com"}\n'
|
||||
'{"url": "https://wget-link-2.com"}\n'
|
||||
)
|
||||
|
||||
(self.test_dir / 'parse_html_urls').mkdir()
|
||||
(self.test_dir / 'parse_html_urls' / 'urls.jsonl').write_text(
|
||||
'{"url": "https://html-link-1.com"}\n'
|
||||
'{"url": "https://html-link-2.com", "title": "HTML Link 2"}\n'
|
||||
)
|
||||
|
||||
(self.test_dir / 'screenshot').mkdir()
|
||||
# No urls.jsonl in screenshot dir - not a parser
|
||||
|
||||
def tearDown(self):
|
||||
"""Clean up test directory."""
|
||||
shutil.rmtree(self.test_dir, ignore_errors=True)
|
||||
|
||||
def test_collect_urls_from_plugins(self):
|
||||
"""Should collect urls.jsonl from all parser plugin subdirectories."""
|
||||
from archivebox.hooks import collect_urls_from_plugins
|
||||
|
||||
urls = collect_urls_from_plugins(self.test_dir)
|
||||
|
||||
self.assertEqual(len(urls), 4)
|
||||
|
||||
# Check that plugin is set
|
||||
plugins = {u['plugin'] for u in urls}
|
||||
self.assertIn('wget', plugins)
|
||||
self.assertIn('parse_html_urls', plugins)
|
||||
self.assertNotIn('screenshot', plugins) # No urls.jsonl
|
||||
|
||||
def test_collect_urls_preserves_metadata(self):
|
||||
"""Should preserve metadata from urls.jsonl entries."""
|
||||
from archivebox.hooks import collect_urls_from_plugins
|
||||
|
||||
urls = collect_urls_from_plugins(self.test_dir)
|
||||
|
||||
# Find the entry with title
|
||||
titled = [u for u in urls if u.get('title') == 'HTML Link 2']
|
||||
self.assertEqual(len(titled), 1)
|
||||
self.assertEqual(titled[0]['url'], 'https://html-link-2.com')
|
||||
|
||||
def test_collect_urls_empty_dir(self):
|
||||
"""Should handle empty or non-existent directories."""
|
||||
from archivebox.hooks import collect_urls_from_plugins
|
||||
|
||||
empty_dir = self.test_dir / 'nonexistent'
|
||||
urls = collect_urls_from_plugins(empty_dir)
|
||||
|
||||
self.assertEqual(len(urls), 0)
|
||||
|
||||
|
||||
class TestEdgeCases(unittest.TestCase):
|
||||
"""Test edge cases and error handling."""
|
||||
|
||||
def test_empty_input(self):
|
||||
"""Commands should handle empty input gracefully."""
|
||||
from archivebox.misc.jsonl import read_args_or_stdin
|
||||
|
||||
# Empty args, TTY stdin (should not block)
|
||||
stdin = MockTTYStringIO('', is_tty=True)
|
||||
|
||||
records = list(read_args_or_stdin((), stream=stdin))
|
||||
self.assertEqual(len(records), 0)
|
||||
|
||||
def test_malformed_jsonl(self):
|
||||
"""Should skip malformed JSONL lines."""
|
||||
from archivebox.misc.jsonl import read_args_or_stdin
|
||||
|
||||
stdin = MockTTYStringIO(
|
||||
'{"url": "https://good.com"}\n'
|
||||
'not valid json\n'
|
||||
'{"url": "https://also-good.com"}\n',
|
||||
is_tty=False,
|
||||
)
|
||||
|
||||
records = list(read_args_or_stdin((), stream=stdin))
|
||||
|
||||
self.assertEqual(len(records), 2)
|
||||
urls = {r['url'] for r in records}
|
||||
self.assertEqual(urls, {'https://good.com', 'https://also-good.com'})
|
||||
|
||||
def test_mixed_input_formats(self):
|
||||
"""Should handle mixed URLs and JSONL."""
|
||||
from archivebox.misc.jsonl import read_args_or_stdin
|
||||
|
||||
stdin = MockTTYStringIO(
|
||||
'https://plain-url.com\n'
|
||||
'{"type": "Snapshot", "url": "https://jsonl-url.com", "tags": "test"}\n'
|
||||
'01234567-89ab-cdef-0123-456789abcdef\n', # UUID
|
||||
is_tty=False,
|
||||
)
|
||||
|
||||
records = list(read_args_or_stdin((), stream=stdin))
|
||||
|
||||
self.assertEqual(len(records), 3)
|
||||
|
||||
# Plain URL
|
||||
self.assertEqual(records[0]['url'], 'https://plain-url.com')
|
||||
|
||||
# JSONL with metadata
|
||||
self.assertEqual(records[1]['url'], 'https://jsonl-url.com')
|
||||
self.assertEqual(records[1]['tags'], 'test')
|
||||
|
||||
# UUID
|
||||
self.assertEqual(records[2]['id'], '01234567-89ab-cdef-0123-456789abcdef')
|
||||
|
||||
def test_crawl_with_multiple_urls(self):
|
||||
"""Crawl should handle multiple URLs in a single crawl."""
|
||||
from archivebox.misc.jsonl import TYPE_CRAWL
|
||||
|
||||
# Test crawl JSONL with multiple URLs
|
||||
crawl_output = {
|
||||
'type': TYPE_CRAWL,
|
||||
'id': 'test-multi-url-crawl',
|
||||
'urls': 'https://url1.com\nhttps://url2.com\nhttps://url3.com',
|
||||
'max_depth': 0,
|
||||
}
|
||||
|
||||
# Parse the URLs
|
||||
urls = [u.strip() for u in crawl_output['urls'].split('\n') if u.strip()]
|
||||
|
||||
self.assertEqual(len(urls), 3)
|
||||
self.assertEqual(urls[0], 'https://url1.com')
|
||||
self.assertEqual(urls[1], 'https://url2.com')
|
||||
self.assertEqual(urls[2], 'https://url3.com')
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Pass-Through Behavior Tests
|
||||
# =============================================================================
|
||||
|
||||
class TestPassThroughBehavior(unittest.TestCase):
|
||||
"""Test pass-through behavior in CLI commands."""
|
||||
|
||||
def test_crawl_passes_through_other_types(self):
|
||||
"""crawl create should pass through records with other types."""
|
||||
|
||||
# Input: a Tag record (not a Crawl or URL)
|
||||
tag_record = {'type': 'Tag', 'id': 'test-tag', 'name': 'example'}
|
||||
url_record = {'url': 'https://example.com'}
|
||||
|
||||
# Mock stdin with both records
|
||||
stdin = MockTTYStringIO(
|
||||
json.dumps(tag_record)
|
||||
+ '\n'
|
||||
+ json.dumps(url_record),
|
||||
is_tty=False,
|
||||
)
|
||||
|
||||
# The Tag should be passed through, the URL should create a Crawl
|
||||
# (This is a unit test of the pass-through logic)
|
||||
from archivebox.misc.jsonl import read_args_or_stdin
|
||||
records = list(read_args_or_stdin((), stream=stdin))
|
||||
|
||||
self.assertEqual(len(records), 2)
|
||||
# First record is a Tag (other type)
|
||||
self.assertEqual(records[0]['type'], 'Tag')
|
||||
# Second record has a URL
|
||||
self.assertIn('url', records[1])
|
||||
|
||||
def test_snapshot_passes_through_crawl(self):
|
||||
"""snapshot create should pass through Crawl records."""
|
||||
from archivebox.misc.jsonl import TYPE_CRAWL
|
||||
|
||||
crawl_record = {
|
||||
'type': TYPE_CRAWL,
|
||||
'id': 'test-crawl',
|
||||
'urls': 'https://example.com',
|
||||
}
|
||||
|
||||
# Crawl records should be passed through AND create snapshots
|
||||
# This tests the accumulation behavior
|
||||
self.assertEqual(crawl_record['type'], TYPE_CRAWL)
|
||||
self.assertIn('urls', crawl_record)
|
||||
|
||||
def test_archiveresult_passes_through_snapshot(self):
|
||||
"""archiveresult create should pass through Snapshot records."""
|
||||
from archivebox.misc.jsonl import TYPE_SNAPSHOT
|
||||
|
||||
snapshot_record = {
|
||||
'type': TYPE_SNAPSHOT,
|
||||
'id': 'test-snapshot',
|
||||
'url': 'https://example.com',
|
||||
}
|
||||
|
||||
# Snapshot records should be passed through
|
||||
self.assertEqual(snapshot_record['type'], TYPE_SNAPSHOT)
|
||||
self.assertIn('url', snapshot_record)
|
||||
|
||||
def test_run_passes_through_unknown_types(self):
|
||||
"""run should pass through records with unknown types."""
|
||||
unknown_record = {'type': 'Unknown', 'id': 'test', 'data': 'value'}
|
||||
|
||||
# Unknown types should be passed through unchanged
|
||||
self.assertEqual(unknown_record['type'], 'Unknown')
|
||||
self.assertIn('data', unknown_record)
|
||||
|
||||
|
||||
class TestPipelineAccumulation(unittest.TestCase):
|
||||
"""Test that pipelines accumulate records correctly."""
|
||||
|
||||
def test_full_pipeline_output_types(self):
|
||||
"""Full pipeline should output all record types."""
|
||||
from archivebox.misc.jsonl import TYPE_CRAWL, TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
|
||||
|
||||
# Simulated pipeline output after: crawl | snapshot | archiveresult | run
|
||||
# Should contain Crawl, Snapshot, and ArchiveResult records
|
||||
pipeline_output = [
|
||||
{'type': TYPE_CRAWL, 'id': 'c1', 'urls': 'https://example.com'},
|
||||
{'type': TYPE_SNAPSHOT, 'id': 's1', 'url': 'https://example.com'},
|
||||
{'type': TYPE_ARCHIVERESULT, 'id': 'ar1', 'plugin': 'title'},
|
||||
]
|
||||
|
||||
types = {r['type'] for r in pipeline_output}
|
||||
self.assertIn(TYPE_CRAWL, types)
|
||||
self.assertIn(TYPE_SNAPSHOT, types)
|
||||
self.assertIn(TYPE_ARCHIVERESULT, types)
|
||||
|
||||
def test_pipeline_preserves_ids(self):
|
||||
"""Pipeline should preserve record IDs through all stages."""
|
||||
records = [
|
||||
{'type': 'Crawl', 'id': 'c1', 'urls': 'https://example.com'},
|
||||
{'type': 'Snapshot', 'id': 's1', 'url': 'https://example.com'},
|
||||
]
|
||||
|
||||
# All records should have IDs
|
||||
for record in records:
|
||||
self.assertIn('id', record)
|
||||
self.assertTrue(record['id'])
|
||||
|
||||
def test_jq_transform_pattern(self):
|
||||
"""Test pattern for jq transforms in pipeline."""
|
||||
# Simulated: archiveresult list --status=failed | jq 'del(.id) | .status = "queued"'
|
||||
failed_record = {
|
||||
'type': 'ArchiveResult',
|
||||
'id': 'ar1',
|
||||
'status': 'failed',
|
||||
'plugin': 'wget',
|
||||
}
|
||||
|
||||
# Transform: delete id, set status to queued
|
||||
transformed = {
|
||||
'type': failed_record['type'],
|
||||
'status': 'queued',
|
||||
'plugin': failed_record['plugin'],
|
||||
}
|
||||
|
||||
self.assertNotIn('id', transformed)
|
||||
self.assertEqual(transformed['status'], 'queued')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
@@ -1,382 +0,0 @@
|
||||
"""Tests for the core views, especially AddView."""
|
||||
|
||||
import importlib
|
||||
import os
|
||||
import django
|
||||
from unittest.mock import patch
|
||||
from typing import TypeVar, cast
|
||||
|
||||
from django.forms import BaseForm
|
||||
|
||||
# Set up Django before importing any Django-dependent modules
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
|
||||
django.setup()
|
||||
|
||||
TestCase = importlib.import_module('django.test').TestCase
|
||||
Client = importlib.import_module('django.test').Client
|
||||
User = importlib.import_module('django.contrib.auth.models').User
|
||||
reverse = importlib.import_module('django.urls').reverse
|
||||
Crawl = importlib.import_module('archivebox.crawls.models').Crawl
|
||||
CrawlSchedule = importlib.import_module('archivebox.crawls.models').CrawlSchedule
|
||||
Tag = importlib.import_module('archivebox.core.models').Tag
|
||||
SERVER_CONFIG = importlib.import_module('archivebox.config.common').SERVER_CONFIG
|
||||
|
||||
T = TypeVar('T')
|
||||
|
||||
|
||||
def require(value: T | None) -> T:
|
||||
if value is None:
|
||||
raise AssertionError('Expected value to be present')
|
||||
return value
|
||||
|
||||
|
||||
class AddViewTests(TestCase):
|
||||
"""Tests for the AddView (crawl creation form)."""
|
||||
|
||||
def setUp(self):
|
||||
"""Set up test user and client."""
|
||||
self.client = Client()
|
||||
self.user = User.objects.create_user(
|
||||
username='testuser',
|
||||
password='testpass123',
|
||||
email='test@example.com'
|
||||
)
|
||||
self.client.login(username='testuser', password='testpass123')
|
||||
self.add_url = reverse('add')
|
||||
|
||||
def test_add_view_get_requires_auth(self):
|
||||
"""Test that GET /add requires authentication."""
|
||||
self.client.logout()
|
||||
response = self.client.get(self.add_url)
|
||||
# Should redirect to login or show 403/404
|
||||
self.assertIn(response.status_code, [302, 403, 404])
|
||||
|
||||
def test_add_view_get_shows_form(self):
|
||||
"""Test that GET /add shows the form with all fields."""
|
||||
response = self.client.get(self.add_url)
|
||||
self.assertEqual(response.status_code, 200)
|
||||
|
||||
# Check that form fields are present
|
||||
self.assertContains(response, 'name="url"')
|
||||
self.assertContains(response, 'name="tag"')
|
||||
self.assertContains(response, 'name="depth"')
|
||||
self.assertContains(response, 'name="notes"')
|
||||
self.assertContains(response, 'name="schedule"')
|
||||
self.assertContains(response, 'name="persona"')
|
||||
self.assertContains(response, 'name="overwrite"')
|
||||
self.assertContains(response, 'name="update"')
|
||||
self.assertContains(response, 'name="index_only"')
|
||||
|
||||
# Check for plugin groups
|
||||
self.assertContains(response, 'name="chrome_plugins"')
|
||||
self.assertContains(response, 'name="archiving_plugins"')
|
||||
self.assertContains(response, 'name="parsing_plugins"')
|
||||
|
||||
def test_add_view_shows_tag_autocomplete(self):
|
||||
"""Test that tag autocomplete datalist is rendered."""
|
||||
# Create some tags
|
||||
Tag.objects.create(name='test-tag-1')
|
||||
Tag.objects.create(name='test-tag-2')
|
||||
|
||||
response = self.client.get(self.add_url)
|
||||
self.assertEqual(response.status_code, 200)
|
||||
|
||||
# Check for datalist with tags
|
||||
self.assertContains(response, 'id="tag-datalist"')
|
||||
self.assertContains(response, 'test-tag-1')
|
||||
self.assertContains(response, 'test-tag-2')
|
||||
|
||||
def test_add_view_shows_plugin_presets(self):
|
||||
"""Test that plugin preset buttons are rendered."""
|
||||
response = self.client.get(self.add_url)
|
||||
self.assertEqual(response.status_code, 200)
|
||||
|
||||
self.assertContains(response, 'Quick Archive')
|
||||
self.assertContains(response, 'Full Chrome')
|
||||
self.assertContains(response, 'Text Only')
|
||||
self.assertContains(response, 'Select All')
|
||||
self.assertContains(response, 'Clear All')
|
||||
|
||||
def test_add_view_shows_links_to_resources(self):
|
||||
"""Test that helpful links are present."""
|
||||
response = self.client.get(self.add_url)
|
||||
self.assertEqual(response.status_code, 200)
|
||||
|
||||
# Link to plugin documentation
|
||||
self.assertContains(response, '/admin/environment/plugins/')
|
||||
|
||||
# Link to create new persona
|
||||
self.assertContains(response, '/admin/personas/persona/add/')
|
||||
|
||||
def test_add_basic_crawl_without_schedule(self):
|
||||
"""Test creating a basic crawl without a schedule."""
|
||||
response = self.client.post(self.add_url, {
|
||||
'url': 'https://example.com\nhttps://example.org',
|
||||
'tag': 'test-tag',
|
||||
'depth': '0',
|
||||
'notes': 'Test crawl notes',
|
||||
})
|
||||
|
||||
# Should redirect to crawl admin page
|
||||
self.assertEqual(response.status_code, 302)
|
||||
|
||||
# Check that crawl was created
|
||||
self.assertEqual(Crawl.objects.count(), 1)
|
||||
crawl = require(Crawl.objects.first())
|
||||
|
||||
self.assertIn('https://example.com', crawl.urls)
|
||||
self.assertIn('https://example.org', crawl.urls)
|
||||
self.assertEqual(crawl.tags_str, 'test-tag')
|
||||
self.assertEqual(crawl.max_depth, 0)
|
||||
self.assertEqual(crawl.notes, 'Test crawl notes')
|
||||
self.assertEqual(crawl.created_by, self.user)
|
||||
|
||||
# No schedule should be created
|
||||
self.assertIsNone(crawl.schedule)
|
||||
self.assertEqual(CrawlSchedule.objects.count(), 0)
|
||||
|
||||
def test_add_crawl_with_schedule(self):
|
||||
"""Test creating a crawl with a repeat schedule."""
|
||||
response = self.client.post(self.add_url, {
|
||||
'url': 'https://example.com',
|
||||
'tag': 'scheduled',
|
||||
'depth': '1',
|
||||
'notes': 'Daily crawl',
|
||||
'schedule': 'daily',
|
||||
})
|
||||
|
||||
self.assertEqual(response.status_code, 302)
|
||||
|
||||
# Check that crawl and schedule were created
|
||||
self.assertEqual(Crawl.objects.count(), 1)
|
||||
self.assertEqual(CrawlSchedule.objects.count(), 1)
|
||||
|
||||
crawl = require(Crawl.objects.first())
|
||||
schedule = require(CrawlSchedule.objects.first())
|
||||
|
||||
self.assertEqual(crawl.schedule, schedule)
|
||||
self.assertEqual(schedule.template, crawl)
|
||||
self.assertEqual(schedule.schedule, 'daily')
|
||||
self.assertTrue(schedule.is_enabled)
|
||||
self.assertEqual(schedule.created_by, self.user)
|
||||
|
||||
def test_add_crawl_with_cron_schedule(self):
|
||||
"""Test creating a crawl with a cron format schedule."""
|
||||
response = self.client.post(self.add_url, {
|
||||
'url': 'https://example.com',
|
||||
'depth': '0',
|
||||
'schedule': '0 */6 * * *', # Every 6 hours
|
||||
})
|
||||
|
||||
self.assertEqual(response.status_code, 302)
|
||||
|
||||
schedule = require(CrawlSchedule.objects.first())
|
||||
self.assertEqual(schedule.schedule, '0 */6 * * *')
|
||||
|
||||
def test_add_crawl_with_plugins(self):
|
||||
"""Test creating a crawl with specific plugins selected."""
|
||||
response = self.client.post(self.add_url, {
|
||||
'url': 'https://example.com',
|
||||
'depth': '0',
|
||||
'chrome_plugins': ['screenshot', 'dom'],
|
||||
'archiving_plugins': ['wget'],
|
||||
})
|
||||
|
||||
self.assertEqual(response.status_code, 302)
|
||||
|
||||
crawl = require(Crawl.objects.first())
|
||||
plugins = crawl.config.get('PLUGINS', '')
|
||||
|
||||
# Should contain the selected plugins
|
||||
self.assertIn('screenshot', plugins)
|
||||
self.assertIn('dom', plugins)
|
||||
self.assertIn('wget', plugins)
|
||||
|
||||
def test_add_crawl_with_depth_range(self):
|
||||
"""Test creating crawls with different depth values (0-4)."""
|
||||
for depth in range(5):
|
||||
response = self.client.post(self.add_url, {
|
||||
'url': f'https://example{depth}.com',
|
||||
'depth': str(depth),
|
||||
})
|
||||
|
||||
self.assertEqual(response.status_code, 302)
|
||||
|
||||
self.assertEqual(Crawl.objects.count(), 5)
|
||||
|
||||
for i, crawl in enumerate(Crawl.objects.order_by('created_at')):
|
||||
self.assertEqual(crawl.max_depth, i)
|
||||
|
||||
def test_add_crawl_with_advanced_options(self):
|
||||
"""Test creating a crawl with advanced options."""
|
||||
response = self.client.post(self.add_url, {
|
||||
'url': 'https://example.com',
|
||||
'depth': '0',
|
||||
'persona': 'CustomPersona',
|
||||
'overwrite': True,
|
||||
'update': True,
|
||||
'index_only': True,
|
||||
})
|
||||
|
||||
self.assertEqual(response.status_code, 302)
|
||||
|
||||
crawl = require(Crawl.objects.first())
|
||||
config = crawl.config
|
||||
|
||||
self.assertEqual(config.get('DEFAULT_PERSONA'), 'CustomPersona')
|
||||
self.assertEqual(config.get('OVERWRITE'), True)
|
||||
self.assertEqual(config.get('ONLY_NEW'), False) # opposite of update
|
||||
self.assertEqual(config.get('INDEX_ONLY'), True)
|
||||
|
||||
def test_add_crawl_with_custom_config(self):
|
||||
"""Test creating a crawl with custom config overrides."""
|
||||
# Note: Django test client can't easily POST the KeyValueWidget format,
|
||||
# so this test would need to use the form directly or mock the cleaned_data
|
||||
# For now, we'll skip this test or mark it as TODO
|
||||
pass
|
||||
|
||||
def test_add_public_anonymous_custom_config_is_silently_stripped(self):
|
||||
"""Anonymous users cannot override crawl config, even with PUBLIC_ADD_VIEW enabled."""
|
||||
self.client.logout()
|
||||
|
||||
with patch.object(SERVER_CONFIG, 'PUBLIC_ADD_VIEW', True):
|
||||
response = self.client.post(self.add_url, {
|
||||
'url': 'https://example.com',
|
||||
'depth': '0',
|
||||
'config': '{"YTDLP_ARGS_EXTRA":["--exec","id > /tmp/pwned"]}',
|
||||
})
|
||||
|
||||
self.assertEqual(response.status_code, 302)
|
||||
crawl = require(Crawl.objects.order_by('-created_at').first())
|
||||
self.assertNotIn('YTDLP_ARGS_EXTRA', crawl.config)
|
||||
|
||||
def test_add_authenticated_non_admin_custom_config_is_silently_stripped(self):
|
||||
"""Authenticated non-admin users cannot override crawl config."""
|
||||
response = self.client.post(self.add_url, {
|
||||
'url': 'https://example.com',
|
||||
'depth': '0',
|
||||
'config': '{"YTDLP_ARGS_EXTRA":["--exec","id > /tmp/pwned"]}',
|
||||
})
|
||||
|
||||
self.assertEqual(response.status_code, 302)
|
||||
crawl = require(Crawl.objects.order_by('-created_at').first())
|
||||
self.assertNotIn('YTDLP_ARGS_EXTRA', crawl.config)
|
||||
|
||||
def test_add_staff_admin_custom_config_is_allowed(self):
|
||||
"""Admin users can override crawl config."""
|
||||
self.client.logout()
|
||||
User.objects.create_user(
|
||||
username='adminuser',
|
||||
password='adminpass123',
|
||||
email='admin@example.com',
|
||||
is_staff=True,
|
||||
)
|
||||
self.client.login(username='adminuser', password='adminpass123')
|
||||
|
||||
response = self.client.post(self.add_url, {
|
||||
'url': 'https://example.com',
|
||||
'depth': '0',
|
||||
'config': '{"YTDLP_ARGS_EXTRA":["--exec","echo hello"]}',
|
||||
})
|
||||
|
||||
self.assertEqual(response.status_code, 302)
|
||||
crawl = require(Crawl.objects.order_by('-created_at').first())
|
||||
self.assertEqual(crawl.config.get('YTDLP_ARGS_EXTRA'), ['--exec', 'echo hello'])
|
||||
|
||||
def test_add_empty_urls_fails(self):
|
||||
"""Test that submitting without URLs fails validation."""
|
||||
response = self.client.post(self.add_url, {
|
||||
'url': '',
|
||||
'depth': '0',
|
||||
})
|
||||
|
||||
# Should show form again with errors, not redirect
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertFormError(cast(BaseForm, response.context['form']), 'url', 'This field is required.')
|
||||
|
||||
def test_add_invalid_urls_fails(self):
|
||||
"""Test that invalid URLs fail validation."""
|
||||
response = self.client.post(self.add_url, {
|
||||
'url': 'not-a-url',
|
||||
'depth': '0',
|
||||
})
|
||||
|
||||
# Should show form again with errors
|
||||
self.assertEqual(response.status_code, 200)
|
||||
# Check for validation error (URL regex should fail)
|
||||
self.assertContains(response, 'error')
|
||||
|
||||
def test_add_success_message_without_schedule(self):
|
||||
"""Test that success message is shown without schedule link."""
|
||||
response = self.client.post(self.add_url, {
|
||||
'url': 'https://example.com\nhttps://example.org',
|
||||
'depth': '0',
|
||||
}, follow=True)
|
||||
|
||||
# Check success message mentions crawl creation
|
||||
messages = list(response.context['messages'])
|
||||
self.assertEqual(len(messages), 1)
|
||||
message_text = str(messages[0])
|
||||
|
||||
self.assertIn('Created crawl with 2 starting URL', message_text)
|
||||
self.assertIn('View Crawl', message_text)
|
||||
self.assertNotIn('scheduled to repeat', message_text)
|
||||
|
||||
def test_add_success_message_with_schedule(self):
|
||||
"""Test that success message includes schedule link."""
|
||||
response = self.client.post(self.add_url, {
|
||||
'url': 'https://example.com',
|
||||
'depth': '0',
|
||||
'schedule': 'weekly',
|
||||
}, follow=True)
|
||||
|
||||
# Check success message mentions schedule
|
||||
messages = list(response.context['messages'])
|
||||
self.assertEqual(len(messages), 1)
|
||||
message_text = str(messages[0])
|
||||
|
||||
self.assertIn('Created crawl', message_text)
|
||||
self.assertIn('scheduled to repeat weekly', message_text)
|
||||
self.assertIn('View Crawl', message_text)
|
||||
|
||||
def test_add_crawl_creates_source_file(self):
|
||||
"""Test that crawl creation saves URLs to sources file."""
|
||||
response = self.client.post(self.add_url, {
|
||||
'url': 'https://example.com',
|
||||
'depth': '0',
|
||||
})
|
||||
|
||||
self.assertEqual(response.status_code, 302)
|
||||
|
||||
# Check that source file was created in sources/ directory
|
||||
from archivebox.config import CONSTANTS
|
||||
sources_dir = CONSTANTS.SOURCES_DIR
|
||||
|
||||
# Should have created a source file
|
||||
source_files = list(sources_dir.glob('*__web_ui_add_by_user_*.txt'))
|
||||
self.assertGreater(len(source_files), 0)
|
||||
|
||||
def test_multiple_tags_are_saved(self):
|
||||
"""Test that multiple comma-separated tags are saved."""
|
||||
response = self.client.post(self.add_url, {
|
||||
'url': 'https://example.com',
|
||||
'depth': '0',
|
||||
'tag': 'tag1,tag2,tag3',
|
||||
})
|
||||
|
||||
self.assertEqual(response.status_code, 302)
|
||||
|
||||
crawl = require(Crawl.objects.first())
|
||||
self.assertEqual(crawl.tags_str, 'tag1,tag2,tag3')
|
||||
|
||||
def test_crawl_redirects_to_admin_change_page(self):
|
||||
"""Test that successful submission redirects to crawl admin page."""
|
||||
response = self.client.post(self.add_url, {
|
||||
'url': 'https://example.com',
|
||||
'depth': '0',
|
||||
})
|
||||
|
||||
crawl = require(Crawl.objects.first())
|
||||
expected_redirect = f'/admin/crawls/crawl/{crawl.id}/change/'
|
||||
|
||||
self.assertRedirects(response, expected_redirect, fetch_redirect_response=False)
|
||||
@@ -70,9 +70,16 @@ def parse_line(line: str) -> Optional[Dict[str, Any]]:
|
||||
if line.startswith('http://') or line.startswith('https://') or line.startswith('file://'):
|
||||
return {'type': TYPE_SNAPSHOT, 'url': line}
|
||||
|
||||
# Could be a snapshot ID (UUID)
|
||||
# Could be a snapshot ID (UUID with dashes or compact 32-char hex)
|
||||
if len(line) == 36 and line.count('-') == 4:
|
||||
return {'type': TYPE_SNAPSHOT, 'id': line}
|
||||
if len(line) == 32:
|
||||
try:
|
||||
int(line, 16)
|
||||
except ValueError:
|
||||
pass
|
||||
else:
|
||||
return {'type': TYPE_SNAPSHOT, 'id': line}
|
||||
|
||||
# Unknown format, skip
|
||||
return None
|
||||
|
||||
@@ -607,7 +607,7 @@ def log_worker_event(
|
||||
|
||||
# Build final message
|
||||
error_str = f' {type(error).__name__}: {error}' if error else ''
|
||||
from archivebox.misc.logging import CONSOLE
|
||||
from archivebox.misc.logging import CONSOLE, STDERR
|
||||
from rich.text import Text
|
||||
|
||||
# Create a Rich Text object for proper formatting
|
||||
@@ -632,7 +632,11 @@ def log_worker_event(
|
||||
if metadata_str:
|
||||
text.append(f' | {metadata_str}')
|
||||
|
||||
CONSOLE.print(text, soft_wrap=True)
|
||||
# Stdout is reserved for JSONL records whenever commands are piped together.
|
||||
# Route worker/DB progress to stderr in non-TTY contexts so pipelines like
|
||||
# `archivebox snapshot list | archivebox run` keep stdout machine-readable.
|
||||
output_console = CONSOLE if sys.stdout.isatty() else STDERR
|
||||
output_console.print(text, soft_wrap=True)
|
||||
|
||||
|
||||
@enforce_types
|
||||
|
||||
@@ -1,2 +0,0 @@
|
||||
|
||||
# Create your tests here.
|
||||
@@ -3,8 +3,10 @@
|
||||
import os
|
||||
import sys
|
||||
import subprocess
|
||||
import tempfile
|
||||
import textwrap
|
||||
import time
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Any, Optional, Tuple
|
||||
|
||||
@@ -14,6 +16,9 @@ from archivebox.uuid_compat import uuid7
|
||||
|
||||
pytest_plugins = ["archivebox.tests.fixtures"]
|
||||
|
||||
SESSION_DATA_DIR = Path(tempfile.mkdtemp(prefix="archivebox-pytest-session-")).resolve()
|
||||
os.environ.setdefault("DATA_DIR", str(SESSION_DATA_DIR))
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# CLI Helpers (defined before fixtures that use them)
|
||||
@@ -82,6 +87,36 @@ def run_archivebox_cmd(
|
||||
# Fixtures
|
||||
# =============================================================================
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def isolate_test_runtime(tmp_path):
|
||||
"""
|
||||
Run each pytest test from an isolated temp cwd and restore env mutations.
|
||||
|
||||
The maintained pytest suite lives under ``archivebox/tests``. Many of those
|
||||
CLI tests shell out without passing ``cwd=`` explicitly, so the safest
|
||||
contract is that every test starts in its own temp directory and any
|
||||
in-process ``os.environ`` edits are rolled back afterwards.
|
||||
|
||||
We intentionally clear ``DATA_DIR`` for the body of each test so subprocess
|
||||
tests that rely on cwd keep working. During collection/import time we still
|
||||
seed a separate session-scoped temp ``DATA_DIR`` above so any ArchiveBox
|
||||
config imported before this fixture runs never points at the repo root.
|
||||
"""
|
||||
original_cwd = Path.cwd()
|
||||
original_env = os.environ.copy()
|
||||
os.chdir(tmp_path)
|
||||
os.environ.pop("DATA_DIR", None)
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
os.chdir(original_cwd)
|
||||
os.environ.clear()
|
||||
os.environ.update(original_env)
|
||||
|
||||
|
||||
def pytest_sessionfinish(session, exitstatus):
|
||||
shutil.rmtree(SESSION_DATA_DIR, ignore_errors=True)
|
||||
|
||||
@pytest.fixture
|
||||
def isolated_data_dir(tmp_path):
|
||||
"""
|
||||
|
||||
@@ -7,8 +7,11 @@ import pytest
|
||||
|
||||
@pytest.fixture
|
||||
def process(tmp_path):
|
||||
os.chdir(tmp_path)
|
||||
process = subprocess.run(['archivebox', 'init'], capture_output=True)
|
||||
process = subprocess.run(
|
||||
['archivebox', 'init'],
|
||||
capture_output=True,
|
||||
cwd=tmp_path,
|
||||
)
|
||||
return process
|
||||
|
||||
@pytest.fixture
|
||||
|
||||
@@ -1,17 +1,12 @@
|
||||
import importlib
|
||||
from io import StringIO
|
||||
|
||||
from archivebox.config.django import setup_django
|
||||
from django.contrib.auth import get_user_model
|
||||
from django.test import RequestFactory, TestCase
|
||||
|
||||
setup_django()
|
||||
from archivebox.api.v1_cli import ScheduleCommandSchema, cli_schedule
|
||||
from archivebox.crawls.models import CrawlSchedule
|
||||
|
||||
User = importlib.import_module('django.contrib.auth.models').User
|
||||
TestCase = importlib.import_module('django.test').TestCase
|
||||
RequestFactory = importlib.import_module('django.test').RequestFactory
|
||||
api_v1_cli = importlib.import_module('archivebox.api.v1_cli')
|
||||
ScheduleCommandSchema = api_v1_cli.ScheduleCommandSchema
|
||||
cli_schedule = api_v1_cli.cli_schedule
|
||||
CrawlSchedule = importlib.import_module('archivebox.crawls.models').CrawlSchedule
|
||||
User = get_user_model()
|
||||
|
||||
|
||||
class CLIScheduleAPITests(TestCase):
|
||||
@@ -1,13 +1,10 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Integration tests for archivebox extract command."""
|
||||
"""Tests for archivebox extract input handling and pipelines."""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sqlite3
|
||||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
|
||||
def test_extract_runs_on_snapshot_id(tmp_path, process, disable_extractors_dict):
|
||||
@@ -271,7 +268,3 @@ class TestExtractCLI:
|
||||
|
||||
# Should show warning about no snapshots or exit normally (empty input)
|
||||
assert result.returncode == 0 or 'No' in result.stderr
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
377
archivebox/tests/test_cli_piping.py
Normal file
377
archivebox/tests/test_cli_piping.py
Normal file
@@ -0,0 +1,377 @@
|
||||
"""
|
||||
Tests for JSONL piping contracts and `archivebox run` / `archivebox orchestrator`.
|
||||
|
||||
This file covers both:
|
||||
- low-level JSONL/stdin parsing behavior that makes CLI piping work
|
||||
- subprocess integration for the supported records `archivebox run` consumes
|
||||
"""
|
||||
|
||||
import sqlite3
|
||||
import sys
|
||||
import uuid
|
||||
from io import StringIO
|
||||
from pathlib import Path
|
||||
|
||||
from archivebox.tests.conftest import (
|
||||
create_test_url,
|
||||
parse_jsonl_output,
|
||||
run_archivebox_cmd,
|
||||
)
|
||||
|
||||
|
||||
PIPE_TEST_ENV = {
|
||||
"PLUGINS": "favicon",
|
||||
"SAVE_FAVICON": "True",
|
||||
"USE_COLOR": "False",
|
||||
"SHOW_PROGRESS": "False",
|
||||
}
|
||||
|
||||
|
||||
class MockTTYStringIO(StringIO):
|
||||
def __init__(self, initial_value: str = "", *, is_tty: bool):
|
||||
super().__init__(initial_value)
|
||||
self._is_tty = is_tty
|
||||
|
||||
def isatty(self) -> bool:
|
||||
return self._is_tty
|
||||
|
||||
|
||||
def _stdout_lines(stdout: str) -> list[str]:
|
||||
return [line for line in stdout.splitlines() if line.strip()]
|
||||
|
||||
|
||||
def _assert_stdout_is_jsonl_only(stdout: str) -> None:
|
||||
lines = _stdout_lines(stdout)
|
||||
assert lines, "Expected stdout to contain JSONL records"
|
||||
assert all(line.lstrip().startswith("{") for line in lines), stdout
|
||||
|
||||
|
||||
def _sqlite_param(value: object) -> object:
|
||||
if not isinstance(value, str):
|
||||
return value
|
||||
try:
|
||||
return uuid.UUID(value).hex
|
||||
except ValueError:
|
||||
return value
|
||||
|
||||
|
||||
def _db_value(data_dir: Path, sql: str, params: tuple[object, ...] = ()) -> object | None:
|
||||
conn = sqlite3.connect(data_dir / "index.sqlite3")
|
||||
try:
|
||||
row = conn.execute(sql, tuple(_sqlite_param(param) for param in params)).fetchone()
|
||||
finally:
|
||||
conn.close()
|
||||
return row[0] if row else None
|
||||
|
||||
|
||||
def test_parse_line_accepts_supported_piping_inputs():
|
||||
"""The JSONL parser should normalize the input forms CLI pipes accept."""
|
||||
from archivebox.misc.jsonl import TYPE_CRAWL, TYPE_SNAPSHOT, parse_line
|
||||
|
||||
assert parse_line("") is None
|
||||
assert parse_line(" ") is None
|
||||
assert parse_line("# comment") is None
|
||||
assert parse_line("not-a-url") is None
|
||||
assert parse_line("ftp://example.com") is None
|
||||
|
||||
plain_url = parse_line("https://example.com")
|
||||
assert plain_url == {"type": TYPE_SNAPSHOT, "url": "https://example.com"}
|
||||
|
||||
file_url = parse_line("file:///tmp/example.txt")
|
||||
assert file_url == {"type": TYPE_SNAPSHOT, "url": "file:///tmp/example.txt"}
|
||||
|
||||
snapshot_json = parse_line('{"type":"Snapshot","url":"https://example.com","tags":"tag1,tag2"}')
|
||||
assert snapshot_json is not None
|
||||
assert snapshot_json["type"] == TYPE_SNAPSHOT
|
||||
assert snapshot_json["tags"] == "tag1,tag2"
|
||||
|
||||
crawl_json = parse_line('{"type":"Crawl","id":"abc123","urls":"https://example.com","max_depth":1}')
|
||||
assert crawl_json is not None
|
||||
assert crawl_json["type"] == TYPE_CRAWL
|
||||
assert crawl_json["id"] == "abc123"
|
||||
assert crawl_json["max_depth"] == 1
|
||||
|
||||
snapshot_id = "01234567-89ab-cdef-0123-456789abcdef"
|
||||
parsed_id = parse_line(snapshot_id)
|
||||
assert parsed_id == {"type": TYPE_SNAPSHOT, "id": snapshot_id}
|
||||
|
||||
compact_snapshot_id = "0123456789abcdef0123456789abcdef"
|
||||
compact_parsed_id = parse_line(compact_snapshot_id)
|
||||
assert compact_parsed_id == {"type": TYPE_SNAPSHOT, "id": compact_snapshot_id}
|
||||
|
||||
|
||||
def test_read_args_or_stdin_handles_args_stdin_and_mixed_jsonl():
|
||||
"""Piping helpers should consume args, structured JSONL, and pass-through records."""
|
||||
from archivebox.misc.jsonl import TYPE_CRAWL, read_args_or_stdin
|
||||
|
||||
records = list(read_args_or_stdin(("https://example1.com", "https://example2.com")))
|
||||
assert [record["url"] for record in records] == ["https://example1.com", "https://example2.com"]
|
||||
|
||||
stdin_records = list(
|
||||
read_args_or_stdin(
|
||||
(),
|
||||
stream=MockTTYStringIO(
|
||||
'https://plain-url.com\n'
|
||||
'{"type":"Snapshot","url":"https://jsonl-url.com","tags":"test"}\n'
|
||||
'{"type":"Tag","id":"tag-1","name":"example"}\n'
|
||||
'01234567-89ab-cdef-0123-456789abcdef\n'
|
||||
'not valid json\n',
|
||||
is_tty=False,
|
||||
),
|
||||
)
|
||||
)
|
||||
assert len(stdin_records) == 4
|
||||
assert stdin_records[0]["url"] == "https://plain-url.com"
|
||||
assert stdin_records[1]["url"] == "https://jsonl-url.com"
|
||||
assert stdin_records[1]["tags"] == "test"
|
||||
assert stdin_records[2]["type"] == "Tag"
|
||||
assert stdin_records[2]["name"] == "example"
|
||||
assert stdin_records[3]["id"] == "01234567-89ab-cdef-0123-456789abcdef"
|
||||
|
||||
crawl_records = list(
|
||||
read_args_or_stdin(
|
||||
(),
|
||||
stream=MockTTYStringIO(
|
||||
'{"type":"Crawl","id":"crawl-1","urls":"https://example.com\\nhttps://foo.com"}\n',
|
||||
is_tty=False,
|
||||
),
|
||||
)
|
||||
)
|
||||
assert len(crawl_records) == 1
|
||||
assert crawl_records[0]["type"] == TYPE_CRAWL
|
||||
assert crawl_records[0]["id"] == "crawl-1"
|
||||
|
||||
tty_records = list(read_args_or_stdin((), stream=MockTTYStringIO("https://example.com", is_tty=True)))
|
||||
assert tty_records == []
|
||||
|
||||
|
||||
def test_collect_urls_from_plugins_reads_only_parser_outputs(tmp_path):
|
||||
"""Parser extractor `urls.jsonl` outputs should be discoverable for recursive piping."""
|
||||
from archivebox.hooks import collect_urls_from_plugins
|
||||
|
||||
(tmp_path / "wget").mkdir()
|
||||
(tmp_path / "wget" / "urls.jsonl").write_text(
|
||||
'{"url":"https://wget-link-1.com"}\n'
|
||||
'{"url":"https://wget-link-2.com"}\n',
|
||||
encoding="utf-8",
|
||||
)
|
||||
(tmp_path / "parse_html_urls").mkdir()
|
||||
(tmp_path / "parse_html_urls" / "urls.jsonl").write_text(
|
||||
'{"url":"https://html-link-1.com"}\n'
|
||||
'{"url":"https://html-link-2.com","title":"HTML Link 2"}\n',
|
||||
encoding="utf-8",
|
||||
)
|
||||
(tmp_path / "screenshot").mkdir()
|
||||
|
||||
urls = collect_urls_from_plugins(tmp_path)
|
||||
assert len(urls) == 4
|
||||
assert {url["plugin"] for url in urls} == {"wget", "parse_html_urls"}
|
||||
titled = [url for url in urls if url.get("title") == "HTML Link 2"]
|
||||
assert len(titled) == 1
|
||||
assert titled[0]["url"] == "https://html-link-2.com"
|
||||
|
||||
assert collect_urls_from_plugins(tmp_path / "nonexistent") == []
|
||||
|
||||
|
||||
def test_crawl_create_stdout_pipes_into_run(initialized_archive):
|
||||
"""`archivebox crawl create | archivebox run` should queue and materialize snapshots."""
|
||||
url = create_test_url()
|
||||
|
||||
create_stdout, create_stderr, create_code = run_archivebox_cmd(
|
||||
["crawl", "create", url],
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
assert create_code == 0, create_stderr
|
||||
_assert_stdout_is_jsonl_only(create_stdout)
|
||||
|
||||
crawl = next(record for record in parse_jsonl_output(create_stdout) if record.get("type") == "Crawl")
|
||||
|
||||
run_stdout, run_stderr, run_code = run_archivebox_cmd(
|
||||
["run"],
|
||||
stdin=create_stdout,
|
||||
data_dir=initialized_archive,
|
||||
timeout=120,
|
||||
env=PIPE_TEST_ENV,
|
||||
)
|
||||
assert run_code == 0, run_stderr
|
||||
_assert_stdout_is_jsonl_only(run_stdout)
|
||||
|
||||
run_records = parse_jsonl_output(run_stdout)
|
||||
assert any(record.get("type") == "Crawl" and record.get("id") == crawl["id"] for record in run_records)
|
||||
|
||||
snapshot_count = _db_value(
|
||||
initialized_archive,
|
||||
"SELECT COUNT(*) FROM core_snapshot WHERE crawl_id = ?",
|
||||
(crawl["id"],),
|
||||
)
|
||||
assert isinstance(snapshot_count, int)
|
||||
assert snapshot_count >= 1
|
||||
|
||||
|
||||
def test_snapshot_list_stdout_pipes_into_run(initialized_archive):
|
||||
"""`archivebox snapshot list | archivebox run` should requeue listed snapshots."""
|
||||
url = create_test_url()
|
||||
|
||||
create_stdout, create_stderr, create_code = run_archivebox_cmd(
|
||||
["snapshot", "create", url],
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
assert create_code == 0, create_stderr
|
||||
snapshot = next(record for record in parse_jsonl_output(create_stdout) if record.get("type") == "Snapshot")
|
||||
|
||||
list_stdout, list_stderr, list_code = run_archivebox_cmd(
|
||||
["snapshot", "list", "--status=queued", f"--url__icontains={snapshot['id']}"],
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
if list_code != 0 or not parse_jsonl_output(list_stdout):
|
||||
list_stdout, list_stderr, list_code = run_archivebox_cmd(
|
||||
["snapshot", "list", f"--url__icontains={url}"],
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
assert list_code == 0, list_stderr
|
||||
_assert_stdout_is_jsonl_only(list_stdout)
|
||||
|
||||
run_stdout, run_stderr, run_code = run_archivebox_cmd(
|
||||
["run"],
|
||||
stdin=list_stdout,
|
||||
data_dir=initialized_archive,
|
||||
timeout=120,
|
||||
env=PIPE_TEST_ENV,
|
||||
)
|
||||
assert run_code == 0, run_stderr
|
||||
_assert_stdout_is_jsonl_only(run_stdout)
|
||||
|
||||
run_records = parse_jsonl_output(run_stdout)
|
||||
assert any(record.get("type") == "Snapshot" and record.get("id") == snapshot["id"] for record in run_records)
|
||||
|
||||
snapshot_status = _db_value(
|
||||
initialized_archive,
|
||||
"SELECT status FROM core_snapshot WHERE id = ?",
|
||||
(snapshot["id"],),
|
||||
)
|
||||
assert snapshot_status == "sealed"
|
||||
|
||||
|
||||
def test_archiveresult_list_stdout_pipes_into_orchestrator_alias(initialized_archive):
|
||||
"""`archivebox archiveresult list | archivebox orchestrator` should preserve clean JSONL stdout."""
|
||||
url = create_test_url()
|
||||
|
||||
snapshot_stdout, snapshot_stderr, snapshot_code = run_archivebox_cmd(
|
||||
["snapshot", "create", url],
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
assert snapshot_code == 0, snapshot_stderr
|
||||
|
||||
ar_create_stdout, ar_create_stderr, ar_create_code = run_archivebox_cmd(
|
||||
["archiveresult", "create", "--plugin=favicon"],
|
||||
stdin=snapshot_stdout,
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
assert ar_create_code == 0, ar_create_stderr
|
||||
|
||||
created_records = parse_jsonl_output(ar_create_stdout)
|
||||
archiveresult = next(record for record in created_records if record.get("type") == "ArchiveResult")
|
||||
|
||||
list_stdout, list_stderr, list_code = run_archivebox_cmd(
|
||||
["archiveresult", "list", "--plugin=favicon"],
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
assert list_code == 0, list_stderr
|
||||
_assert_stdout_is_jsonl_only(list_stdout)
|
||||
|
||||
orchestrator_stdout, orchestrator_stderr, orchestrator_code = run_archivebox_cmd(
|
||||
["orchestrator"],
|
||||
stdin=list_stdout,
|
||||
data_dir=initialized_archive,
|
||||
timeout=120,
|
||||
env=PIPE_TEST_ENV,
|
||||
)
|
||||
assert orchestrator_code == 0, orchestrator_stderr
|
||||
_assert_stdout_is_jsonl_only(orchestrator_stdout)
|
||||
assert "renamed to `archivebox run`" in orchestrator_stderr
|
||||
|
||||
run_records = parse_jsonl_output(orchestrator_stdout)
|
||||
assert any(
|
||||
record.get("type") == "ArchiveResult" and record.get("id") == archiveresult["id"]
|
||||
for record in run_records
|
||||
)
|
||||
|
||||
|
||||
def test_binary_create_stdout_pipes_into_run(initialized_archive):
|
||||
"""`archivebox binary create | archivebox run` should queue the binary record for processing."""
|
||||
create_stdout, create_stderr, create_code = run_archivebox_cmd(
|
||||
["binary", "create", "--name=python3", f"--abspath={sys.executable}", "--version=test"],
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
assert create_code == 0, create_stderr
|
||||
_assert_stdout_is_jsonl_only(create_stdout)
|
||||
|
||||
binary = next(record for record in parse_jsonl_output(create_stdout) if record.get("type") == "Binary")
|
||||
|
||||
run_stdout, run_stderr, run_code = run_archivebox_cmd(
|
||||
["run"],
|
||||
stdin=create_stdout,
|
||||
data_dir=initialized_archive,
|
||||
timeout=120,
|
||||
)
|
||||
assert run_code == 0, run_stderr
|
||||
_assert_stdout_is_jsonl_only(run_stdout)
|
||||
|
||||
run_records = parse_jsonl_output(run_stdout)
|
||||
assert any(record.get("type") == "Binary" and record.get("id") == binary["id"] for record in run_records)
|
||||
|
||||
status = _db_value(
|
||||
initialized_archive,
|
||||
"SELECT status FROM machine_binary WHERE id = ?",
|
||||
(binary["id"],),
|
||||
)
|
||||
assert status in {"queued", "installed"}
|
||||
|
||||
|
||||
def test_multi_stage_pipeline_into_run(initialized_archive):
|
||||
"""`crawl create | snapshot create | archiveresult create | run` should preserve JSONL and finish work."""
|
||||
url = create_test_url()
|
||||
|
||||
crawl_stdout, crawl_stderr, crawl_code = run_archivebox_cmd(
|
||||
["crawl", "create", url],
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
assert crawl_code == 0, crawl_stderr
|
||||
_assert_stdout_is_jsonl_only(crawl_stdout)
|
||||
|
||||
snapshot_stdout, snapshot_stderr, snapshot_code = run_archivebox_cmd(
|
||||
["snapshot", "create"],
|
||||
stdin=crawl_stdout,
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
assert snapshot_code == 0, snapshot_stderr
|
||||
_assert_stdout_is_jsonl_only(snapshot_stdout)
|
||||
|
||||
archiveresult_stdout, archiveresult_stderr, archiveresult_code = run_archivebox_cmd(
|
||||
["archiveresult", "create", "--plugin=favicon"],
|
||||
stdin=snapshot_stdout,
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
assert archiveresult_code == 0, archiveresult_stderr
|
||||
_assert_stdout_is_jsonl_only(archiveresult_stdout)
|
||||
|
||||
run_stdout, run_stderr, run_code = run_archivebox_cmd(
|
||||
["run"],
|
||||
stdin=archiveresult_stdout,
|
||||
data_dir=initialized_archive,
|
||||
timeout=120,
|
||||
env=PIPE_TEST_ENV,
|
||||
)
|
||||
assert run_code == 0, run_stderr
|
||||
_assert_stdout_is_jsonl_only(run_stdout)
|
||||
|
||||
run_records = parse_jsonl_output(run_stdout)
|
||||
snapshot = next(record for record in run_records if record.get("type") == "Snapshot")
|
||||
assert any(record.get("type") == "ArchiveResult" for record in run_records)
|
||||
|
||||
snapshot_status = _db_value(
|
||||
initialized_archive,
|
||||
"SELECT status FROM core_snapshot WHERE id = ?",
|
||||
(snapshot["id"],),
|
||||
)
|
||||
assert snapshot_status == "sealed"
|
||||
@@ -1,156 +0,0 @@
|
||||
import json as pyjson
|
||||
import sqlite3
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
from .fixtures import disable_extractors_dict, process
|
||||
|
||||
FIXTURES = (disable_extractors_dict, process)
|
||||
|
||||
|
||||
def _find_snapshot_dir(data_dir: Path, snapshot_id: str) -> Path | None:
|
||||
candidates = {snapshot_id}
|
||||
if len(snapshot_id) == 32:
|
||||
candidates.add(f"{snapshot_id[:8]}-{snapshot_id[8:12]}-{snapshot_id[12:16]}-{snapshot_id[16:20]}-{snapshot_id[20:]}")
|
||||
elif len(snapshot_id) == 36 and "-" in snapshot_id:
|
||||
candidates.add(snapshot_id.replace("-", ""))
|
||||
|
||||
for needle in candidates:
|
||||
for path in data_dir.rglob(needle):
|
||||
if path.is_dir():
|
||||
return path
|
||||
return None
|
||||
|
||||
|
||||
def _latest_snapshot_dir(data_dir: Path) -> Path:
|
||||
conn = sqlite3.connect(data_dir / "index.sqlite3")
|
||||
try:
|
||||
snapshot_id = conn.execute(
|
||||
"SELECT id FROM core_snapshot ORDER BY created_at DESC LIMIT 1"
|
||||
).fetchone()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
assert snapshot_id is not None, "Expected a snapshot to be created"
|
||||
snapshot_dir = _find_snapshot_dir(data_dir, str(snapshot_id[0]))
|
||||
assert snapshot_dir is not None, f"Snapshot output directory not found for {snapshot_id[0]}"
|
||||
return snapshot_dir
|
||||
|
||||
|
||||
def _latest_plugin_result(data_dir: Path, plugin: str) -> tuple[str, str, dict]:
|
||||
conn = sqlite3.connect(data_dir / "index.sqlite3")
|
||||
try:
|
||||
row = conn.execute(
|
||||
"SELECT snapshot_id, status, output_files FROM core_archiveresult "
|
||||
"WHERE plugin = ? ORDER BY created_at DESC LIMIT 1",
|
||||
(plugin,),
|
||||
).fetchone()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
assert row is not None, f"Expected an ArchiveResult row for plugin={plugin}"
|
||||
output_files = row[2]
|
||||
if isinstance(output_files, str):
|
||||
output_files = pyjson.loads(output_files or "{}")
|
||||
output_files = output_files or {}
|
||||
return str(row[0]), str(row[1]), output_files
|
||||
|
||||
|
||||
def _plugin_output_paths(data_dir: Path, plugin: str) -> list[Path]:
|
||||
snapshot_id, status, output_files = _latest_plugin_result(data_dir, plugin)
|
||||
assert status == "succeeded", f"Expected {plugin} ArchiveResult to succeed, got {status}"
|
||||
assert output_files, f"Expected {plugin} ArchiveResult to record output_files"
|
||||
|
||||
snapshot_dir = _find_snapshot_dir(data_dir, snapshot_id)
|
||||
assert snapshot_dir is not None, f"Snapshot output directory not found for {snapshot_id}"
|
||||
|
||||
plugin_dir = snapshot_dir / plugin
|
||||
output_paths = [plugin_dir / rel_path for rel_path in output_files.keys()]
|
||||
missing_paths = [path for path in output_paths if not path.exists()]
|
||||
assert not missing_paths, f"Expected plugin outputs to exist on disk, missing: {missing_paths}"
|
||||
return output_paths
|
||||
|
||||
|
||||
def _archivebox_env(base_env: dict, data_dir: Path) -> dict:
|
||||
env = base_env.copy()
|
||||
tmp_dir = Path("/tmp") / f"abx-{data_dir.name}"
|
||||
tmp_dir.mkdir(parents=True, exist_ok=True)
|
||||
env["TMP_DIR"] = str(tmp_dir)
|
||||
env["ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS"] = "true"
|
||||
return env
|
||||
|
||||
|
||||
def test_singlefile_works(tmp_path, process, disable_extractors_dict):
|
||||
data_dir = Path.cwd()
|
||||
env = _archivebox_env(disable_extractors_dict, data_dir)
|
||||
env.update({"SAVE_SINGLEFILE": "true"})
|
||||
add_process = subprocess.run(
|
||||
['archivebox', 'add', '--plugins=singlefile', 'https://example.com'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=900,
|
||||
)
|
||||
assert add_process.returncode == 0, add_process.stderr
|
||||
output_files = _plugin_output_paths(data_dir, "singlefile")
|
||||
assert any(path.suffix in (".html", ".htm") for path in output_files)
|
||||
|
||||
def test_readability_works(tmp_path, process, disable_extractors_dict):
|
||||
data_dir = Path.cwd()
|
||||
env = _archivebox_env(disable_extractors_dict, data_dir)
|
||||
env.update({"SAVE_SINGLEFILE": "true", "SAVE_READABILITY": "true"})
|
||||
add_process = subprocess.run(
|
||||
['archivebox', 'add', '--plugins=singlefile,readability', 'https://example.com'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=900,
|
||||
)
|
||||
assert add_process.returncode == 0, add_process.stderr
|
||||
output_files = _plugin_output_paths(data_dir, "readability")
|
||||
assert any(path.suffix in (".html", ".htm") for path in output_files)
|
||||
|
||||
def test_htmltotext_works(tmp_path, process, disable_extractors_dict):
|
||||
data_dir = Path.cwd()
|
||||
env = _archivebox_env(disable_extractors_dict, data_dir)
|
||||
env.update({"SAVE_WGET": "true", "SAVE_HTMLTOTEXT": "true"})
|
||||
add_process = subprocess.run(
|
||||
['archivebox', 'add', '--plugins=wget,htmltotext', 'https://example.com'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=900,
|
||||
)
|
||||
assert add_process.returncode == 0, add_process.stderr
|
||||
output_files = _plugin_output_paths(data_dir, "htmltotext")
|
||||
assert any(path.suffix == ".txt" for path in output_files)
|
||||
|
||||
def test_use_node_false_disables_readability_and_singlefile(tmp_path, process, disable_extractors_dict):
|
||||
env = _archivebox_env(disable_extractors_dict, Path.cwd())
|
||||
env.update({"SAVE_READABILITY": "true", "SAVE_DOM": "true", "SAVE_SINGLEFILE": "true", "USE_NODE": "false"})
|
||||
add_process = subprocess.run(['archivebox', 'add', '--plugins=readability,dom,singlefile', 'https://example.com'],
|
||||
capture_output=True, env=env)
|
||||
output_str = add_process.stdout.decode("utf-8")
|
||||
assert "> singlefile" not in output_str
|
||||
assert "> readability" not in output_str
|
||||
|
||||
def test_headers_retrieved(tmp_path, process, disable_extractors_dict):
|
||||
data_dir = Path.cwd()
|
||||
env = _archivebox_env(disable_extractors_dict, data_dir)
|
||||
env.update({"SAVE_HEADERS": "true"})
|
||||
add_process = subprocess.run(
|
||||
['archivebox', 'add', '--plugins=headers', 'https://example.com'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=900,
|
||||
)
|
||||
assert add_process.returncode == 0, add_process.stderr
|
||||
output_files = _plugin_output_paths(data_dir, "headers")
|
||||
output_file = next((path for path in output_files if path.suffix == ".json"), None)
|
||||
assert output_file is not None, f"Expected headers output_files to include a JSON file, got: {output_files}"
|
||||
with open(output_file, 'r', encoding='utf-8') as f:
|
||||
headers = pyjson.load(f)
|
||||
response_headers = headers.get("response_headers") or headers.get("headers") or {}
|
||||
assert isinstance(response_headers, dict), f"Expected response_headers dict, got: {response_headers!r}"
|
||||
assert 'Content-Type' in response_headers or 'content-type' in response_headers
|
||||
@@ -13,7 +13,6 @@ ADMIN_HOST = 'admin.archivebox.localhost:8000'
|
||||
|
||||
|
||||
def _run_savepagenow_script(initialized_archive: Path, request_url: str, expected_url: str, *, login: bool, public_add_view: bool, host: str):
|
||||
project_root = Path(__file__).resolve().parents[2]
|
||||
script = textwrap.dedent(
|
||||
f"""
|
||||
import os
|
||||
@@ -81,7 +80,7 @@ def _run_savepagenow_script(initialized_archive: Path, request_url: str, expecte
|
||||
|
||||
return subprocess.run(
|
||||
[sys.executable, '-c', script],
|
||||
cwd=project_root,
|
||||
cwd=initialized_archive,
|
||||
env=env,
|
||||
text=True,
|
||||
capture_output=True,
|
||||
@@ -90,7 +89,6 @@ def _run_savepagenow_script(initialized_archive: Path, request_url: str, expecte
|
||||
|
||||
|
||||
def _run_savepagenow_not_found_script(initialized_archive: Path, request_url: str):
|
||||
project_root = Path(__file__).resolve().parents[2]
|
||||
script = textwrap.dedent(
|
||||
f"""
|
||||
import os
|
||||
@@ -137,7 +135,7 @@ def _run_savepagenow_not_found_script(initialized_archive: Path, request_url: st
|
||||
|
||||
return subprocess.run(
|
||||
[sys.executable, '-c', script],
|
||||
cwd=project_root,
|
||||
cwd=initialized_archive,
|
||||
env=env,
|
||||
text=True,
|
||||
capture_output=True,
|
||||
@@ -146,7 +144,6 @@ def _run_savepagenow_not_found_script(initialized_archive: Path, request_url: st
|
||||
|
||||
|
||||
def _run_savepagenow_existing_snapshot_script(initialized_archive: Path, request_url: str, stored_url: str):
|
||||
project_root = Path(__file__).resolve().parents[2]
|
||||
script = textwrap.dedent(
|
||||
f"""
|
||||
import os
|
||||
@@ -199,7 +196,7 @@ def _run_savepagenow_existing_snapshot_script(initialized_archive: Path, request
|
||||
|
||||
return subprocess.run(
|
||||
[sys.executable, '-c', script],
|
||||
cwd=project_root,
|
||||
cwd=initialized_archive,
|
||||
env=env,
|
||||
text=True,
|
||||
capture_output=True,
|
||||
|
||||
@@ -29,6 +29,7 @@ Usage:
|
||||
__package__ = 'archivebox.workers'
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from typing import Type
|
||||
from datetime import datetime, timedelta
|
||||
@@ -258,9 +259,7 @@ class Orchestrator:
|
||||
def spawn_worker(self, WorkerClass: Type[Worker]) -> int | None:
|
||||
"""Spawn a new worker process. Returns PID or None if spawn failed."""
|
||||
try:
|
||||
print(f'[yellow]DEBUG: Spawning {WorkerClass.name} worker with crawl_id={self.crawl_id}...[/yellow]')
|
||||
pid = WorkerClass.start(parent=self.db_process, crawl_id=self.crawl_id)
|
||||
print(f'[yellow]DEBUG: Spawned {WorkerClass.name} worker with PID={pid}[/yellow]')
|
||||
|
||||
# CRITICAL: Block until worker registers itself in Process table
|
||||
# This prevents race condition where orchestrator spawns multiple workers
|
||||
@@ -281,17 +280,6 @@ class Orchestrator:
|
||||
# 4. Parent is this orchestrator
|
||||
# 5. Started recently (within last 10 seconds)
|
||||
|
||||
# Debug: Check all processes with this PID first
|
||||
if elapsed < 0.5:
|
||||
all_procs = list(Process.objects.filter(pid=pid))
|
||||
print(f'[yellow]DEBUG spawn_worker: elapsed={elapsed:.1f}s pid={pid} orchestrator_id={self.db_process.id}[/yellow]')
|
||||
print(f'[yellow] Found {len(all_procs)} Process records for pid={pid}[/yellow]')
|
||||
for p in all_procs:
|
||||
print(
|
||||
f'[yellow] -> type={p.process_type} status={p.status} '
|
||||
f'parent_id={p.parent_id} match={p.parent_id == self.db_process.id}[/yellow]'
|
||||
)
|
||||
|
||||
worker_process = Process.objects.filter(
|
||||
pid=pid,
|
||||
process_type=Process.TypeChoices.WORKER,
|
||||
@@ -302,7 +290,6 @@ class Orchestrator:
|
||||
|
||||
if worker_process:
|
||||
# Worker successfully registered!
|
||||
print(f'[green]DEBUG spawn_worker: Worker registered! Returning pid={pid}[/green]')
|
||||
return pid
|
||||
|
||||
time.sleep(poll_interval)
|
||||
@@ -653,14 +640,15 @@ class Orchestrator:
|
||||
def runloop(self) -> None:
|
||||
"""Main orchestrator loop."""
|
||||
from rich.live import Live
|
||||
from archivebox.misc.logging import IS_TTY
|
||||
from archivebox.misc.progress_layout import ArchiveBoxProgressLayout
|
||||
import sys
|
||||
import os
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
# Enable progress layout only in TTY + foreground mode
|
||||
show_progress = IS_TTY and self.exit_on_idle
|
||||
plain_output = not IS_TTY
|
||||
show_progress = is_tty and self.exit_on_idle
|
||||
# When stdout is not a TTY, it may be reserved for JSONL pipeline output.
|
||||
# Keep the plain progress view, but emit it to stderr instead of stdout.
|
||||
plain_output = not is_tty
|
||||
self.on_startup()
|
||||
|
||||
if not show_progress:
|
||||
@@ -1241,7 +1229,7 @@ class Orchestrator:
|
||||
ts = timezone.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
for panel, line in new_lines:
|
||||
if line:
|
||||
print(f"[{ts}] [{panel}] {line}")
|
||||
print(f"[{ts}] [{panel}] {line}", file=sys.stderr)
|
||||
last_plain_lines = set(plain_lines)
|
||||
|
||||
# Track idle state
|
||||
@@ -1271,7 +1259,7 @@ class Orchestrator:
|
||||
except KeyboardInterrupt:
|
||||
if progress_layout:
|
||||
progress_layout.log_event("Interrupted by user", style="red")
|
||||
print() # Newline after ^C
|
||||
print(file=sys.stderr) # Newline after ^C
|
||||
self.on_shutdown(error=KeyboardInterrupt())
|
||||
except BaseException as e:
|
||||
if progress_layout:
|
||||
@@ -1310,7 +1298,7 @@ class Orchestrator:
|
||||
Used by commands like 'add' to ensure orchestrator is running.
|
||||
"""
|
||||
if cls.is_running():
|
||||
print('[grey53]👨✈️ Orchestrator already running[/grey53]')
|
||||
print('[grey53]👨✈️ Orchestrator already running[/grey53]', file=sys.stderr)
|
||||
# Return a placeholder - actual orchestrator is in another process
|
||||
return cls(exit_on_idle=exit_on_idle)
|
||||
|
||||
|
||||
@@ -176,7 +176,7 @@ package-dir = {"archivebox" = "archivebox"}
|
||||
line-length = 140
|
||||
target-version = "py313"
|
||||
src = ["archivebox"]
|
||||
exclude = ["*.pyi", "typings/", "migrations/"]
|
||||
exclude = ["*.pyi", "typings/", "migrations/", "archivebox/tests/data/"]
|
||||
|
||||
# https://docs.astral.sh/ruff/rules/
|
||||
[tool.ruff.lint]
|
||||
@@ -184,6 +184,7 @@ ignore = ["E731", "E303", "E266", "E241", "E222"]
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
testpaths = [ "archivebox/tests" ]
|
||||
norecursedirs = ["archivebox/tests/data"]
|
||||
DJANGO_SETTINGS_MODULE = "archivebox.core.settings"
|
||||
# Note: Plugin tests under abx_plugins/plugins/ must NOT load Django
|
||||
# They use a conftest.py to disable Django automatically
|
||||
@@ -254,6 +255,8 @@ exclude = [
|
||||
"**/node_modules",
|
||||
"**/__pycache__",
|
||||
"**/migrations",
|
||||
"archivebox/tests/data",
|
||||
"archivebox/tests/data/**",
|
||||
]
|
||||
stubPath = "./typings"
|
||||
venvPath = "."
|
||||
@@ -267,7 +270,7 @@ pythonPlatform = "Linux"
|
||||
|
||||
[tool.ty]
|
||||
environment = { python-version = "3.13", python-platform = "linux" }
|
||||
src = { include = ["archivebox"], exclude = [".venv", "**/*.pyi", "**/__init__.pyi", "**/node_modules", "**/__pycache__", "**/migrations"] }
|
||||
src = { include = ["archivebox"], exclude = [".venv", "**/*.pyi", "**/__init__.pyi", "**/node_modules", "**/__pycache__", "**/migrations", "archivebox/tests/data", "archivebox/tests/data/**"] }
|
||||
|
||||
|
||||
[project.scripts]
|
||||
|
||||
Reference in New Issue
Block a user