Restore CLI compat and plugin dependency handling

This commit is contained in:
Nick Sweeting
2026-03-15 06:06:18 -07:00
parent 6b482c62df
commit 1f792d7199
19 changed files with 302 additions and 92 deletions

View File

@@ -41,6 +41,7 @@ class ArchiveBoxGroup(click.Group):
archive_commands = {
# High-level commands
'add': 'archivebox.cli.archivebox_add.main',
'extract': 'archivebox.cli.archivebox_extract.main',
'list': 'archivebox.cli.archivebox_list.main',
'remove': 'archivebox.cli.archivebox_remove.main',
'run': 'archivebox.cli.archivebox_run.main',
@@ -55,6 +56,10 @@ class ArchiveBoxGroup(click.Group):
# Introspection commands
'pluginmap': 'archivebox.cli.archivebox_pluginmap.main',
}
legacy_model_commands = {
'crawl': 'archivebox.cli.archivebox_crawl_compat.main',
'snapshot': 'archivebox.cli.archivebox_snapshot_compat.main',
}
all_subcommands = {
**meta_commands,
**setup_commands,
@@ -67,12 +72,35 @@ class ArchiveBoxGroup(click.Group):
'archive': 'add',
# Old commands replaced by new model commands
'orchestrator': 'run',
'extract': 'archiveresult',
}
legacy_model_subcommands = {
'crawl': {'create', 'list', 'update', 'delete'},
'snapshot': {'create', 'list', 'update', 'delete'},
}
@classmethod
def get_canonical_name(cls, cmd_name):
return cls.renamed_commands.get(cmd_name, cmd_name)
@classmethod
def _should_use_legacy_model_command(cls, cmd_name: str) -> bool:
if cmd_name not in cls.legacy_model_commands:
return False
try:
arg_idx = sys.argv.index(cmd_name)
except ValueError:
return False
remaining_args = sys.argv[arg_idx + 1:]
if not remaining_args:
return False
first_arg = remaining_args[0]
if first_arg in ('-h', '--help'):
return False
return first_arg not in cls.legacy_model_subcommands[cmd_name]
def get_command(self, ctx, cmd_name):
@@ -82,6 +110,9 @@ class ArchiveBoxGroup(click.Group):
print(f' [violet]Hint:[/violet] `archivebox {cmd_name}` has been renamed to `archivebox {new_name}`')
cmd_name = new_name
ctx.invoked_subcommand = cmd_name
if self._should_use_legacy_model_command(cmd_name):
return self._lazy_load(self.legacy_model_commands[cmd_name])
# handle lazy loading of commands
if cmd_name in self.all_subcommands:
@@ -91,8 +122,8 @@ class ArchiveBoxGroup(click.Group):
return super().get_command(ctx, cmd_name)
@classmethod
def _lazy_load(cls, cmd_name):
import_path = cls.all_subcommands[cmd_name]
def _lazy_load(cls, cmd_name_or_path):
import_path = cls.all_subcommands.get(cmd_name_or_path, cmd_name_or_path)
modname, funcname = import_path.rsplit('.', 1)
# print(f'LAZY LOADING {import_path}')

View File

@@ -0,0 +1,27 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox crawl'
import sys
import rich_click as click
from archivebox.cli.archivebox_add import add
@click.command(context_settings={'ignore_unknown_options': True})
@click.option('--depth', '-d', type=int, default=0, help='Max crawl depth (default: 0)')
@click.option('--tag', '-t', default='', help='Comma-separated tags to add')
@click.option('--status', '-s', default='queued', help='Initial status (default: queued)')
@click.option('--wait/--no-wait', 'wait', default=True, help='Accepted for backwards compatibility')
@click.argument('urls', nargs=-1)
def main(depth: int, tag: str, status: str, wait: bool, urls: tuple[str, ...]):
"""Backwards-compatible `archivebox crawl URL...` entrypoint."""
del status, wait
add(list(urls), depth=depth, tag=tag, index_only=True, bg=True)
sys.exit(0)
if __name__ == '__main__':
main()

View File

@@ -226,7 +226,7 @@ def is_archiveresult_id(value: str) -> bool:
@click.command()
@click.option('--plugins', '-p', default='', help='Comma-separated list of plugins to run (e.g., screenshot,singlefile)')
@click.option('--plugins', '--plugin', '-p', default='', help='Comma-separated list of plugins to run (e.g., screenshot,singlefile)')
@click.option('--wait/--no-wait', default=True, help='Wait for plugins to complete (default: wait)')
@click.argument('args', nargs=-1)
def main(plugins: str, wait: bool, args: tuple):

View File

@@ -12,6 +12,7 @@ import rich_click as click
from django.db.models import QuerySet
from archivebox.config import DATA_DIR
from archivebox.config.constants import CONSTANTS
from archivebox.config.django import setup_django
from archivebox.misc.util import enforce_types, docstring
from archivebox.misc.checks import check_data_folder
@@ -65,6 +66,9 @@ def remove(filter_patterns: Iterable[str]=(),
for snapshot in snapshots:
if delete:
shutil.rmtree(snapshot.output_dir, ignore_errors=True)
legacy_path = CONSTANTS.ARCHIVE_DIR / snapshot.timestamp
if legacy_path.is_symlink():
legacy_path.unlink(missing_ok=True)
finally:
timer.end()

View File

@@ -0,0 +1,24 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox snapshot'
import sys
import rich_click as click
from archivebox.cli.archivebox_snapshot import create_snapshots
@click.command(context_settings={'ignore_unknown_options': True})
@click.option('--tag', '-t', default='', help='Comma-separated tags to add')
@click.option('--status', '-s', default='queued', help='Initial status (default: queued)')
@click.option('--depth', '-d', type=int, default=0, help='Crawl depth (default: 0)')
@click.argument('urls', nargs=-1)
def main(tag: str, status: str, depth: int, urls: tuple[str, ...]):
"""Backwards-compatible `archivebox snapshot URL...` entrypoint."""
sys.exit(create_snapshots(urls, tag=tag, status=status, depth=depth))
if __name__ == '__main__':
main()

View File

@@ -1557,6 +1557,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
'url': self.url,
'title': self.title,
'tags': self.tags_str(),
'tags_str': self.tags_str(),
'bookmarked_at': self.bookmarked_at.isoformat() if self.bookmarked_at else None,
'created_at': self.created_at.isoformat() if self.created_at else None,
'timestamp': self.timestamp,
@@ -2306,7 +2307,7 @@ class SnapshotMachine(BaseStateMachine):
status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]
).count()
if remaining_active == 0:
if remaining_active == 0 and crawl.status == crawl.StatusChoices.STARTED:
print(f'[cyan]🔒 All snapshots sealed for crawl {crawl.id}, sealing crawl[/cyan]', file=sys.stderr)
# Seal the parent crawl
crawl.sm.seal()

View File

@@ -332,8 +332,9 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
"""
import time
from pathlib import Path
from archivebox.hooks import run_hook, discover_hooks, process_hook_records
from archivebox.hooks import run_hook, discover_hooks, process_hook_records, is_finite_background_hook
from archivebox.config.configset import get_config
from archivebox.machine.models import Binary, Machine
# Debug logging to file (since stdout/stderr redirected to /dev/null in progress mode)
debug_log = Path('/tmp/archivebox_crawl_debug.log')
@@ -344,6 +345,43 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
# Get merged config with crawl context
config = get_config(crawl=self)
machine = Machine.current()
declared_binary_names: set[str] = set()
def install_declared_binaries(binary_names: set[str]) -> None:
if not binary_names:
return
pending_binaries = Binary.objects.filter(
machine=machine,
name__in=binary_names,
).exclude(
status=Binary.StatusChoices.INSTALLED,
).order_by('retry_at')
for binary in pending_binaries:
try:
binary.sm.tick()
except Exception:
continue
unresolved_binaries = list(
Binary.objects.filter(
machine=machine,
name__in=binary_names,
).exclude(
status=Binary.StatusChoices.INSTALLED,
).order_by('name')
)
if unresolved_binaries:
binary_details = ', '.join(
f'{binary.name} (status={binary.status})'
for binary in unresolved_binaries
)
raise RuntimeError(
f'Crawl dependencies failed to install before continuing: {binary_details}'
)
# Discover and run on_Crawl hooks
with open(debug_log, 'a') as f:
f.write(f'Discovering Crawl hooks...\n')
@@ -378,9 +416,15 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
if hook_elapsed > 0.5: # Log slow hooks
print(f'[yellow]⏱️ Hook {hook.name} took {hook_elapsed:.2f}s[/yellow]')
# Background hook - still running
# Finite background hooks must finish before snapshots start so they can
# emit dependency records (Binary, Machine config, etc.).
if process.status == process.StatusChoices.RUNNING:
continue
if not is_finite_background_hook(hook.name):
continue
try:
process.wait(timeout=process.timeout)
except Exception:
continue
# Foreground hook - process JSONL records
from archivebox.hooks import extract_records_from_process
@@ -394,33 +438,19 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
if stats:
print(f'[green]✓ Created: {stats}[/green]')
# Ensure any newly declared binaries are installed before creating snapshots
from archivebox.machine.models import Binary, Machine
from django.utils import timezone
hook_binary_names = {
str(record.get('name')).strip()
for record in records
if record.get('type') == 'Binary' and record.get('name')
}
hook_binary_names.discard('')
if hook_binary_names:
declared_binary_names.update(hook_binary_names)
install_declared_binaries(hook_binary_names)
machine = Machine.current()
while True:
pending_binaries = Binary.objects.filter(
machine=machine,
status=Binary.StatusChoices.QUEUED,
retry_at__lte=timezone.now(),
).order_by('retry_at')
if not pending_binaries.exists():
break
for binary in pending_binaries:
try:
binary.sm.tick()
except Exception:
continue
# Exit if nothing else is immediately retryable
if not Binary.objects.filter(
machine=machine,
status=Binary.StatusChoices.QUEUED,
retry_at__lte=timezone.now(),
).exists():
break
# Safety check: don't create snapshots if any crawl-declared dependency
# is still unresolved after all crawl hooks have run.
install_declared_binaries(declared_binary_names)
# Create snapshots from all URLs in self.urls
with open(debug_log, 'a') as f:

View File

@@ -121,6 +121,11 @@ def is_background_hook(hook_name: str) -> bool:
return '.bg.' in hook_name or '__background' in hook_name
def is_finite_background_hook(hook_name: str) -> bool:
"""Check if a background hook is finite-lived and should be awaited."""
return '.finite.bg.' in hook_name
def iter_plugin_dirs() -> List[Path]:
"""Iterate over all built-in and user plugin directories."""
plugin_dirs: List[Path] = []
@@ -904,8 +909,25 @@ def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[
# Check if PLUGINS whitelist is specified (e.g., --plugins=wget,favicon)
plugins_whitelist = config.get('PLUGINS', '')
if plugins_whitelist:
# PLUGINS whitelist is specified - only enable plugins in the list
plugin_names = [p.strip().lower() for p in plugins_whitelist.split(',') if p.strip()]
# PLUGINS whitelist is specified - include transitive required_plugins from config.json
plugin_configs = discover_plugin_configs()
plugin_names = {p.strip().lower() for p in plugins_whitelist.split(',') if p.strip()}
pending = list(plugin_names)
while pending:
current = pending.pop()
schema = plugin_configs.get(current, {})
required_plugins = schema.get('required_plugins', [])
if not isinstance(required_plugins, list):
continue
for required_plugin in required_plugins:
required_plugin_name = str(required_plugin).strip().lower()
if not required_plugin_name or required_plugin_name in plugin_names:
continue
plugin_names.add(required_plugin_name)
pending.append(required_plugin_name)
if plugin_name.lower() not in plugin_names:
# Plugin not in whitelist - explicitly disabled
enabled = False

View File

@@ -1,8 +1,18 @@
# Generated by Django 6.0 on 2026-01-05 01:09
from django.db import migrations
def remove_output_dir_if_exists(apps, schema_editor):
cursor = schema_editor.connection.cursor()
cursor.execute("PRAGMA table_info(machine_binary)")
columns = {row[1] for row in cursor.fetchall()}
if 'output_dir' not in columns:
return
Binary = apps.get_model('machine', 'Binary')
schema_editor.remove_field(Binary, Binary._meta.get_field('output_dir'))
class Migration(migrations.Migration):
dependencies = [
@@ -10,8 +20,15 @@ class Migration(migrations.Migration):
]
operations = [
migrations.RemoveField(
model_name='binary',
name='output_dir',
migrations.SeparateDatabaseAndState(
database_operations=[
migrations.RunPython(remove_output_dir_if_exists, migrations.RunPython.noop),
],
state_operations=[
migrations.RemoveField(
model_name='binary',
name='output_dir',
),
],
),
]

View File

@@ -352,7 +352,7 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine):
# Case 2: From binaries.json - create queued binary (needs installation)
if 'binproviders' in record or ('overrides' in record and not abspath):
binary, created = Binary.objects.get_or_create(
binary, _ = Binary.objects.update_or_create(
machine=machine,
name=name,
defaults={

View File

@@ -13,16 +13,16 @@ def process(tmp_path):
def disable_extractors_dict():
env = os.environ.copy()
env.update({
"USE_WGET": "false",
"USE_SINGLEFILE": "false",
"USE_READABILITY": "false",
"USE_MERCURY": "false",
"SAVE_WGET": "false",
"SAVE_SINGLEFILE": "false",
"SAVE_READABILITY": "false",
"SAVE_MERCURY": "false",
"SAVE_HTMLTOTEXT": "false",
"SAVE_PDF": "false",
"SAVE_SCREENSHOT": "false",
"SAVE_DOM": "false",
"SAVE_HEADERS": "false",
"USE_GIT": "false",
"SAVE_GIT": "false",
"SAVE_YTDLP": "false",
"SAVE_ARCHIVEDOTORG": "false",
"SAVE_TITLE": "false",

View File

@@ -145,8 +145,8 @@ def test_crawl_from_file_creates_snapshot(tmp_path, process, disable_extractors_
assert snapshot is not None, "Should create at least one snapshot"
def test_crawl_creates_seed_for_input(tmp_path, process, disable_extractors_dict):
"""Test that crawl creates a Seed object for input."""
def test_crawl_persists_input_urls_on_crawl(tmp_path, process, disable_extractors_dict):
"""Test that crawl input URLs are stored on the Crawl record."""
os.chdir(tmp_path)
subprocess.run(
@@ -158,10 +158,11 @@ def test_crawl_creates_seed_for_input(tmp_path, process, disable_extractors_dict
conn = sqlite3.connect('index.sqlite3')
c = conn.cursor()
seed = c.execute("SELECT id FROM crawls_seed").fetchone()
crawl_urls = c.execute("SELECT urls FROM crawls_crawl ORDER BY created_at DESC LIMIT 1").fetchone()
conn.close()
assert seed is not None, "Seed should be created for crawl input"
assert crawl_urls is not None, "Crawl should be created for crawl input"
assert 'https://example.com' in crawl_urls[0], "Crawl should persist input URLs"
class TestCrawlCLI:
@@ -178,7 +179,7 @@ class TestCrawlCLI:
)
assert result.returncode == 0
assert '--depth' in result.stdout or '-d' in result.stdout
assert 'create' in result.stdout
if __name__ == '__main__':

View File

@@ -3,7 +3,7 @@ import json as pyjson
def test_singlefile_works(tmp_path, process, disable_extractors_dict):
disable_extractors_dict.update({"USE_SINGLEFILE": "true"})
disable_extractors_dict.update({"SAVE_SINGLEFILE": "true"})
add_process = subprocess.run(['archivebox', 'add', 'https://example.com'],
capture_output=True, env=disable_extractors_dict)
archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
@@ -11,7 +11,7 @@ def test_singlefile_works(tmp_path, process, disable_extractors_dict):
assert output_file.exists()
def test_readability_works(tmp_path, process, disable_extractors_dict):
disable_extractors_dict.update({"USE_READABILITY": "true"})
disable_extractors_dict.update({"SAVE_READABILITY": "true"})
add_process = subprocess.run(['archivebox', 'add', 'https://example.com'],
capture_output=True, env=disable_extractors_dict)
archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
@@ -27,7 +27,7 @@ def test_htmltotext_works(tmp_path, process, disable_extractors_dict):
assert output_file.exists()
def test_use_node_false_disables_readability_and_singlefile(tmp_path, process, disable_extractors_dict):
disable_extractors_dict.update({"USE_READABILITY": "true", "SAVE_DOM": "true", "USE_SINGLEFILE": "true", "USE_NODE": "false"})
disable_extractors_dict.update({"SAVE_READABILITY": "true", "SAVE_DOM": "true", "SAVE_SINGLEFILE": "true", "USE_NODE": "false"})
add_process = subprocess.run(['archivebox', 'add', 'https://example.com'],
capture_output=True, env=disable_extractors_dict)
output_str = add_process.stdout.decode("utf-8")

View File

@@ -39,6 +39,17 @@ def test_add_real_world_example_domain(tmp_path):
env = os.environ.copy()
env["TMP_DIR"] = str(tmp_short)
env["ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS"] = "true"
env["SAVE_TITLE"] = "True"
env["SAVE_WGET"] = "True"
env["SAVE_SINGLEFILE"] = "True"
env["SAVE_READABILITY"] = "False"
env["SAVE_HTMLTOTEXT"] = "True"
env["SAVE_HEADERS"] = "True"
env["SAVE_PDF"] = "False"
env["SAVE_SCREENSHOT"] = "False"
env["SAVE_ARCHIVEDOTORG"] = "False"
env["SAVE_YTDLP"] = "False"
env["SAVE_GIT"] = "False"
init = subprocess.run(
["archivebox", "init"],
@@ -50,7 +61,7 @@ def test_add_real_world_example_domain(tmp_path):
assert init.returncode == 0, f"archivebox init failed: {init.stderr}"
result = subprocess.run(
["archivebox", "add", "https://example.com"],
["archivebox", "add", "--plugins=title,wget,singlefile,htmltotext,headers", "https://example.com"],
capture_output=True,
text=True,
timeout=900,
@@ -115,19 +126,13 @@ def test_add_real_world_example_domain(tmp_path):
)
text_hits = 0
for path in (
*snapshot_dir.glob("*_readability/content.txt"),
snapshot_dir / "readability" / "content.txt",
):
if path.exists() and "Example Domain" in path.read_text(errors="ignore"):
text_hits += 1
for path in (
*snapshot_dir.glob("*_htmltotext/htmltotext.txt"),
snapshot_dir / "htmltotext" / "htmltotext.txt",
):
if path.exists() and "Example Domain" in path.read_text(errors="ignore"):
text_hits += 1
assert text_hits >= 2, (
"Expected multiple text extractors to contain Example Domain "
f"(readability/htmltotext hits={text_hits})."
assert text_hits >= 1, (
"Expected htmltotext output to contain Example Domain "
f"(htmltotext hits={text_hits})."
)

View File

@@ -22,16 +22,16 @@ def test_background_hooks_dont_block_parser_extractors(tmp_path, process):
env = os.environ.copy()
env.update({
# Disable most extractors
"USE_WGET": "false",
"USE_SINGLEFILE": "false",
"USE_READABILITY": "false",
"USE_MERCURY": "false",
"SAVE_WGET": "false",
"SAVE_SINGLEFILE": "false",
"SAVE_READABILITY": "false",
"SAVE_MERCURY": "false",
"SAVE_HTMLTOTEXT": "false",
"SAVE_PDF": "false",
"SAVE_SCREENSHOT": "false",
"SAVE_DOM": "false",
"SAVE_HEADERS": "false",
"USE_GIT": "false",
"SAVE_GIT": "false",
"SAVE_YTDLP": "false",
"SAVE_ARCHIVEDOTORG": "false",
"SAVE_TITLE": "false",
@@ -122,16 +122,16 @@ def test_parser_extractors_emit_snapshot_jsonl(tmp_path, process):
# Enable only parse_html_urls for this test
env = os.environ.copy()
env.update({
"USE_WGET": "false",
"USE_SINGLEFILE": "false",
"USE_READABILITY": "false",
"USE_MERCURY": "false",
"SAVE_WGET": "false",
"SAVE_SINGLEFILE": "false",
"SAVE_READABILITY": "false",
"SAVE_MERCURY": "false",
"SAVE_HTMLTOTEXT": "false",
"SAVE_PDF": "false",
"SAVE_SCREENSHOT": "false",
"SAVE_DOM": "false",
"SAVE_HEADERS": "false",
"USE_GIT": "false",
"SAVE_GIT": "false",
"SAVE_YTDLP": "false",
"SAVE_ARCHIVEDOTORG": "false",
"SAVE_TITLE": "false",
@@ -202,12 +202,22 @@ def test_recursive_crawl_creates_child_snapshots(tmp_path, process):
env = os.environ.copy()
env.update({
"URL_ALLOWLIST": r"monadical\.com/.*", # Only crawl same domain
"SAVE_READABILITY": "false",
"SAVE_SINGLEFILE": "false",
"SAVE_MERCURY": "false",
"SAVE_SCREENSHOT": "false",
"SAVE_PDF": "false",
"SAVE_HEADERS": "false",
"SAVE_ARCHIVEDOTORG": "false",
"SAVE_GIT": "false",
"SAVE_YTDLP": "false",
"SAVE_TITLE": "false",
})
# Start a crawl with depth=1 (just one hop to test recursive crawling)
# Use file:// URL so it's instant, no network fetch needed
proc = subprocess.Popen(
['archivebox', 'add', '--depth=1', f'file://{test_html}'],
['archivebox', 'add', '--depth=1', '--plugins=wget,parse_html_urls', f'file://{test_html}'],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,

View File

@@ -12,7 +12,7 @@ def test_remove_single_snapshot(tmp_path, process, disable_extractors_dict):
# Verify snapshot exists
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
count_before = c.execute("SELECT COUNT() from archivebox.core.snapshot").fetchone()[0]
count_before = c.execute("SELECT COUNT() FROM core_snapshot").fetchone()[0]
conn.close()
assert count_before >= 1
@@ -24,7 +24,7 @@ def test_remove_single_snapshot(tmp_path, process, disable_extractors_dict):
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
count = c.execute("SELECT COUNT() from archivebox.core.snapshot").fetchone()[0]
count = c.execute("SELECT COUNT() FROM core_snapshot").fetchone()[0]
conn.close()
assert count == 0
@@ -59,7 +59,7 @@ def test_remove_regex(tmp_path, process, disable_extractors_dict):
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
count_before = c.execute("SELECT COUNT() from archivebox.core.snapshot").fetchone()[0]
count_before = c.execute("SELECT COUNT() FROM core_snapshot").fetchone()[0]
conn.close()
assert count_before >= 2
@@ -67,7 +67,7 @@ def test_remove_regex(tmp_path, process, disable_extractors_dict):
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
count_after = c.execute("SELECT COUNT() from archivebox.core.snapshot").fetchone()[0]
count_after = c.execute("SELECT COUNT() FROM core_snapshot").fetchone()[0]
conn.close()
assert count_after == 0
@@ -80,7 +80,7 @@ def test_add_creates_crawls(tmp_path, process, disable_extractors_dict):
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
crawl_count = c.execute("SELECT COUNT() from archivebox.crawls.crawl").fetchone()[0]
crawl_count = c.execute("SELECT COUNT() FROM crawls_crawl").fetchone()[0]
conn.close()
assert crawl_count == 2

View File

@@ -9,7 +9,10 @@ from pathlib import Path
from archivebox.tests.conftest import create_test_url
def _run_savepagenow_script(initialized_archive: Path, request_url: str, expected_url: str, *, login: bool, public_add_view: bool):
ADMIN_HOST = 'admin.archivebox.localhost:8000'
def _run_savepagenow_script(initialized_archive: Path, request_url: str, expected_url: str, *, login: bool, public_add_view: bool, host: str):
project_root = Path(__file__).resolve().parents[2]
script = textwrap.dedent(
f"""
@@ -31,7 +34,7 @@ def _run_savepagenow_script(initialized_archive: Path, request_url: str, expecte
target_url = {request_url!r}
resp = client.get('/web/' + target_url, HTTP_HOST='web.archivebox.localhost:8000')
resp = client.get('/web/' + target_url, HTTP_HOST={host!r})
assert resp.status_code == 302, resp.status_code
snapshot = Snapshot.objects.filter(url={expected_url!r}).order_by('-created_at').first()
@@ -46,7 +49,7 @@ def _run_savepagenow_script(initialized_archive: Path, request_url: str, expecte
)
assert resp['Location'] == f"/{{snapshot.url_path}}"
resp2 = client.get('/web/' + target_url, HTTP_HOST='web.archivebox.localhost:8000')
resp2 = client.get('/web/' + target_url, HTTP_HOST={host!r})
assert resp2.status_code == 302, resp2.status_code
assert Snapshot.objects.filter(url={expected_url!r}).count() == 1
assert resp2['Location'] == f"/{{snapshot.url_path}}"
@@ -208,7 +211,7 @@ def test_web_add_creates_and_reuses_snapshot_logged_in(initialized_archive):
"""/web/https://... should work for authenticated users even when public add is off."""
url = create_test_url(domain='example.com', path='savepagenow-auth')
request_url = url.replace('https://', '')
result = _run_savepagenow_script(initialized_archive, request_url, url, login=True, public_add_view=False)
result = _run_savepagenow_script(initialized_archive, request_url, url, login=True, public_add_view=False, host=ADMIN_HOST)
assert result.returncode == 0, (
"SavePageNow shortcut (logged-in) test failed.\n"
f"stdout:\n{result.stdout}\n"
@@ -220,7 +223,7 @@ def test_web_add_creates_and_reuses_snapshot_public(initialized_archive):
"""/web/https://... should work when PUBLIC_ADD_VIEW is enabled without login."""
url = create_test_url(domain='example.com', path='savepagenow-public')
request_url = url.replace('https://', '')
result = _run_savepagenow_script(initialized_archive, request_url, url, login=False, public_add_view=True)
result = _run_savepagenow_script(initialized_archive, request_url, url, login=False, public_add_view=True, host='web.archivebox.localhost:8000')
assert result.returncode == 0, (
"SavePageNow shortcut (public add) test failed.\n"
f"stdout:\n{result.stdout}\n"

View File

@@ -6,14 +6,19 @@ from .fixtures import *
def test_title_is_extracted(tmp_path, process, disable_extractors_dict):
"""Test that title is extracted from the page."""
disable_extractors_dict.update({"SAVE_TITLE": "true"})
subprocess.run(['archivebox', 'add', 'https://example.com'],
capture_output=True, env=disable_extractors_dict)
add_process = subprocess.run(
['archivebox', 'add', '--plugins=title', 'https://example.com'],
capture_output=True,
text=True,
env=disable_extractors_dict,
)
assert add_process.returncode == 0, add_process.stderr or add_process.stdout
os.chdir(tmp_path)
conn = sqlite3.connect("index.sqlite3")
conn.row_factory = sqlite3.Row
c = conn.cursor()
c.execute("SELECT title from archivebox.core.snapshot")
c.execute("SELECT title FROM core_snapshot")
snapshot = c.fetchone()
conn.close()
@@ -27,8 +32,13 @@ def test_title_is_htmlencoded_in_index_html(tmp_path, process, disable_extractor
and breaks the layout.
"""
disable_extractors_dict.update({"SAVE_TITLE": "true"})
subprocess.run(['archivebox', 'add', 'https://example.com'],
capture_output=True, env=disable_extractors_dict)
add_process = subprocess.run(
['archivebox', 'add', '--plugins=title', 'https://example.com'],
capture_output=True,
text=True,
env=disable_extractors_dict,
)
assert add_process.returncode == 0, add_process.stderr or add_process.stdout
list_process = subprocess.run(["archivebox", "list", "--html"], capture_output=True)
# Should not contain unescaped HTML tags in output

View File

@@ -1,5 +1,30 @@
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
from threading import Thread
from archivebox.misc.util import download_url
class _ExampleHandler(BaseHTTPRequestHandler):
def do_GET(self):
body = b"<html><body><h1>Example Domain</h1></body></html>"
self.send_response(200)
self.send_header("Content-Type", "text/html; charset=utf-8")
self.send_header("Content-Length", str(len(body)))
self.end_headers()
self.wfile.write(body)
def log_message(self, format, *args):
return
def test_download_url_downloads_content():
text = download_url("https://example.com")
server = ThreadingHTTPServer(("127.0.0.1", 0), _ExampleHandler)
thread = Thread(target=server.serve_forever, daemon=True)
thread.start()
try:
text = download_url(f"http://127.0.0.1:{server.server_address[1]}/")
finally:
server.shutdown()
server.server_close()
thread.join(timeout=5)
assert "Example Domain" in text