mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
Restore CLI compat and plugin dependency handling
This commit is contained in:
@@ -41,6 +41,7 @@ class ArchiveBoxGroup(click.Group):
|
||||
archive_commands = {
|
||||
# High-level commands
|
||||
'add': 'archivebox.cli.archivebox_add.main',
|
||||
'extract': 'archivebox.cli.archivebox_extract.main',
|
||||
'list': 'archivebox.cli.archivebox_list.main',
|
||||
'remove': 'archivebox.cli.archivebox_remove.main',
|
||||
'run': 'archivebox.cli.archivebox_run.main',
|
||||
@@ -55,6 +56,10 @@ class ArchiveBoxGroup(click.Group):
|
||||
# Introspection commands
|
||||
'pluginmap': 'archivebox.cli.archivebox_pluginmap.main',
|
||||
}
|
||||
legacy_model_commands = {
|
||||
'crawl': 'archivebox.cli.archivebox_crawl_compat.main',
|
||||
'snapshot': 'archivebox.cli.archivebox_snapshot_compat.main',
|
||||
}
|
||||
all_subcommands = {
|
||||
**meta_commands,
|
||||
**setup_commands,
|
||||
@@ -67,12 +72,35 @@ class ArchiveBoxGroup(click.Group):
|
||||
'archive': 'add',
|
||||
# Old commands replaced by new model commands
|
||||
'orchestrator': 'run',
|
||||
'extract': 'archiveresult',
|
||||
}
|
||||
legacy_model_subcommands = {
|
||||
'crawl': {'create', 'list', 'update', 'delete'},
|
||||
'snapshot': {'create', 'list', 'update', 'delete'},
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def get_canonical_name(cls, cmd_name):
|
||||
return cls.renamed_commands.get(cmd_name, cmd_name)
|
||||
|
||||
@classmethod
|
||||
def _should_use_legacy_model_command(cls, cmd_name: str) -> bool:
|
||||
if cmd_name not in cls.legacy_model_commands:
|
||||
return False
|
||||
|
||||
try:
|
||||
arg_idx = sys.argv.index(cmd_name)
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
remaining_args = sys.argv[arg_idx + 1:]
|
||||
if not remaining_args:
|
||||
return False
|
||||
|
||||
first_arg = remaining_args[0]
|
||||
if first_arg in ('-h', '--help'):
|
||||
return False
|
||||
|
||||
return first_arg not in cls.legacy_model_subcommands[cmd_name]
|
||||
|
||||
|
||||
def get_command(self, ctx, cmd_name):
|
||||
@@ -82,6 +110,9 @@ class ArchiveBoxGroup(click.Group):
|
||||
print(f' [violet]Hint:[/violet] `archivebox {cmd_name}` has been renamed to `archivebox {new_name}`')
|
||||
cmd_name = new_name
|
||||
ctx.invoked_subcommand = cmd_name
|
||||
|
||||
if self._should_use_legacy_model_command(cmd_name):
|
||||
return self._lazy_load(self.legacy_model_commands[cmd_name])
|
||||
|
||||
# handle lazy loading of commands
|
||||
if cmd_name in self.all_subcommands:
|
||||
@@ -91,8 +122,8 @@ class ArchiveBoxGroup(click.Group):
|
||||
return super().get_command(ctx, cmd_name)
|
||||
|
||||
@classmethod
|
||||
def _lazy_load(cls, cmd_name):
|
||||
import_path = cls.all_subcommands[cmd_name]
|
||||
def _lazy_load(cls, cmd_name_or_path):
|
||||
import_path = cls.all_subcommands.get(cmd_name_or_path, cmd_name_or_path)
|
||||
modname, funcname = import_path.rsplit('.', 1)
|
||||
|
||||
# print(f'LAZY LOADING {import_path}')
|
||||
|
||||
27
archivebox/cli/archivebox_crawl_compat.py
Normal file
27
archivebox/cli/archivebox_crawl_compat.py
Normal file
@@ -0,0 +1,27 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox crawl'
|
||||
|
||||
import sys
|
||||
|
||||
import rich_click as click
|
||||
|
||||
from archivebox.cli.archivebox_add import add
|
||||
|
||||
|
||||
@click.command(context_settings={'ignore_unknown_options': True})
|
||||
@click.option('--depth', '-d', type=int, default=0, help='Max crawl depth (default: 0)')
|
||||
@click.option('--tag', '-t', default='', help='Comma-separated tags to add')
|
||||
@click.option('--status', '-s', default='queued', help='Initial status (default: queued)')
|
||||
@click.option('--wait/--no-wait', 'wait', default=True, help='Accepted for backwards compatibility')
|
||||
@click.argument('urls', nargs=-1)
|
||||
def main(depth: int, tag: str, status: str, wait: bool, urls: tuple[str, ...]):
|
||||
"""Backwards-compatible `archivebox crawl URL...` entrypoint."""
|
||||
del status, wait
|
||||
add(list(urls), depth=depth, tag=tag, index_only=True, bg=True)
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -226,7 +226,7 @@ def is_archiveresult_id(value: str) -> bool:
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--plugins', '-p', default='', help='Comma-separated list of plugins to run (e.g., screenshot,singlefile)')
|
||||
@click.option('--plugins', '--plugin', '-p', default='', help='Comma-separated list of plugins to run (e.g., screenshot,singlefile)')
|
||||
@click.option('--wait/--no-wait', default=True, help='Wait for plugins to complete (default: wait)')
|
||||
@click.argument('args', nargs=-1)
|
||||
def main(plugins: str, wait: bool, args: tuple):
|
||||
|
||||
@@ -12,6 +12,7 @@ import rich_click as click
|
||||
from django.db.models import QuerySet
|
||||
|
||||
from archivebox.config import DATA_DIR
|
||||
from archivebox.config.constants import CONSTANTS
|
||||
from archivebox.config.django import setup_django
|
||||
from archivebox.misc.util import enforce_types, docstring
|
||||
from archivebox.misc.checks import check_data_folder
|
||||
@@ -65,6 +66,9 @@ def remove(filter_patterns: Iterable[str]=(),
|
||||
for snapshot in snapshots:
|
||||
if delete:
|
||||
shutil.rmtree(snapshot.output_dir, ignore_errors=True)
|
||||
legacy_path = CONSTANTS.ARCHIVE_DIR / snapshot.timestamp
|
||||
if legacy_path.is_symlink():
|
||||
legacy_path.unlink(missing_ok=True)
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
|
||||
24
archivebox/cli/archivebox_snapshot_compat.py
Normal file
24
archivebox/cli/archivebox_snapshot_compat.py
Normal file
@@ -0,0 +1,24 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox snapshot'
|
||||
|
||||
import sys
|
||||
|
||||
import rich_click as click
|
||||
|
||||
from archivebox.cli.archivebox_snapshot import create_snapshots
|
||||
|
||||
|
||||
@click.command(context_settings={'ignore_unknown_options': True})
|
||||
@click.option('--tag', '-t', default='', help='Comma-separated tags to add')
|
||||
@click.option('--status', '-s', default='queued', help='Initial status (default: queued)')
|
||||
@click.option('--depth', '-d', type=int, default=0, help='Crawl depth (default: 0)')
|
||||
@click.argument('urls', nargs=-1)
|
||||
def main(tag: str, status: str, depth: int, urls: tuple[str, ...]):
|
||||
"""Backwards-compatible `archivebox snapshot URL...` entrypoint."""
|
||||
sys.exit(create_snapshots(urls, tag=tag, status=status, depth=depth))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -1557,6 +1557,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
'url': self.url,
|
||||
'title': self.title,
|
||||
'tags': self.tags_str(),
|
||||
'tags_str': self.tags_str(),
|
||||
'bookmarked_at': self.bookmarked_at.isoformat() if self.bookmarked_at else None,
|
||||
'created_at': self.created_at.isoformat() if self.created_at else None,
|
||||
'timestamp': self.timestamp,
|
||||
@@ -2306,7 +2307,7 @@ class SnapshotMachine(BaseStateMachine):
|
||||
status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]
|
||||
).count()
|
||||
|
||||
if remaining_active == 0:
|
||||
if remaining_active == 0 and crawl.status == crawl.StatusChoices.STARTED:
|
||||
print(f'[cyan]🔒 All snapshots sealed for crawl {crawl.id}, sealing crawl[/cyan]', file=sys.stderr)
|
||||
# Seal the parent crawl
|
||||
crawl.sm.seal()
|
||||
|
||||
@@ -332,8 +332,9 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
"""
|
||||
import time
|
||||
from pathlib import Path
|
||||
from archivebox.hooks import run_hook, discover_hooks, process_hook_records
|
||||
from archivebox.hooks import run_hook, discover_hooks, process_hook_records, is_finite_background_hook
|
||||
from archivebox.config.configset import get_config
|
||||
from archivebox.machine.models import Binary, Machine
|
||||
|
||||
# Debug logging to file (since stdout/stderr redirected to /dev/null in progress mode)
|
||||
debug_log = Path('/tmp/archivebox_crawl_debug.log')
|
||||
@@ -344,6 +345,43 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
# Get merged config with crawl context
|
||||
config = get_config(crawl=self)
|
||||
|
||||
machine = Machine.current()
|
||||
declared_binary_names: set[str] = set()
|
||||
|
||||
def install_declared_binaries(binary_names: set[str]) -> None:
|
||||
if not binary_names:
|
||||
return
|
||||
|
||||
pending_binaries = Binary.objects.filter(
|
||||
machine=machine,
|
||||
name__in=binary_names,
|
||||
).exclude(
|
||||
status=Binary.StatusChoices.INSTALLED,
|
||||
).order_by('retry_at')
|
||||
|
||||
for binary in pending_binaries:
|
||||
try:
|
||||
binary.sm.tick()
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
unresolved_binaries = list(
|
||||
Binary.objects.filter(
|
||||
machine=machine,
|
||||
name__in=binary_names,
|
||||
).exclude(
|
||||
status=Binary.StatusChoices.INSTALLED,
|
||||
).order_by('name')
|
||||
)
|
||||
if unresolved_binaries:
|
||||
binary_details = ', '.join(
|
||||
f'{binary.name} (status={binary.status})'
|
||||
for binary in unresolved_binaries
|
||||
)
|
||||
raise RuntimeError(
|
||||
f'Crawl dependencies failed to install before continuing: {binary_details}'
|
||||
)
|
||||
|
||||
# Discover and run on_Crawl hooks
|
||||
with open(debug_log, 'a') as f:
|
||||
f.write(f'Discovering Crawl hooks...\n')
|
||||
@@ -378,9 +416,15 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
if hook_elapsed > 0.5: # Log slow hooks
|
||||
print(f'[yellow]⏱️ Hook {hook.name} took {hook_elapsed:.2f}s[/yellow]')
|
||||
|
||||
# Background hook - still running
|
||||
# Finite background hooks must finish before snapshots start so they can
|
||||
# emit dependency records (Binary, Machine config, etc.).
|
||||
if process.status == process.StatusChoices.RUNNING:
|
||||
continue
|
||||
if not is_finite_background_hook(hook.name):
|
||||
continue
|
||||
try:
|
||||
process.wait(timeout=process.timeout)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# Foreground hook - process JSONL records
|
||||
from archivebox.hooks import extract_records_from_process
|
||||
@@ -394,33 +438,19 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
if stats:
|
||||
print(f'[green]✓ Created: {stats}[/green]')
|
||||
|
||||
# Ensure any newly declared binaries are installed before creating snapshots
|
||||
from archivebox.machine.models import Binary, Machine
|
||||
from django.utils import timezone
|
||||
hook_binary_names = {
|
||||
str(record.get('name')).strip()
|
||||
for record in records
|
||||
if record.get('type') == 'Binary' and record.get('name')
|
||||
}
|
||||
hook_binary_names.discard('')
|
||||
if hook_binary_names:
|
||||
declared_binary_names.update(hook_binary_names)
|
||||
install_declared_binaries(hook_binary_names)
|
||||
|
||||
machine = Machine.current()
|
||||
while True:
|
||||
pending_binaries = Binary.objects.filter(
|
||||
machine=machine,
|
||||
status=Binary.StatusChoices.QUEUED,
|
||||
retry_at__lte=timezone.now(),
|
||||
).order_by('retry_at')
|
||||
if not pending_binaries.exists():
|
||||
break
|
||||
|
||||
for binary in pending_binaries:
|
||||
try:
|
||||
binary.sm.tick()
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# Exit if nothing else is immediately retryable
|
||||
if not Binary.objects.filter(
|
||||
machine=machine,
|
||||
status=Binary.StatusChoices.QUEUED,
|
||||
retry_at__lte=timezone.now(),
|
||||
).exists():
|
||||
break
|
||||
# Safety check: don't create snapshots if any crawl-declared dependency
|
||||
# is still unresolved after all crawl hooks have run.
|
||||
install_declared_binaries(declared_binary_names)
|
||||
|
||||
# Create snapshots from all URLs in self.urls
|
||||
with open(debug_log, 'a') as f:
|
||||
|
||||
@@ -121,6 +121,11 @@ def is_background_hook(hook_name: str) -> bool:
|
||||
return '.bg.' in hook_name or '__background' in hook_name
|
||||
|
||||
|
||||
def is_finite_background_hook(hook_name: str) -> bool:
|
||||
"""Check if a background hook is finite-lived and should be awaited."""
|
||||
return '.finite.bg.' in hook_name
|
||||
|
||||
|
||||
def iter_plugin_dirs() -> List[Path]:
|
||||
"""Iterate over all built-in and user plugin directories."""
|
||||
plugin_dirs: List[Path] = []
|
||||
@@ -904,8 +909,25 @@ def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[
|
||||
# Check if PLUGINS whitelist is specified (e.g., --plugins=wget,favicon)
|
||||
plugins_whitelist = config.get('PLUGINS', '')
|
||||
if plugins_whitelist:
|
||||
# PLUGINS whitelist is specified - only enable plugins in the list
|
||||
plugin_names = [p.strip().lower() for p in plugins_whitelist.split(',') if p.strip()]
|
||||
# PLUGINS whitelist is specified - include transitive required_plugins from config.json
|
||||
plugin_configs = discover_plugin_configs()
|
||||
plugin_names = {p.strip().lower() for p in plugins_whitelist.split(',') if p.strip()}
|
||||
pending = list(plugin_names)
|
||||
|
||||
while pending:
|
||||
current = pending.pop()
|
||||
schema = plugin_configs.get(current, {})
|
||||
required_plugins = schema.get('required_plugins', [])
|
||||
if not isinstance(required_plugins, list):
|
||||
continue
|
||||
|
||||
for required_plugin in required_plugins:
|
||||
required_plugin_name = str(required_plugin).strip().lower()
|
||||
if not required_plugin_name or required_plugin_name in plugin_names:
|
||||
continue
|
||||
plugin_names.add(required_plugin_name)
|
||||
pending.append(required_plugin_name)
|
||||
|
||||
if plugin_name.lower() not in plugin_names:
|
||||
# Plugin not in whitelist - explicitly disabled
|
||||
enabled = False
|
||||
|
||||
@@ -1,8 +1,18 @@
|
||||
# Generated by Django 6.0 on 2026-01-05 01:09
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
def remove_output_dir_if_exists(apps, schema_editor):
|
||||
cursor = schema_editor.connection.cursor()
|
||||
cursor.execute("PRAGMA table_info(machine_binary)")
|
||||
columns = {row[1] for row in cursor.fetchall()}
|
||||
|
||||
if 'output_dir' not in columns:
|
||||
return
|
||||
|
||||
Binary = apps.get_model('machine', 'Binary')
|
||||
schema_editor.remove_field(Binary, Binary._meta.get_field('output_dir'))
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
@@ -10,8 +20,15 @@ class Migration(migrations.Migration):
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RemoveField(
|
||||
model_name='binary',
|
||||
name='output_dir',
|
||||
migrations.SeparateDatabaseAndState(
|
||||
database_operations=[
|
||||
migrations.RunPython(remove_output_dir_if_exists, migrations.RunPython.noop),
|
||||
],
|
||||
state_operations=[
|
||||
migrations.RemoveField(
|
||||
model_name='binary',
|
||||
name='output_dir',
|
||||
),
|
||||
],
|
||||
),
|
||||
]
|
||||
|
||||
@@ -352,7 +352,7 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine):
|
||||
|
||||
# Case 2: From binaries.json - create queued binary (needs installation)
|
||||
if 'binproviders' in record or ('overrides' in record and not abspath):
|
||||
binary, created = Binary.objects.get_or_create(
|
||||
binary, _ = Binary.objects.update_or_create(
|
||||
machine=machine,
|
||||
name=name,
|
||||
defaults={
|
||||
|
||||
@@ -13,16 +13,16 @@ def process(tmp_path):
|
||||
def disable_extractors_dict():
|
||||
env = os.environ.copy()
|
||||
env.update({
|
||||
"USE_WGET": "false",
|
||||
"USE_SINGLEFILE": "false",
|
||||
"USE_READABILITY": "false",
|
||||
"USE_MERCURY": "false",
|
||||
"SAVE_WGET": "false",
|
||||
"SAVE_SINGLEFILE": "false",
|
||||
"SAVE_READABILITY": "false",
|
||||
"SAVE_MERCURY": "false",
|
||||
"SAVE_HTMLTOTEXT": "false",
|
||||
"SAVE_PDF": "false",
|
||||
"SAVE_SCREENSHOT": "false",
|
||||
"SAVE_DOM": "false",
|
||||
"SAVE_HEADERS": "false",
|
||||
"USE_GIT": "false",
|
||||
"SAVE_GIT": "false",
|
||||
"SAVE_YTDLP": "false",
|
||||
"SAVE_ARCHIVEDOTORG": "false",
|
||||
"SAVE_TITLE": "false",
|
||||
|
||||
@@ -145,8 +145,8 @@ def test_crawl_from_file_creates_snapshot(tmp_path, process, disable_extractors_
|
||||
assert snapshot is not None, "Should create at least one snapshot"
|
||||
|
||||
|
||||
def test_crawl_creates_seed_for_input(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that crawl creates a Seed object for input."""
|
||||
def test_crawl_persists_input_urls_on_crawl(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that crawl input URLs are stored on the Crawl record."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
subprocess.run(
|
||||
@@ -158,10 +158,11 @@ def test_crawl_creates_seed_for_input(tmp_path, process, disable_extractors_dict
|
||||
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
c = conn.cursor()
|
||||
seed = c.execute("SELECT id FROM crawls_seed").fetchone()
|
||||
crawl_urls = c.execute("SELECT urls FROM crawls_crawl ORDER BY created_at DESC LIMIT 1").fetchone()
|
||||
conn.close()
|
||||
|
||||
assert seed is not None, "Seed should be created for crawl input"
|
||||
assert crawl_urls is not None, "Crawl should be created for crawl input"
|
||||
assert 'https://example.com' in crawl_urls[0], "Crawl should persist input URLs"
|
||||
|
||||
|
||||
class TestCrawlCLI:
|
||||
@@ -178,7 +179,7 @@ class TestCrawlCLI:
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert '--depth' in result.stdout or '-d' in result.stdout
|
||||
assert 'create' in result.stdout
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@@ -3,7 +3,7 @@ import json as pyjson
|
||||
|
||||
|
||||
def test_singlefile_works(tmp_path, process, disable_extractors_dict):
|
||||
disable_extractors_dict.update({"USE_SINGLEFILE": "true"})
|
||||
disable_extractors_dict.update({"SAVE_SINGLEFILE": "true"})
|
||||
add_process = subprocess.run(['archivebox', 'add', 'https://example.com'],
|
||||
capture_output=True, env=disable_extractors_dict)
|
||||
archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
|
||||
@@ -11,7 +11,7 @@ def test_singlefile_works(tmp_path, process, disable_extractors_dict):
|
||||
assert output_file.exists()
|
||||
|
||||
def test_readability_works(tmp_path, process, disable_extractors_dict):
|
||||
disable_extractors_dict.update({"USE_READABILITY": "true"})
|
||||
disable_extractors_dict.update({"SAVE_READABILITY": "true"})
|
||||
add_process = subprocess.run(['archivebox', 'add', 'https://example.com'],
|
||||
capture_output=True, env=disable_extractors_dict)
|
||||
archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
|
||||
@@ -27,7 +27,7 @@ def test_htmltotext_works(tmp_path, process, disable_extractors_dict):
|
||||
assert output_file.exists()
|
||||
|
||||
def test_use_node_false_disables_readability_and_singlefile(tmp_path, process, disable_extractors_dict):
|
||||
disable_extractors_dict.update({"USE_READABILITY": "true", "SAVE_DOM": "true", "USE_SINGLEFILE": "true", "USE_NODE": "false"})
|
||||
disable_extractors_dict.update({"SAVE_READABILITY": "true", "SAVE_DOM": "true", "SAVE_SINGLEFILE": "true", "USE_NODE": "false"})
|
||||
add_process = subprocess.run(['archivebox', 'add', 'https://example.com'],
|
||||
capture_output=True, env=disable_extractors_dict)
|
||||
output_str = add_process.stdout.decode("utf-8")
|
||||
|
||||
@@ -39,6 +39,17 @@ def test_add_real_world_example_domain(tmp_path):
|
||||
env = os.environ.copy()
|
||||
env["TMP_DIR"] = str(tmp_short)
|
||||
env["ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS"] = "true"
|
||||
env["SAVE_TITLE"] = "True"
|
||||
env["SAVE_WGET"] = "True"
|
||||
env["SAVE_SINGLEFILE"] = "True"
|
||||
env["SAVE_READABILITY"] = "False"
|
||||
env["SAVE_HTMLTOTEXT"] = "True"
|
||||
env["SAVE_HEADERS"] = "True"
|
||||
env["SAVE_PDF"] = "False"
|
||||
env["SAVE_SCREENSHOT"] = "False"
|
||||
env["SAVE_ARCHIVEDOTORG"] = "False"
|
||||
env["SAVE_YTDLP"] = "False"
|
||||
env["SAVE_GIT"] = "False"
|
||||
|
||||
init = subprocess.run(
|
||||
["archivebox", "init"],
|
||||
@@ -50,7 +61,7 @@ def test_add_real_world_example_domain(tmp_path):
|
||||
assert init.returncode == 0, f"archivebox init failed: {init.stderr}"
|
||||
|
||||
result = subprocess.run(
|
||||
["archivebox", "add", "https://example.com"],
|
||||
["archivebox", "add", "--plugins=title,wget,singlefile,htmltotext,headers", "https://example.com"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=900,
|
||||
@@ -115,19 +126,13 @@ def test_add_real_world_example_domain(tmp_path):
|
||||
)
|
||||
|
||||
text_hits = 0
|
||||
for path in (
|
||||
*snapshot_dir.glob("*_readability/content.txt"),
|
||||
snapshot_dir / "readability" / "content.txt",
|
||||
):
|
||||
if path.exists() and "Example Domain" in path.read_text(errors="ignore"):
|
||||
text_hits += 1
|
||||
for path in (
|
||||
*snapshot_dir.glob("*_htmltotext/htmltotext.txt"),
|
||||
snapshot_dir / "htmltotext" / "htmltotext.txt",
|
||||
):
|
||||
if path.exists() and "Example Domain" in path.read_text(errors="ignore"):
|
||||
text_hits += 1
|
||||
assert text_hits >= 2, (
|
||||
"Expected multiple text extractors to contain Example Domain "
|
||||
f"(readability/htmltotext hits={text_hits})."
|
||||
assert text_hits >= 1, (
|
||||
"Expected htmltotext output to contain Example Domain "
|
||||
f"(htmltotext hits={text_hits})."
|
||||
)
|
||||
|
||||
@@ -22,16 +22,16 @@ def test_background_hooks_dont_block_parser_extractors(tmp_path, process):
|
||||
env = os.environ.copy()
|
||||
env.update({
|
||||
# Disable most extractors
|
||||
"USE_WGET": "false",
|
||||
"USE_SINGLEFILE": "false",
|
||||
"USE_READABILITY": "false",
|
||||
"USE_MERCURY": "false",
|
||||
"SAVE_WGET": "false",
|
||||
"SAVE_SINGLEFILE": "false",
|
||||
"SAVE_READABILITY": "false",
|
||||
"SAVE_MERCURY": "false",
|
||||
"SAVE_HTMLTOTEXT": "false",
|
||||
"SAVE_PDF": "false",
|
||||
"SAVE_SCREENSHOT": "false",
|
||||
"SAVE_DOM": "false",
|
||||
"SAVE_HEADERS": "false",
|
||||
"USE_GIT": "false",
|
||||
"SAVE_GIT": "false",
|
||||
"SAVE_YTDLP": "false",
|
||||
"SAVE_ARCHIVEDOTORG": "false",
|
||||
"SAVE_TITLE": "false",
|
||||
@@ -122,16 +122,16 @@ def test_parser_extractors_emit_snapshot_jsonl(tmp_path, process):
|
||||
# Enable only parse_html_urls for this test
|
||||
env = os.environ.copy()
|
||||
env.update({
|
||||
"USE_WGET": "false",
|
||||
"USE_SINGLEFILE": "false",
|
||||
"USE_READABILITY": "false",
|
||||
"USE_MERCURY": "false",
|
||||
"SAVE_WGET": "false",
|
||||
"SAVE_SINGLEFILE": "false",
|
||||
"SAVE_READABILITY": "false",
|
||||
"SAVE_MERCURY": "false",
|
||||
"SAVE_HTMLTOTEXT": "false",
|
||||
"SAVE_PDF": "false",
|
||||
"SAVE_SCREENSHOT": "false",
|
||||
"SAVE_DOM": "false",
|
||||
"SAVE_HEADERS": "false",
|
||||
"USE_GIT": "false",
|
||||
"SAVE_GIT": "false",
|
||||
"SAVE_YTDLP": "false",
|
||||
"SAVE_ARCHIVEDOTORG": "false",
|
||||
"SAVE_TITLE": "false",
|
||||
@@ -202,12 +202,22 @@ def test_recursive_crawl_creates_child_snapshots(tmp_path, process):
|
||||
env = os.environ.copy()
|
||||
env.update({
|
||||
"URL_ALLOWLIST": r"monadical\.com/.*", # Only crawl same domain
|
||||
"SAVE_READABILITY": "false",
|
||||
"SAVE_SINGLEFILE": "false",
|
||||
"SAVE_MERCURY": "false",
|
||||
"SAVE_SCREENSHOT": "false",
|
||||
"SAVE_PDF": "false",
|
||||
"SAVE_HEADERS": "false",
|
||||
"SAVE_ARCHIVEDOTORG": "false",
|
||||
"SAVE_GIT": "false",
|
||||
"SAVE_YTDLP": "false",
|
||||
"SAVE_TITLE": "false",
|
||||
})
|
||||
|
||||
# Start a crawl with depth=1 (just one hop to test recursive crawling)
|
||||
# Use file:// URL so it's instant, no network fetch needed
|
||||
proc = subprocess.Popen(
|
||||
['archivebox', 'add', '--depth=1', f'file://{test_html}'],
|
||||
['archivebox', 'add', '--depth=1', '--plugins=wget,parse_html_urls', f'file://{test_html}'],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
|
||||
@@ -12,7 +12,7 @@ def test_remove_single_snapshot(tmp_path, process, disable_extractors_dict):
|
||||
# Verify snapshot exists
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
count_before = c.execute("SELECT COUNT() from archivebox.core.snapshot").fetchone()[0]
|
||||
count_before = c.execute("SELECT COUNT() FROM core_snapshot").fetchone()[0]
|
||||
conn.close()
|
||||
assert count_before >= 1
|
||||
|
||||
@@ -24,7 +24,7 @@ def test_remove_single_snapshot(tmp_path, process, disable_extractors_dict):
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
count = c.execute("SELECT COUNT() from archivebox.core.snapshot").fetchone()[0]
|
||||
count = c.execute("SELECT COUNT() FROM core_snapshot").fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
assert count == 0
|
||||
@@ -59,7 +59,7 @@ def test_remove_regex(tmp_path, process, disable_extractors_dict):
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
count_before = c.execute("SELECT COUNT() from archivebox.core.snapshot").fetchone()[0]
|
||||
count_before = c.execute("SELECT COUNT() FROM core_snapshot").fetchone()[0]
|
||||
conn.close()
|
||||
assert count_before >= 2
|
||||
|
||||
@@ -67,7 +67,7 @@ def test_remove_regex(tmp_path, process, disable_extractors_dict):
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
count_after = c.execute("SELECT COUNT() from archivebox.core.snapshot").fetchone()[0]
|
||||
count_after = c.execute("SELECT COUNT() FROM core_snapshot").fetchone()[0]
|
||||
conn.close()
|
||||
assert count_after == 0
|
||||
|
||||
@@ -80,7 +80,7 @@ def test_add_creates_crawls(tmp_path, process, disable_extractors_dict):
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
crawl_count = c.execute("SELECT COUNT() from archivebox.crawls.crawl").fetchone()[0]
|
||||
crawl_count = c.execute("SELECT COUNT() FROM crawls_crawl").fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
assert crawl_count == 2
|
||||
|
||||
@@ -9,7 +9,10 @@ from pathlib import Path
|
||||
from archivebox.tests.conftest import create_test_url
|
||||
|
||||
|
||||
def _run_savepagenow_script(initialized_archive: Path, request_url: str, expected_url: str, *, login: bool, public_add_view: bool):
|
||||
ADMIN_HOST = 'admin.archivebox.localhost:8000'
|
||||
|
||||
|
||||
def _run_savepagenow_script(initialized_archive: Path, request_url: str, expected_url: str, *, login: bool, public_add_view: bool, host: str):
|
||||
project_root = Path(__file__).resolve().parents[2]
|
||||
script = textwrap.dedent(
|
||||
f"""
|
||||
@@ -31,7 +34,7 @@ def _run_savepagenow_script(initialized_archive: Path, request_url: str, expecte
|
||||
|
||||
target_url = {request_url!r}
|
||||
|
||||
resp = client.get('/web/' + target_url, HTTP_HOST='web.archivebox.localhost:8000')
|
||||
resp = client.get('/web/' + target_url, HTTP_HOST={host!r})
|
||||
assert resp.status_code == 302, resp.status_code
|
||||
|
||||
snapshot = Snapshot.objects.filter(url={expected_url!r}).order_by('-created_at').first()
|
||||
@@ -46,7 +49,7 @@ def _run_savepagenow_script(initialized_archive: Path, request_url: str, expecte
|
||||
)
|
||||
assert resp['Location'] == f"/{{snapshot.url_path}}"
|
||||
|
||||
resp2 = client.get('/web/' + target_url, HTTP_HOST='web.archivebox.localhost:8000')
|
||||
resp2 = client.get('/web/' + target_url, HTTP_HOST={host!r})
|
||||
assert resp2.status_code == 302, resp2.status_code
|
||||
assert Snapshot.objects.filter(url={expected_url!r}).count() == 1
|
||||
assert resp2['Location'] == f"/{{snapshot.url_path}}"
|
||||
@@ -208,7 +211,7 @@ def test_web_add_creates_and_reuses_snapshot_logged_in(initialized_archive):
|
||||
"""/web/https://... should work for authenticated users even when public add is off."""
|
||||
url = create_test_url(domain='example.com', path='savepagenow-auth')
|
||||
request_url = url.replace('https://', '')
|
||||
result = _run_savepagenow_script(initialized_archive, request_url, url, login=True, public_add_view=False)
|
||||
result = _run_savepagenow_script(initialized_archive, request_url, url, login=True, public_add_view=False, host=ADMIN_HOST)
|
||||
assert result.returncode == 0, (
|
||||
"SavePageNow shortcut (logged-in) test failed.\n"
|
||||
f"stdout:\n{result.stdout}\n"
|
||||
@@ -220,7 +223,7 @@ def test_web_add_creates_and_reuses_snapshot_public(initialized_archive):
|
||||
"""/web/https://... should work when PUBLIC_ADD_VIEW is enabled without login."""
|
||||
url = create_test_url(domain='example.com', path='savepagenow-public')
|
||||
request_url = url.replace('https://', '')
|
||||
result = _run_savepagenow_script(initialized_archive, request_url, url, login=False, public_add_view=True)
|
||||
result = _run_savepagenow_script(initialized_archive, request_url, url, login=False, public_add_view=True, host='web.archivebox.localhost:8000')
|
||||
assert result.returncode == 0, (
|
||||
"SavePageNow shortcut (public add) test failed.\n"
|
||||
f"stdout:\n{result.stdout}\n"
|
||||
|
||||
@@ -6,14 +6,19 @@ from .fixtures import *
|
||||
def test_title_is_extracted(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that title is extracted from the page."""
|
||||
disable_extractors_dict.update({"SAVE_TITLE": "true"})
|
||||
subprocess.run(['archivebox', 'add', 'https://example.com'],
|
||||
capture_output=True, env=disable_extractors_dict)
|
||||
add_process = subprocess.run(
|
||||
['archivebox', 'add', '--plugins=title', 'https://example.com'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
assert add_process.returncode == 0, add_process.stderr or add_process.stdout
|
||||
|
||||
os.chdir(tmp_path)
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
conn.row_factory = sqlite3.Row
|
||||
c = conn.cursor()
|
||||
c.execute("SELECT title from archivebox.core.snapshot")
|
||||
c.execute("SELECT title FROM core_snapshot")
|
||||
snapshot = c.fetchone()
|
||||
conn.close()
|
||||
|
||||
@@ -27,8 +32,13 @@ def test_title_is_htmlencoded_in_index_html(tmp_path, process, disable_extractor
|
||||
and breaks the layout.
|
||||
"""
|
||||
disable_extractors_dict.update({"SAVE_TITLE": "true"})
|
||||
subprocess.run(['archivebox', 'add', 'https://example.com'],
|
||||
capture_output=True, env=disable_extractors_dict)
|
||||
add_process = subprocess.run(
|
||||
['archivebox', 'add', '--plugins=title', 'https://example.com'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
assert add_process.returncode == 0, add_process.stderr or add_process.stdout
|
||||
list_process = subprocess.run(["archivebox", "list", "--html"], capture_output=True)
|
||||
|
||||
# Should not contain unescaped HTML tags in output
|
||||
|
||||
@@ -1,5 +1,30 @@
|
||||
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
|
||||
from threading import Thread
|
||||
|
||||
from archivebox.misc.util import download_url
|
||||
|
||||
|
||||
class _ExampleHandler(BaseHTTPRequestHandler):
|
||||
def do_GET(self):
|
||||
body = b"<html><body><h1>Example Domain</h1></body></html>"
|
||||
self.send_response(200)
|
||||
self.send_header("Content-Type", "text/html; charset=utf-8")
|
||||
self.send_header("Content-Length", str(len(body)))
|
||||
self.end_headers()
|
||||
self.wfile.write(body)
|
||||
|
||||
def log_message(self, format, *args):
|
||||
return
|
||||
|
||||
def test_download_url_downloads_content():
|
||||
text = download_url("https://example.com")
|
||||
server = ThreadingHTTPServer(("127.0.0.1", 0), _ExampleHandler)
|
||||
thread = Thread(target=server.serve_forever, daemon=True)
|
||||
thread.start()
|
||||
try:
|
||||
text = download_url(f"http://127.0.0.1:{server.server_address[1]}/")
|
||||
finally:
|
||||
server.shutdown()
|
||||
server.server_close()
|
||||
thread.join(timeout=5)
|
||||
|
||||
assert "Example Domain" in text
|
||||
|
||||
Reference in New Issue
Block a user