mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
wip
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox"
|
||||
import os
|
||||
import sys
|
||||
from importlib import import_module
|
||||
@@ -10,55 +10,55 @@ from rich import print
|
||||
from archivebox.config.version import VERSION
|
||||
|
||||
|
||||
|
||||
if '--debug' in sys.argv:
|
||||
os.environ['DEBUG'] = 'True'
|
||||
sys.argv.remove('--debug')
|
||||
if "--debug" in sys.argv:
|
||||
os.environ["DEBUG"] = "True"
|
||||
sys.argv.remove("--debug")
|
||||
|
||||
|
||||
class ArchiveBoxGroup(click.Group):
|
||||
"""lazy loading click group for archivebox commands"""
|
||||
|
||||
meta_commands = {
|
||||
'help': 'archivebox.cli.archivebox_help.main',
|
||||
'version': 'archivebox.cli.archivebox_version.main',
|
||||
'mcp': 'archivebox.cli.archivebox_mcp.main',
|
||||
"help": "archivebox.cli.archivebox_help.main",
|
||||
"version": "archivebox.cli.archivebox_version.main",
|
||||
"mcp": "archivebox.cli.archivebox_mcp.main",
|
||||
}
|
||||
setup_commands = {
|
||||
'init': 'archivebox.cli.archivebox_init.main',
|
||||
'install': 'archivebox.cli.archivebox_install.main',
|
||||
"init": "archivebox.cli.archivebox_init.main",
|
||||
"install": "archivebox.cli.archivebox_install.main",
|
||||
}
|
||||
# Model commands (CRUD operations via subcommands)
|
||||
model_commands = {
|
||||
'crawl': 'archivebox.cli.archivebox_crawl.main',
|
||||
'snapshot': 'archivebox.cli.archivebox_snapshot.main',
|
||||
'archiveresult': 'archivebox.cli.archivebox_archiveresult.main',
|
||||
'tag': 'archivebox.cli.archivebox_tag.main',
|
||||
'binary': 'archivebox.cli.archivebox_binary.main',
|
||||
'process': 'archivebox.cli.archivebox_process.main',
|
||||
'machine': 'archivebox.cli.archivebox_machine.main',
|
||||
'persona': 'archivebox.cli.archivebox_persona.main',
|
||||
"crawl": "archivebox.cli.archivebox_crawl.main",
|
||||
"snapshot": "archivebox.cli.archivebox_snapshot.main",
|
||||
"archiveresult": "archivebox.cli.archivebox_archiveresult.main",
|
||||
"tag": "archivebox.cli.archivebox_tag.main",
|
||||
"binary": "archivebox.cli.archivebox_binary.main",
|
||||
"process": "archivebox.cli.archivebox_process.main",
|
||||
"machine": "archivebox.cli.archivebox_machine.main",
|
||||
"persona": "archivebox.cli.archivebox_persona.main",
|
||||
}
|
||||
archive_commands = {
|
||||
# High-level commands
|
||||
'add': 'archivebox.cli.archivebox_add.main',
|
||||
'extract': 'archivebox.cli.archivebox_extract.main',
|
||||
'list': 'archivebox.cli.archivebox_list.main',
|
||||
'remove': 'archivebox.cli.archivebox_remove.main',
|
||||
'run': 'archivebox.cli.archivebox_run.main',
|
||||
'update': 'archivebox.cli.archivebox_update.main',
|
||||
'status': 'archivebox.cli.archivebox_status.main',
|
||||
'search': 'archivebox.cli.archivebox_search.main',
|
||||
'config': 'archivebox.cli.archivebox_config.main',
|
||||
'schedule': 'archivebox.cli.archivebox_schedule.main',
|
||||
'server': 'archivebox.cli.archivebox_server.main',
|
||||
'shell': 'archivebox.cli.archivebox_shell.main',
|
||||
'manage': 'archivebox.cli.archivebox_manage.main',
|
||||
"add": "archivebox.cli.archivebox_add.main",
|
||||
"extract": "archivebox.cli.archivebox_extract.main",
|
||||
"list": "archivebox.cli.archivebox_list.main",
|
||||
"remove": "archivebox.cli.archivebox_remove.main",
|
||||
"run": "archivebox.cli.archivebox_run.main",
|
||||
"update": "archivebox.cli.archivebox_update.main",
|
||||
"status": "archivebox.cli.archivebox_status.main",
|
||||
"search": "archivebox.cli.archivebox_search.main",
|
||||
"config": "archivebox.cli.archivebox_config.main",
|
||||
"schedule": "archivebox.cli.archivebox_schedule.main",
|
||||
"server": "archivebox.cli.archivebox_server.main",
|
||||
"shell": "archivebox.cli.archivebox_shell.main",
|
||||
"manage": "archivebox.cli.archivebox_manage.main",
|
||||
# Introspection commands
|
||||
'pluginmap': 'archivebox.cli.archivebox_pluginmap.main',
|
||||
"pluginmap": "archivebox.cli.archivebox_pluginmap.main",
|
||||
}
|
||||
legacy_model_commands = {
|
||||
'crawl': 'archivebox.cli.archivebox_crawl_compat.main',
|
||||
'snapshot': 'archivebox.cli.archivebox_snapshot_compat.main',
|
||||
"crawl": "archivebox.cli.archivebox_crawl_compat.main",
|
||||
"snapshot": "archivebox.cli.archivebox_snapshot_compat.main",
|
||||
}
|
||||
all_subcommands = {
|
||||
**meta_commands,
|
||||
@@ -67,15 +67,15 @@ class ArchiveBoxGroup(click.Group):
|
||||
**archive_commands,
|
||||
}
|
||||
renamed_commands = {
|
||||
'setup': 'install',
|
||||
'import': 'add',
|
||||
'archive': 'add',
|
||||
"setup": "install",
|
||||
"import": "add",
|
||||
"archive": "add",
|
||||
}
|
||||
legacy_model_subcommands = {
|
||||
'crawl': {'create', 'list', 'update', 'delete'},
|
||||
'snapshot': {'create', 'list', 'update', 'delete'},
|
||||
"crawl": {"create", "list", "update", "delete"},
|
||||
"snapshot": {"create", "list", "update", "delete"},
|
||||
}
|
||||
|
||||
|
||||
@classmethod
|
||||
def get_canonical_name(cls, cmd_name):
|
||||
return cls.renamed_commands.get(cmd_name, cmd_name)
|
||||
@@ -90,23 +90,22 @@ class ArchiveBoxGroup(click.Group):
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
remaining_args = sys.argv[arg_idx + 1:]
|
||||
remaining_args = sys.argv[arg_idx + 1 :]
|
||||
if not remaining_args:
|
||||
return False
|
||||
|
||||
first_arg = remaining_args[0]
|
||||
if first_arg in ('-h', '--help'):
|
||||
if first_arg in ("-h", "--help"):
|
||||
return False
|
||||
|
||||
return first_arg not in cls.legacy_model_subcommands[cmd_name]
|
||||
|
||||
|
||||
def get_command(self, ctx, cmd_name):
|
||||
# handle renamed commands
|
||||
if cmd_name in self.renamed_commands:
|
||||
new_name = self.renamed_commands[cmd_name]
|
||||
print(
|
||||
f' [violet]Hint:[/violet] `archivebox {cmd_name}` has been renamed to `archivebox {new_name}`',
|
||||
f" [violet]Hint:[/violet] `archivebox {cmd_name}` has been renamed to `archivebox {new_name}`",
|
||||
file=sys.stderr,
|
||||
)
|
||||
cmd_name = new_name
|
||||
@@ -114,11 +113,11 @@ class ArchiveBoxGroup(click.Group):
|
||||
|
||||
if self._should_use_legacy_model_command(cmd_name):
|
||||
return self._lazy_load(self.legacy_model_commands[cmd_name])
|
||||
|
||||
|
||||
# handle lazy loading of commands
|
||||
if cmd_name in self.all_subcommands:
|
||||
return self._lazy_load(cmd_name)
|
||||
|
||||
|
||||
# fall-back to using click's default command lookup
|
||||
return super().get_command(ctx, cmd_name)
|
||||
|
||||
@@ -127,72 +126,74 @@ class ArchiveBoxGroup(click.Group):
|
||||
import_path = cls.all_subcommands.get(cmd_name_or_path)
|
||||
if import_path is None:
|
||||
import_path = cmd_name_or_path
|
||||
modname, funcname = import_path.rsplit('.', 1)
|
||||
|
||||
modname, funcname = import_path.rsplit(".", 1)
|
||||
|
||||
# print(f'LAZY LOADING {import_path}')
|
||||
mod = import_module(modname)
|
||||
func = getattr(mod, funcname)
|
||||
|
||||
if not hasattr(func, '__doc__'):
|
||||
raise ValueError(f'lazy loading of {import_path} failed - no docstring found on method')
|
||||
|
||||
|
||||
if not hasattr(func, "__doc__"):
|
||||
raise ValueError(f"lazy loading of {import_path} failed - no docstring found on method")
|
||||
|
||||
# if not isinstance(cmd, click.BaseCommand):
|
||||
# raise ValueError(f'lazy loading of {import_path} failed - not a click command')
|
||||
|
||||
# raise ValueError(f'lazy loading of {import_path} failed - not a click command')
|
||||
|
||||
return func
|
||||
|
||||
|
||||
@click.group(cls=ArchiveBoxGroup, invoke_without_command=True)
|
||||
@click.option('--help', '-h', is_flag=True, help='Show help')
|
||||
@click.version_option(VERSION, '-v', '--version', package_name='archivebox', message='%(version)s')
|
||||
@click.option("--help", "-h", is_flag=True, help="Show help")
|
||||
@click.version_option(VERSION, "-v", "--version", package_name="archivebox", message="%(version)s")
|
||||
@click.pass_context
|
||||
def cli(ctx, help=False):
|
||||
"""ArchiveBox: The self-hosted internet archive"""
|
||||
|
||||
|
||||
subcommand = ArchiveBoxGroup.get_canonical_name(ctx.invoked_subcommand)
|
||||
|
||||
|
||||
# if --help is passed or no subcommand is given, show custom help message
|
||||
if help or ctx.invoked_subcommand is None:
|
||||
ctx.invoke(ctx.command.get_command(ctx, 'help'))
|
||||
|
||||
ctx.invoke(ctx.command.get_command(ctx, "help"))
|
||||
|
||||
# if the subcommand is in archive_commands or model_commands,
|
||||
# then we need to set up the django environment and check that we're in a valid data folder
|
||||
if subcommand in ArchiveBoxGroup.archive_commands or subcommand in ArchiveBoxGroup.model_commands:
|
||||
# print('SETUP DJANGO AND CHECK DATA FOLDER')
|
||||
try:
|
||||
if subcommand == 'server':
|
||||
run_in_debug = '--reload' in sys.argv or os.environ.get('DEBUG') in ('1', 'true', 'True', 'TRUE', 'yes')
|
||||
if subcommand == "server":
|
||||
run_in_debug = "--reload" in sys.argv or os.environ.get("DEBUG") in ("1", "true", "True", "TRUE", "yes")
|
||||
if run_in_debug:
|
||||
os.environ['ARCHIVEBOX_RUNSERVER'] = '1'
|
||||
if '--reload' in sys.argv:
|
||||
os.environ['ARCHIVEBOX_AUTORELOAD'] = '1'
|
||||
os.environ["ARCHIVEBOX_RUNSERVER"] = "1"
|
||||
if "--reload" in sys.argv:
|
||||
os.environ["ARCHIVEBOX_AUTORELOAD"] = "1"
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
os.environ['ARCHIVEBOX_RUNSERVER_PIDFILE'] = str(STORAGE_CONFIG.TMP_DIR / 'runserver.pid')
|
||||
|
||||
os.environ["ARCHIVEBOX_RUNSERVER_PIDFILE"] = str(STORAGE_CONFIG.TMP_DIR / "runserver.pid")
|
||||
|
||||
from archivebox.config.django import setup_django
|
||||
from archivebox.misc.checks import check_data_folder
|
||||
|
||||
setup_django()
|
||||
check_data_folder()
|
||||
except Exception as e:
|
||||
print(f'[red][X] Error setting up Django or checking data folder: {e}[/red]', file=sys.stderr)
|
||||
if subcommand not in ('manage', 'shell'): # not all management commands need django to be setup beforehand
|
||||
print(f"[red][X] Error setting up Django or checking data folder: {e}[/red]", file=sys.stderr)
|
||||
if subcommand not in ("manage", "shell"): # not all management commands need django to be setup beforehand
|
||||
raise
|
||||
|
||||
|
||||
|
||||
def main(args=None, prog_name=None, stdin=None):
|
||||
# show `docker run archivebox xyz` in help messages if running in docker
|
||||
IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes')
|
||||
IN_DOCKER = os.environ.get("IN_DOCKER", False) in ("1", "true", "True", "TRUE", "yes")
|
||||
IS_TTY = sys.stdin.isatty()
|
||||
prog_name = prog_name or (f'docker compose run{"" if IS_TTY else " -T"} archivebox' if IN_DOCKER else 'archivebox')
|
||||
|
||||
prog_name = prog_name or (f"docker compose run{'' if IS_TTY else ' -T'} archivebox" if IN_DOCKER else "archivebox")
|
||||
|
||||
# stdin param allows passing input data from caller (used by __main__.py)
|
||||
# currently not used by click-based CLI, but kept for backwards compatibility
|
||||
|
||||
try:
|
||||
cli(args=args, prog_name=prog_name)
|
||||
except KeyboardInterrupt:
|
||||
print('\n\n[red][X] Got CTRL+C. Exiting...[/red]')
|
||||
print("\n\n[red][X] Got CTRL+C. Exiting...[/red]")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox add'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox add"
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
@@ -14,6 +14,7 @@ from django.utils import timezone
|
||||
from django.db.models import QuerySet
|
||||
|
||||
from archivebox.misc.util import enforce_types, docstring
|
||||
from archivebox.misc.util import parse_filesize_to_bytes
|
||||
from archivebox import CONSTANTS
|
||||
from archivebox.config.common import ARCHIVING_CONFIG, SERVER_CONFIG
|
||||
from archivebox.config.permissions import USER, HOSTNAME
|
||||
@@ -29,34 +30,38 @@ def _collect_input_urls(args: tuple[str, ...]) -> list[str]:
|
||||
|
||||
urls: list[str] = []
|
||||
for record in read_args_or_stdin(args):
|
||||
url = record.get('url')
|
||||
url = record.get("url")
|
||||
if isinstance(url, str) and url:
|
||||
urls.append(url)
|
||||
|
||||
urls_field = record.get('urls')
|
||||
urls_field = record.get("urls")
|
||||
if isinstance(urls_field, str):
|
||||
for line in urls_field.splitlines():
|
||||
line = line.strip()
|
||||
if line and not line.startswith('#'):
|
||||
if line and not line.startswith("#"):
|
||||
urls.append(line)
|
||||
|
||||
return urls
|
||||
|
||||
|
||||
@enforce_types
|
||||
def add(urls: str | list[str],
|
||||
depth: int | str=0,
|
||||
tag: str='',
|
||||
url_allowlist: str='',
|
||||
url_denylist: str='',
|
||||
parser: str="auto",
|
||||
plugins: str="",
|
||||
persona: str='Default',
|
||||
overwrite: bool=False,
|
||||
update: bool | None=None,
|
||||
index_only: bool=False,
|
||||
bg: bool=False,
|
||||
created_by_id: int | None=None) -> tuple['Crawl', QuerySet['Snapshot']]:
|
||||
def add(
|
||||
urls: str | list[str],
|
||||
depth: int | str = 0,
|
||||
max_urls: int = 0,
|
||||
max_size: int | str = 0,
|
||||
tag: str = "",
|
||||
url_allowlist: str = "",
|
||||
url_denylist: str = "",
|
||||
parser: str = "auto",
|
||||
plugins: str = "",
|
||||
persona: str = "Default",
|
||||
overwrite: bool = False,
|
||||
update: bool | None = None,
|
||||
index_only: bool = False,
|
||||
bg: bool = False,
|
||||
created_by_id: int | None = None,
|
||||
) -> tuple["Crawl", QuerySet["Snapshot"]]:
|
||||
"""Add a new URL or list of URLs to your archive.
|
||||
|
||||
The flow is:
|
||||
@@ -72,8 +77,15 @@ def add(urls: str | list[str],
|
||||
from rich import print
|
||||
|
||||
depth = int(depth)
|
||||
max_urls = int(max_urls or 0)
|
||||
max_size = parse_filesize_to_bytes(max_size)
|
||||
|
||||
assert depth in (0, 1, 2, 3, 4), 'Depth must be 0-4'
|
||||
if depth not in (0, 1, 2, 3, 4):
|
||||
raise ValueError("Depth must be 0-4")
|
||||
if max_urls < 0:
|
||||
raise ValueError("max_urls must be >= 0")
|
||||
if max_size < 0:
|
||||
raise ValueError("max_size must be >= 0")
|
||||
|
||||
# import models once django is set up
|
||||
from archivebox.core.models import Snapshot
|
||||
@@ -91,47 +103,49 @@ def add(urls: str | list[str],
|
||||
update = not ARCHIVING_CONFIG.ONLY_NEW
|
||||
|
||||
# 1. Save the provided URLs to sources/2024-11-05__23-59-59__cli_add.txt
|
||||
sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__cli_add.txt'
|
||||
sources_file = CONSTANTS.SOURCES_DIR / f"{timezone.now().strftime('%Y-%m-%d__%H-%M-%S')}__cli_add.txt"
|
||||
sources_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))
|
||||
sources_file.write_text(urls if isinstance(urls, str) else "\n".join(urls))
|
||||
|
||||
# 2. Create a new Crawl with inline URLs
|
||||
cli_args = [*sys.argv]
|
||||
if cli_args[0].lower().endswith('archivebox'):
|
||||
cli_args[0] = 'archivebox'
|
||||
cmd_str = ' '.join(cli_args)
|
||||
if cli_args[0].lower().endswith("archivebox"):
|
||||
cli_args[0] = "archivebox"
|
||||
cmd_str = " ".join(cli_args)
|
||||
|
||||
timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
|
||||
|
||||
# Read URLs directly into crawl
|
||||
urls_content = sources_file.read_text()
|
||||
persona_name = (persona or 'Default').strip() or 'Default'
|
||||
plugins = plugins or str(get_config().get('PLUGINS') or '')
|
||||
persona_name = (persona or "Default").strip() or "Default"
|
||||
plugins = plugins or str(get_config().get("PLUGINS") or "")
|
||||
persona_obj, _ = Persona.objects.get_or_create(name=persona_name)
|
||||
persona_obj.ensure_dirs()
|
||||
|
||||
crawl = Crawl.objects.create(
|
||||
urls=urls_content,
|
||||
max_depth=depth,
|
||||
max_urls=max_urls,
|
||||
max_size=max_size,
|
||||
tags_str=tag,
|
||||
persona_id=persona_obj.id,
|
||||
label=f'{USER}@{HOSTNAME} $ {cmd_str} [{timestamp}]',
|
||||
label=f"{USER}@{HOSTNAME} $ {cmd_str} [{timestamp}]",
|
||||
created_by_id=created_by_id,
|
||||
config={
|
||||
'ONLY_NEW': not update,
|
||||
'INDEX_ONLY': index_only,
|
||||
'OVERWRITE': overwrite,
|
||||
'PLUGINS': plugins,
|
||||
'DEFAULT_PERSONA': persona_name,
|
||||
'PARSER': parser,
|
||||
**({'URL_ALLOWLIST': url_allowlist} if url_allowlist else {}),
|
||||
**({'URL_DENYLIST': url_denylist} if url_denylist else {}),
|
||||
}
|
||||
"ONLY_NEW": not update,
|
||||
"INDEX_ONLY": index_only,
|
||||
"OVERWRITE": overwrite,
|
||||
"PLUGINS": plugins,
|
||||
"DEFAULT_PERSONA": persona_name,
|
||||
"PARSER": parser,
|
||||
**({"URL_ALLOWLIST": url_allowlist} if url_allowlist else {}),
|
||||
**({"URL_DENYLIST": url_denylist} if url_denylist else {}),
|
||||
},
|
||||
)
|
||||
|
||||
print(f'[green]\\[+] Created Crawl {crawl.id} with max_depth={depth}[/green]')
|
||||
first_url = crawl.get_urls_list()[0] if crawl.get_urls_list() else ''
|
||||
print(f' [dim]First URL: {first_url}[/dim]')
|
||||
print(f"[green]\\[+] Created Crawl {crawl.id} with max_depth={depth}[/green]")
|
||||
first_url = crawl.get_urls_list()[0] if crawl.get_urls_list() else ""
|
||||
print(f" [dim]First URL: {first_url}[/dim]")
|
||||
|
||||
# 3. The CrawlMachine will create Snapshots from all URLs when started
|
||||
# Parser extractors run on snapshots and discover more URLs
|
||||
@@ -139,20 +153,21 @@ def add(urls: str | list[str],
|
||||
|
||||
if index_only:
|
||||
# Just create the crawl but don't start processing
|
||||
print('[yellow]\\[*] Index-only mode - crawl created but not started[/yellow]')
|
||||
print("[yellow]\\[*] Index-only mode - crawl created but not started[/yellow]")
|
||||
# Create snapshots for all URLs in the crawl
|
||||
for url in crawl.get_urls_list():
|
||||
snapshot, _ = Snapshot.objects.update_or_create(
|
||||
crawl=crawl, url=url,
|
||||
crawl=crawl,
|
||||
url=url,
|
||||
defaults={
|
||||
'status': Snapshot.INITIAL_STATE,
|
||||
'retry_at': timezone.now(),
|
||||
'timestamp': str(timezone.now().timestamp()),
|
||||
'depth': 0,
|
||||
"status": Snapshot.INITIAL_STATE,
|
||||
"retry_at": timezone.now(),
|
||||
"timestamp": str(timezone.now().timestamp()),
|
||||
"depth": 0,
|
||||
},
|
||||
)
|
||||
if tag:
|
||||
snapshot.save_tags(tag.split(','))
|
||||
snapshot.save_tags(tag.split(","))
|
||||
snapshot.ensure_crawl_symlink()
|
||||
return crawl, crawl.snapshot_set.all()
|
||||
|
||||
@@ -168,10 +183,12 @@ def add(urls: str | list[str],
|
||||
|
||||
if bg:
|
||||
# Background mode: just queue work and return (background runner via server will pick it up)
|
||||
print('[yellow]\\[*] URLs queued. The background runner will process them (run `archivebox server` or `archivebox run --daemon` if not already running).[/yellow]')
|
||||
print(
|
||||
"[yellow]\\[*] URLs queued. The background runner will process them (run `archivebox server` or `archivebox run --daemon` if not already running).[/yellow]",
|
||||
)
|
||||
else:
|
||||
# Foreground mode: run full crawl runner until all work is done
|
||||
print('[green]\\[*] Starting crawl runner to process crawl...[/green]')
|
||||
print("[green]\\[*] Starting crawl runner to process crawl...[/green]")
|
||||
run_crawl(str(crawl.id))
|
||||
|
||||
# Print summary for foreground runs
|
||||
@@ -179,7 +196,10 @@ def add(urls: str | list[str],
|
||||
crawl.refresh_from_db()
|
||||
snapshots_count = crawl.snapshot_set.count()
|
||||
try:
|
||||
total_bytes = sum(s.archive_size for s in crawl.snapshot_set.all())
|
||||
from django.db.models import Count, Sum
|
||||
|
||||
totals = crawl.snapshot_set.aggregate(snapshot_count=Count("id"), total_bytes=Sum("archiveresult__output_size"))
|
||||
total_bytes = int(totals["total_bytes"] or 0) if totals["snapshot_count"] else 0
|
||||
except Exception:
|
||||
total_bytes, _, _ = get_dir_size(crawl.output_dir)
|
||||
total_size = printable_filesize(total_bytes)
|
||||
@@ -197,23 +217,23 @@ def add(urls: str | list[str],
|
||||
# Output dir relative to DATA_DIR
|
||||
try:
|
||||
rel_output = Path(crawl.output_dir).relative_to(CONSTANTS.DATA_DIR)
|
||||
rel_output_str = f'./{rel_output}'
|
||||
rel_output_str = f"./{rel_output}"
|
||||
except Exception:
|
||||
rel_output_str = str(crawl.output_dir)
|
||||
|
||||
bind_addr = SERVER_CONFIG.BIND_ADDR or '127.0.0.1:8000'
|
||||
if bind_addr.startswith('http://') or bind_addr.startswith('https://'):
|
||||
bind_addr = SERVER_CONFIG.BIND_ADDR or "127.0.0.1:8000"
|
||||
if bind_addr.startswith("http://") or bind_addr.startswith("https://"):
|
||||
base_url = bind_addr
|
||||
else:
|
||||
base_url = f'http://{bind_addr}'
|
||||
admin_url = f'{base_url}/admin/crawls/crawl/{crawl.id}/change/'
|
||||
base_url = f"http://{bind_addr}"
|
||||
admin_url = f"{base_url}/admin/crawls/crawl/{crawl.id}/change/"
|
||||
|
||||
print('\n[bold]crawl output saved to:[/bold]')
|
||||
print(f' {rel_output_str}')
|
||||
print(f' {admin_url}')
|
||||
print(f'\n[bold]total urls snapshotted:[/bold] {snapshots_count}')
|
||||
print(f'[bold]total size:[/bold] {total_size}')
|
||||
print(f'[bold]total time:[/bold] {duration_str}')
|
||||
print("\n[bold]crawl output saved to:[/bold]")
|
||||
print(f" {rel_output_str}")
|
||||
print(f" {admin_url}")
|
||||
print(f"\n[bold]total urls snapshotted:[/bold] {snapshots_count}")
|
||||
print(f"[bold]total size:[/bold] {total_size}")
|
||||
print(f"[bold]total time:[/bold] {duration_str}")
|
||||
except Exception:
|
||||
# Summary is best-effort; avoid failing the command if something goes wrong
|
||||
pass
|
||||
@@ -224,29 +244,43 @@ def add(urls: str | list[str],
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--depth', '-d', type=click.Choice([str(i) for i in range(5)]), default='0', help='Recursively archive linked pages up to N hops away')
|
||||
@click.option('--tag', '-t', default='', help='Comma-separated list of tags to add to each snapshot e.g. tag1,tag2,tag3')
|
||||
@click.option('--url-allowlist', '--domain-allowlist', default='', help='Comma-separated URL/domain allowlist for this crawl')
|
||||
@click.option('--url-denylist', '--domain-denylist', default='', help='Comma-separated URL/domain denylist for this crawl')
|
||||
@click.option('--parser', default='auto', help='Parser for reading input URLs (auto, txt, html, rss, json, jsonl, netscape, ...)')
|
||||
@click.option('--plugins', '-p', default='', help='Comma-separated list of plugins to run e.g. title,favicon,screenshot,singlefile,...')
|
||||
@click.option('--persona', default='Default', help='Authentication profile to use when archiving')
|
||||
@click.option('--overwrite', '-F', is_flag=True, help='Overwrite existing data if URLs have been archived previously')
|
||||
@click.option('--update', is_flag=True, default=None, help='Retry any previously skipped/failed URLs when re-adding them')
|
||||
@click.option('--index-only', is_flag=True, help='Just add the URLs to the index without archiving them now')
|
||||
@click.option('--bg', is_flag=True, help='Run archiving in background (queue work and return immediately)')
|
||||
@click.argument('urls', nargs=-1, type=click.Path())
|
||||
@click.option(
|
||||
"--depth",
|
||||
"-d",
|
||||
type=click.Choice([str(i) for i in range(5)]),
|
||||
default="0",
|
||||
help="Recursively archive linked pages up to N hops away",
|
||||
)
|
||||
@click.option("--max-urls", type=int, default=0, help="Maximum number of URLs to snapshot for this crawl (0 = unlimited)")
|
||||
@click.option("--max-size", default="0", help="Maximum total crawl size in bytes or units like 45mb / 1gb (0 = unlimited)")
|
||||
@click.option("--tag", "-t", default="", help="Comma-separated list of tags to add to each snapshot e.g. tag1,tag2,tag3")
|
||||
@click.option("--url-allowlist", "--domain-allowlist", default="", help="Comma-separated URL/domain allowlist for this crawl")
|
||||
@click.option("--url-denylist", "--domain-denylist", default="", help="Comma-separated URL/domain denylist for this crawl")
|
||||
@click.option("--parser", default="auto", help="Parser for reading input URLs (auto, txt, html, rss, json, jsonl, netscape, ...)")
|
||||
@click.option("--plugins", "-p", default="", help="Comma-separated list of plugins to run e.g. title,favicon,screenshot,singlefile,...")
|
||||
@click.option("--persona", default="Default", help="Authentication profile to use when archiving")
|
||||
@click.option("--overwrite", "-F", is_flag=True, help="Overwrite existing data if URLs have been archived previously")
|
||||
@click.option("--update", is_flag=True, default=None, help="Retry any previously skipped/failed URLs when re-adding them")
|
||||
@click.option("--index-only", is_flag=True, help="Just add the URLs to the index without archiving them now")
|
||||
@click.option("--bg", is_flag=True, help="Run archiving in background (queue work and return immediately)")
|
||||
@click.argument("urls", nargs=-1, type=click.Path())
|
||||
@docstring(add.__doc__)
|
||||
def main(**kwargs):
|
||||
"""Add a new URL or list of URLs to your archive"""
|
||||
|
||||
raw_urls = kwargs.pop('urls')
|
||||
raw_urls = kwargs.pop("urls")
|
||||
urls = _collect_input_urls(raw_urls)
|
||||
if not urls:
|
||||
raise click.UsageError('No URLs provided. Pass URLs as arguments or via stdin.')
|
||||
raise click.UsageError("No URLs provided. Pass URLs as arguments or via stdin.")
|
||||
if int(kwargs.get("max_urls") or 0) < 0:
|
||||
raise click.BadParameter("max_urls must be 0 or a positive integer.", param_hint="--max-urls")
|
||||
try:
|
||||
kwargs["max_size"] = parse_filesize_to_bytes(kwargs.get("max_size"))
|
||||
except ValueError as err:
|
||||
raise click.BadParameter(str(err), param_hint="--max-size") from err
|
||||
|
||||
add(urls=urls, **kwargs)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -30,11 +30,10 @@ Examples:
|
||||
archivebox archiveresult list --status=failed | archivebox run
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox archiveresult'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox archiveresult"
|
||||
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
import rich_click as click
|
||||
from rich import print as rprint
|
||||
@@ -42,13 +41,13 @@ from rich import print as rprint
|
||||
from archivebox.cli.cli_utils import apply_filters
|
||||
|
||||
|
||||
def build_archiveresult_request(snapshot_id: str, plugin: str, hook_name: str = '', status: str = 'queued') -> dict:
|
||||
def build_archiveresult_request(snapshot_id: str, plugin: str, hook_name: str = "", status: str = "queued") -> dict:
|
||||
return {
|
||||
'type': 'ArchiveResult',
|
||||
'snapshot_id': str(snapshot_id),
|
||||
'plugin': plugin,
|
||||
'hook_name': hook_name,
|
||||
'status': status,
|
||||
"type": "ArchiveResult",
|
||||
"snapshot_id": str(snapshot_id),
|
||||
"plugin": plugin,
|
||||
"hook_name": hook_name,
|
||||
"status": status,
|
||||
}
|
||||
|
||||
|
||||
@@ -56,10 +55,11 @@ def build_archiveresult_request(snapshot_id: str, plugin: str, hook_name: str =
|
||||
# CREATE
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def create_archiveresults(
|
||||
snapshot_id: Optional[str] = None,
|
||||
plugin: Optional[str] = None,
|
||||
status: str = 'queued',
|
||||
snapshot_id: str | None = None,
|
||||
plugin: str | None = None,
|
||||
status: str = "queued",
|
||||
) -> int:
|
||||
"""
|
||||
Create ArchiveResult request records for Snapshots.
|
||||
@@ -86,13 +86,13 @@ def create_archiveresults(
|
||||
snapshots = [Snapshot.objects.get(id=snapshot_id)]
|
||||
pass_through_records = []
|
||||
except Snapshot.DoesNotExist:
|
||||
rprint(f'[red]Snapshot not found: {snapshot_id}[/red]', file=sys.stderr)
|
||||
rprint(f"[red]Snapshot not found: {snapshot_id}[/red]", file=sys.stderr)
|
||||
return 1
|
||||
else:
|
||||
# Read from stdin
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No Snapshot records provided via stdin[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No Snapshot records provided via stdin[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Separate snapshot records from pass-through records
|
||||
@@ -100,17 +100,17 @@ def create_archiveresults(
|
||||
pass_through_records = []
|
||||
|
||||
for record in records:
|
||||
record_type = record.get('type', '')
|
||||
record_type = record.get("type", "")
|
||||
|
||||
if record_type == TYPE_SNAPSHOT:
|
||||
# Pass through the Snapshot record itself
|
||||
pass_through_records.append(record)
|
||||
if record.get('id'):
|
||||
snapshot_ids.append(record['id'])
|
||||
if record.get("id"):
|
||||
snapshot_ids.append(record["id"])
|
||||
|
||||
elif record_type == TYPE_ARCHIVERESULT:
|
||||
# ArchiveResult records: pass through if they have an id
|
||||
if record.get('id'):
|
||||
if record.get("id"):
|
||||
pass_through_records.append(record)
|
||||
# If no id, we could create it, but for now just pass through
|
||||
else:
|
||||
@@ -120,9 +120,9 @@ def create_archiveresults(
|
||||
# Other typed records (Crawl, Tag, etc): pass through
|
||||
pass_through_records.append(record)
|
||||
|
||||
elif record.get('id'):
|
||||
elif record.get("id"):
|
||||
# Untyped record with id - assume it's a snapshot ID
|
||||
snapshot_ids.append(record['id'])
|
||||
snapshot_ids.append(record["id"])
|
||||
|
||||
# Output pass-through records first
|
||||
if not is_tty:
|
||||
@@ -131,15 +131,15 @@ def create_archiveresults(
|
||||
|
||||
if not snapshot_ids:
|
||||
if pass_through_records:
|
||||
rprint(f'[dim]Passed through {len(pass_through_records)} records, no new snapshots to process[/dim]', file=sys.stderr)
|
||||
rprint(f"[dim]Passed through {len(pass_through_records)} records, no new snapshots to process[/dim]", file=sys.stderr)
|
||||
return 0
|
||||
rprint('[yellow]No valid Snapshot IDs in input[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No valid Snapshot IDs in input[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
snapshots = list(Snapshot.objects.filter(id__in=snapshot_ids))
|
||||
|
||||
if not snapshots:
|
||||
rprint('[yellow]No matching snapshots found[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No matching snapshots found[/yellow]", file=sys.stderr)
|
||||
return 0 if pass_through_records else 1
|
||||
|
||||
created_count = 0
|
||||
@@ -150,7 +150,7 @@ def create_archiveresults(
|
||||
created_count += 1
|
||||
else:
|
||||
config = get_config(crawl=snapshot.crawl, snapshot=snapshot)
|
||||
hooks = discover_hooks('Snapshot', config=config)
|
||||
hooks = discover_hooks("Snapshot", config=config)
|
||||
for hook_path in hooks:
|
||||
hook_name = hook_path.name
|
||||
plugin_name = hook_path.parent.name
|
||||
@@ -158,7 +158,7 @@ def create_archiveresults(
|
||||
write_record(build_archiveresult_request(snapshot.id, plugin_name, hook_name=hook_name, status=status))
|
||||
created_count += 1
|
||||
|
||||
rprint(f'[green]Created {created_count} archive result request records[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Created {created_count} archive result request records[/green]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -166,11 +166,12 @@ def create_archiveresults(
|
||||
# LIST
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def list_archiveresults(
|
||||
status: Optional[str] = None,
|
||||
plugin: Optional[str] = None,
|
||||
snapshot_id: Optional[str] = None,
|
||||
limit: Optional[int] = None,
|
||||
status: str | None = None,
|
||||
plugin: str | None = None,
|
||||
snapshot_id: str | None = None,
|
||||
limit: int | None = None,
|
||||
) -> int:
|
||||
"""
|
||||
List ArchiveResults as JSONL with optional filters.
|
||||
@@ -183,13 +184,13 @@ def list_archiveresults(
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
queryset = ArchiveResult.objects.all().order_by('-start_ts')
|
||||
queryset = ArchiveResult.objects.all().order_by("-start_ts")
|
||||
|
||||
# Apply filters
|
||||
filter_kwargs = {
|
||||
'status': status,
|
||||
'plugin': plugin,
|
||||
'snapshot_id': snapshot_id,
|
||||
"status": status,
|
||||
"plugin": plugin,
|
||||
"snapshot_id": snapshot_id,
|
||||
}
|
||||
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
|
||||
|
||||
@@ -197,20 +198,22 @@ def list_archiveresults(
|
||||
for result in queryset:
|
||||
if is_tty:
|
||||
status_color = {
|
||||
'queued': 'yellow',
|
||||
'started': 'blue',
|
||||
'succeeded': 'green',
|
||||
'failed': 'red',
|
||||
'skipped': 'dim',
|
||||
'noresults': 'dim',
|
||||
'backoff': 'magenta',
|
||||
}.get(result.status, 'dim')
|
||||
rprint(f'[{status_color}]{result.status:10}[/{status_color}] {result.plugin:15} [dim]{result.id}[/dim] {result.snapshot.url[:40]}')
|
||||
"queued": "yellow",
|
||||
"started": "blue",
|
||||
"succeeded": "green",
|
||||
"failed": "red",
|
||||
"skipped": "dim",
|
||||
"noresults": "dim",
|
||||
"backoff": "magenta",
|
||||
}.get(result.status, "dim")
|
||||
rprint(
|
||||
f"[{status_color}]{result.status:10}[/{status_color}] {result.plugin:15} [dim]{result.id}[/dim] {result.snapshot.url[:40]}",
|
||||
)
|
||||
else:
|
||||
write_record(result.to_json())
|
||||
count += 1
|
||||
|
||||
rprint(f'[dim]Listed {count} archive results[/dim]', file=sys.stderr)
|
||||
rprint(f"[dim]Listed {count} archive results[/dim]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -218,8 +221,9 @@ def list_archiveresults(
|
||||
# UPDATE
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def update_archiveresults(
|
||||
status: Optional[str] = None,
|
||||
status: str | None = None,
|
||||
) -> int:
|
||||
"""
|
||||
Update ArchiveResults from stdin JSONL.
|
||||
@@ -238,12 +242,12 @@ def update_archiveresults(
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
updated_count = 0
|
||||
for record in records:
|
||||
result_id = record.get('id')
|
||||
result_id = record.get("id")
|
||||
if not result_id:
|
||||
continue
|
||||
|
||||
@@ -261,10 +265,10 @@ def update_archiveresults(
|
||||
write_record(result.to_json())
|
||||
|
||||
except ArchiveResult.DoesNotExist:
|
||||
rprint(f'[yellow]ArchiveResult not found: {result_id}[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]ArchiveResult not found: {result_id}[/yellow]", file=sys.stderr)
|
||||
continue
|
||||
|
||||
rprint(f'[green]Updated {updated_count} archive results[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Updated {updated_count} archive results[/green]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -272,6 +276,7 @@ def update_archiveresults(
|
||||
# DELETE
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def delete_archiveresults(yes: bool = False, dry_run: bool = False) -> int:
|
||||
"""
|
||||
Delete ArchiveResults from stdin JSONL.
|
||||
@@ -287,37 +292,37 @@ def delete_archiveresults(yes: bool = False, dry_run: bool = False) -> int:
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
result_ids = [r.get('id') for r in records if r.get('id')]
|
||||
result_ids = [r.get("id") for r in records if r.get("id")]
|
||||
|
||||
if not result_ids:
|
||||
rprint('[yellow]No valid archive result IDs in input[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No valid archive result IDs in input[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
results = ArchiveResult.objects.filter(id__in=result_ids)
|
||||
count = results.count()
|
||||
|
||||
if count == 0:
|
||||
rprint('[yellow]No matching archive results found[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No matching archive results found[/yellow]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if dry_run:
|
||||
rprint(f'[yellow]Would delete {count} archive results (dry run)[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]Would delete {count} archive results (dry run)[/yellow]", file=sys.stderr)
|
||||
for result in results[:10]:
|
||||
rprint(f' [dim]{result.id}[/dim] {result.plugin} {result.snapshot.url[:40]}', file=sys.stderr)
|
||||
rprint(f" [dim]{result.id}[/dim] {result.plugin} {result.snapshot.url[:40]}", file=sys.stderr)
|
||||
if count > 10:
|
||||
rprint(f' ... and {count - 10} more', file=sys.stderr)
|
||||
rprint(f" ... and {count - 10} more", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if not yes:
|
||||
rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
|
||||
rprint("[red]Use --yes to confirm deletion[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Perform deletion
|
||||
deleted_count, _ = results.delete()
|
||||
rprint(f'[green]Deleted {deleted_count} archive results[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Deleted {deleted_count} archive results[/green]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -325,51 +330,58 @@ def delete_archiveresults(yes: bool = False, dry_run: bool = False) -> int:
|
||||
# CLI Commands
|
||||
# =============================================================================
|
||||
|
||||
|
||||
@click.group()
|
||||
def main():
|
||||
"""Manage ArchiveResult records (plugin extraction results)."""
|
||||
pass
|
||||
|
||||
|
||||
@main.command('create')
|
||||
@click.option('--snapshot-id', help='Snapshot ID to create results for')
|
||||
@click.option('--plugin', '-p', help='Plugin name (e.g., screenshot, singlefile)')
|
||||
@click.option('--status', '-s', default='queued', help='Initial status (default: queued)')
|
||||
def create_cmd(snapshot_id: Optional[str], plugin: Optional[str], status: str):
|
||||
@main.command("create")
|
||||
@click.option("--snapshot-id", help="Snapshot ID to create results for")
|
||||
@click.option("--plugin", "-p", help="Plugin name (e.g., screenshot, singlefile)")
|
||||
@click.option("--status", "-s", default="queued", help="Initial status (default: queued)")
|
||||
def create_cmd(snapshot_id: str | None, plugin: str | None, status: str):
|
||||
"""Create ArchiveResults for Snapshots from stdin JSONL."""
|
||||
sys.exit(create_archiveresults(snapshot_id=snapshot_id, plugin=plugin, status=status))
|
||||
|
||||
|
||||
@main.command('list')
|
||||
@click.option('--status', '-s', help='Filter by status (queued, started, succeeded, failed, skipped)')
|
||||
@click.option('--plugin', '-p', help='Filter by plugin name')
|
||||
@click.option('--snapshot-id', help='Filter by snapshot ID')
|
||||
@click.option('--limit', '-n', type=int, help='Limit number of results')
|
||||
def list_cmd(status: Optional[str], plugin: Optional[str],
|
||||
snapshot_id: Optional[str], limit: Optional[int]):
|
||||
@main.command("list")
|
||||
@click.option("--status", "-s", help="Filter by status (queued, started, succeeded, failed, skipped)")
|
||||
@click.option("--plugin", "-p", help="Filter by plugin name")
|
||||
@click.option("--snapshot-id", help="Filter by snapshot ID")
|
||||
@click.option("--limit", "-n", type=int, help="Limit number of results")
|
||||
def list_cmd(
|
||||
status: str | None,
|
||||
plugin: str | None,
|
||||
snapshot_id: str | None,
|
||||
limit: int | None,
|
||||
):
|
||||
"""List ArchiveResults as JSONL."""
|
||||
sys.exit(list_archiveresults(
|
||||
status=status,
|
||||
plugin=plugin,
|
||||
snapshot_id=snapshot_id,
|
||||
limit=limit,
|
||||
))
|
||||
sys.exit(
|
||||
list_archiveresults(
|
||||
status=status,
|
||||
plugin=plugin,
|
||||
snapshot_id=snapshot_id,
|
||||
limit=limit,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@main.command('update')
|
||||
@click.option('--status', '-s', help='Set status')
|
||||
def update_cmd(status: Optional[str]):
|
||||
@main.command("update")
|
||||
@click.option("--status", "-s", help="Set status")
|
||||
def update_cmd(status: str | None):
|
||||
"""Update ArchiveResults from stdin JSONL."""
|
||||
sys.exit(update_archiveresults(status=status))
|
||||
|
||||
|
||||
@main.command('delete')
|
||||
@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
|
||||
@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
|
||||
@main.command("delete")
|
||||
@click.option("--yes", "-y", is_flag=True, help="Confirm deletion")
|
||||
@click.option("--dry-run", is_flag=True, help="Show what would be deleted")
|
||||
def delete_cmd(yes: bool, dry_run: bool):
|
||||
"""Delete ArchiveResults from stdin JSONL."""
|
||||
sys.exit(delete_archiveresults(yes=yes, dry_run=dry_run))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -25,11 +25,10 @@ Examples:
|
||||
archivebox binary list --name=chrome | archivebox binary delete --yes
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox binary'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox binary"
|
||||
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
import rich_click as click
|
||||
from rich import print as rprint
|
||||
@@ -41,10 +40,11 @@ from archivebox.cli.cli_utils import apply_filters
|
||||
# CREATE
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def create_binary(
|
||||
name: str,
|
||||
abspath: str,
|
||||
version: str = '',
|
||||
version: str = "",
|
||||
) -> int:
|
||||
"""
|
||||
Create/register a Binary.
|
||||
@@ -59,7 +59,7 @@ def create_binary(
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
if not name or not abspath:
|
||||
rprint('[red]Both --name and --abspath are required[/red]', file=sys.stderr)
|
||||
rprint("[red]Both --name and --abspath are required[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
try:
|
||||
@@ -76,28 +76,30 @@ def create_binary(
|
||||
# Mirror the Binary model lifecycle used elsewhere in the system so CLI
|
||||
# records are owned by the current machine and can be safely piped into
|
||||
# `archivebox run` without creating invalid rows missing machine_id.
|
||||
binary = Binary.from_json({
|
||||
'name': name,
|
||||
'abspath': abspath,
|
||||
'version': version,
|
||||
'binproviders': 'env',
|
||||
'binprovider': 'env',
|
||||
})
|
||||
binary = Binary.from_json(
|
||||
{
|
||||
"name": name,
|
||||
"abspath": abspath,
|
||||
"version": version,
|
||||
"binproviders": "env",
|
||||
"binprovider": "env",
|
||||
},
|
||||
)
|
||||
if binary is None:
|
||||
raise ValueError('failed to create binary record')
|
||||
raise ValueError("failed to create binary record")
|
||||
|
||||
if not is_tty:
|
||||
write_record(binary.to_json())
|
||||
|
||||
if created:
|
||||
rprint(f'[green]Created binary: {name} at {abspath}[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Created binary: {name} at {abspath}[/green]", file=sys.stderr)
|
||||
else:
|
||||
rprint(f'[dim]Binary already exists: {name} at {abspath}[/dim]', file=sys.stderr)
|
||||
rprint(f"[dim]Binary already exists: {name} at {abspath}[/dim]", file=sys.stderr)
|
||||
|
||||
return 0
|
||||
|
||||
except Exception as e:
|
||||
rprint(f'[red]Error creating binary: {e}[/red]', file=sys.stderr)
|
||||
rprint(f"[red]Error creating binary: {e}[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
|
||||
@@ -105,11 +107,12 @@ def create_binary(
|
||||
# LIST
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def list_binaries(
|
||||
name: Optional[str] = None,
|
||||
abspath__icontains: Optional[str] = None,
|
||||
version__icontains: Optional[str] = None,
|
||||
limit: Optional[int] = None,
|
||||
name: str | None = None,
|
||||
abspath__icontains: str | None = None,
|
||||
version__icontains: str | None = None,
|
||||
limit: int | None = None,
|
||||
) -> int:
|
||||
"""
|
||||
List Binaries as JSONL with optional filters.
|
||||
@@ -122,25 +125,25 @@ def list_binaries(
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
queryset = Binary.objects.all().order_by('name', '-modified_at', '-created_at')
|
||||
queryset = Binary.objects.all().order_by("name", "-modified_at", "-created_at")
|
||||
|
||||
# Apply filters
|
||||
filter_kwargs = {
|
||||
'name': name,
|
||||
'abspath__icontains': abspath__icontains,
|
||||
'version__icontains': version__icontains,
|
||||
"name": name,
|
||||
"abspath__icontains": abspath__icontains,
|
||||
"version__icontains": version__icontains,
|
||||
}
|
||||
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
|
||||
|
||||
count = 0
|
||||
for binary in queryset:
|
||||
if is_tty:
|
||||
rprint(f'[cyan]{binary.name:20}[/cyan] [dim]{binary.version:15}[/dim] {binary.abspath}')
|
||||
rprint(f"[cyan]{binary.name:20}[/cyan] [dim]{binary.version:15}[/dim] {binary.abspath}")
|
||||
else:
|
||||
write_record(binary.to_json())
|
||||
count += 1
|
||||
|
||||
rprint(f'[dim]Listed {count} binaries[/dim]', file=sys.stderr)
|
||||
rprint(f"[dim]Listed {count} binaries[/dim]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -148,9 +151,10 @@ def list_binaries(
|
||||
# UPDATE
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def update_binaries(
|
||||
version: Optional[str] = None,
|
||||
abspath: Optional[str] = None,
|
||||
version: str | None = None,
|
||||
abspath: str | None = None,
|
||||
) -> int:
|
||||
"""
|
||||
Update Binaries from stdin JSONL.
|
||||
@@ -169,12 +173,12 @@ def update_binaries(
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
updated_count = 0
|
||||
for record in records:
|
||||
binary_id = record.get('id')
|
||||
binary_id = record.get("id")
|
||||
if not binary_id:
|
||||
continue
|
||||
|
||||
@@ -194,10 +198,10 @@ def update_binaries(
|
||||
write_record(binary.to_json())
|
||||
|
||||
except Binary.DoesNotExist:
|
||||
rprint(f'[yellow]Binary not found: {binary_id}[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]Binary not found: {binary_id}[/yellow]", file=sys.stderr)
|
||||
continue
|
||||
|
||||
rprint(f'[green]Updated {updated_count} binaries[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Updated {updated_count} binaries[/green]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -205,6 +209,7 @@ def update_binaries(
|
||||
# DELETE
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def delete_binaries(yes: bool = False, dry_run: bool = False) -> int:
|
||||
"""
|
||||
Delete Binaries from stdin JSONL.
|
||||
@@ -220,35 +225,35 @@ def delete_binaries(yes: bool = False, dry_run: bool = False) -> int:
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
binary_ids = [r.get('id') for r in records if r.get('id')]
|
||||
binary_ids = [r.get("id") for r in records if r.get("id")]
|
||||
|
||||
if not binary_ids:
|
||||
rprint('[yellow]No valid binary IDs in input[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No valid binary IDs in input[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
binaries = Binary.objects.filter(id__in=binary_ids)
|
||||
count = binaries.count()
|
||||
|
||||
if count == 0:
|
||||
rprint('[yellow]No matching binaries found[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No matching binaries found[/yellow]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if dry_run:
|
||||
rprint(f'[yellow]Would delete {count} binaries (dry run)[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]Would delete {count} binaries (dry run)[/yellow]", file=sys.stderr)
|
||||
for binary in binaries:
|
||||
rprint(f' {binary.name} {binary.abspath}', file=sys.stderr)
|
||||
rprint(f" {binary.name} {binary.abspath}", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if not yes:
|
||||
rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
|
||||
rprint("[red]Use --yes to confirm deletion[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Perform deletion
|
||||
deleted_count, _ = binaries.delete()
|
||||
rprint(f'[green]Deleted {deleted_count} binaries[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Deleted {deleted_count} binaries[/green]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -256,52 +261,59 @@ def delete_binaries(yes: bool = False, dry_run: bool = False) -> int:
|
||||
# CLI Commands
|
||||
# =============================================================================
|
||||
|
||||
|
||||
@click.group()
|
||||
def main():
|
||||
"""Manage Binary records (detected executables)."""
|
||||
pass
|
||||
|
||||
|
||||
@main.command('create')
|
||||
@click.option('--name', '-n', required=True, help='Binary name (e.g., chrome, wget)')
|
||||
@click.option('--abspath', '-p', required=True, help='Absolute path to binary')
|
||||
@click.option('--version', '-v', default='', help='Binary version')
|
||||
@main.command("create")
|
||||
@click.option("--name", "-n", required=True, help="Binary name (e.g., chrome, wget)")
|
||||
@click.option("--abspath", "-p", required=True, help="Absolute path to binary")
|
||||
@click.option("--version", "-v", default="", help="Binary version")
|
||||
def create_cmd(name: str, abspath: str, version: str):
|
||||
"""Create/register a Binary."""
|
||||
sys.exit(create_binary(name=name, abspath=abspath, version=version))
|
||||
|
||||
|
||||
@main.command('list')
|
||||
@click.option('--name', '-n', help='Filter by name')
|
||||
@click.option('--abspath__icontains', help='Filter by path contains')
|
||||
@click.option('--version__icontains', help='Filter by version contains')
|
||||
@click.option('--limit', type=int, help='Limit number of results')
|
||||
def list_cmd(name: Optional[str], abspath__icontains: Optional[str],
|
||||
version__icontains: Optional[str], limit: Optional[int]):
|
||||
@main.command("list")
|
||||
@click.option("--name", "-n", help="Filter by name")
|
||||
@click.option("--abspath__icontains", help="Filter by path contains")
|
||||
@click.option("--version__icontains", help="Filter by version contains")
|
||||
@click.option("--limit", type=int, help="Limit number of results")
|
||||
def list_cmd(
|
||||
name: str | None,
|
||||
abspath__icontains: str | None,
|
||||
version__icontains: str | None,
|
||||
limit: int | None,
|
||||
):
|
||||
"""List Binaries as JSONL."""
|
||||
sys.exit(list_binaries(
|
||||
name=name,
|
||||
abspath__icontains=abspath__icontains,
|
||||
version__icontains=version__icontains,
|
||||
limit=limit,
|
||||
))
|
||||
sys.exit(
|
||||
list_binaries(
|
||||
name=name,
|
||||
abspath__icontains=abspath__icontains,
|
||||
version__icontains=version__icontains,
|
||||
limit=limit,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@main.command('update')
|
||||
@click.option('--version', '-v', help='Set version')
|
||||
@click.option('--abspath', '-p', help='Set path')
|
||||
def update_cmd(version: Optional[str], abspath: Optional[str]):
|
||||
@main.command("update")
|
||||
@click.option("--version", "-v", help="Set version")
|
||||
@click.option("--abspath", "-p", help="Set path")
|
||||
def update_cmd(version: str | None, abspath: str | None):
|
||||
"""Update Binaries from stdin JSONL."""
|
||||
sys.exit(update_binaries(version=version, abspath=abspath))
|
||||
|
||||
|
||||
@main.command('delete')
|
||||
@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
|
||||
@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
|
||||
@main.command("delete")
|
||||
@click.option("--yes", "-y", is_flag=True, help="Confirm deletion")
|
||||
@click.option("--dry-run", is_flag=True, help="Show what would be deleted")
|
||||
def delete_cmd(yes: bool, dry_run: bool):
|
||||
"""Delete Binaries from stdin JSONL."""
|
||||
sys.exit(delete_binaries(yes=yes, dry_run=dry_run))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__package__ = "archivebox.cli"
|
||||
|
||||
import sys
|
||||
import rich_click as click
|
||||
@@ -12,12 +12,14 @@ from archivebox.misc.toml_util import CustomTOMLEncoder
|
||||
|
||||
|
||||
@enforce_types
|
||||
def config(*keys,
|
||||
get: bool=False,
|
||||
set: bool=False,
|
||||
search: bool=False,
|
||||
reset: bool=False,
|
||||
**kwargs) -> None:
|
||||
def config(
|
||||
*keys,
|
||||
get: bool = False,
|
||||
set: bool = False,
|
||||
search: bool = False,
|
||||
reset: bool = False,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
"""Get and set your ArchiveBox project configuration values"""
|
||||
|
||||
from archivebox.misc.checks import check_data_folder
|
||||
@@ -29,8 +31,8 @@ def config(*keys,
|
||||
|
||||
FLAT_CONFIG = get_flat_config()
|
||||
CONFIGS = get_all_configs()
|
||||
|
||||
config_options: list[str] = list(kwargs.pop('key=value', []) or keys or [f'{key}={val}' for key, val in kwargs.items()])
|
||||
|
||||
config_options: list[str] = list(kwargs.pop("key=value", []) or keys or [f"{key}={val}" for key, val in kwargs.items()])
|
||||
no_args = not (get or set or reset or config_options)
|
||||
|
||||
matching_config = {}
|
||||
@@ -39,19 +41,19 @@ def config(*keys,
|
||||
config_options = [get_real_name(key) for key in config_options]
|
||||
matching_config = {key: FLAT_CONFIG[key] for key in config_options if key in FLAT_CONFIG}
|
||||
for config_section in CONFIGS.values():
|
||||
aliases = getattr(config_section, 'aliases', {})
|
||||
|
||||
aliases = getattr(config_section, "aliases", {})
|
||||
|
||||
for search_key in config_options:
|
||||
# search all aliases in the section
|
||||
for alias_key, key in aliases.items():
|
||||
if search_key.lower() in alias_key.lower():
|
||||
matching_config[key] = dict(config_section)[key]
|
||||
|
||||
|
||||
# search all keys and values in the section
|
||||
for existing_key, value in dict(config_section).items():
|
||||
if search_key.lower() in existing_key.lower() or search_key.lower() in str(value).lower():
|
||||
matching_config[existing_key] = value
|
||||
|
||||
|
||||
print(printable_config(matching_config))
|
||||
raise SystemExit(not matching_config)
|
||||
|
||||
@@ -61,23 +63,23 @@ def config(*keys,
|
||||
matching_config = {key: FLAT_CONFIG[key] for key in config_options if key in FLAT_CONFIG}
|
||||
failed_config = [key for key in config_options if key not in FLAT_CONFIG]
|
||||
if failed_config:
|
||||
print('\n[red][X] These options failed to get[/red]')
|
||||
print(' {}'.format('\n '.join(config_options)))
|
||||
print("\n[red][X] These options failed to get[/red]")
|
||||
print(" {}".format("\n ".join(config_options)))
|
||||
raise SystemExit(1)
|
||||
else:
|
||||
matching_config = FLAT_CONFIG
|
||||
|
||||
# Display core config sections
|
||||
for config_section in CONFIGS.values():
|
||||
section_header = getattr(config_section, 'toml_section_header', '')
|
||||
section_header = getattr(config_section, "toml_section_header", "")
|
||||
if isinstance(section_header, str) and section_header:
|
||||
print(f'[grey53]\\[{section_header}][/grey53]')
|
||||
print(f"[grey53]\\[{section_header}][/grey53]")
|
||||
else:
|
||||
print('[grey53]\\[CONSTANTS] # (read-only)[/grey53]')
|
||||
print("[grey53]\\[CONSTANTS] # (read-only)[/grey53]")
|
||||
|
||||
kv_in_section = {key: val for key, val in dict(config_section).items() if key in matching_config}
|
||||
print(benedict(kv_in_section).to_toml(encoder=CustomTOMLEncoder()).strip().replace('\n\n', '\n'))
|
||||
print('[grey53]################################################################[/grey53]')
|
||||
print(benedict(kv_in_section).to_toml(encoder=CustomTOMLEncoder()).strip().replace("\n\n", "\n"))
|
||||
print("[grey53]################################################################[/grey53]")
|
||||
|
||||
# Display plugin config section
|
||||
from archivebox.hooks import discover_plugin_configs
|
||||
@@ -87,17 +89,17 @@ def config(*keys,
|
||||
|
||||
# Collect all plugin config keys
|
||||
for plugin_name, schema in plugin_configs.items():
|
||||
if 'properties' not in schema:
|
||||
if "properties" not in schema:
|
||||
continue
|
||||
for key in schema['properties'].keys():
|
||||
for key in schema["properties"].keys():
|
||||
if key in matching_config:
|
||||
plugin_keys[key] = matching_config[key]
|
||||
|
||||
# Display all plugin config in single [PLUGINS] section
|
||||
if plugin_keys:
|
||||
print('[grey53]\\[PLUGINS][/grey53]')
|
||||
print(benedict(plugin_keys).to_toml(encoder=CustomTOMLEncoder()).strip().replace('\n\n', '\n'))
|
||||
print('[grey53]################################################################[/grey53]')
|
||||
print("[grey53]\\[PLUGINS][/grey53]")
|
||||
print(benedict(plugin_keys).to_toml(encoder=CustomTOMLEncoder()).strip().replace("\n\n", "\n"))
|
||||
print("[grey53]################################################################[/grey53]")
|
||||
|
||||
raise SystemExit(not matching_config)
|
||||
|
||||
@@ -105,18 +107,20 @@ def config(*keys,
|
||||
new_config = {}
|
||||
failed_options = []
|
||||
for line in config_options:
|
||||
if line.startswith('#') or not line.strip():
|
||||
if line.startswith("#") or not line.strip():
|
||||
continue
|
||||
if '=' not in line:
|
||||
print('[red][X] Config KEY=VALUE must have an = sign in it[/red]')
|
||||
print(f' {line}')
|
||||
if "=" not in line:
|
||||
print("[red][X] Config KEY=VALUE must have an = sign in it[/red]")
|
||||
print(f" {line}")
|
||||
raise SystemExit(2)
|
||||
|
||||
raw_key, val = line.split('=', 1)
|
||||
raw_key, val = line.split("=", 1)
|
||||
raw_key = raw_key.upper().strip()
|
||||
key = get_real_name(raw_key)
|
||||
if key != raw_key:
|
||||
print(f'[yellow][i] Note: The config option {raw_key} has been renamed to {key}, please use the new name going forwards.[/yellow]')
|
||||
print(
|
||||
f"[yellow][i] Note: The config option {raw_key} has been renamed to {key}, please use the new name going forwards.[/yellow]",
|
||||
)
|
||||
|
||||
if key in FLAT_CONFIG:
|
||||
new_config[key] = val.strip()
|
||||
@@ -136,38 +140,38 @@ def config(*keys,
|
||||
|
||||
if side_effect_changes:
|
||||
print(file=sys.stderr)
|
||||
print('[yellow][i] Note: This change also affected these other options that depended on it:[/yellow]', file=sys.stderr)
|
||||
print(' {}'.format(printable_config(side_effect_changes, prefix=' ')), file=sys.stderr)
|
||||
print("[yellow][i] Note: This change also affected these other options that depended on it:[/yellow]", file=sys.stderr)
|
||||
print(" {}".format(printable_config(side_effect_changes, prefix=" ")), file=sys.stderr)
|
||||
|
||||
if failed_options:
|
||||
print()
|
||||
print('[red][X] These options failed to set (check for typos):[/red]')
|
||||
print(' {}'.format('\n '.join(failed_options)))
|
||||
print("[red][X] These options failed to set (check for typos):[/red]")
|
||||
print(" {}".format("\n ".join(failed_options)))
|
||||
raise SystemExit(1)
|
||||
|
||||
elif reset:
|
||||
print('[red][X] This command is not implemented yet.[/red]')
|
||||
print(' Please manually remove the relevant lines from your config file:')
|
||||
print("[red][X] This command is not implemented yet.[/red]")
|
||||
print(" Please manually remove the relevant lines from your config file:")
|
||||
raise SystemExit(2)
|
||||
|
||||
else:
|
||||
print('[red][X] You must pass either --get or --set, or no arguments to get the whole config.[/red]')
|
||||
print(' archivebox config')
|
||||
print(' archivebox config --get SOME_KEY')
|
||||
print(' archivebox config --set SOME_KEY=SOME_VALUE')
|
||||
print("[red][X] You must pass either --get or --set, or no arguments to get the whole config.[/red]")
|
||||
print(" archivebox config")
|
||||
print(" archivebox config --get SOME_KEY")
|
||||
print(" archivebox config --set SOME_KEY=SOME_VALUE")
|
||||
raise SystemExit(2)
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--search', is_flag=True, help='Search config KEYs, VALUEs, and ALIASES for the given term')
|
||||
@click.option('--get', is_flag=True, help='Get the value for the given config KEYs')
|
||||
@click.option('--set', is_flag=True, help='Set the given KEY=VALUE config values')
|
||||
@click.option('--reset', is_flag=True, help='Reset the given KEY config values to their defaults')
|
||||
@click.argument('KEY=VALUE', nargs=-1, type=str)
|
||||
@click.option("--search", is_flag=True, help="Search config KEYs, VALUEs, and ALIASES for the given term")
|
||||
@click.option("--get", is_flag=True, help="Get the value for the given config KEYs")
|
||||
@click.option("--set", is_flag=True, help="Set the given KEY=VALUE config values")
|
||||
@click.option("--reset", is_flag=True, help="Reset the given KEY config values to their defaults")
|
||||
@click.argument("KEY=VALUE", nargs=-1, type=str)
|
||||
@docstring(config.__doc__)
|
||||
def main(**kwargs) -> None:
|
||||
config(**kwargs)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -30,11 +30,11 @@ Examples:
|
||||
archivebox crawl create https://example.com | archivebox snapshot create | archivebox run
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox crawl'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox crawl"
|
||||
|
||||
import sys
|
||||
from typing import Optional, Iterable
|
||||
from collections.abc import Iterable
|
||||
|
||||
import rich_click as click
|
||||
from rich import print as rprint
|
||||
@@ -46,12 +46,13 @@ from archivebox.cli.cli_utils import apply_filters
|
||||
# CREATE
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def create_crawl(
|
||||
urls: Iterable[str],
|
||||
depth: int = 0,
|
||||
tag: str = '',
|
||||
status: str = 'queued',
|
||||
created_by_id: Optional[int] = None,
|
||||
tag: str = "",
|
||||
status: str = "queued",
|
||||
created_by_id: int | None = None,
|
||||
) -> int:
|
||||
"""
|
||||
Create a Crawl job from URLs.
|
||||
@@ -74,7 +75,7 @@ def create_crawl(
|
||||
records = list(read_args_or_stdin(urls))
|
||||
|
||||
if not records:
|
||||
rprint('[yellow]No URLs provided. Pass URLs as arguments or via stdin.[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No URLs provided. Pass URLs as arguments or via stdin.[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Separate pass-through records from URL records
|
||||
@@ -82,29 +83,29 @@ def create_crawl(
|
||||
pass_through_records = []
|
||||
|
||||
for record in records:
|
||||
record_type = record.get('type', '')
|
||||
record_type = record.get("type", "")
|
||||
|
||||
# Pass-through: output records that aren't URL/Crawl types
|
||||
if record_type and record_type != TYPE_CRAWL and not record.get('url') and not record.get('urls'):
|
||||
if record_type and record_type != TYPE_CRAWL and not record.get("url") and not record.get("urls"):
|
||||
pass_through_records.append(record)
|
||||
continue
|
||||
|
||||
# Handle existing Crawl records (just pass through with id)
|
||||
if record_type == TYPE_CRAWL and record.get('id'):
|
||||
if record_type == TYPE_CRAWL and record.get("id"):
|
||||
pass_through_records.append(record)
|
||||
continue
|
||||
|
||||
# Collect URLs
|
||||
url = record.get('url')
|
||||
url = record.get("url")
|
||||
if url:
|
||||
url_list.append(url)
|
||||
|
||||
# Handle 'urls' field (newline-separated)
|
||||
urls_field = record.get('urls')
|
||||
urls_field = record.get("urls")
|
||||
if urls_field:
|
||||
for line in urls_field.split('\n'):
|
||||
for line in urls_field.split("\n"):
|
||||
line = line.strip()
|
||||
if line and not line.startswith('#'):
|
||||
if line and not line.startswith("#"):
|
||||
url_list.append(line)
|
||||
|
||||
# Output pass-through records first
|
||||
@@ -115,44 +116,44 @@ def create_crawl(
|
||||
if not url_list:
|
||||
if pass_through_records:
|
||||
# If we had pass-through records but no URLs, that's OK
|
||||
rprint(f'[dim]Passed through {len(pass_through_records)} records, no new URLs[/dim]', file=sys.stderr)
|
||||
rprint(f"[dim]Passed through {len(pass_through_records)} records, no new URLs[/dim]", file=sys.stderr)
|
||||
return 0
|
||||
rprint('[red]No valid URLs found[/red]', file=sys.stderr)
|
||||
rprint("[red]No valid URLs found[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
try:
|
||||
# Build crawl record with all URLs as newline-separated string
|
||||
crawl_record = {
|
||||
'urls': '\n'.join(url_list),
|
||||
'max_depth': depth,
|
||||
'tags_str': tag,
|
||||
'status': status,
|
||||
'label': '',
|
||||
"urls": "\n".join(url_list),
|
||||
"max_depth": depth,
|
||||
"tags_str": tag,
|
||||
"status": status,
|
||||
"label": "",
|
||||
}
|
||||
|
||||
crawl = Crawl.from_json(crawl_record, overrides={'created_by_id': created_by_id})
|
||||
crawl = Crawl.from_json(crawl_record, overrides={"created_by_id": created_by_id})
|
||||
if not crawl:
|
||||
rprint('[red]Failed to create crawl[/red]', file=sys.stderr)
|
||||
rprint("[red]Failed to create crawl[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Output JSONL record (only when piped)
|
||||
if not is_tty:
|
||||
write_record(crawl.to_json())
|
||||
|
||||
rprint(f'[green]Created crawl with {len(url_list)} URLs[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Created crawl with {len(url_list)} URLs[/green]", file=sys.stderr)
|
||||
|
||||
# If TTY, show human-readable output
|
||||
if is_tty:
|
||||
rprint(f' [dim]{crawl.id}[/dim]', file=sys.stderr)
|
||||
rprint(f" [dim]{crawl.id}[/dim]", file=sys.stderr)
|
||||
for url in url_list[:5]: # Show first 5 URLs
|
||||
rprint(f' {url[:70]}', file=sys.stderr)
|
||||
rprint(f" {url[:70]}", file=sys.stderr)
|
||||
if len(url_list) > 5:
|
||||
rprint(f' ... and {len(url_list) - 5} more', file=sys.stderr)
|
||||
rprint(f" ... and {len(url_list) - 5} more", file=sys.stderr)
|
||||
|
||||
return 0
|
||||
|
||||
except Exception as e:
|
||||
rprint(f'[red]Error creating crawl: {e}[/red]', file=sys.stderr)
|
||||
rprint(f"[red]Error creating crawl: {e}[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
|
||||
@@ -160,11 +161,12 @@ def create_crawl(
|
||||
# LIST
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def list_crawls(
|
||||
status: Optional[str] = None,
|
||||
urls__icontains: Optional[str] = None,
|
||||
max_depth: Optional[int] = None,
|
||||
limit: Optional[int] = None,
|
||||
status: str | None = None,
|
||||
urls__icontains: str | None = None,
|
||||
max_depth: int | None = None,
|
||||
limit: int | None = None,
|
||||
) -> int:
|
||||
"""
|
||||
List Crawls as JSONL with optional filters.
|
||||
@@ -177,13 +179,13 @@ def list_crawls(
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
queryset = Crawl.objects.all().order_by('-created_at')
|
||||
queryset = Crawl.objects.all().order_by("-created_at")
|
||||
|
||||
# Apply filters
|
||||
filter_kwargs = {
|
||||
'status': status,
|
||||
'urls__icontains': urls__icontains,
|
||||
'max_depth': max_depth,
|
||||
"status": status,
|
||||
"urls__icontains": urls__icontains,
|
||||
"max_depth": max_depth,
|
||||
}
|
||||
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
|
||||
|
||||
@@ -191,17 +193,17 @@ def list_crawls(
|
||||
for crawl in queryset:
|
||||
if is_tty:
|
||||
status_color = {
|
||||
'queued': 'yellow',
|
||||
'started': 'blue',
|
||||
'sealed': 'green',
|
||||
}.get(crawl.status, 'dim')
|
||||
url_preview = crawl.urls[:50].replace('\n', ' ')
|
||||
rprint(f'[{status_color}]{crawl.status:8}[/{status_color}] [dim]{crawl.id}[/dim] {url_preview}...')
|
||||
"queued": "yellow",
|
||||
"started": "blue",
|
||||
"sealed": "green",
|
||||
}.get(crawl.status, "dim")
|
||||
url_preview = crawl.urls[:50].replace("\n", " ")
|
||||
rprint(f"[{status_color}]{crawl.status:8}[/{status_color}] [dim]{crawl.id}[/dim] {url_preview}...")
|
||||
else:
|
||||
write_record(crawl.to_json())
|
||||
count += 1
|
||||
|
||||
rprint(f'[dim]Listed {count} crawls[/dim]', file=sys.stderr)
|
||||
rprint(f"[dim]Listed {count} crawls[/dim]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -209,9 +211,10 @@ def list_crawls(
|
||||
# UPDATE
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def update_crawls(
|
||||
status: Optional[str] = None,
|
||||
max_depth: Optional[int] = None,
|
||||
status: str | None = None,
|
||||
max_depth: int | None = None,
|
||||
) -> int:
|
||||
"""
|
||||
Update Crawls from stdin JSONL.
|
||||
@@ -232,12 +235,12 @@ def update_crawls(
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
updated_count = 0
|
||||
for record in records:
|
||||
crawl_id = record.get('id')
|
||||
crawl_id = record.get("id")
|
||||
if not crawl_id:
|
||||
continue
|
||||
|
||||
@@ -258,10 +261,10 @@ def update_crawls(
|
||||
write_record(crawl.to_json())
|
||||
|
||||
except Crawl.DoesNotExist:
|
||||
rprint(f'[yellow]Crawl not found: {crawl_id}[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]Crawl not found: {crawl_id}[/yellow]", file=sys.stderr)
|
||||
continue
|
||||
|
||||
rprint(f'[green]Updated {updated_count} crawls[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Updated {updated_count} crawls[/green]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -269,6 +272,7 @@ def update_crawls(
|
||||
# DELETE
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def delete_crawls(yes: bool = False, dry_run: bool = False) -> int:
|
||||
"""
|
||||
Delete Crawls from stdin JSONL.
|
||||
@@ -284,36 +288,36 @@ def delete_crawls(yes: bool = False, dry_run: bool = False) -> int:
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
crawl_ids = [r.get('id') for r in records if r.get('id')]
|
||||
crawl_ids = [r.get("id") for r in records if r.get("id")]
|
||||
|
||||
if not crawl_ids:
|
||||
rprint('[yellow]No valid crawl IDs in input[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No valid crawl IDs in input[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
crawls = Crawl.objects.filter(id__in=crawl_ids)
|
||||
count = crawls.count()
|
||||
|
||||
if count == 0:
|
||||
rprint('[yellow]No matching crawls found[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No matching crawls found[/yellow]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if dry_run:
|
||||
rprint(f'[yellow]Would delete {count} crawls (dry run)[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]Would delete {count} crawls (dry run)[/yellow]", file=sys.stderr)
|
||||
for crawl in crawls:
|
||||
url_preview = crawl.urls[:50].replace('\n', ' ')
|
||||
rprint(f' [dim]{crawl.id}[/dim] {url_preview}...', file=sys.stderr)
|
||||
url_preview = crawl.urls[:50].replace("\n", " ")
|
||||
rprint(f" [dim]{crawl.id}[/dim] {url_preview}...", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if not yes:
|
||||
rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
|
||||
rprint("[red]Use --yes to confirm deletion[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Perform deletion
|
||||
deleted_count, _ = crawls.delete()
|
||||
rprint(f'[green]Deleted {deleted_count} crawls[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Deleted {deleted_count} crawls[/green]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -321,53 +325,60 @@ def delete_crawls(yes: bool = False, dry_run: bool = False) -> int:
|
||||
# CLI Commands
|
||||
# =============================================================================
|
||||
|
||||
|
||||
@click.group()
|
||||
def main():
|
||||
"""Manage Crawl records."""
|
||||
pass
|
||||
|
||||
|
||||
@main.command('create')
|
||||
@click.argument('urls', nargs=-1)
|
||||
@click.option('--depth', '-d', type=int, default=0, help='Max crawl depth (default: 0)')
|
||||
@click.option('--tag', '-t', default='', help='Comma-separated tags to add')
|
||||
@click.option('--status', '-s', default='queued', help='Initial status (default: queued)')
|
||||
@main.command("create")
|
||||
@click.argument("urls", nargs=-1)
|
||||
@click.option("--depth", "-d", type=int, default=0, help="Max crawl depth (default: 0)")
|
||||
@click.option("--tag", "-t", default="", help="Comma-separated tags to add")
|
||||
@click.option("--status", "-s", default="queued", help="Initial status (default: queued)")
|
||||
def create_cmd(urls: tuple, depth: int, tag: str, status: str):
|
||||
"""Create a Crawl job from URLs or stdin."""
|
||||
sys.exit(create_crawl(urls, depth=depth, tag=tag, status=status))
|
||||
|
||||
|
||||
@main.command('list')
|
||||
@click.option('--status', '-s', help='Filter by status (queued, started, sealed)')
|
||||
@click.option('--urls__icontains', help='Filter by URLs contains')
|
||||
@click.option('--max-depth', type=int, help='Filter by max depth')
|
||||
@click.option('--limit', '-n', type=int, help='Limit number of results')
|
||||
def list_cmd(status: Optional[str], urls__icontains: Optional[str],
|
||||
max_depth: Optional[int], limit: Optional[int]):
|
||||
@main.command("list")
|
||||
@click.option("--status", "-s", help="Filter by status (queued, started, sealed)")
|
||||
@click.option("--urls__icontains", help="Filter by URLs contains")
|
||||
@click.option("--max-depth", type=int, help="Filter by max depth")
|
||||
@click.option("--limit", "-n", type=int, help="Limit number of results")
|
||||
def list_cmd(
|
||||
status: str | None,
|
||||
urls__icontains: str | None,
|
||||
max_depth: int | None,
|
||||
limit: int | None,
|
||||
):
|
||||
"""List Crawls as JSONL."""
|
||||
sys.exit(list_crawls(
|
||||
status=status,
|
||||
urls__icontains=urls__icontains,
|
||||
max_depth=max_depth,
|
||||
limit=limit,
|
||||
))
|
||||
sys.exit(
|
||||
list_crawls(
|
||||
status=status,
|
||||
urls__icontains=urls__icontains,
|
||||
max_depth=max_depth,
|
||||
limit=limit,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@main.command('update')
|
||||
@click.option('--status', '-s', help='Set status')
|
||||
@click.option('--max-depth', type=int, help='Set max depth')
|
||||
def update_cmd(status: Optional[str], max_depth: Optional[int]):
|
||||
@main.command("update")
|
||||
@click.option("--status", "-s", help="Set status")
|
||||
@click.option("--max-depth", type=int, help="Set max depth")
|
||||
def update_cmd(status: str | None, max_depth: int | None):
|
||||
"""Update Crawls from stdin JSONL."""
|
||||
sys.exit(update_crawls(status=status, max_depth=max_depth))
|
||||
|
||||
|
||||
@main.command('delete')
|
||||
@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
|
||||
@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
|
||||
@main.command("delete")
|
||||
@click.option("--yes", "-y", is_flag=True, help="Confirm deletion")
|
||||
@click.option("--dry-run", is_flag=True, help="Show what would be deleted")
|
||||
def delete_cmd(yes: bool, dry_run: bool):
|
||||
"""Delete Crawls from stdin JSONL."""
|
||||
sys.exit(delete_crawls(yes=yes, dry_run=dry_run))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox crawl'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox crawl"
|
||||
|
||||
import sys
|
||||
|
||||
@@ -10,12 +10,12 @@ import rich_click as click
|
||||
from archivebox.cli.archivebox_add import add
|
||||
|
||||
|
||||
@click.command(context_settings={'ignore_unknown_options': True})
|
||||
@click.option('--depth', '-d', type=int, default=0, help='Max crawl depth (default: 0)')
|
||||
@click.option('--tag', '-t', default='', help='Comma-separated tags to add')
|
||||
@click.option('--status', '-s', default='queued', help='Initial status (default: queued)')
|
||||
@click.option('--wait/--no-wait', 'wait', default=True, help='Accepted for backwards compatibility')
|
||||
@click.argument('urls', nargs=-1)
|
||||
@click.command(context_settings={"ignore_unknown_options": True})
|
||||
@click.option("--depth", "-d", type=int, default=0, help="Max crawl depth (default: 0)")
|
||||
@click.option("--tag", "-t", default="", help="Comma-separated tags to add")
|
||||
@click.option("--status", "-s", default="queued", help="Initial status (default: queued)")
|
||||
@click.option("--wait/--no-wait", "wait", default=True, help="Accepted for backwards compatibility")
|
||||
@click.argument("urls", nargs=-1)
|
||||
def main(depth: int, tag: str, status: str, wait: bool, urls: tuple[str, ...]):
|
||||
"""Backwards-compatible `archivebox crawl URL...` entrypoint."""
|
||||
del status, wait
|
||||
@@ -23,5 +23,5 @@ def main(depth: int, tag: str, status: str, wait: bool, urls: tuple[str, ...]):
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -27,8 +27,8 @@ Examples:
|
||||
archivebox crawl https://example.com | archivebox snapshot | archivebox extract
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox extract'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox extract"
|
||||
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
@@ -52,51 +52,52 @@ def process_archiveresult_by_id(archiveresult_id: str) -> int:
|
||||
try:
|
||||
archiveresult = ArchiveResult.objects.get(id=archiveresult_id)
|
||||
except ArchiveResult.DoesNotExist:
|
||||
rprint(f'[red]ArchiveResult {archiveresult_id} not found[/red]', file=sys.stderr)
|
||||
rprint(f"[red]ArchiveResult {archiveresult_id} not found[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
rprint(f'[blue]Extracting {archiveresult.plugin} for {archiveresult.snapshot.url}[/blue]', file=sys.stderr)
|
||||
rprint(f"[blue]Extracting {archiveresult.plugin} for {archiveresult.snapshot.url}[/blue]", file=sys.stderr)
|
||||
|
||||
try:
|
||||
archiveresult.reset_for_retry()
|
||||
snapshot = archiveresult.snapshot
|
||||
snapshot.status = snapshot.StatusChoices.QUEUED
|
||||
snapshot.retry_at = timezone.now()
|
||||
snapshot.save(update_fields=['status', 'retry_at', 'modified_at'])
|
||||
snapshot.save(update_fields=["status", "retry_at", "modified_at"])
|
||||
|
||||
crawl = snapshot.crawl
|
||||
if crawl.status != crawl.StatusChoices.STARTED:
|
||||
crawl.status = crawl.StatusChoices.QUEUED
|
||||
crawl.retry_at = timezone.now()
|
||||
crawl.save(update_fields=['status', 'retry_at', 'modified_at'])
|
||||
crawl.save(update_fields=["status", "retry_at", "modified_at"])
|
||||
|
||||
run_crawl(str(crawl.id), snapshot_ids=[str(snapshot.id)], selected_plugins=[archiveresult.plugin])
|
||||
archiveresult.refresh_from_db()
|
||||
|
||||
if archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED:
|
||||
print(f'[green]Extraction succeeded: {archiveresult.output_str}[/green]')
|
||||
print(f"[green]Extraction succeeded: {archiveresult.output_str}[/green]")
|
||||
return 0
|
||||
elif archiveresult.status == ArchiveResult.StatusChoices.NORESULTS:
|
||||
print(f'[dim]Extraction completed with no results: {archiveresult.output_str}[/dim]')
|
||||
print(f"[dim]Extraction completed with no results: {archiveresult.output_str}[/dim]")
|
||||
return 0
|
||||
elif archiveresult.status == ArchiveResult.StatusChoices.FAILED:
|
||||
print(f'[red]Extraction failed: {archiveresult.output_str}[/red]', file=sys.stderr)
|
||||
print(f"[red]Extraction failed: {archiveresult.output_str}[/red]", file=sys.stderr)
|
||||
return 1
|
||||
else:
|
||||
# Still in progress or backoff - not a failure
|
||||
print(f'[yellow]Extraction status: {archiveresult.status}[/yellow]')
|
||||
print(f"[yellow]Extraction status: {archiveresult.status}[/yellow]")
|
||||
return 0
|
||||
|
||||
except Exception as e:
|
||||
print(f'[red]Extraction error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
|
||||
print(f"[red]Extraction error: {type(e).__name__}: {e}[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
|
||||
def run_plugins(
|
||||
args: tuple,
|
||||
records: list[dict] | None = None,
|
||||
plugins: str = '',
|
||||
plugins: str = "",
|
||||
wait: bool = True,
|
||||
emit_results: bool = True,
|
||||
) -> int:
|
||||
"""
|
||||
Run plugins on Snapshots from input.
|
||||
@@ -111,16 +112,18 @@ def run_plugins(
|
||||
from django.utils import timezone
|
||||
|
||||
from archivebox.misc.jsonl import (
|
||||
read_args_or_stdin, write_record,
|
||||
TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
|
||||
read_args_or_stdin,
|
||||
write_record,
|
||||
TYPE_SNAPSHOT,
|
||||
TYPE_ARCHIVERESULT,
|
||||
)
|
||||
from archivebox.core.models import Snapshot, ArchiveResult
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.services.runner import run_crawl
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
# Parse comma-separated plugins list once (reused in creation and filtering)
|
||||
plugins_list = [p.strip() for p in plugins.split(',') if p.strip()] if plugins else []
|
||||
plugins_list = [p.strip() for p in plugins.split(",") if p.strip()] if plugins else []
|
||||
|
||||
# Parse stdin/args exactly once per CLI invocation.
|
||||
# `main()` may already have consumed stdin to distinguish Snapshot input from
|
||||
@@ -130,41 +133,41 @@ def run_plugins(
|
||||
records = list(read_args_or_stdin(args))
|
||||
|
||||
if not records:
|
||||
rprint('[yellow]No snapshots provided. Pass snapshot IDs as arguments or via stdin.[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No snapshots provided. Pass snapshot IDs as arguments or via stdin.[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Gather snapshot IDs and optional plugin constraints to process
|
||||
snapshot_ids = set()
|
||||
requested_plugins_by_snapshot: dict[str, set[str]] = defaultdict(set)
|
||||
for record in records:
|
||||
record_type = record.get('type')
|
||||
record_type = record.get("type")
|
||||
|
||||
if record_type == TYPE_SNAPSHOT:
|
||||
snapshot_id = record.get('id')
|
||||
snapshot_id = record.get("id")
|
||||
if snapshot_id:
|
||||
snapshot_ids.add(snapshot_id)
|
||||
elif record.get('url'):
|
||||
elif record.get("url"):
|
||||
# Look up by URL (get most recent if multiple exist)
|
||||
snap = Snapshot.objects.filter(url=record['url']).order_by('-created_at').first()
|
||||
snap = Snapshot.objects.filter(url=record["url"]).order_by("-created_at").first()
|
||||
if snap:
|
||||
snapshot_ids.add(str(snap.id))
|
||||
else:
|
||||
rprint(f'[yellow]Snapshot not found for URL: {record["url"]}[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]Snapshot not found for URL: {record['url']}[/yellow]", file=sys.stderr)
|
||||
|
||||
elif record_type == TYPE_ARCHIVERESULT:
|
||||
snapshot_id = record.get('snapshot_id')
|
||||
snapshot_id = record.get("snapshot_id")
|
||||
if snapshot_id:
|
||||
snapshot_ids.add(snapshot_id)
|
||||
plugin_name = record.get('plugin')
|
||||
plugin_name = record.get("plugin")
|
||||
if plugin_name and not plugins_list:
|
||||
requested_plugins_by_snapshot[str(snapshot_id)].add(str(plugin_name))
|
||||
|
||||
elif 'id' in record:
|
||||
elif "id" in record:
|
||||
# Assume it's a snapshot ID
|
||||
snapshot_ids.add(record['id'])
|
||||
snapshot_ids.add(record["id"])
|
||||
|
||||
if not snapshot_ids:
|
||||
rprint('[red]No valid snapshot IDs found in input[/red]', file=sys.stderr)
|
||||
rprint("[red]No valid snapshot IDs found in input[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Get snapshots and ensure they have pending ArchiveResults
|
||||
@@ -173,17 +176,13 @@ def run_plugins(
|
||||
try:
|
||||
snapshot = Snapshot.objects.get(id=snapshot_id)
|
||||
except Snapshot.DoesNotExist:
|
||||
rprint(f'[yellow]Snapshot {snapshot_id} not found[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]Snapshot {snapshot_id} not found[/yellow]", file=sys.stderr)
|
||||
continue
|
||||
|
||||
for plugin_name in requested_plugins_by_snapshot.get(str(snapshot.id), set()):
|
||||
existing_result = snapshot.archiveresult_set.filter(plugin=plugin_name).order_by('-created_at').first()
|
||||
if existing_result and existing_result.status in [
|
||||
ArchiveResult.StatusChoices.FAILED,
|
||||
ArchiveResult.StatusChoices.SKIPPED,
|
||||
ArchiveResult.StatusChoices.NORESULTS,
|
||||
ArchiveResult.StatusChoices.BACKOFF,
|
||||
]:
|
||||
requested_plugin_names = set(plugins_list) | requested_plugins_by_snapshot.get(str(snapshot.id), set())
|
||||
for plugin_name in requested_plugin_names:
|
||||
existing_result = snapshot.archiveresult_set.filter(plugin=plugin_name).order_by("-created_at").first()
|
||||
if existing_result:
|
||||
existing_result.reset_for_retry()
|
||||
|
||||
# Reset snapshot status to allow processing
|
||||
@@ -195,34 +194,39 @@ def run_plugins(
|
||||
processed_count += 1
|
||||
|
||||
if processed_count == 0:
|
||||
rprint('[red]No snapshots to process[/red]', file=sys.stderr)
|
||||
rprint("[red]No snapshots to process[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
rprint(f'[blue]Queued {processed_count} snapshots for extraction[/blue]', file=sys.stderr)
|
||||
rprint(f"[blue]Queued {processed_count} snapshots for extraction[/blue]", file=sys.stderr)
|
||||
|
||||
# Run orchestrator if --wait (default)
|
||||
if wait:
|
||||
rprint('[blue]Running plugins...[/blue]', file=sys.stderr)
|
||||
rprint("[blue]Running plugins...[/blue]", file=sys.stderr)
|
||||
snapshot_ids_by_crawl: dict[str, set[str]] = defaultdict(set)
|
||||
for snapshot_id in snapshot_ids:
|
||||
try:
|
||||
snapshot = Snapshot.objects.only('id', 'crawl_id').get(id=snapshot_id)
|
||||
snapshot = Snapshot.objects.only("id", "crawl_id").get(id=snapshot_id)
|
||||
except Snapshot.DoesNotExist:
|
||||
continue
|
||||
snapshot_ids_by_crawl[str(snapshot.crawl_id)].add(str(snapshot.id))
|
||||
|
||||
for crawl_id, crawl_snapshot_ids in snapshot_ids_by_crawl.items():
|
||||
selected_plugins = plugins_list or sorted({
|
||||
plugin
|
||||
for snapshot_id in crawl_snapshot_ids
|
||||
for plugin in requested_plugins_by_snapshot.get(str(snapshot_id), set())
|
||||
}) or None
|
||||
selected_plugins = (
|
||||
plugins_list
|
||||
or sorted(
|
||||
{plugin for snapshot_id in crawl_snapshot_ids for plugin in requested_plugins_by_snapshot.get(str(snapshot_id), set())},
|
||||
)
|
||||
or None
|
||||
)
|
||||
run_crawl(
|
||||
crawl_id,
|
||||
snapshot_ids=sorted(crawl_snapshot_ids),
|
||||
selected_plugins=selected_plugins,
|
||||
)
|
||||
|
||||
if not emit_results:
|
||||
return 0
|
||||
|
||||
# Output results as JSONL (when piped) or human-readable (when TTY)
|
||||
for snapshot_id in snapshot_ids:
|
||||
try:
|
||||
@@ -234,11 +238,14 @@ def run_plugins(
|
||||
for result in results:
|
||||
if is_tty:
|
||||
status_color = {
|
||||
'succeeded': 'green',
|
||||
'failed': 'red',
|
||||
'skipped': 'yellow',
|
||||
}.get(result.status, 'dim')
|
||||
rprint(f' [{status_color}]{result.status}[/{status_color}] {result.plugin} → {result.output_str or ""}', file=sys.stderr)
|
||||
"succeeded": "green",
|
||||
"failed": "red",
|
||||
"skipped": "yellow",
|
||||
}.get(result.status, "dim")
|
||||
rprint(
|
||||
f" [{status_color}]{result.status}[/{status_color}] {result.plugin} → {result.output_str or ''}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
else:
|
||||
write_record(result.to_json())
|
||||
except Snapshot.DoesNotExist:
|
||||
@@ -250,18 +257,20 @@ def run_plugins(
|
||||
def is_archiveresult_id(value: str) -> bool:
|
||||
"""Check if value looks like an ArchiveResult UUID."""
|
||||
import re
|
||||
uuid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.I)
|
||||
|
||||
uuid_pattern = re.compile(r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$", re.I)
|
||||
if not uuid_pattern.match(value):
|
||||
return False
|
||||
# Verify it's actually an ArchiveResult (not a Snapshot or other object)
|
||||
from archivebox.core.models import ArchiveResult
|
||||
|
||||
return ArchiveResult.objects.filter(id=value).exists()
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--plugins', '--plugin', '-p', default='', help='Comma-separated list of plugins to run (e.g., screenshot,singlefile)')
|
||||
@click.option('--wait/--no-wait', default=True, help='Wait for plugins to complete (default: wait)')
|
||||
@click.argument('args', nargs=-1)
|
||||
@click.option("--plugins", "--plugin", "-p", default="", help="Comma-separated list of plugins to run (e.g., screenshot,singlefile)")
|
||||
@click.option("--wait/--no-wait", default=True, help="Wait for plugins to complete (default: wait)")
|
||||
@click.argument("args", nargs=-1)
|
||||
def main(plugins: str, wait: bool, args: tuple):
|
||||
"""Run plugins on Snapshots, or process existing ArchiveResults by ID"""
|
||||
from archivebox.misc.jsonl import read_args_or_stdin
|
||||
@@ -271,14 +280,12 @@ def main(plugins: str, wait: bool, args: tuple):
|
||||
|
||||
if not records:
|
||||
from rich import print as rprint
|
||||
rprint('[yellow]No Snapshot IDs or ArchiveResult IDs provided. Pass as arguments or via stdin.[/yellow]', file=sys.stderr)
|
||||
|
||||
rprint("[yellow]No Snapshot IDs or ArchiveResult IDs provided. Pass as arguments or via stdin.[/yellow]", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Check if input looks like existing ArchiveResult IDs to process
|
||||
all_are_archiveresult_ids = all(
|
||||
is_archiveresult_id(r.get('id') or r.get('url', ''))
|
||||
for r in records
|
||||
)
|
||||
all_are_archiveresult_ids = all(is_archiveresult_id(r.get("id") or r.get("url", "")) for r in records)
|
||||
|
||||
if all_are_archiveresult_ids:
|
||||
# Process existing ArchiveResults by ID
|
||||
@@ -286,9 +293,9 @@ def main(plugins: str, wait: bool, args: tuple):
|
||||
|
||||
exit_code = 0
|
||||
for record in records:
|
||||
archiveresult_id = record.get('id') or record.get('url')
|
||||
archiveresult_id = record.get("id") or record.get("url")
|
||||
if not isinstance(archiveresult_id, str):
|
||||
rprint(f'[red]Invalid ArchiveResult input: {record}[/red]', file=sys.stderr)
|
||||
rprint(f"[red]Invalid ArchiveResult input: {record}[/red]", file=sys.stderr)
|
||||
exit_code = 1
|
||||
continue
|
||||
result = process_archiveresult_by_id(archiveresult_id)
|
||||
@@ -300,5 +307,5 @@ def main(plugins: str, wait: bool, args: tuple):
|
||||
sys.exit(run_plugins(args, records=records, plugins=plugins, wait=wait))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
#!/usr/bin/env python3
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox help'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox help"
|
||||
|
||||
import os
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import click
|
||||
@@ -17,33 +17,44 @@ def help() -> None:
|
||||
from archivebox.config import CONSTANTS
|
||||
from archivebox.config.permissions import IN_DOCKER
|
||||
from archivebox.misc.logging_util import log_cli_command
|
||||
|
||||
log_cli_command('help', [], None, '.')
|
||||
|
||||
COMMANDS_HELP_TEXT = '\n '.join(
|
||||
f'[green]{cmd.ljust(20)}[/green] {ArchiveBoxGroup._lazy_load(cmd).__doc__}'
|
||||
for cmd in ArchiveBoxGroup.meta_commands.keys()
|
||||
) + '\n\n ' + '\n '.join(
|
||||
f'[green]{cmd.ljust(20)}[/green] {ArchiveBoxGroup._lazy_load(cmd).__doc__}'
|
||||
for cmd in ArchiveBoxGroup.setup_commands.keys()
|
||||
) + '\n\n ' + '\n '.join(
|
||||
f'[green]{cmd.ljust(20)}[/green] {ArchiveBoxGroup._lazy_load(cmd).__doc__}'
|
||||
for cmd in ArchiveBoxGroup.archive_commands.keys()
|
||||
|
||||
log_cli_command("help", [], None, ".")
|
||||
|
||||
COMMANDS_HELP_TEXT = (
|
||||
"\n ".join(
|
||||
f"[green]{cmd.ljust(20)}[/green] {ArchiveBoxGroup._lazy_load(cmd).__doc__}" for cmd in ArchiveBoxGroup.meta_commands.keys()
|
||||
)
|
||||
+ "\n\n "
|
||||
+ "\n ".join(
|
||||
f"[green]{cmd.ljust(20)}[/green] {ArchiveBoxGroup._lazy_load(cmd).__doc__}" for cmd in ArchiveBoxGroup.setup_commands.keys()
|
||||
)
|
||||
+ "\n\n "
|
||||
+ "\n ".join(
|
||||
f"[green]{cmd.ljust(20)}[/green] {ArchiveBoxGroup._lazy_load(cmd).__doc__}" for cmd in ArchiveBoxGroup.archive_commands.keys()
|
||||
)
|
||||
)
|
||||
|
||||
DOCKER_USAGE = '''
|
||||
|
||||
DOCKER_USAGE = (
|
||||
"""
|
||||
[dodger_blue3]Docker Usage:[/dodger_blue3]
|
||||
[grey53]# using Docker Compose:[/grey53]
|
||||
[blue]docker compose run[/blue] [dark_green]archivebox[/dark_green] [green]\\[command][/green] [green3][...args][/green3] [violet][--help][/violet] [grey53][--version][/grey53]
|
||||
|
||||
[grey53]# using Docker:[/grey53]
|
||||
[blue]docker run[/blue] -v [light_slate_blue]$PWD:/data[/light_slate_blue] [grey53]-p 8000:8000[/grey53] -it [dark_green]archivebox/archivebox[/dark_green] [green]\\[command][/green] [green3][...args][/green3] [violet][--help][/violet] [grey53][--version][/grey53]
|
||||
''' if IN_DOCKER else ''
|
||||
DOCKER_DOCS = '\n [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Docker[/link]' if IN_DOCKER else ''
|
||||
DOCKER_OUTSIDE_HINT = "\n [grey53]# outside of Docker:[/grey53]" if IN_DOCKER else ''
|
||||
DOCKER_CMD_PREFIX = "[blue]docker ... [/blue]" if IN_DOCKER else ''
|
||||
"""
|
||||
if IN_DOCKER
|
||||
else ""
|
||||
)
|
||||
DOCKER_DOCS = (
|
||||
"\n [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Docker[/link]"
|
||||
if IN_DOCKER
|
||||
else ""
|
||||
)
|
||||
DOCKER_OUTSIDE_HINT = "\n [grey53]# outside of Docker:[/grey53]" if IN_DOCKER else ""
|
||||
DOCKER_CMD_PREFIX = "[blue]docker ... [/blue]" if IN_DOCKER else ""
|
||||
|
||||
print(f'''{DOCKER_USAGE}
|
||||
print(f"""{DOCKER_USAGE}
|
||||
[deep_sky_blue4]Usage:[/deep_sky_blue4]{DOCKER_OUTSIDE_HINT}
|
||||
[dark_green]archivebox[/dark_green] [green]\\[command][/green] [green3][...args][/green3] [violet][--help][/violet] [grey53][--version][/grey53]
|
||||
|
||||
@@ -54,12 +65,11 @@ def help() -> None:
|
||||
[link=https://github.com/ArchiveBox/ArchiveBox/wiki]https://github.com/ArchiveBox/ArchiveBox/wiki[/link]{DOCKER_DOCS}
|
||||
[link=https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#cli-usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Usage[/link]
|
||||
[link=https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration]https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration[/link]
|
||||
''')
|
||||
|
||||
|
||||
""")
|
||||
|
||||
if os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) and CONSTANTS.ARCHIVE_DIR.is_dir():
|
||||
pretty_out_dir = str(CONSTANTS.DATA_DIR).replace(str(Path('~').expanduser()), '~')
|
||||
EXAMPLE_USAGE = f'''
|
||||
pretty_out_dir = str(CONSTANTS.DATA_DIR).replace(str(Path("~").expanduser()), "~")
|
||||
EXAMPLE_USAGE = f"""
|
||||
[light_slate_blue]DATA DIR[/light_slate_blue]: [yellow]{pretty_out_dir}[/yellow]
|
||||
|
||||
[violet]Hint:[/violet] [i]Common maintenance tasks:[/i]
|
||||
@@ -73,33 +83,49 @@ def help() -> None:
|
||||
[dark_green]archivebox[/dark_green] [green]list[/green] --sort=timestamp --csv=timestamp,downloaded_at,url,title
|
||||
[dark_green]archivebox[/dark_green] [green]schedule[/green] --every=day --depth=1 "https://example.com/some/feed.rss"
|
||||
[dark_green]archivebox[/dark_green] [green]server[/green] [blue]0.0.0.0:8000[/blue] [grey53]# Start the Web UI / API server[/grey53]
|
||||
'''
|
||||
print(Panel(EXAMPLE_USAGE, expand=False, border_style='grey53', title='[green3]:white_check_mark: A collection [light_slate_blue]DATA DIR[/light_slate_blue] is currently active[/green3]', subtitle='Commands run inside this dir will only apply to this collection.'))
|
||||
"""
|
||||
print(
|
||||
Panel(
|
||||
EXAMPLE_USAGE,
|
||||
expand=False,
|
||||
border_style="grey53",
|
||||
title="[green3]:white_check_mark: A collection [light_slate_blue]DATA DIR[/light_slate_blue] is currently active[/green3]",
|
||||
subtitle="Commands run inside this dir will only apply to this collection.",
|
||||
),
|
||||
)
|
||||
else:
|
||||
DATA_SETUP_HELP = '\n'
|
||||
DATA_SETUP_HELP = "\n"
|
||||
if IN_DOCKER:
|
||||
DATA_SETUP_HELP += '[violet]Hint:[/violet] When using Docker, you need to mount a volume to use as your data dir:\n'
|
||||
DATA_SETUP_HELP += ' docker run [violet]-v /some/path/data:/data[/violet] archivebox/archivebox ...\n\n'
|
||||
DATA_SETUP_HELP += 'To load an [dark_blue]existing[/dark_blue] collection:\n'
|
||||
DATA_SETUP_HELP += ' 1. [green]cd[/green] ~/archivebox/data [grey53]# go into existing [light_slate_blue]DATA DIR[/light_slate_blue] (can be anywhere)[/grey53]\n'
|
||||
DATA_SETUP_HELP += f' 2. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]init[/green] [grey53]# migrate to latest version (safe to run multiple times)[/grey53]\n'
|
||||
DATA_SETUP_HELP += f' 3. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]install[/green] [grey53]# auto-update all plugins (wget, chrome, singlefile, etc.)[/grey53]\n'
|
||||
DATA_SETUP_HELP += f' 4. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]help[/green] [grey53]# ...get help with next steps... [/grey53]\n\n'
|
||||
DATA_SETUP_HELP += 'To start a [sea_green1]new[/sea_green1] collection:\n'
|
||||
DATA_SETUP_HELP += ' 1. [green]mkdir[/green] ~/archivebox/data [grey53]# create a new, empty [light_slate_blue]DATA DIR[/light_slate_blue] (can be anywhere)[/grey53]\n'
|
||||
DATA_SETUP_HELP += ' 2. [green]cd[/green] ~/archivebox/data [grey53]# cd into the new directory[/grey53]\n'
|
||||
DATA_SETUP_HELP += f' 3. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]init[/green] [grey53]# initialize ArchiveBox in the new data dir[/grey53]\n'
|
||||
DATA_SETUP_HELP += f' 4. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]install[/green] [grey53]# auto-install all plugins (wget, chrome, singlefile, etc.)[/grey53]\n'
|
||||
DATA_SETUP_HELP += f' 5. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]help[/green] [grey53]# ... get help with next steps... [/grey53]\n'
|
||||
print(Panel(DATA_SETUP_HELP, expand=False, border_style='grey53', title='[red]:cross_mark: No collection is currently active[/red]', subtitle='All archivebox [green]commands[/green] should be run from inside a collection [light_slate_blue]DATA DIR[/light_slate_blue]'))
|
||||
|
||||
DATA_SETUP_HELP += "[violet]Hint:[/violet] When using Docker, you need to mount a volume to use as your data dir:\n"
|
||||
DATA_SETUP_HELP += " docker run [violet]-v /some/path/data:/data[/violet] archivebox/archivebox ...\n\n"
|
||||
DATA_SETUP_HELP += "To load an [dark_blue]existing[/dark_blue] collection:\n"
|
||||
DATA_SETUP_HELP += " 1. [green]cd[/green] ~/archivebox/data [grey53]# go into existing [light_slate_blue]DATA DIR[/light_slate_blue] (can be anywhere)[/grey53]\n"
|
||||
DATA_SETUP_HELP += f" 2. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]init[/green] [grey53]# migrate to latest version (safe to run multiple times)[/grey53]\n"
|
||||
DATA_SETUP_HELP += f" 3. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]install[/green] [grey53]# auto-update all plugins (wget, chrome, singlefile, etc.)[/grey53]\n"
|
||||
DATA_SETUP_HELP += f" 4. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]help[/green] [grey53]# ...get help with next steps... [/grey53]\n\n"
|
||||
DATA_SETUP_HELP += "To start a [sea_green1]new[/sea_green1] collection:\n"
|
||||
DATA_SETUP_HELP += " 1. [green]mkdir[/green] ~/archivebox/data [grey53]# create a new, empty [light_slate_blue]DATA DIR[/light_slate_blue] (can be anywhere)[/grey53]\n"
|
||||
DATA_SETUP_HELP += " 2. [green]cd[/green] ~/archivebox/data [grey53]# cd into the new directory[/grey53]\n"
|
||||
DATA_SETUP_HELP += f" 3. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]init[/green] [grey53]# initialize ArchiveBox in the new data dir[/grey53]\n"
|
||||
DATA_SETUP_HELP += f" 4. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]install[/green] [grey53]# auto-install all plugins (wget, chrome, singlefile, etc.)[/grey53]\n"
|
||||
DATA_SETUP_HELP += f" 5. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]help[/green] [grey53]# ... get help with next steps... [/grey53]\n"
|
||||
print(
|
||||
Panel(
|
||||
DATA_SETUP_HELP,
|
||||
expand=False,
|
||||
border_style="grey53",
|
||||
title="[red]:cross_mark: No collection is currently active[/red]",
|
||||
subtitle="All archivebox [green]commands[/green] should be run from inside a collection [light_slate_blue]DATA DIR[/light_slate_blue]",
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--help', '-h', is_flag=True, help='Show help')
|
||||
@click.option("--help", "-h", is_flag=True, help="Show help")
|
||||
def main(**kwargs):
|
||||
"""Print the ArchiveBox help message and usage"""
|
||||
return help()
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__package__ = "archivebox.cli"
|
||||
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Mapping
|
||||
from collections.abc import Mapping
|
||||
|
||||
from rich import print
|
||||
import rich_click as click
|
||||
@@ -14,12 +14,12 @@ from archivebox.misc.util import docstring, enforce_types
|
||||
|
||||
|
||||
def _normalize_snapshot_record(link_dict: Mapping[str, object]) -> tuple[str, dict[str, object]] | None:
|
||||
url = link_dict.get('url')
|
||||
url = link_dict.get("url")
|
||||
if not isinstance(url, str) or not url:
|
||||
return None
|
||||
|
||||
record: dict[str, object] = {'url': url}
|
||||
for key in ('timestamp', 'title', 'tags', 'sources'):
|
||||
record: dict[str, object] = {"url": url}
|
||||
for key in ("timestamp", "title", "tags", "sources"):
|
||||
value = link_dict.get(key)
|
||||
if value is not None:
|
||||
record[key] = value
|
||||
@@ -27,15 +27,15 @@ def _normalize_snapshot_record(link_dict: Mapping[str, object]) -> tuple[str, di
|
||||
|
||||
|
||||
@enforce_types
|
||||
def init(force: bool=False, quick: bool=False, install: bool=False) -> None:
|
||||
def init(force: bool = False, quick: bool = False, install: bool = False) -> None:
|
||||
"""Initialize a new ArchiveBox collection in the current directory"""
|
||||
|
||||
|
||||
from archivebox.config import CONSTANTS, VERSION, DATA_DIR
|
||||
from archivebox.config.common import SERVER_CONFIG
|
||||
from archivebox.config.collection import write_config_file
|
||||
from archivebox.misc.legacy import parse_json_main_index, parse_json_links_details
|
||||
from archivebox.misc.db import apply_migrations
|
||||
|
||||
|
||||
# if os.access(out_dir / CONSTANTS.JSON_INDEX_FILENAME, os.F_OK):
|
||||
# print("[red]:warning: This folder contains a JSON index. It is deprecated, and will no longer be kept up to date automatically.[/red]", file=sys.stderr)
|
||||
# print("[red] You can run `archivebox list --json --with-headers > static_index.json` to manually generate it.[/red]", file=sys.stderr)
|
||||
@@ -43,69 +43,71 @@ def init(force: bool=False, quick: bool=False, install: bool=False) -> None:
|
||||
is_empty = not len(set(os.listdir(DATA_DIR)) - CONSTANTS.ALLOWED_IN_DATA_DIR)
|
||||
existing_index = os.path.isfile(CONSTANTS.DATABASE_FILE)
|
||||
if is_empty and not existing_index:
|
||||
print(f'[turquoise4][+] Initializing a new ArchiveBox v{VERSION} collection...[/turquoise4]')
|
||||
print('[green]----------------------------------------------------------------------[/green]')
|
||||
print(f"[turquoise4][+] Initializing a new ArchiveBox v{VERSION} collection...[/turquoise4]")
|
||||
print("[green]----------------------------------------------------------------------[/green]")
|
||||
elif existing_index:
|
||||
# TODO: properly detect and print the existing version in current index as well
|
||||
print(f'[green][*] Verifying and updating existing ArchiveBox collection to v{VERSION}...[/green]')
|
||||
print('[green]----------------------------------------------------------------------[/green]')
|
||||
print(f"[green][*] Verifying and updating existing ArchiveBox collection to v{VERSION}...[/green]")
|
||||
print("[green]----------------------------------------------------------------------[/green]")
|
||||
else:
|
||||
if force:
|
||||
print('[red][!] This folder appears to already have files in it, but no index.sqlite3 is present.[/red]')
|
||||
print('[red] Because --force was passed, ArchiveBox will initialize anyway (which may overwrite existing files).[/red]')
|
||||
print("[red][!] This folder appears to already have files in it, but no index.sqlite3 is present.[/red]")
|
||||
print("[red] Because --force was passed, ArchiveBox will initialize anyway (which may overwrite existing files).[/red]")
|
||||
else:
|
||||
print(
|
||||
("[red][X] This folder appears to already have files in it, but no index.sqlite3 present.[/red]\n\n"
|
||||
"[red][X] This folder appears to already have files in it, but no index.sqlite3 present.[/red]\n\n"
|
||||
" You must run init in a completely empty directory, or an existing data folder.\n\n"
|
||||
" [violet]Hint:[/violet] To import an existing data folder make sure to cd into the folder first, \n"
|
||||
" then run and run 'archivebox init' to pick up where you left off.\n\n"
|
||||
" (Always make sure your data folder is backed up first before updating ArchiveBox)"
|
||||
)
|
||||
" (Always make sure your data folder is backed up first before updating ArchiveBox)",
|
||||
)
|
||||
raise SystemExit(2)
|
||||
|
||||
if existing_index:
|
||||
print('\n[green][*] Verifying archive folder structure...[/green]')
|
||||
print("\n[green][*] Verifying archive folder structure...[/green]")
|
||||
else:
|
||||
print('\n[green][+] Building archive folder structure...[/green]')
|
||||
|
||||
print(f' + ./{CONSTANTS.ARCHIVE_DIR.relative_to(DATA_DIR)}, ./{CONSTANTS.SOURCES_DIR.relative_to(DATA_DIR)}, ./{CONSTANTS.LOGS_DIR.relative_to(DATA_DIR)}...')
|
||||
print("\n[green][+] Building archive folder structure...[/green]")
|
||||
|
||||
print(
|
||||
f" + ./{CONSTANTS.ARCHIVE_DIR.relative_to(DATA_DIR)}, ./{CONSTANTS.SOURCES_DIR.relative_to(DATA_DIR)}, ./{CONSTANTS.LOGS_DIR.relative_to(DATA_DIR)}...",
|
||||
)
|
||||
Path(CONSTANTS.SOURCES_DIR).mkdir(exist_ok=True)
|
||||
Path(CONSTANTS.ARCHIVE_DIR).mkdir(exist_ok=True)
|
||||
Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True)
|
||||
|
||||
print(f' + ./{CONSTANTS.CONFIG_FILE.relative_to(DATA_DIR)}...')
|
||||
|
||||
|
||||
print(f" + ./{CONSTANTS.CONFIG_FILE.relative_to(DATA_DIR)}...")
|
||||
|
||||
# create the .archivebox_id file with a unique ID for this collection
|
||||
from archivebox.config.paths import _get_collection_id
|
||||
_get_collection_id(DATA_DIR, force_create=True)
|
||||
|
||||
# create the ArchiveBox.conf file
|
||||
write_config_file({'SECRET_KEY': SERVER_CONFIG.SECRET_KEY})
|
||||
|
||||
_get_collection_id(DATA_DIR, force_create=True)
|
||||
|
||||
# create the ArchiveBox.conf file
|
||||
write_config_file({"SECRET_KEY": SERVER_CONFIG.SECRET_KEY})
|
||||
|
||||
if os.access(CONSTANTS.DATABASE_FILE, os.F_OK):
|
||||
print('\n[green][*] Verifying main SQL index and running any migrations needed...[/green]')
|
||||
print("\n[green][*] Verifying main SQL index and running any migrations needed...[/green]")
|
||||
else:
|
||||
print('\n[green][+] Building main SQL index and running initial migrations...[/green]')
|
||||
|
||||
print("\n[green][+] Building main SQL index and running initial migrations...[/green]")
|
||||
|
||||
from archivebox.config.django import setup_django
|
||||
|
||||
setup_django()
|
||||
|
||||
|
||||
for migration_line in apply_migrations(DATA_DIR):
|
||||
sys.stdout.write(f' {migration_line}\n')
|
||||
sys.stdout.write(f" {migration_line}\n")
|
||||
|
||||
assert os.path.isfile(CONSTANTS.DATABASE_FILE) and os.access(CONSTANTS.DATABASE_FILE, os.R_OK)
|
||||
print()
|
||||
print(f' √ ./{CONSTANTS.DATABASE_FILE.relative_to(DATA_DIR)}')
|
||||
|
||||
print(f" √ ./{CONSTANTS.DATABASE_FILE.relative_to(DATA_DIR)}")
|
||||
|
||||
# from django.contrib.auth.models import User
|
||||
# if SHELL_CONFIG.IS_TTY and not User.objects.filter(is_superuser=True).exclude(username='system').exists():
|
||||
# print('{green}[+] Creating admin user account...{reset}'.format(**SHELL_CONFIG.ANSI))
|
||||
# call_command("createsuperuser", interactive=True)
|
||||
|
||||
print()
|
||||
print('[dodger_blue3][*] Checking links from indexes and archive folders (safe to Ctrl+C)...[/dodger_blue3]')
|
||||
print("[dodger_blue3][*] Checking links from indexes and archive folders (safe to Ctrl+C)...[/dodger_blue3]")
|
||||
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
@@ -114,10 +116,10 @@ def init(force: bool=False, quick: bool=False, install: bool=False) -> None:
|
||||
|
||||
if existing_index:
|
||||
all_links = Snapshot.objects.all()
|
||||
print(f' √ Loaded {all_links.count()} links from existing main index.')
|
||||
print(f" √ Loaded {all_links.count()} links from existing main index.")
|
||||
|
||||
if quick:
|
||||
print(' > Skipping orphan snapshot import (quick mode)')
|
||||
print(" > Skipping orphan snapshot import (quick mode)")
|
||||
else:
|
||||
try:
|
||||
# Import orphaned links from legacy JSON indexes
|
||||
@@ -131,7 +133,7 @@ def init(force: bool=False, quick: bool=False, install: bool=False) -> None:
|
||||
orphaned_json_links[url] = record
|
||||
if orphaned_json_links:
|
||||
pending_links.update(orphaned_json_links)
|
||||
print(f' [yellow]√ Added {len(orphaned_json_links)} orphaned links from existing JSON index...[/yellow]')
|
||||
print(f" [yellow]√ Added {len(orphaned_json_links)} orphaned links from existing JSON index...[/yellow]")
|
||||
|
||||
orphaned_data_dir_links: dict[str, dict[str, object]] = {}
|
||||
for link_dict in parse_json_links_details(DATA_DIR):
|
||||
@@ -143,7 +145,7 @@ def init(force: bool=False, quick: bool=False, install: bool=False) -> None:
|
||||
orphaned_data_dir_links[url] = record
|
||||
if orphaned_data_dir_links:
|
||||
pending_links.update(orphaned_data_dir_links)
|
||||
print(f' [yellow]√ Added {len(orphaned_data_dir_links)} orphaned links from existing archive directories.[/yellow]')
|
||||
print(f" [yellow]√ Added {len(orphaned_data_dir_links)} orphaned links from existing archive directories.[/yellow]")
|
||||
|
||||
if pending_links:
|
||||
for link_dict in pending_links.values():
|
||||
@@ -151,42 +153,44 @@ def init(force: bool=False, quick: bool=False, install: bool=False) -> None:
|
||||
|
||||
# Hint for orphaned snapshot directories
|
||||
print()
|
||||
print(' [violet]Hint:[/violet] To import orphaned snapshot directories and reconcile filesystem state, run:')
|
||||
print(' archivebox update')
|
||||
print(" [violet]Hint:[/violet] To import orphaned snapshot directories and reconcile filesystem state, run:")
|
||||
print(" archivebox update")
|
||||
|
||||
except (KeyboardInterrupt, SystemExit):
|
||||
print(file=sys.stderr)
|
||||
print('[yellow]:stop_sign: Stopped checking archive directories due to Ctrl-C/SIGTERM[/yellow]', file=sys.stderr)
|
||||
print(' Your archive data is safe, but you should re-run `archivebox init` to finish the process later.', file=sys.stderr)
|
||||
print("[yellow]:stop_sign: Stopped checking archive directories due to Ctrl-C/SIGTERM[/yellow]", file=sys.stderr)
|
||||
print(" Your archive data is safe, but you should re-run `archivebox init` to finish the process later.", file=sys.stderr)
|
||||
print(file=sys.stderr)
|
||||
print(' [violet]Hint:[/violet] In the future you can run a quick init without checking dirs like so:', file=sys.stderr)
|
||||
print(' archivebox init --quick', file=sys.stderr)
|
||||
print(" [violet]Hint:[/violet] In the future you can run a quick init without checking dirs like so:", file=sys.stderr)
|
||||
print(" archivebox init --quick", file=sys.stderr)
|
||||
raise SystemExit(1)
|
||||
|
||||
print('\n[green]----------------------------------------------------------------------[/green]')
|
||||
print("\n[green]----------------------------------------------------------------------[/green]")
|
||||
|
||||
from django.contrib.auth.models import User
|
||||
|
||||
if (SERVER_CONFIG.ADMIN_USERNAME and SERVER_CONFIG.ADMIN_PASSWORD) and not User.objects.filter(username=SERVER_CONFIG.ADMIN_USERNAME).exists():
|
||||
print('[green][+] Found ADMIN_USERNAME and ADMIN_PASSWORD configuration options, creating new admin user.[/green]')
|
||||
if (SERVER_CONFIG.ADMIN_USERNAME and SERVER_CONFIG.ADMIN_PASSWORD) and not User.objects.filter(
|
||||
username=SERVER_CONFIG.ADMIN_USERNAME,
|
||||
).exists():
|
||||
print("[green][+] Found ADMIN_USERNAME and ADMIN_PASSWORD configuration options, creating new admin user.[/green]")
|
||||
User.objects.create_superuser(username=SERVER_CONFIG.ADMIN_USERNAME, password=SERVER_CONFIG.ADMIN_PASSWORD)
|
||||
|
||||
if existing_index:
|
||||
print('[green][√] Done. Verified and updated the existing ArchiveBox collection.[/green]')
|
||||
print("[green][√] Done. Verified and updated the existing ArchiveBox collection.[/green]")
|
||||
else:
|
||||
print(f'[green][√] Done. A new ArchiveBox collection was initialized ({len(all_links) + len(pending_links)} links).[/green]')
|
||||
print(f"[green][√] Done. A new ArchiveBox collection was initialized ({len(all_links) + len(pending_links)} links).[/green]")
|
||||
|
||||
|
||||
CONSTANTS.PERSONAS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
CONSTANTS.DEFAULT_TMP_DIR.mkdir(parents=True, exist_ok=True)
|
||||
CONSTANTS.DEFAULT_LIB_DIR.mkdir(parents=True, exist_ok=True)
|
||||
(CONSTANTS.DEFAULT_LIB_DIR / 'bin').mkdir(parents=True, exist_ok=True)
|
||||
(CONSTANTS.DEFAULT_LIB_DIR / "bin").mkdir(parents=True, exist_ok=True)
|
||||
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
from archivebox.config.paths import get_or_create_working_tmp_dir, get_or_create_working_lib_dir
|
||||
|
||||
STORAGE_CONFIG.TMP_DIR.mkdir(parents=True, exist_ok=True)
|
||||
STORAGE_CONFIG.LIB_DIR.mkdir(parents=True, exist_ok=True)
|
||||
(STORAGE_CONFIG.LIB_DIR / 'bin').mkdir(parents=True, exist_ok=True)
|
||||
(STORAGE_CONFIG.LIB_DIR / "bin").mkdir(parents=True, exist_ok=True)
|
||||
|
||||
working_tmp_dir = get_or_create_working_tmp_dir(autofix=True, quiet=True)
|
||||
if working_tmp_dir:
|
||||
@@ -195,33 +199,35 @@ def init(force: bool=False, quick: bool=False, install: bool=False) -> None:
|
||||
working_lib_dir = get_or_create_working_lib_dir(autofix=True, quiet=True)
|
||||
if working_lib_dir:
|
||||
working_lib_dir.mkdir(parents=True, exist_ok=True)
|
||||
(working_lib_dir / 'bin').mkdir(parents=True, exist_ok=True)
|
||||
|
||||
(working_lib_dir / "bin").mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if install:
|
||||
from archivebox.cli.archivebox_install import install as install_method
|
||||
|
||||
install_method()
|
||||
|
||||
if Snapshot.objects.count() < 25: # hide the hints for experienced users
|
||||
if Snapshot.objects.count() < 25: # hide the hints for experienced users
|
||||
print()
|
||||
print(' [violet]Hint:[/violet] To view your archive index, run:')
|
||||
print(' archivebox server # then visit [deep_sky_blue4][link=http://127.0.0.1:8000]http://127.0.0.1:8000[/link][/deep_sky_blue4]')
|
||||
print(" [violet]Hint:[/violet] To view your archive index, run:")
|
||||
print(
|
||||
" archivebox server # then visit [deep_sky_blue4][link=http://127.0.0.1:8000]http://127.0.0.1:8000[/link][/deep_sky_blue4]",
|
||||
)
|
||||
print()
|
||||
print(' To add new links, you can run:')
|
||||
print(" To add new links, you can run:")
|
||||
print(" archivebox add < ~/some/path/to/list_of_links.txt")
|
||||
print()
|
||||
print(' For more usage and examples, run:')
|
||||
print(' archivebox help')
|
||||
|
||||
print(" For more usage and examples, run:")
|
||||
print(" archivebox help")
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--force', '-f', is_flag=True, help='Ignore unrecognized files in current directory and initialize anyway')
|
||||
@click.option('--quick', '-q', is_flag=True, help='Run any updates or migrations without rechecking all snapshot dirs')
|
||||
@click.option('--install', '-s', is_flag=True, help='Automatically install dependencies and extras used for archiving')
|
||||
@click.option("--force", "-f", is_flag=True, help="Ignore unrecognized files in current directory and initialize anyway")
|
||||
@click.option("--quick", "-q", is_flag=True, help="Run any updates or migrations without rechecking all snapshot dirs")
|
||||
@click.option("--install", "-s", is_flag=True, help="Automatically install dependencies and extras used for archiving")
|
||||
@docstring(init.__doc__)
|
||||
def main(**kwargs) -> None:
|
||||
init(**kwargs)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__package__ = "archivebox.cli"
|
||||
|
||||
import os
|
||||
|
||||
@@ -11,7 +11,7 @@ from archivebox.misc.util import docstring, enforce_types
|
||||
|
||||
|
||||
@enforce_types
|
||||
def install(binaries: tuple[str, ...] = (), binproviders: str = '*', dry_run: bool = False) -> None:
|
||||
def install(binaries: tuple[str, ...] = (), binproviders: str = "*", dry_run: bool = False) -> None:
|
||||
"""Detect and install ArchiveBox dependencies by running the abx-dl install flow
|
||||
|
||||
Examples:
|
||||
@@ -31,33 +31,34 @@ def install(binaries: tuple[str, ...] = (), binproviders: str = '*', dry_run: bo
|
||||
|
||||
# Show what we're installing
|
||||
if binaries:
|
||||
print(f'\n[green][+] Installing specific binaries: {", ".join(binaries)}[/green]')
|
||||
print(f"\n[green][+] Installing specific binaries: {', '.join(binaries)}[/green]")
|
||||
else:
|
||||
print('\n[green][+] Detecting and installing all ArchiveBox dependencies...[/green]')
|
||||
print("\n[green][+] Detecting and installing all ArchiveBox dependencies...[/green]")
|
||||
|
||||
if binproviders != '*':
|
||||
print(f'[green][+] Using providers: {binproviders}[/green]')
|
||||
if binproviders != "*":
|
||||
print(f"[green][+] Using providers: {binproviders}[/green]")
|
||||
|
||||
if IS_ROOT:
|
||||
EUID = os.geteuid()
|
||||
print()
|
||||
print(f'[yellow]:warning: Running as UID=[blue]{EUID}[/blue].[/yellow]')
|
||||
print(f' DATA_DIR will be owned by [blue]{ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/blue].')
|
||||
print(f"[yellow]:warning: Running as UID=[blue]{EUID}[/blue].[/yellow]")
|
||||
print(f" DATA_DIR will be owned by [blue]{ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/blue].")
|
||||
print()
|
||||
|
||||
if dry_run:
|
||||
print('[dim]Dry run - would run the abx-dl install flow[/dim]')
|
||||
print("[dim]Dry run - would run the abx-dl install flow[/dim]")
|
||||
return
|
||||
|
||||
# Set up Django
|
||||
from archivebox.config.django import setup_django
|
||||
|
||||
setup_django()
|
||||
|
||||
plugin_names = list(binaries)
|
||||
if binproviders != '*':
|
||||
plugin_names.extend(provider.strip() for provider in binproviders.split(',') if provider.strip())
|
||||
if binproviders != "*":
|
||||
plugin_names.extend(provider.strip() for provider in binproviders.split(",") if provider.strip())
|
||||
|
||||
print('[+] Running installer via abx-dl bus...')
|
||||
print("[+] Running installer via abx-dl bus...")
|
||||
print()
|
||||
|
||||
from archivebox.services.runner import run_install
|
||||
@@ -68,28 +69,36 @@ def install(binaries: tuple[str, ...] = (), binproviders: str = '*', dry_run: bo
|
||||
|
||||
# Check for superuser
|
||||
from django.contrib.auth import get_user_model
|
||||
|
||||
User = get_user_model()
|
||||
|
||||
if not User.objects.filter(is_superuser=True).exclude(username='system').exists():
|
||||
stderr('\n[+] Don\'t forget to create a new admin user for the Web UI...', color='green')
|
||||
stderr(' archivebox manage createsuperuser')
|
||||
if not User.objects.filter(is_superuser=True).exclude(username="system").exists():
|
||||
stderr("\n[+] Don't forget to create a new admin user for the Web UI...", color="green")
|
||||
stderr(" archivebox manage createsuperuser")
|
||||
|
||||
print()
|
||||
|
||||
# Show version to display full status including installed binaries
|
||||
# Django is already loaded, so just import and call the function directly
|
||||
from archivebox.cli.archivebox_version import version as show_version
|
||||
|
||||
show_version(quiet=False)
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.argument('binaries', nargs=-1, type=str, required=False)
|
||||
@click.option('--binproviders', '-p', default='*', help='Comma-separated list of providers to use (pip,npm,brew,apt,env,custom) or * for all', show_default=True)
|
||||
@click.option('--dry-run', '-d', is_flag=True, help='Show what would happen without actually running', default=False)
|
||||
@click.argument("binaries", nargs=-1, type=str, required=False)
|
||||
@click.option(
|
||||
"--binproviders",
|
||||
"-p",
|
||||
default="*",
|
||||
help="Comma-separated list of providers to use (pip,npm,brew,apt,env,custom) or * for all",
|
||||
show_default=True,
|
||||
)
|
||||
@click.option("--dry-run", "-d", is_flag=True, help="Show what would happen without actually running", default=False)
|
||||
@docstring(install.__doc__)
|
||||
def main(**kwargs) -> None:
|
||||
install(**kwargs)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,10 +1,9 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox list'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox list"
|
||||
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
import rich_click as click
|
||||
|
||||
@@ -12,31 +11,47 @@ from archivebox.cli.archivebox_snapshot import list_snapshots
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--status', '-s', help='Filter by status (queued, started, sealed)')
|
||||
@click.option('--url__icontains', help='Filter by URL contains')
|
||||
@click.option('--url__istartswith', help='Filter by URL starts with')
|
||||
@click.option('--tag', '-t', help='Filter by tag name')
|
||||
@click.option('--crawl-id', help='Filter by crawl ID')
|
||||
@click.option('--limit', '-n', type=int, help='Limit number of results')
|
||||
@click.option('--sort', '-o', type=str, help='Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at')
|
||||
@click.option('--csv', '-C', type=str, help='Print output as CSV with the provided fields, e.g.: timestamp,url,title')
|
||||
@click.option('--with-headers', is_flag=True, help='Include column headers in structured output')
|
||||
def main(status: Optional[str], url__icontains: Optional[str], url__istartswith: Optional[str],
|
||||
tag: Optional[str], crawl_id: Optional[str], limit: Optional[int],
|
||||
sort: Optional[str], csv: Optional[str], with_headers: bool) -> None:
|
||||
@click.option("--status", "-s", help="Filter by status (queued, started, sealed)")
|
||||
@click.option("--url__icontains", help="Filter by URL contains")
|
||||
@click.option("--url__istartswith", help="Filter by URL starts with")
|
||||
@click.option("--tag", "-t", help="Filter by tag name")
|
||||
@click.option("--crawl-id", help="Filter by crawl ID")
|
||||
@click.option("--limit", "-n", type=int, help="Limit number of results")
|
||||
@click.option("--sort", "-o", type=str, help="Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at")
|
||||
@click.option("--csv", "-C", type=str, help="Print output as CSV with the provided fields, e.g.: timestamp,url,title")
|
||||
@click.option("--with-headers", is_flag=True, help="Include column headers in structured output")
|
||||
@click.option("--search", type=click.Choice(["meta", "content", "contents", "deep"]), help="Search mode to use for the query")
|
||||
@click.argument("query", nargs=-1)
|
||||
def main(
|
||||
status: str | None,
|
||||
url__icontains: str | None,
|
||||
url__istartswith: str | None,
|
||||
tag: str | None,
|
||||
crawl_id: str | None,
|
||||
limit: int | None,
|
||||
sort: str | None,
|
||||
csv: str | None,
|
||||
with_headers: bool,
|
||||
search: str | None,
|
||||
query: tuple[str, ...],
|
||||
) -> None:
|
||||
"""List Snapshots."""
|
||||
sys.exit(list_snapshots(
|
||||
status=status,
|
||||
url__icontains=url__icontains,
|
||||
url__istartswith=url__istartswith,
|
||||
tag=tag,
|
||||
crawl_id=crawl_id,
|
||||
limit=limit,
|
||||
sort=sort,
|
||||
csv=csv,
|
||||
with_headers=with_headers,
|
||||
))
|
||||
sys.exit(
|
||||
list_snapshots(
|
||||
status=status,
|
||||
url__icontains=url__icontains,
|
||||
url__istartswith=url__istartswith,
|
||||
tag=tag,
|
||||
crawl_id=crawl_id,
|
||||
limit=limit,
|
||||
sort=sort,
|
||||
csv=csv,
|
||||
with_headers=with_headers,
|
||||
search=search,
|
||||
query=" ".join(query),
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -19,11 +19,10 @@ Examples:
|
||||
archivebox machine list --hostname__icontains=myserver
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox machine'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox machine"
|
||||
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
import rich_click as click
|
||||
from rich import print as rprint
|
||||
@@ -35,10 +34,11 @@ from archivebox.cli.cli_utils import apply_filters
|
||||
# LIST
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def list_machines(
|
||||
hostname__icontains: Optional[str] = None,
|
||||
os_platform: Optional[str] = None,
|
||||
limit: Optional[int] = None,
|
||||
hostname__icontains: str | None = None,
|
||||
os_platform: str | None = None,
|
||||
limit: int | None = None,
|
||||
) -> int:
|
||||
"""
|
||||
List Machines as JSONL with optional filters.
|
||||
@@ -51,24 +51,24 @@ def list_machines(
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
queryset = Machine.objects.all().order_by('-created_at')
|
||||
queryset = Machine.objects.all().order_by("-created_at")
|
||||
|
||||
# Apply filters
|
||||
filter_kwargs = {
|
||||
'hostname__icontains': hostname__icontains,
|
||||
'os_platform': os_platform,
|
||||
"hostname__icontains": hostname__icontains,
|
||||
"os_platform": os_platform,
|
||||
}
|
||||
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
|
||||
|
||||
count = 0
|
||||
for machine in queryset:
|
||||
if is_tty:
|
||||
rprint(f'[cyan]{machine.hostname:30}[/cyan] [dim]{machine.os_platform:10}[/dim] {machine.id}')
|
||||
rprint(f"[cyan]{machine.hostname:30}[/cyan] [dim]{machine.os_platform:10}[/dim] {machine.id}")
|
||||
else:
|
||||
write_record(machine.to_json())
|
||||
count += 1
|
||||
|
||||
rprint(f'[dim]Listed {count} machines[/dim]', file=sys.stderr)
|
||||
rprint(f"[dim]Listed {count} machines[/dim]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -76,24 +76,27 @@ def list_machines(
|
||||
# CLI Commands
|
||||
# =============================================================================
|
||||
|
||||
|
||||
@click.group()
|
||||
def main():
|
||||
"""Manage Machine records (read-only, system-managed)."""
|
||||
pass
|
||||
|
||||
|
||||
@main.command('list')
|
||||
@click.option('--hostname__icontains', help='Filter by hostname contains')
|
||||
@click.option('--os-platform', help='Filter by OS platform')
|
||||
@click.option('--limit', '-n', type=int, help='Limit number of results')
|
||||
def list_cmd(hostname__icontains: Optional[str], os_platform: Optional[str], limit: Optional[int]):
|
||||
@main.command("list")
|
||||
@click.option("--hostname__icontains", help="Filter by hostname contains")
|
||||
@click.option("--os-platform", help="Filter by OS platform")
|
||||
@click.option("--limit", "-n", type=int, help="Limit number of results")
|
||||
def list_cmd(hostname__icontains: str | None, os_platform: str | None, limit: int | None):
|
||||
"""List Machines as JSONL."""
|
||||
sys.exit(list_machines(
|
||||
hostname__icontains=hostname__icontains,
|
||||
os_platform=os_platform,
|
||||
limit=limit,
|
||||
))
|
||||
sys.exit(
|
||||
list_machines(
|
||||
hostname__icontains=hostname__icontains,
|
||||
os_platform=os_platform,
|
||||
limit=limit,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,33 +1,34 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__package__ = "archivebox.cli"
|
||||
|
||||
import rich_click as click
|
||||
from archivebox.misc.util import docstring, enforce_types
|
||||
|
||||
|
||||
@enforce_types
|
||||
def manage(args: list[str] | None=None) -> None:
|
||||
def manage(args: list[str] | None = None) -> None:
|
||||
"""Run an ArchiveBox Django management command"""
|
||||
|
||||
from archivebox.config.common import SHELL_CONFIG
|
||||
from archivebox.misc.logging import stderr
|
||||
|
||||
if (args and "createsuperuser" in args) and (SHELL_CONFIG.IN_DOCKER and not SHELL_CONFIG.IS_TTY):
|
||||
stderr('[!] Warning: you need to pass -it to use interactive commands in docker', color='lightyellow')
|
||||
stderr(' docker run -it archivebox manage {}'.format(' '.join(args or ['...'])), color='lightyellow')
|
||||
stderr('')
|
||||
stderr("[!] Warning: you need to pass -it to use interactive commands in docker", color="lightyellow")
|
||||
stderr(" docker run -it archivebox manage {}".format(" ".join(args or ["..."])), color="lightyellow")
|
||||
stderr("")
|
||||
|
||||
from django.core.management import execute_from_command_line
|
||||
execute_from_command_line(['manage.py', *(args or ['help'])])
|
||||
|
||||
execute_from_command_line(["manage.py", *(args or ["help"])])
|
||||
|
||||
|
||||
@click.command(add_help_option=False, context_settings=dict(ignore_unknown_options=True))
|
||||
@click.argument('args', nargs=-1)
|
||||
@click.argument("args", nargs=-1)
|
||||
@docstring(manage.__doc__)
|
||||
def main(args: list[str] | None=None) -> None:
|
||||
def main(args: list[str] | None = None) -> None:
|
||||
manage(args=args)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -6,8 +6,8 @@ Start the Model Context Protocol (MCP) server in stdio mode.
|
||||
Exposes all ArchiveBox CLI commands as MCP tools for AI agents.
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox mcp'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox mcp"
|
||||
|
||||
import rich_click as click
|
||||
|
||||
@@ -45,5 +45,5 @@ def main(**kwargs):
|
||||
mcp()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -24,8 +24,8 @@ Examples:
|
||||
archivebox persona list --name=old | archivebox persona delete --yes
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox persona'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox persona"
|
||||
|
||||
import os
|
||||
import sys
|
||||
@@ -35,7 +35,7 @@ import subprocess
|
||||
import tempfile
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Optional, Iterable
|
||||
from collections.abc import Iterable
|
||||
from collections import OrderedDict
|
||||
|
||||
import rich_click as click
|
||||
@@ -49,134 +49,145 @@ from archivebox.personas import importers as persona_importers
|
||||
# Browser Profile Locations
|
||||
# =============================================================================
|
||||
|
||||
def get_chrome_user_data_dir() -> Optional[Path]:
|
||||
|
||||
def get_chrome_user_data_dir() -> Path | None:
|
||||
"""Get the default Chrome user data directory for the current platform."""
|
||||
system = platform.system()
|
||||
home = Path.home()
|
||||
|
||||
if system == 'Darwin': # macOS
|
||||
if system == "Darwin": # macOS
|
||||
candidates = [
|
||||
home / 'Library' / 'Application Support' / 'Google' / 'Chrome',
|
||||
home / 'Library' / 'Application Support' / 'Chromium',
|
||||
home / "Library" / "Application Support" / "Google" / "Chrome",
|
||||
home / "Library" / "Application Support" / "Chromium",
|
||||
]
|
||||
elif system == 'Linux':
|
||||
elif system == "Linux":
|
||||
candidates = [
|
||||
home / '.config' / 'google-chrome',
|
||||
home / '.config' / 'chromium',
|
||||
home / '.config' / 'chrome',
|
||||
home / 'snap' / 'chromium' / 'common' / 'chromium',
|
||||
home / ".config" / "google-chrome",
|
||||
home / ".config" / "chromium",
|
||||
home / ".config" / "chrome",
|
||||
home / "snap" / "chromium" / "common" / "chromium",
|
||||
]
|
||||
elif system == 'Windows':
|
||||
local_app_data = Path(os.environ.get('LOCALAPPDATA', home / 'AppData' / 'Local'))
|
||||
elif system == "Windows":
|
||||
local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local"))
|
||||
candidates = [
|
||||
local_app_data / 'Google' / 'Chrome' / 'User Data',
|
||||
local_app_data / 'Chromium' / 'User Data',
|
||||
local_app_data / "Google" / "Chrome" / "User Data",
|
||||
local_app_data / "Chromium" / "User Data",
|
||||
]
|
||||
else:
|
||||
candidates = []
|
||||
|
||||
for candidate in candidates:
|
||||
if candidate.exists() and (candidate / 'Default').exists():
|
||||
if candidate.exists() and (candidate / "Default").exists():
|
||||
return candidate
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def get_brave_user_data_dir() -> Optional[Path]:
|
||||
def get_brave_user_data_dir() -> Path | None:
|
||||
"""Get the default Brave user data directory for the current platform."""
|
||||
system = platform.system()
|
||||
home = Path.home()
|
||||
|
||||
if system == 'Darwin':
|
||||
if system == "Darwin":
|
||||
candidates = [
|
||||
home / 'Library' / 'Application Support' / 'BraveSoftware' / 'Brave-Browser',
|
||||
home / "Library" / "Application Support" / "BraveSoftware" / "Brave-Browser",
|
||||
]
|
||||
elif system == 'Linux':
|
||||
elif system == "Linux":
|
||||
candidates = [
|
||||
home / '.config' / 'BraveSoftware' / 'Brave-Browser',
|
||||
home / ".config" / "BraveSoftware" / "Brave-Browser",
|
||||
]
|
||||
elif system == 'Windows':
|
||||
local_app_data = Path(os.environ.get('LOCALAPPDATA', home / 'AppData' / 'Local'))
|
||||
elif system == "Windows":
|
||||
local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local"))
|
||||
candidates = [
|
||||
local_app_data / 'BraveSoftware' / 'Brave-Browser' / 'User Data',
|
||||
local_app_data / "BraveSoftware" / "Brave-Browser" / "User Data",
|
||||
]
|
||||
else:
|
||||
candidates = []
|
||||
|
||||
for candidate in candidates:
|
||||
if candidate.exists() and (candidate / 'Default').exists():
|
||||
if candidate.exists() and (candidate / "Default").exists():
|
||||
return candidate
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def get_edge_user_data_dir() -> Optional[Path]:
|
||||
def get_edge_user_data_dir() -> Path | None:
|
||||
"""Get the default Edge user data directory for the current platform."""
|
||||
system = platform.system()
|
||||
home = Path.home()
|
||||
|
||||
if system == 'Darwin':
|
||||
if system == "Darwin":
|
||||
candidates = [
|
||||
home / 'Library' / 'Application Support' / 'Microsoft Edge',
|
||||
home / "Library" / "Application Support" / "Microsoft Edge",
|
||||
]
|
||||
elif system == 'Linux':
|
||||
elif system == "Linux":
|
||||
candidates = [
|
||||
home / '.config' / 'microsoft-edge',
|
||||
home / '.config' / 'microsoft-edge-beta',
|
||||
home / '.config' / 'microsoft-edge-dev',
|
||||
home / ".config" / "microsoft-edge",
|
||||
home / ".config" / "microsoft-edge-beta",
|
||||
home / ".config" / "microsoft-edge-dev",
|
||||
]
|
||||
elif system == 'Windows':
|
||||
local_app_data = Path(os.environ.get('LOCALAPPDATA', home / 'AppData' / 'Local'))
|
||||
elif system == "Windows":
|
||||
local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local"))
|
||||
candidates = [
|
||||
local_app_data / 'Microsoft' / 'Edge' / 'User Data',
|
||||
local_app_data / "Microsoft" / "Edge" / "User Data",
|
||||
]
|
||||
else:
|
||||
candidates = []
|
||||
|
||||
for candidate in candidates:
|
||||
if candidate.exists() and (candidate / 'Default').exists():
|
||||
if candidate.exists() and (candidate / "Default").exists():
|
||||
return candidate
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def get_browser_binary(browser: str) -> Optional[str]:
|
||||
def get_browser_binary(browser: str) -> str | None:
|
||||
system = platform.system()
|
||||
home = Path.home()
|
||||
browser = browser.lower()
|
||||
|
||||
if system == 'Darwin':
|
||||
if system == "Darwin":
|
||||
candidates = {
|
||||
'chrome': ['/Applications/Google Chrome.app/Contents/MacOS/Google Chrome'],
|
||||
'chromium': ['/Applications/Chromium.app/Contents/MacOS/Chromium'],
|
||||
'brave': ['/Applications/Brave Browser.app/Contents/MacOS/Brave Browser'],
|
||||
'edge': ['/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge'],
|
||||
"chrome": ["/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"],
|
||||
"chromium": ["/Applications/Chromium.app/Contents/MacOS/Chromium"],
|
||||
"brave": ["/Applications/Brave Browser.app/Contents/MacOS/Brave Browser"],
|
||||
"edge": ["/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge"],
|
||||
}.get(browser, [])
|
||||
elif system == 'Linux':
|
||||
elif system == "Linux":
|
||||
candidates = {
|
||||
'chrome': ['/usr/bin/google-chrome', '/usr/bin/google-chrome-stable', '/usr/bin/google-chrome-beta', '/usr/bin/google-chrome-unstable'],
|
||||
'chromium': ['/usr/bin/chromium', '/usr/bin/chromium-browser'],
|
||||
'brave': ['/usr/bin/brave-browser', '/usr/bin/brave-browser-beta', '/usr/bin/brave-browser-nightly'],
|
||||
'edge': ['/usr/bin/microsoft-edge', '/usr/bin/microsoft-edge-stable', '/usr/bin/microsoft-edge-beta', '/usr/bin/microsoft-edge-dev'],
|
||||
"chrome": [
|
||||
"/usr/bin/google-chrome",
|
||||
"/usr/bin/google-chrome-stable",
|
||||
"/usr/bin/google-chrome-beta",
|
||||
"/usr/bin/google-chrome-unstable",
|
||||
],
|
||||
"chromium": ["/usr/bin/chromium", "/usr/bin/chromium-browser"],
|
||||
"brave": ["/usr/bin/brave-browser", "/usr/bin/brave-browser-beta", "/usr/bin/brave-browser-nightly"],
|
||||
"edge": [
|
||||
"/usr/bin/microsoft-edge",
|
||||
"/usr/bin/microsoft-edge-stable",
|
||||
"/usr/bin/microsoft-edge-beta",
|
||||
"/usr/bin/microsoft-edge-dev",
|
||||
],
|
||||
}.get(browser, [])
|
||||
elif system == 'Windows':
|
||||
local_app_data = Path(os.environ.get('LOCALAPPDATA', home / 'AppData' / 'Local'))
|
||||
elif system == "Windows":
|
||||
local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local"))
|
||||
candidates = {
|
||||
'chrome': [
|
||||
str(local_app_data / 'Google' / 'Chrome' / 'Application' / 'chrome.exe'),
|
||||
'C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe',
|
||||
'C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe',
|
||||
"chrome": [
|
||||
str(local_app_data / "Google" / "Chrome" / "Application" / "chrome.exe"),
|
||||
"C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
|
||||
"C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe",
|
||||
],
|
||||
'chromium': [str(local_app_data / 'Chromium' / 'Application' / 'chrome.exe')],
|
||||
'brave': [
|
||||
str(local_app_data / 'BraveSoftware' / 'Brave-Browser' / 'Application' / 'brave.exe'),
|
||||
'C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe',
|
||||
'C:\\Program Files (x86)\\BraveSoftware\\Brave-Browser\\Application\\brave.exe',
|
||||
"chromium": [str(local_app_data / "Chromium" / "Application" / "chrome.exe")],
|
||||
"brave": [
|
||||
str(local_app_data / "BraveSoftware" / "Brave-Browser" / "Application" / "brave.exe"),
|
||||
"C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe",
|
||||
"C:\\Program Files (x86)\\BraveSoftware\\Brave-Browser\\Application\\brave.exe",
|
||||
],
|
||||
'edge': [
|
||||
str(local_app_data / 'Microsoft' / 'Edge' / 'Application' / 'msedge.exe'),
|
||||
'C:\\Program Files\\Microsoft\\Edge\\Application\\msedge.exe',
|
||||
'C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe',
|
||||
"edge": [
|
||||
str(local_app_data / "Microsoft" / "Edge" / "Application" / "msedge.exe"),
|
||||
"C:\\Program Files\\Microsoft\\Edge\\Application\\msedge.exe",
|
||||
"C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe",
|
||||
],
|
||||
}.get(browser, [])
|
||||
else:
|
||||
@@ -190,13 +201,13 @@ def get_browser_binary(browser: str) -> Optional[str]:
|
||||
|
||||
|
||||
BROWSER_PROFILE_FINDERS = {
|
||||
'chrome': get_chrome_user_data_dir,
|
||||
'chromium': get_chrome_user_data_dir, # Same locations
|
||||
'brave': get_brave_user_data_dir,
|
||||
'edge': get_edge_user_data_dir,
|
||||
"chrome": get_chrome_user_data_dir,
|
||||
"chromium": get_chrome_user_data_dir, # Same locations
|
||||
"brave": get_brave_user_data_dir,
|
||||
"edge": get_edge_user_data_dir,
|
||||
}
|
||||
|
||||
CHROMIUM_BROWSERS = {'chrome', 'chromium', 'brave', 'edge'}
|
||||
CHROMIUM_BROWSERS = {"chrome", "chromium", "brave", "edge"}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
@@ -204,12 +215,12 @@ CHROMIUM_BROWSERS = {'chrome', 'chromium', 'brave', 'edge'}
|
||||
# =============================================================================
|
||||
|
||||
NETSCAPE_COOKIE_HEADER = [
|
||||
'# Netscape HTTP Cookie File',
|
||||
'# https://curl.se/docs/http-cookies.html',
|
||||
'# This file was generated by ArchiveBox persona cookie extraction',
|
||||
'#',
|
||||
'# Format: domain\\tincludeSubdomains\\tpath\\tsecure\\texpiry\\tname\\tvalue',
|
||||
'',
|
||||
"# Netscape HTTP Cookie File",
|
||||
"# https://curl.se/docs/http-cookies.html",
|
||||
"# This file was generated by ArchiveBox persona cookie extraction",
|
||||
"#",
|
||||
"# Format: domain\\tincludeSubdomains\\tpath\\tsecure\\texpiry\\tname\\tvalue",
|
||||
"",
|
||||
]
|
||||
|
||||
|
||||
@@ -219,9 +230,9 @@ def _parse_netscape_cookies(path: Path) -> "OrderedDict[tuple[str, str, str], tu
|
||||
return cookies
|
||||
|
||||
for line in path.read_text().splitlines():
|
||||
if not line or line.startswith('#'):
|
||||
if not line or line.startswith("#"):
|
||||
continue
|
||||
parts = line.split('\t')
|
||||
parts = line.split("\t")
|
||||
if len(parts) < 7:
|
||||
continue
|
||||
domain, include_subdomains, cookie_path, secure, expiry, name, value = parts[:7]
|
||||
@@ -233,8 +244,8 @@ def _parse_netscape_cookies(path: Path) -> "OrderedDict[tuple[str, str, str], tu
|
||||
def _write_netscape_cookies(path: Path, cookies: "OrderedDict[tuple[str, str, str], tuple[str, str, str, str, str, str, str]]") -> None:
|
||||
lines = list(NETSCAPE_COOKIE_HEADER)
|
||||
for cookie in cookies.values():
|
||||
lines.append('\t'.join(cookie))
|
||||
path.write_text('\n'.join(lines) + '\n')
|
||||
lines.append("\t".join(cookie))
|
||||
path.write_text("\n".join(lines) + "\n")
|
||||
|
||||
|
||||
def _merge_netscape_cookies(existing_file: Path, new_file: Path) -> None:
|
||||
@@ -259,52 +270,52 @@ def extract_cookies_via_cdp(
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
|
||||
# Find the cookie extraction script
|
||||
chrome_plugin_dir = Path(__file__).parent.parent / 'plugins' / 'chrome'
|
||||
extract_script = chrome_plugin_dir / 'extract_cookies.js'
|
||||
chrome_plugin_dir = Path(__file__).parent.parent / "plugins" / "chrome"
|
||||
extract_script = chrome_plugin_dir / "extract_cookies.js"
|
||||
|
||||
if not extract_script.exists():
|
||||
rprint(f'[yellow]Cookie extraction script not found at {extract_script}[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]Cookie extraction script not found at {extract_script}[/yellow]", file=sys.stderr)
|
||||
return False
|
||||
|
||||
# Get node modules dir
|
||||
node_modules_dir = STORAGE_CONFIG.LIB_DIR / 'npm' / 'node_modules'
|
||||
node_modules_dir = STORAGE_CONFIG.LIB_DIR / "npm" / "node_modules"
|
||||
|
||||
# Set up environment
|
||||
env = os.environ.copy()
|
||||
env['NODE_MODULES_DIR'] = str(node_modules_dir)
|
||||
env['CHROME_USER_DATA_DIR'] = str(user_data_dir)
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
env["NODE_MODULES_DIR"] = str(node_modules_dir)
|
||||
env["CHROME_USER_DATA_DIR"] = str(user_data_dir)
|
||||
env["CHROME_HEADLESS"] = "true"
|
||||
if chrome_binary:
|
||||
env['CHROME_BINARY'] = str(chrome_binary)
|
||||
env["CHROME_BINARY"] = str(chrome_binary)
|
||||
output_path = output_file
|
||||
temp_output = None
|
||||
temp_dir = None
|
||||
if output_file.exists():
|
||||
temp_dir = Path(tempfile.mkdtemp(prefix='ab_cookies_'))
|
||||
temp_output = temp_dir / 'cookies.txt'
|
||||
temp_dir = Path(tempfile.mkdtemp(prefix="ab_cookies_"))
|
||||
temp_output = temp_dir / "cookies.txt"
|
||||
output_path = temp_output
|
||||
if profile_dir:
|
||||
extra_arg = f'--profile-directory={profile_dir}'
|
||||
existing_extra = env.get('CHROME_ARGS_EXTRA', '').strip()
|
||||
extra_arg = f"--profile-directory={profile_dir}"
|
||||
existing_extra = env.get("CHROME_ARGS_EXTRA", "").strip()
|
||||
args_list = []
|
||||
if existing_extra:
|
||||
if existing_extra.startswith('['):
|
||||
if existing_extra.startswith("["):
|
||||
try:
|
||||
parsed = json.loads(existing_extra)
|
||||
if isinstance(parsed, list):
|
||||
args_list.extend(str(x) for x in parsed)
|
||||
except Exception:
|
||||
args_list.extend([s.strip() for s in existing_extra.split(',') if s.strip()])
|
||||
args_list.extend([s.strip() for s in existing_extra.split(",") if s.strip()])
|
||||
else:
|
||||
args_list.extend([s.strip() for s in existing_extra.split(',') if s.strip()])
|
||||
args_list.extend([s.strip() for s in existing_extra.split(",") if s.strip()])
|
||||
args_list.append(extra_arg)
|
||||
env['CHROME_ARGS_EXTRA'] = json.dumps(args_list)
|
||||
env["CHROME_ARGS_EXTRA"] = json.dumps(args_list)
|
||||
|
||||
env['COOKIES_OUTPUT_FILE'] = str(output_path)
|
||||
env["COOKIES_OUTPUT_FILE"] = str(output_path)
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['node', str(extract_script)],
|
||||
["node", str(extract_script)],
|
||||
env=env,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
@@ -316,17 +327,17 @@ def extract_cookies_via_cdp(
|
||||
_merge_netscape_cookies(output_file, temp_output)
|
||||
return True
|
||||
else:
|
||||
rprint(f'[yellow]Cookie extraction failed: {result.stderr}[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]Cookie extraction failed: {result.stderr}[/yellow]", file=sys.stderr)
|
||||
return False
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
rprint('[yellow]Cookie extraction timed out[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]Cookie extraction timed out[/yellow]", file=sys.stderr)
|
||||
return False
|
||||
except FileNotFoundError:
|
||||
rprint('[yellow]Node.js not found. Cannot extract cookies.[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]Node.js not found. Cannot extract cookies.[/yellow]", file=sys.stderr)
|
||||
return False
|
||||
except Exception as e:
|
||||
rprint(f'[yellow]Cookie extraction error: {e}[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]Cookie extraction error: {e}[/yellow]", file=sys.stderr)
|
||||
return False
|
||||
finally:
|
||||
if temp_dir and temp_dir.exists():
|
||||
@@ -337,6 +348,7 @@ def extract_cookies_via_cdp(
|
||||
# Validation Helpers
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def validate_persona_name(name: str) -> tuple[bool, str]:
|
||||
"""
|
||||
Validate persona name to prevent path traversal attacks.
|
||||
@@ -348,19 +360,19 @@ def validate_persona_name(name: str) -> tuple[bool, str]:
|
||||
return False, "Persona name cannot be empty"
|
||||
|
||||
# Check for path separators
|
||||
if '/' in name or '\\' in name:
|
||||
if "/" in name or "\\" in name:
|
||||
return False, "Persona name cannot contain path separators (/ or \\)"
|
||||
|
||||
# Check for parent directory references
|
||||
if '..' in name:
|
||||
if ".." in name:
|
||||
return False, "Persona name cannot contain parent directory references (..)"
|
||||
|
||||
# Check for hidden files/directories
|
||||
if name.startswith('.'):
|
||||
if name.startswith("."):
|
||||
return False, "Persona name cannot start with a dot (.)"
|
||||
|
||||
# Ensure name doesn't contain null bytes or other dangerous chars
|
||||
if '\x00' in name or '\n' in name or '\r' in name:
|
||||
if "\x00" in name or "\n" in name or "\r" in name:
|
||||
return False, "Persona name contains invalid characters"
|
||||
|
||||
return True, ""
|
||||
@@ -394,10 +406,11 @@ def ensure_path_within_personas_dir(persona_path: Path) -> bool:
|
||||
# CREATE
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def create_personas(
|
||||
names: Iterable[str],
|
||||
import_from: Optional[str] = None,
|
||||
profile: Optional[str] = None,
|
||||
import_from: str | None = None,
|
||||
profile: str | None = None,
|
||||
) -> int:
|
||||
"""
|
||||
Create Personas from names.
|
||||
@@ -416,7 +429,7 @@ def create_personas(
|
||||
name_list = list(names) if names else []
|
||||
|
||||
if not name_list:
|
||||
rprint('[yellow]No persona names provided. Pass names as arguments.[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No persona names provided. Pass names as arguments.[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Validate import source if specified
|
||||
@@ -424,23 +437,23 @@ def create_personas(
|
||||
if import_from:
|
||||
import_from = import_from.lower()
|
||||
if import_from not in BROWSER_PROFILE_FINDERS:
|
||||
rprint(f'[red]Unknown browser: {import_from}[/red]', file=sys.stderr)
|
||||
rprint(f'[dim]Supported browsers: {", ".join(BROWSER_PROFILE_FINDERS.keys())}[/dim]', file=sys.stderr)
|
||||
rprint(f"[red]Unknown browser: {import_from}[/red]", file=sys.stderr)
|
||||
rprint(f"[dim]Supported browsers: {', '.join(BROWSER_PROFILE_FINDERS.keys())}[/dim]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
source_profile_dir = BROWSER_PROFILE_FINDERS[import_from]()
|
||||
if not source_profile_dir:
|
||||
rprint(f'[red]Could not find {import_from} profile directory[/red]', file=sys.stderr)
|
||||
rprint(f"[red]Could not find {import_from} profile directory[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
rprint(f'[dim]Found {import_from} profile: {source_profile_dir}[/dim]', file=sys.stderr)
|
||||
rprint(f"[dim]Found {import_from} profile: {source_profile_dir}[/dim]", file=sys.stderr)
|
||||
|
||||
if profile is None and (source_profile_dir / 'Default').exists():
|
||||
profile = 'Default'
|
||||
if profile is None and (source_profile_dir / "Default").exists():
|
||||
profile = "Default"
|
||||
|
||||
browser_binary = get_browser_binary(import_from)
|
||||
if browser_binary:
|
||||
rprint(f'[dim]Using {import_from} binary: {browser_binary}[/dim]', file=sys.stderr)
|
||||
rprint(f"[dim]Using {import_from} binary: {browser_binary}[/dim]", file=sys.stderr)
|
||||
|
||||
created_count = 0
|
||||
for name in name_list:
|
||||
@@ -459,11 +472,11 @@ def create_personas(
|
||||
if created:
|
||||
persona.ensure_dirs()
|
||||
created_count += 1
|
||||
rprint(f'[green]Created persona: {name}[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Created persona: {name}[/green]", file=sys.stderr)
|
||||
else:
|
||||
rprint(f'[dim]Persona already exists: {name}[/dim]', file=sys.stderr)
|
||||
rprint(f"[dim]Persona already exists: {name}[/dim]", file=sys.stderr)
|
||||
|
||||
cookies_file = Path(persona.path) / 'cookies.txt'
|
||||
cookies_file = Path(persona.path) / "cookies.txt"
|
||||
|
||||
# Import browser profile if requested
|
||||
if import_from in CHROMIUM_BROWSERS and source_profile_dir is not None:
|
||||
@@ -477,29 +490,31 @@ def create_personas(
|
||||
capture_storage=False,
|
||||
)
|
||||
except Exception as e:
|
||||
rprint(f'[red]Failed to import browser profile: {e}[/red]', file=sys.stderr)
|
||||
rprint(f"[red]Failed to import browser profile: {e}[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
if import_result.profile_copied:
|
||||
rprint('[green]Copied browser profile to persona[/green]', file=sys.stderr)
|
||||
rprint("[green]Copied browser profile to persona[/green]", file=sys.stderr)
|
||||
if import_result.cookies_imported:
|
||||
rprint(f'[green]Extracted cookies to {cookies_file}[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Extracted cookies to {cookies_file}[/green]", file=sys.stderr)
|
||||
elif not import_result.profile_copied:
|
||||
rprint('[yellow]Could not import cookies automatically.[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]Could not import cookies automatically.[/yellow]", file=sys.stderr)
|
||||
|
||||
for warning in import_result.warnings:
|
||||
rprint(f'[yellow]{warning}[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]{warning}[/yellow]", file=sys.stderr)
|
||||
|
||||
if not is_tty:
|
||||
write_record({
|
||||
'id': str(persona.id) if hasattr(persona, 'id') else None,
|
||||
'name': persona.name,
|
||||
'path': str(persona.path),
|
||||
'CHROME_USER_DATA_DIR': persona.CHROME_USER_DATA_DIR,
|
||||
'COOKIES_FILE': persona.COOKIES_FILE,
|
||||
})
|
||||
write_record(
|
||||
{
|
||||
"id": str(persona.id) if hasattr(persona, "id") else None,
|
||||
"name": persona.name,
|
||||
"path": str(persona.path),
|
||||
"CHROME_USER_DATA_DIR": persona.CHROME_USER_DATA_DIR,
|
||||
"COOKIES_FILE": persona.COOKIES_FILE,
|
||||
},
|
||||
)
|
||||
|
||||
rprint(f'[green]Created {created_count} new persona(s)[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Created {created_count} new persona(s)[/green]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -507,10 +522,11 @@ def create_personas(
|
||||
# LIST
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def list_personas(
|
||||
name: Optional[str] = None,
|
||||
name__icontains: Optional[str] = None,
|
||||
limit: Optional[int] = None,
|
||||
name: str | None = None,
|
||||
name__icontains: str | None = None,
|
||||
limit: int | None = None,
|
||||
) -> int:
|
||||
"""
|
||||
List Personas as JSONL with optional filters.
|
||||
@@ -523,33 +539,35 @@ def list_personas(
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
queryset = Persona.objects.all().order_by('name')
|
||||
queryset = Persona.objects.all().order_by("name")
|
||||
|
||||
# Apply filters
|
||||
filter_kwargs = {
|
||||
'name': name,
|
||||
'name__icontains': name__icontains,
|
||||
"name": name,
|
||||
"name__icontains": name__icontains,
|
||||
}
|
||||
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
|
||||
|
||||
count = 0
|
||||
for persona in queryset:
|
||||
cookies_status = '[green]✓[/green]' if persona.COOKIES_FILE else '[dim]✗[/dim]'
|
||||
chrome_status = '[green]✓[/green]' if Path(persona.CHROME_USER_DATA_DIR).exists() else '[dim]✗[/dim]'
|
||||
cookies_status = "[green]✓[/green]" if persona.COOKIES_FILE else "[dim]✗[/dim]"
|
||||
chrome_status = "[green]✓[/green]" if Path(persona.CHROME_USER_DATA_DIR).exists() else "[dim]✗[/dim]"
|
||||
|
||||
if is_tty:
|
||||
rprint(f'[cyan]{persona.name:20}[/cyan] cookies:{cookies_status} chrome:{chrome_status} [dim]{persona.path}[/dim]')
|
||||
rprint(f"[cyan]{persona.name:20}[/cyan] cookies:{cookies_status} chrome:{chrome_status} [dim]{persona.path}[/dim]")
|
||||
else:
|
||||
write_record({
|
||||
'id': str(persona.id) if hasattr(persona, 'id') else None,
|
||||
'name': persona.name,
|
||||
'path': str(persona.path),
|
||||
'CHROME_USER_DATA_DIR': persona.CHROME_USER_DATA_DIR,
|
||||
'COOKIES_FILE': persona.COOKIES_FILE,
|
||||
})
|
||||
write_record(
|
||||
{
|
||||
"id": str(persona.id) if hasattr(persona, "id") else None,
|
||||
"name": persona.name,
|
||||
"path": str(persona.path),
|
||||
"CHROME_USER_DATA_DIR": persona.CHROME_USER_DATA_DIR,
|
||||
"COOKIES_FILE": persona.COOKIES_FILE,
|
||||
},
|
||||
)
|
||||
count += 1
|
||||
|
||||
rprint(f'[dim]Listed {count} persona(s)[/dim]', file=sys.stderr)
|
||||
rprint(f"[dim]Listed {count} persona(s)[/dim]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -557,7 +575,8 @@ def list_personas(
|
||||
# UPDATE
|
||||
# =============================================================================
|
||||
|
||||
def update_personas(name: Optional[str] = None) -> int:
|
||||
|
||||
def update_personas(name: str | None = None) -> int:
|
||||
"""
|
||||
Update Personas from stdin JSONL.
|
||||
|
||||
@@ -575,13 +594,13 @@ def update_personas(name: Optional[str] = None) -> int:
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
updated_count = 0
|
||||
for record in records:
|
||||
persona_id = record.get('id')
|
||||
old_name = record.get('name')
|
||||
persona_id = record.get("id")
|
||||
old_name = record.get("name")
|
||||
|
||||
if not persona_id and not old_name:
|
||||
continue
|
||||
@@ -613,17 +632,19 @@ def update_personas(name: Optional[str] = None) -> int:
|
||||
updated_count += 1
|
||||
|
||||
if not is_tty:
|
||||
write_record({
|
||||
'id': str(persona.id) if hasattr(persona, 'id') else None,
|
||||
'name': persona.name,
|
||||
'path': str(persona.path),
|
||||
})
|
||||
write_record(
|
||||
{
|
||||
"id": str(persona.id) if hasattr(persona, "id") else None,
|
||||
"name": persona.name,
|
||||
"path": str(persona.path),
|
||||
},
|
||||
)
|
||||
|
||||
except Persona.DoesNotExist:
|
||||
rprint(f'[yellow]Persona not found: {persona_id or old_name}[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]Persona not found: {persona_id or old_name}[/yellow]", file=sys.stderr)
|
||||
continue
|
||||
|
||||
rprint(f'[green]Updated {updated_count} persona(s)[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Updated {updated_count} persona(s)[/green]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -631,6 +652,7 @@ def update_personas(name: Optional[str] = None) -> int:
|
||||
# DELETE
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def delete_personas(yes: bool = False, dry_run: bool = False) -> int:
|
||||
"""
|
||||
Delete Personas from stdin JSONL.
|
||||
@@ -646,23 +668,24 @@ def delete_personas(yes: bool = False, dry_run: bool = False) -> int:
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Collect persona IDs or names
|
||||
persona_ids = []
|
||||
persona_names = []
|
||||
for r in records:
|
||||
if r.get('id'):
|
||||
persona_ids.append(r['id'])
|
||||
elif r.get('name'):
|
||||
persona_names.append(r['name'])
|
||||
if r.get("id"):
|
||||
persona_ids.append(r["id"])
|
||||
elif r.get("name"):
|
||||
persona_names.append(r["name"])
|
||||
|
||||
if not persona_ids and not persona_names:
|
||||
rprint('[yellow]No valid persona IDs or names in input[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No valid persona IDs or names in input[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
from django.db.models import Q
|
||||
|
||||
query = Q()
|
||||
if persona_ids:
|
||||
query |= Q(id__in=persona_ids)
|
||||
@@ -673,17 +696,17 @@ def delete_personas(yes: bool = False, dry_run: bool = False) -> int:
|
||||
count = personas.count()
|
||||
|
||||
if count == 0:
|
||||
rprint('[yellow]No matching personas found[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No matching personas found[/yellow]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if dry_run:
|
||||
rprint(f'[yellow]Would delete {count} persona(s) (dry run)[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]Would delete {count} persona(s) (dry run)[/yellow]", file=sys.stderr)
|
||||
for persona in personas:
|
||||
rprint(f' {persona.name} ({persona.path})', file=sys.stderr)
|
||||
rprint(f" {persona.name} ({persona.path})", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if not yes:
|
||||
rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
|
||||
rprint("[red]Use --yes to confirm deletion[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Delete persona directories and database records
|
||||
@@ -701,7 +724,7 @@ def delete_personas(yes: bool = False, dry_run: bool = False) -> int:
|
||||
persona.delete()
|
||||
deleted_count += 1
|
||||
|
||||
rprint(f'[green]Deleted {deleted_count} persona(s)[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Deleted {deleted_count} persona(s)[/green]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -709,44 +732,45 @@ def delete_personas(yes: bool = False, dry_run: bool = False) -> int:
|
||||
# CLI Commands
|
||||
# =============================================================================
|
||||
|
||||
|
||||
@click.group()
|
||||
def main():
|
||||
"""Manage Persona records (browser profiles)."""
|
||||
pass
|
||||
|
||||
|
||||
@main.command('create')
|
||||
@click.argument('names', nargs=-1)
|
||||
@click.option('--import', 'import_from', help='Import profile from browser (chrome, chromium, brave, edge)')
|
||||
@click.option('--profile', help='Profile directory name under the user data dir (e.g. Default, Profile 1)')
|
||||
def create_cmd(names: tuple, import_from: Optional[str], profile: Optional[str]):
|
||||
@main.command("create")
|
||||
@click.argument("names", nargs=-1)
|
||||
@click.option("--import", "import_from", help="Import profile from browser (chrome, chromium, brave, edge)")
|
||||
@click.option("--profile", help="Profile directory name under the user data dir (e.g. Default, Profile 1)")
|
||||
def create_cmd(names: tuple, import_from: str | None, profile: str | None):
|
||||
"""Create Personas, optionally importing from a browser profile."""
|
||||
sys.exit(create_personas(names, import_from=import_from, profile=profile))
|
||||
|
||||
|
||||
@main.command('list')
|
||||
@click.option('--name', help='Filter by exact name')
|
||||
@click.option('--name__icontains', help='Filter by name contains')
|
||||
@click.option('--limit', '-n', type=int, help='Limit number of results')
|
||||
def list_cmd(name: Optional[str], name__icontains: Optional[str], limit: Optional[int]):
|
||||
@main.command("list")
|
||||
@click.option("--name", help="Filter by exact name")
|
||||
@click.option("--name__icontains", help="Filter by name contains")
|
||||
@click.option("--limit", "-n", type=int, help="Limit number of results")
|
||||
def list_cmd(name: str | None, name__icontains: str | None, limit: int | None):
|
||||
"""List Personas as JSONL."""
|
||||
sys.exit(list_personas(name=name, name__icontains=name__icontains, limit=limit))
|
||||
|
||||
|
||||
@main.command('update')
|
||||
@click.option('--name', '-n', help='Set new name')
|
||||
def update_cmd(name: Optional[str]):
|
||||
@main.command("update")
|
||||
@click.option("--name", "-n", help="Set new name")
|
||||
def update_cmd(name: str | None):
|
||||
"""Update Personas from stdin JSONL."""
|
||||
sys.exit(update_personas(name=name))
|
||||
|
||||
|
||||
@main.command('delete')
|
||||
@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
|
||||
@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
|
||||
@main.command("delete")
|
||||
@click.option("--yes", "-y", is_flag=True, help="Confirm deletion")
|
||||
@click.option("--dry-run", is_flag=True, help="Show what would be deleted")
|
||||
def delete_cmd(yes: bool, dry_run: bool):
|
||||
"""Delete Personas from stdin JSONL."""
|
||||
sys.exit(delete_personas(yes=yes, dry_run=dry_run))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__package__ = "archivebox.cli"
|
||||
|
||||
from typing import Optional
|
||||
|
||||
import rich_click as click
|
||||
|
||||
@@ -137,7 +136,7 @@ BINARY_MACHINE_DIAGRAM = """
|
||||
@enforce_types
|
||||
def pluginmap(
|
||||
show_disabled: bool = False,
|
||||
model: Optional[str] = None,
|
||||
model: str | None = None,
|
||||
quiet: bool = False,
|
||||
) -> dict:
|
||||
"""
|
||||
@@ -164,25 +163,25 @@ def pluginmap(
|
||||
|
||||
# Model event types that can have hooks
|
||||
model_events = {
|
||||
'Crawl': {
|
||||
'description': 'Hooks run when a Crawl starts (QUEUED→STARTED)',
|
||||
'machine': 'CrawlMachine',
|
||||
'diagram': CRAWL_MACHINE_DIAGRAM,
|
||||
"Crawl": {
|
||||
"description": "Hooks run when a Crawl starts (QUEUED→STARTED)",
|
||||
"machine": "CrawlMachine",
|
||||
"diagram": CRAWL_MACHINE_DIAGRAM,
|
||||
},
|
||||
'CrawlEnd': {
|
||||
'description': 'Hooks run when a Crawl finishes (STARTED→SEALED)',
|
||||
'machine': 'CrawlMachine',
|
||||
'diagram': None, # Part of CrawlMachine
|
||||
"CrawlEnd": {
|
||||
"description": "Hooks run when a Crawl finishes (STARTED→SEALED)",
|
||||
"machine": "CrawlMachine",
|
||||
"diagram": None, # Part of CrawlMachine
|
||||
},
|
||||
'Snapshot': {
|
||||
'description': 'Hooks run for each Snapshot (creates ArchiveResults)',
|
||||
'machine': 'SnapshotMachine',
|
||||
'diagram': SNAPSHOT_MACHINE_DIAGRAM,
|
||||
"Snapshot": {
|
||||
"description": "Hooks run for each Snapshot (creates ArchiveResults)",
|
||||
"machine": "SnapshotMachine",
|
||||
"diagram": SNAPSHOT_MACHINE_DIAGRAM,
|
||||
},
|
||||
'Binary': {
|
||||
'description': 'Hooks for installing binary dependencies (providers)',
|
||||
'machine': 'BinaryMachine',
|
||||
'diagram': BINARY_MACHINE_DIAGRAM,
|
||||
"Binary": {
|
||||
"description": "Hooks for installing binary dependencies (providers)",
|
||||
"machine": "BinaryMachine",
|
||||
"diagram": BINARY_MACHINE_DIAGRAM,
|
||||
},
|
||||
}
|
||||
|
||||
@@ -195,16 +194,16 @@ def pluginmap(
|
||||
model_events = {model: model_events[model]}
|
||||
|
||||
result = {
|
||||
'models': {},
|
||||
'plugins_dir': str(BUILTIN_PLUGINS_DIR),
|
||||
'user_plugins_dir': str(USER_PLUGINS_DIR),
|
||||
"models": {},
|
||||
"plugins_dir": str(BUILTIN_PLUGINS_DIR),
|
||||
"user_plugins_dir": str(USER_PLUGINS_DIR),
|
||||
}
|
||||
|
||||
if not quiet:
|
||||
prnt()
|
||||
prnt('[bold cyan]ArchiveBox Plugin Map[/bold cyan]')
|
||||
prnt(f'[dim]Built-in plugins: {BUILTIN_PLUGINS_DIR}[/dim]')
|
||||
prnt(f'[dim]User plugins: {USER_PLUGINS_DIR}[/dim]')
|
||||
prnt("[bold cyan]ArchiveBox Plugin Map[/bold cyan]")
|
||||
prnt(f"[dim]Built-in plugins: {BUILTIN_PLUGINS_DIR}[/dim]")
|
||||
prnt(f"[dim]User plugins: {USER_PLUGINS_DIR}[/dim]")
|
||||
prnt()
|
||||
|
||||
for event_name, info in model_events.items():
|
||||
@@ -218,88 +217,93 @@ def pluginmap(
|
||||
plugin_name = hook_path.parent.name
|
||||
is_bg = is_background_hook(hook_path.name)
|
||||
|
||||
hook_infos.append({
|
||||
'path': str(hook_path),
|
||||
'name': hook_path.name,
|
||||
'plugin': plugin_name,
|
||||
'is_background': is_bg,
|
||||
'extension': hook_path.suffix,
|
||||
})
|
||||
hook_infos.append(
|
||||
{
|
||||
"path": str(hook_path),
|
||||
"name": hook_path.name,
|
||||
"plugin": plugin_name,
|
||||
"is_background": is_bg,
|
||||
"extension": hook_path.suffix,
|
||||
},
|
||||
)
|
||||
|
||||
result['models'][event_name] = {
|
||||
'description': info['description'],
|
||||
'machine': info['machine'],
|
||||
'hooks': hook_infos,
|
||||
'hook_count': len(hook_infos),
|
||||
result["models"][event_name] = {
|
||||
"description": info["description"],
|
||||
"machine": info["machine"],
|
||||
"hooks": hook_infos,
|
||||
"hook_count": len(hook_infos),
|
||||
}
|
||||
|
||||
if not quiet:
|
||||
# Show diagram if this model has one
|
||||
if info.get('diagram'):
|
||||
assert info['diagram'] is not None
|
||||
prnt(Panel(
|
||||
info['diagram'],
|
||||
title=f'[bold green]{info["machine"]}[/bold green]',
|
||||
border_style='green',
|
||||
expand=False,
|
||||
))
|
||||
if info.get("diagram"):
|
||||
assert info["diagram"] is not None
|
||||
prnt(
|
||||
Panel(
|
||||
info["diagram"],
|
||||
title=f"[bold green]{info['machine']}[/bold green]",
|
||||
border_style="green",
|
||||
expand=False,
|
||||
),
|
||||
)
|
||||
prnt()
|
||||
|
||||
# Create hooks table
|
||||
table = Table(
|
||||
title=f'[bold yellow]on_{event_name}__* Hooks[/bold yellow] ({len(hooks)} found)',
|
||||
title=f"[bold yellow]on_{event_name}__* Hooks[/bold yellow] ({len(hooks)} found)",
|
||||
box=box.ROUNDED,
|
||||
show_header=True,
|
||||
header_style='bold magenta',
|
||||
header_style="bold magenta",
|
||||
)
|
||||
table.add_column('Plugin', style='cyan', width=20)
|
||||
table.add_column('Hook Name', style='green')
|
||||
table.add_column('BG', justify='center', width=4)
|
||||
table.add_column('Type', justify='center', width=5)
|
||||
table.add_column("Plugin", style="cyan", width=20)
|
||||
table.add_column("Hook Name", style="green")
|
||||
table.add_column("BG", justify="center", width=4)
|
||||
table.add_column("Type", justify="center", width=5)
|
||||
|
||||
# Sort lexicographically by hook name
|
||||
sorted_hooks = sorted(hook_infos, key=lambda h: h['name'])
|
||||
sorted_hooks = sorted(hook_infos, key=lambda h: h["name"])
|
||||
|
||||
for hook in sorted_hooks:
|
||||
bg_marker = '[yellow]bg[/yellow]' if hook['is_background'] else ''
|
||||
ext = hook['extension'].lstrip('.')
|
||||
bg_marker = "[yellow]bg[/yellow]" if hook["is_background"] else ""
|
||||
ext = hook["extension"].lstrip(".")
|
||||
table.add_row(
|
||||
hook['plugin'],
|
||||
hook['name'],
|
||||
hook["plugin"],
|
||||
hook["name"],
|
||||
bg_marker,
|
||||
ext,
|
||||
)
|
||||
|
||||
prnt(table)
|
||||
prnt()
|
||||
prnt(f'[dim]{info["description"]}[/dim]')
|
||||
prnt(f"[dim]{info['description']}[/dim]")
|
||||
prnt()
|
||||
|
||||
# Summary
|
||||
if not quiet:
|
||||
total_hooks = sum(m['hook_count'] for m in result['models'].values())
|
||||
prnt(f'[bold]Total hooks discovered: {total_hooks}[/bold]')
|
||||
total_hooks = sum(m["hook_count"] for m in result["models"].values())
|
||||
prnt(f"[bold]Total hooks discovered: {total_hooks}[/bold]")
|
||||
prnt()
|
||||
prnt('[dim]Hook naming convention: on_{Model}__{XX}_{description}[.bg].{ext}[/dim]')
|
||||
prnt('[dim] - XX: Two-digit lexicographic order (00-99)[/dim]')
|
||||
prnt('[dim] - .bg: Background hook (non-blocking)[/dim]')
|
||||
prnt('[dim] - ext: py, sh, or js[/dim]')
|
||||
prnt("[dim]Hook naming convention: on_{Model}__{XX}_{description}[.bg].{ext}[/dim]")
|
||||
prnt("[dim] - XX: Two-digit lexicographic order (00-99)[/dim]")
|
||||
prnt("[dim] - .bg: Background hook (non-blocking)[/dim]")
|
||||
prnt("[dim] - ext: py, sh, or js[/dim]")
|
||||
prnt()
|
||||
|
||||
return result
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--show-disabled', '-a', is_flag=True, help='Show hooks from disabled plugins too')
|
||||
@click.option('--model', '-m', type=str, default=None, help='Filter to specific model (Crawl, Snapshot, Binary, CrawlEnd)')
|
||||
@click.option('--quiet', '-q', is_flag=True, help='Output JSON only, no ASCII diagrams')
|
||||
@click.option("--show-disabled", "-a", is_flag=True, help="Show hooks from disabled plugins too")
|
||||
@click.option("--model", "-m", type=str, default=None, help="Filter to specific model (Crawl, Snapshot, Binary, CrawlEnd)")
|
||||
@click.option("--quiet", "-q", is_flag=True, help="Output JSON only, no ASCII diagrams")
|
||||
@docstring(pluginmap.__doc__)
|
||||
def main(**kwargs):
|
||||
import json
|
||||
|
||||
result = pluginmap(**kwargs)
|
||||
if kwargs.get('quiet'):
|
||||
if kwargs.get("quiet"):
|
||||
print(json.dumps(result, indent=2))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -22,11 +22,10 @@ Examples:
|
||||
archivebox process list --limit=10
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox process'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox process"
|
||||
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
import rich_click as click
|
||||
from rich import print as rprint
|
||||
@@ -38,10 +37,11 @@ from archivebox.cli.cli_utils import apply_filters
|
||||
# LIST
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def list_processes(
|
||||
binary_name: Optional[str] = None,
|
||||
machine_id: Optional[str] = None,
|
||||
limit: Optional[int] = None,
|
||||
binary_name: str | None = None,
|
||||
machine_id: str | None = None,
|
||||
limit: int | None = None,
|
||||
) -> int:
|
||||
"""
|
||||
List Processes as JSONL with optional filters.
|
||||
@@ -54,29 +54,29 @@ def list_processes(
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
queryset = Process.objects.all().select_related('binary', 'machine').order_by('-start_ts')
|
||||
queryset = Process.objects.all().select_related("binary", "machine").order_by("-start_ts")
|
||||
|
||||
# Apply filters
|
||||
filter_kwargs = {}
|
||||
if binary_name:
|
||||
filter_kwargs['binary__name'] = binary_name
|
||||
filter_kwargs["binary__name"] = binary_name
|
||||
if machine_id:
|
||||
filter_kwargs['machine_id'] = machine_id
|
||||
filter_kwargs["machine_id"] = machine_id
|
||||
|
||||
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
|
||||
|
||||
count = 0
|
||||
for process in queryset:
|
||||
if is_tty:
|
||||
binary_name_str = process.binary.name if process.binary else 'unknown'
|
||||
exit_code = process.exit_code if process.exit_code is not None else '?'
|
||||
status_color = 'green' if process.exit_code == 0 else 'red' if process.exit_code else 'yellow'
|
||||
rprint(f'[{status_color}]exit={exit_code:3}[/{status_color}] [cyan]{binary_name_str:15}[/cyan] [dim]{process.id}[/dim]')
|
||||
binary_name_str = process.binary.name if process.binary else "unknown"
|
||||
exit_code = process.exit_code if process.exit_code is not None else "?"
|
||||
status_color = "green" if process.exit_code == 0 else "red" if process.exit_code else "yellow"
|
||||
rprint(f"[{status_color}]exit={exit_code:3}[/{status_color}] [cyan]{binary_name_str:15}[/cyan] [dim]{process.id}[/dim]")
|
||||
else:
|
||||
write_record(process.to_json())
|
||||
count += 1
|
||||
|
||||
rprint(f'[dim]Listed {count} processes[/dim]', file=sys.stderr)
|
||||
rprint(f"[dim]Listed {count} processes[/dim]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -84,24 +84,27 @@ def list_processes(
|
||||
# CLI Commands
|
||||
# =============================================================================
|
||||
|
||||
|
||||
@click.group()
|
||||
def main():
|
||||
"""Manage Process records (read-only, system-managed)."""
|
||||
pass
|
||||
|
||||
|
||||
@main.command('list')
|
||||
@click.option('--binary-name', '-b', help='Filter by binary name')
|
||||
@click.option('--machine-id', '-m', help='Filter by machine ID')
|
||||
@click.option('--limit', '-n', type=int, help='Limit number of results')
|
||||
def list_cmd(binary_name: Optional[str], machine_id: Optional[str], limit: Optional[int]):
|
||||
@main.command("list")
|
||||
@click.option("--binary-name", "-b", help="Filter by binary name")
|
||||
@click.option("--machine-id", "-m", help="Filter by machine ID")
|
||||
@click.option("--limit", "-n", type=int, help="Limit number of results")
|
||||
def list_cmd(binary_name: str | None, machine_id: str | None, limit: int | None):
|
||||
"""List Processes as JSONL."""
|
||||
sys.exit(list_processes(
|
||||
binary_name=binary_name,
|
||||
machine_id=machine_id,
|
||||
limit=limit,
|
||||
))
|
||||
sys.exit(
|
||||
list_processes(
|
||||
binary_name=binary_name,
|
||||
machine_id=machine_id,
|
||||
limit=limit,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox remove'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox remove"
|
||||
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
from collections.abc import Iterable
|
||||
|
||||
import rich_click as click
|
||||
|
||||
@@ -26,25 +26,27 @@ from archivebox.misc.logging_util import (
|
||||
|
||||
|
||||
@enforce_types
|
||||
def remove(filter_patterns: Iterable[str]=(),
|
||||
filter_type: str='exact',
|
||||
snapshots: QuerySet | None=None,
|
||||
after: float | None=None,
|
||||
before: float | None=None,
|
||||
yes: bool=False,
|
||||
delete: bool=False,
|
||||
out_dir: Path=DATA_DIR) -> QuerySet:
|
||||
def remove(
|
||||
filter_patterns: Iterable[str] = (),
|
||||
filter_type: str = "exact",
|
||||
snapshots: QuerySet | None = None,
|
||||
after: float | None = None,
|
||||
before: float | None = None,
|
||||
yes: bool = False,
|
||||
delete: bool = False,
|
||||
out_dir: Path = DATA_DIR,
|
||||
) -> QuerySet:
|
||||
"""Remove the specified URLs from the archive"""
|
||||
|
||||
|
||||
setup_django()
|
||||
check_data_folder()
|
||||
|
||||
|
||||
from archivebox.cli.archivebox_search import get_snapshots
|
||||
|
||||
pattern_list = list(filter_patterns)
|
||||
|
||||
log_list_started(pattern_list or None, filter_type)
|
||||
timer = TimedProgress(360, prefix=' ')
|
||||
timer = TimedProgress(360, prefix=" ")
|
||||
try:
|
||||
snapshots = get_snapshots(
|
||||
snapshots=snapshots,
|
||||
@@ -63,7 +65,7 @@ def remove(filter_patterns: Iterable[str]=(),
|
||||
log_list_finished(snapshots)
|
||||
log_removal_started(snapshots, yes=yes, delete=delete)
|
||||
|
||||
timer = TimedProgress(360, prefix=' ')
|
||||
timer = TimedProgress(360, prefix=" ")
|
||||
try:
|
||||
for snapshot in snapshots:
|
||||
if delete:
|
||||
@@ -88,17 +90,23 @@ def remove(filter_patterns: Iterable[str]=(),
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--yes', is_flag=True, help='Remove links instantly without prompting to confirm')
|
||||
@click.option('--delete', is_flag=True, help='Delete the archived content and metadata folder in addition to removing from index')
|
||||
@click.option('--before', type=float, help='Remove only URLs bookmarked before timestamp')
|
||||
@click.option('--after', type=float, help='Remove only URLs bookmarked after timestamp')
|
||||
@click.option('--filter-type', '-f', type=click.Choice(('exact', 'substring', 'domain', 'regex', 'tag')), default='exact', help='Type of pattern matching to use when filtering URLs')
|
||||
@click.argument('filter_patterns', nargs=-1)
|
||||
@click.option("--yes", is_flag=True, help="Remove links instantly without prompting to confirm")
|
||||
@click.option("--delete", is_flag=True, help="Delete the archived content and metadata folder in addition to removing from index")
|
||||
@click.option("--before", type=float, help="Remove only URLs bookmarked before timestamp")
|
||||
@click.option("--after", type=float, help="Remove only URLs bookmarked after timestamp")
|
||||
@click.option(
|
||||
"--filter-type",
|
||||
"-f",
|
||||
type=click.Choice(("exact", "substring", "domain", "regex", "tag")),
|
||||
default="exact",
|
||||
help="Type of pattern matching to use when filtering URLs",
|
||||
)
|
||||
@click.argument("filter_patterns", nargs=-1)
|
||||
@docstring(remove.__doc__)
|
||||
def main(**kwargs):
|
||||
"""Remove the specified URLs from the archive"""
|
||||
remove(**kwargs)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -37,8 +37,8 @@ Examples:
|
||||
archivebox run --binary-id=019b7e90-5a8e-712c-9877-2c70eebe80ad
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox run'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox run"
|
||||
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
@@ -87,8 +87,8 @@ def process_stdin_records() -> int:
|
||||
binary_ids: list[str] = []
|
||||
|
||||
for record in records:
|
||||
record_type = record.get('type', '')
|
||||
record_id = record.get('id')
|
||||
record_type = record.get("type", "")
|
||||
record_id = record.get("id")
|
||||
|
||||
try:
|
||||
if record_type == TYPE_CRAWL:
|
||||
@@ -97,10 +97,10 @@ def process_stdin_records() -> int:
|
||||
try:
|
||||
crawl = Crawl.objects.get(id=record_id)
|
||||
except Crawl.DoesNotExist:
|
||||
crawl = Crawl.from_json(record, overrides={'created_by_id': created_by_id})
|
||||
crawl = Crawl.from_json(record, overrides={"created_by_id": created_by_id})
|
||||
else:
|
||||
# New crawl - create it
|
||||
crawl = Crawl.from_json(record, overrides={'created_by_id': created_by_id})
|
||||
crawl = Crawl.from_json(record, overrides={"created_by_id": created_by_id})
|
||||
|
||||
if crawl:
|
||||
crawl.retry_at = timezone.now()
|
||||
@@ -112,16 +112,16 @@ def process_stdin_records() -> int:
|
||||
output_records.append(crawl.to_json())
|
||||
queued_count += 1
|
||||
|
||||
elif record_type == TYPE_SNAPSHOT or (record.get('url') and not record_type):
|
||||
elif record_type == TYPE_SNAPSHOT or (record.get("url") and not record_type):
|
||||
if record_id:
|
||||
# Existing snapshot - re-queue
|
||||
try:
|
||||
snapshot = Snapshot.objects.get(id=record_id)
|
||||
except Snapshot.DoesNotExist:
|
||||
snapshot = Snapshot.from_json(record, overrides={'created_by_id': created_by_id})
|
||||
snapshot = Snapshot.from_json(record, overrides={"created_by_id": created_by_id})
|
||||
else:
|
||||
# New snapshot - create it
|
||||
snapshot = Snapshot.from_json(record, overrides={'created_by_id': created_by_id})
|
||||
snapshot = Snapshot.from_json(record, overrides={"created_by_id": created_by_id})
|
||||
|
||||
if snapshot:
|
||||
snapshot.retry_at = timezone.now()
|
||||
@@ -132,7 +132,7 @@ def process_stdin_records() -> int:
|
||||
crawl.retry_at = timezone.now()
|
||||
if crawl.status != Crawl.StatusChoices.STARTED:
|
||||
crawl.status = Crawl.StatusChoices.QUEUED
|
||||
crawl.save(update_fields=['status', 'retry_at', 'modified_at'])
|
||||
crawl.save(update_fields=["status", "retry_at", "modified_at"])
|
||||
crawl_id = str(snapshot.crawl_id)
|
||||
snapshot_ids_by_crawl[crawl_id].add(str(snapshot.id))
|
||||
run_all_plugins_for_crawl.add(crawl_id)
|
||||
@@ -149,11 +149,16 @@ def process_stdin_records() -> int:
|
||||
else:
|
||||
archiveresult = None
|
||||
|
||||
snapshot_id = record.get('snapshot_id')
|
||||
plugin_name = record.get('plugin')
|
||||
snapshot_id = record.get("snapshot_id")
|
||||
plugin_name = record.get("plugin")
|
||||
snapshot = None
|
||||
if archiveresult:
|
||||
if archiveresult.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED, ArchiveResult.StatusChoices.NORESULTS, ArchiveResult.StatusChoices.BACKOFF]:
|
||||
if archiveresult.status in [
|
||||
ArchiveResult.StatusChoices.FAILED,
|
||||
ArchiveResult.StatusChoices.SKIPPED,
|
||||
ArchiveResult.StatusChoices.NORESULTS,
|
||||
ArchiveResult.StatusChoices.BACKOFF,
|
||||
]:
|
||||
archiveresult.reset_for_retry()
|
||||
snapshot = archiveresult.snapshot
|
||||
plugin_name = plugin_name or archiveresult.plugin
|
||||
@@ -167,12 +172,12 @@ def process_stdin_records() -> int:
|
||||
snapshot.retry_at = timezone.now()
|
||||
if snapshot.status != Snapshot.StatusChoices.STARTED:
|
||||
snapshot.status = Snapshot.StatusChoices.QUEUED
|
||||
snapshot.save(update_fields=['status', 'retry_at', 'modified_at'])
|
||||
snapshot.save(update_fields=["status", "retry_at", "modified_at"])
|
||||
crawl = snapshot.crawl
|
||||
crawl.retry_at = timezone.now()
|
||||
if crawl.status != Crawl.StatusChoices.STARTED:
|
||||
crawl.status = Crawl.StatusChoices.QUEUED
|
||||
crawl.save(update_fields=['status', 'retry_at', 'modified_at'])
|
||||
crawl.save(update_fields=["status", "retry_at", "modified_at"])
|
||||
crawl_id = str(snapshot.crawl_id)
|
||||
snapshot_ids_by_crawl[crawl_id].add(str(snapshot.id))
|
||||
if plugin_name:
|
||||
@@ -203,7 +208,7 @@ def process_stdin_records() -> int:
|
||||
output_records.append(record)
|
||||
|
||||
except Exception as e:
|
||||
rprint(f'[yellow]Error processing record: {e}[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]Error processing record: {e}[/yellow]", file=sys.stderr)
|
||||
continue
|
||||
|
||||
# Output all processed records (for chaining)
|
||||
@@ -212,10 +217,10 @@ def process_stdin_records() -> int:
|
||||
write_record(rec)
|
||||
|
||||
if queued_count == 0:
|
||||
rprint('[yellow]No records to process[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No records to process[/yellow]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
rprint(f'[blue]Processing {queued_count} records...[/blue]', file=sys.stderr)
|
||||
rprint(f"[blue]Processing {queued_count} records...[/blue]", file=sys.stderr)
|
||||
|
||||
for binary_id in binary_ids:
|
||||
run_binary(binary_id)
|
||||
@@ -245,13 +250,14 @@ def run_runner(daemon: bool = False) -> int:
|
||||
from archivebox.services.runner import recover_orphaned_crawls, recover_orphaned_snapshots, run_pending_crawls
|
||||
|
||||
Process.cleanup_stale_running()
|
||||
Process.cleanup_orphaned_workers()
|
||||
recover_orphaned_snapshots()
|
||||
recover_orphaned_crawls()
|
||||
Machine.current()
|
||||
current = Process.current()
|
||||
if current.process_type != Process.TypeChoices.ORCHESTRATOR:
|
||||
current.process_type = Process.TypeChoices.ORCHESTRATOR
|
||||
current.save(update_fields=['process_type', 'modified_at'])
|
||||
current.save(update_fields=["process_type", "modified_at"])
|
||||
|
||||
try:
|
||||
run_pending_crawls(daemon=daemon)
|
||||
@@ -259,21 +265,21 @@ def run_runner(daemon: bool = False) -> int:
|
||||
except KeyboardInterrupt:
|
||||
return 0
|
||||
except Exception as e:
|
||||
rprint(f'[red]Runner error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
|
||||
rprint(f"[red]Runner error: {type(e).__name__}: {e}[/red]", file=sys.stderr)
|
||||
return 1
|
||||
finally:
|
||||
current.refresh_from_db()
|
||||
if current.status != Process.StatusChoices.EXITED:
|
||||
current.status = Process.StatusChoices.EXITED
|
||||
current.ended_at = current.ended_at or timezone.now()
|
||||
current.save(update_fields=['status', 'ended_at', 'modified_at'])
|
||||
current.save(update_fields=["status", "ended_at", "modified_at"])
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--daemon', '-d', is_flag=True, help="Run forever (don't exit on idle)")
|
||||
@click.option('--crawl-id', help="Run the crawl runner for a specific crawl only")
|
||||
@click.option('--snapshot-id', help="Run one snapshot through its crawl")
|
||||
@click.option('--binary-id', help="Run one queued binary install directly on the bus")
|
||||
@click.option("--daemon", "-d", is_flag=True, help="Run forever (don't exit on idle)")
|
||||
@click.option("--crawl-id", help="Run the crawl runner for a specific crawl only")
|
||||
@click.option("--snapshot-id", help="Run one snapshot through its crawl")
|
||||
@click.option("--binary-id", help="Run one queued binary install directly on the bus")
|
||||
def main(daemon: bool, crawl_id: str, snapshot_id: str, binary_id: str):
|
||||
"""
|
||||
Process queued work.
|
||||
@@ -297,21 +303,24 @@ def main(daemon: bool, crawl_id: str, snapshot_id: str, binary_id: str):
|
||||
except KeyboardInterrupt:
|
||||
sys.exit(0)
|
||||
except Exception as e:
|
||||
rprint(f'[red]Runner error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
|
||||
rprint(f"[red]Runner error: {type(e).__name__}: {e}[/red]", file=sys.stderr)
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
|
||||
if crawl_id:
|
||||
try:
|
||||
from archivebox.services.runner import run_crawl
|
||||
|
||||
run_crawl(crawl_id)
|
||||
sys.exit(0)
|
||||
except KeyboardInterrupt:
|
||||
sys.exit(0)
|
||||
except Exception as e:
|
||||
rprint(f'[red]Runner error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
|
||||
rprint(f"[red]Runner error: {type(e).__name__}: {e}[/red]", file=sys.stderr)
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
|
||||
@@ -333,17 +342,18 @@ def run_snapshot_worker(snapshot_id: str) -> int:
|
||||
from archivebox.services.runner import run_crawl
|
||||
|
||||
try:
|
||||
snapshot = Snapshot.objects.select_related('crawl').get(id=snapshot_id)
|
||||
snapshot = Snapshot.objects.select_related("crawl").get(id=snapshot_id)
|
||||
run_crawl(str(snapshot.crawl_id), snapshot_ids=[str(snapshot.id)])
|
||||
return 0
|
||||
except KeyboardInterrupt:
|
||||
return 0
|
||||
except Exception as e:
|
||||
rprint(f'[red]Runner error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
|
||||
rprint(f"[red]Runner error: {type(e).__name__}: {e}[/red]", file=sys.stderr)
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__package__ = "archivebox.cli"
|
||||
|
||||
import rich_click as click
|
||||
from rich import print
|
||||
@@ -10,18 +10,20 @@ from archivebox.config.common import ARCHIVING_CONFIG
|
||||
|
||||
|
||||
@enforce_types
|
||||
def schedule(add: bool = False,
|
||||
show: bool = False,
|
||||
clear: bool = False,
|
||||
foreground: bool = False,
|
||||
run_all: bool = False,
|
||||
quiet: bool = False,
|
||||
every: str | None = None,
|
||||
tag: str = '',
|
||||
depth: int | str = 0,
|
||||
overwrite: bool = False,
|
||||
update: bool = not ARCHIVING_CONFIG.ONLY_NEW,
|
||||
import_path: str | None = None):
|
||||
def schedule(
|
||||
add: bool = False,
|
||||
show: bool = False,
|
||||
clear: bool = False,
|
||||
foreground: bool = False,
|
||||
run_all: bool = False,
|
||||
quiet: bool = False,
|
||||
every: str | None = None,
|
||||
tag: str = "",
|
||||
depth: int | str = 0,
|
||||
overwrite: bool = False,
|
||||
update: bool = not ARCHIVING_CONFIG.ONLY_NEW,
|
||||
import_path: str | None = None,
|
||||
):
|
||||
"""Manage database-backed scheduled crawls processed by the crawl runner."""
|
||||
|
||||
from django.utils import timezone
|
||||
@@ -33,55 +35,51 @@ def schedule(add: bool = False,
|
||||
|
||||
depth = int(depth)
|
||||
result: dict[str, object] = {
|
||||
'created_schedule_ids': [],
|
||||
'disabled_count': 0,
|
||||
'run_all_enqueued': 0,
|
||||
'active_schedule_ids': [],
|
||||
"created_schedule_ids": [],
|
||||
"disabled_count": 0,
|
||||
"run_all_enqueued": 0,
|
||||
"active_schedule_ids": [],
|
||||
}
|
||||
|
||||
def _active_schedules():
|
||||
return CrawlSchedule.objects.filter(is_enabled=True).select_related('template').order_by('created_at')
|
||||
return CrawlSchedule.objects.filter(is_enabled=True).select_related("template").order_by("created_at")
|
||||
|
||||
if clear:
|
||||
disabled_count = CrawlSchedule.objects.filter(is_enabled=True).update(
|
||||
is_enabled=False,
|
||||
modified_at=timezone.now(),
|
||||
)
|
||||
result['disabled_count'] = disabled_count
|
||||
print(f'[green]\\[√] Disabled {disabled_count} scheduled crawl(s).[/green]')
|
||||
result["disabled_count"] = disabled_count
|
||||
print(f"[green]\\[√] Disabled {disabled_count} scheduled crawl(s).[/green]")
|
||||
|
||||
if every or add:
|
||||
schedule_str = (every or 'day').strip()
|
||||
schedule_str = (every or "day").strip()
|
||||
validate_schedule(schedule_str)
|
||||
|
||||
created_by_id = get_or_create_system_user_pk()
|
||||
is_update_schedule = not import_path
|
||||
template_urls = import_path or 'archivebox://update'
|
||||
template_label = (
|
||||
f'Scheduled import: {template_urls}'
|
||||
if import_path else
|
||||
'Scheduled ArchiveBox update'
|
||||
)[:64]
|
||||
template_urls = import_path or "archivebox://update"
|
||||
template_label = (f"Scheduled import: {template_urls}" if import_path else "Scheduled ArchiveBox update")[:64]
|
||||
template_notes = (
|
||||
f'Created by archivebox schedule for {template_urls}'
|
||||
if import_path else
|
||||
'Created by archivebox schedule to queue recurring archivebox://update maintenance crawls.'
|
||||
f"Created by archivebox schedule for {template_urls}"
|
||||
if import_path
|
||||
else "Created by archivebox schedule to queue recurring archivebox://update maintenance crawls."
|
||||
)
|
||||
|
||||
template = Crawl.objects.create(
|
||||
urls=template_urls,
|
||||
max_depth=0 if is_update_schedule else depth,
|
||||
tags_str='' if is_update_schedule else tag,
|
||||
tags_str="" if is_update_schedule else tag,
|
||||
label=template_label,
|
||||
notes=template_notes,
|
||||
created_by_id=created_by_id,
|
||||
status=Crawl.StatusChoices.SEALED,
|
||||
retry_at=None,
|
||||
config={
|
||||
'ONLY_NEW': not update,
|
||||
'OVERWRITE': overwrite,
|
||||
'DEPTH': 0 if is_update_schedule else depth,
|
||||
'SCHEDULE_KIND': 'update' if is_update_schedule else 'crawl',
|
||||
"ONLY_NEW": not update,
|
||||
"OVERWRITE": overwrite,
|
||||
"DEPTH": 0 if is_update_schedule else depth,
|
||||
"SCHEDULE_KIND": "update" if is_update_schedule else "crawl",
|
||||
},
|
||||
)
|
||||
crawl_schedule = CrawlSchedule.objects.create(
|
||||
@@ -92,31 +90,31 @@ def schedule(add: bool = False,
|
||||
notes=template_notes,
|
||||
created_by_id=created_by_id,
|
||||
)
|
||||
result['created_schedule_ids'] = [str(crawl_schedule.id)]
|
||||
result["created_schedule_ids"] = [str(crawl_schedule.id)]
|
||||
|
||||
schedule_type = 'maintenance update' if is_update_schedule else 'crawl'
|
||||
print(f'[green]\\[√] Created scheduled {schedule_type}.[/green]')
|
||||
print(f' id={crawl_schedule.id}')
|
||||
print(f' every={crawl_schedule.schedule}')
|
||||
print(f' next_run={crawl_schedule.next_run_at.isoformat()}')
|
||||
schedule_type = "maintenance update" if is_update_schedule else "crawl"
|
||||
print(f"[green]\\[√] Created scheduled {schedule_type}.[/green]")
|
||||
print(f" id={crawl_schedule.id}")
|
||||
print(f" every={crawl_schedule.schedule}")
|
||||
print(f" next_run={crawl_schedule.next_run_at.isoformat()}")
|
||||
if import_path:
|
||||
print(f' source={import_path}')
|
||||
print(f" source={import_path}")
|
||||
|
||||
schedules = list(_active_schedules())
|
||||
result['active_schedule_ids'] = [str(schedule.id) for schedule in schedules]
|
||||
result["active_schedule_ids"] = [str(schedule.id) for schedule in schedules]
|
||||
|
||||
if show:
|
||||
if schedules:
|
||||
print(f'[green]\\[*] Active scheduled crawls: {len(schedules)}[/green]')
|
||||
print(f"[green]\\[*] Active scheduled crawls: {len(schedules)}[/green]")
|
||||
for scheduled_crawl in schedules:
|
||||
template = scheduled_crawl.template
|
||||
print(
|
||||
f' - id={scheduled_crawl.id} every={scheduled_crawl.schedule} '
|
||||
f'next_run={scheduled_crawl.next_run_at.isoformat()} '
|
||||
f'source={template.urls.splitlines()[0] if template.urls else ""}'
|
||||
f" - id={scheduled_crawl.id} every={scheduled_crawl.schedule} "
|
||||
f"next_run={scheduled_crawl.next_run_at.isoformat()} "
|
||||
f"source={template.urls.splitlines()[0] if template.urls else ''}",
|
||||
)
|
||||
else:
|
||||
print('[yellow]\\[*] No scheduled crawls are enabled.[/yellow]')
|
||||
print("[yellow]\\[*] No scheduled crawls are enabled.[/yellow]")
|
||||
|
||||
if run_all:
|
||||
enqueued = 0
|
||||
@@ -124,13 +122,17 @@ def schedule(add: bool = False,
|
||||
for scheduled_crawl in schedules:
|
||||
scheduled_crawl.enqueue(queued_at=now)
|
||||
enqueued += 1
|
||||
result['run_all_enqueued'] = enqueued
|
||||
print(f'[green]\\[*] Enqueued {enqueued} scheduled crawl(s) immediately.[/green]')
|
||||
result["run_all_enqueued"] = enqueued
|
||||
print(f"[green]\\[*] Enqueued {enqueued} scheduled crawl(s) immediately.[/green]")
|
||||
if enqueued:
|
||||
print('[yellow]\\[*] Start `archivebox server`, `archivebox run --daemon`, or `archivebox schedule --foreground` to process the queued crawls.[/yellow]')
|
||||
print(
|
||||
"[yellow]\\[*] Start `archivebox server`, `archivebox run --daemon`, or `archivebox schedule --foreground` to process the queued crawls.[/yellow]",
|
||||
)
|
||||
|
||||
if foreground:
|
||||
print('[green]\\[*] Starting global crawl runner in foreground mode. It will materialize scheduled crawls and process queued work.[/green]')
|
||||
print(
|
||||
"[green]\\[*] Starting global crawl runner in foreground mode. It will materialize scheduled crawls and process queued work.[/green]",
|
||||
)
|
||||
run_pending_crawls(daemon=True)
|
||||
|
||||
if quiet:
|
||||
@@ -138,33 +140,38 @@ def schedule(add: bool = False,
|
||||
|
||||
if not any((every, add, show, clear, foreground, run_all)):
|
||||
if schedules:
|
||||
print('[green]\\[*] Active scheduled crawls:[/green]')
|
||||
print("[green]\\[*] Active scheduled crawls:[/green]")
|
||||
for scheduled_crawl in schedules:
|
||||
print(f' - {scheduled_crawl.id} every={scheduled_crawl.schedule} next_run={scheduled_crawl.next_run_at.isoformat()}')
|
||||
print(f" - {scheduled_crawl.id} every={scheduled_crawl.schedule} next_run={scheduled_crawl.next_run_at.isoformat()}")
|
||||
else:
|
||||
print('[yellow]\\[*] No scheduled crawls are enabled.[/yellow]')
|
||||
print("[yellow]\\[*] No scheduled crawls are enabled.[/yellow]")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--quiet', '-q', is_flag=True, help="Return structured results without extra summary output")
|
||||
@click.option('--add', is_flag=True, help='Create a new scheduled crawl')
|
||||
@click.option('--every', type=str, help='Run on an alias like daily/weekly/monthly or a cron expression such as "0 */6 * * *"')
|
||||
@click.option('--tag', '-t', default='', help='Comma-separated tags to apply to scheduled crawl snapshots')
|
||||
@click.option('--depth', type=click.Choice([str(i) for i in range(5)]), default='0', help='Recursively archive linked pages up to N hops away')
|
||||
@click.option('--overwrite', is_flag=True, help='Overwrite existing data if URLs have been archived previously')
|
||||
@click.option('--update', is_flag=True, help='Retry previously failed/skipped URLs when scheduled crawls run')
|
||||
@click.option('--clear', is_flag=True, help='Disable all currently enabled schedules')
|
||||
@click.option('--show', is_flag=True, help='Print all currently enabled schedules')
|
||||
@click.option('--foreground', '-f', is_flag=True, help='Run the global crawl runner in the foreground (no crontab required)')
|
||||
@click.option('--run-all', is_flag=True, help='Enqueue all enabled schedules immediately and process them once')
|
||||
@click.argument('import_path', required=False)
|
||||
@click.option("--quiet", "-q", is_flag=True, help="Return structured results without extra summary output")
|
||||
@click.option("--add", is_flag=True, help="Create a new scheduled crawl")
|
||||
@click.option("--every", type=str, help='Run on an alias like daily/weekly/monthly or a cron expression such as "0 */6 * * *"')
|
||||
@click.option("--tag", "-t", default="", help="Comma-separated tags to apply to scheduled crawl snapshots")
|
||||
@click.option(
|
||||
"--depth",
|
||||
type=click.Choice([str(i) for i in range(5)]),
|
||||
default="0",
|
||||
help="Recursively archive linked pages up to N hops away",
|
||||
)
|
||||
@click.option("--overwrite", is_flag=True, help="Overwrite existing data if URLs have been archived previously")
|
||||
@click.option("--update", is_flag=True, help="Retry previously failed/skipped URLs when scheduled crawls run")
|
||||
@click.option("--clear", is_flag=True, help="Disable all currently enabled schedules")
|
||||
@click.option("--show", is_flag=True, help="Print all currently enabled schedules")
|
||||
@click.option("--foreground", "-f", is_flag=True, help="Run the global crawl runner in the foreground (no crontab required)")
|
||||
@click.option("--run-all", is_flag=True, help="Enqueue all enabled schedules immediately and process them once")
|
||||
@click.argument("import_path", required=False)
|
||||
@docstring(schedule.__doc__)
|
||||
def main(**kwargs):
|
||||
"""Manage database-backed scheduled crawls processed by the crawl runner."""
|
||||
schedule(**kwargs)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox search'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox search"
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Callable
|
||||
from typing import TYPE_CHECKING
|
||||
from collections.abc import Callable
|
||||
|
||||
import rich_click as click
|
||||
|
||||
@@ -20,30 +21,28 @@ if TYPE_CHECKING:
|
||||
|
||||
# Filter types for URL matching
|
||||
LINK_FILTERS: dict[str, Callable[[str], Q]] = {
|
||||
'exact': lambda pattern: Q(url=pattern),
|
||||
'substring': lambda pattern: Q(url__icontains=pattern),
|
||||
'regex': lambda pattern: Q(url__iregex=pattern),
|
||||
'domain': lambda pattern: (
|
||||
Q(url__istartswith=f'http://{pattern}')
|
||||
| Q(url__istartswith=f'https://{pattern}')
|
||||
| Q(url__istartswith=f'ftp://{pattern}')
|
||||
"exact": lambda pattern: Q(url=pattern),
|
||||
"substring": lambda pattern: Q(url__icontains=pattern),
|
||||
"regex": lambda pattern: Q(url__iregex=pattern),
|
||||
"domain": lambda pattern: (
|
||||
Q(url__istartswith=f"http://{pattern}") | Q(url__istartswith=f"https://{pattern}") | Q(url__istartswith=f"ftp://{pattern}")
|
||||
),
|
||||
'tag': lambda pattern: Q(tags__name=pattern),
|
||||
'timestamp': lambda pattern: Q(timestamp=pattern),
|
||||
"tag": lambda pattern: Q(tags__name=pattern),
|
||||
"timestamp": lambda pattern: Q(timestamp=pattern),
|
||||
}
|
||||
|
||||
STATUS_CHOICES = ['indexed', 'archived', 'unarchived']
|
||||
STATUS_CHOICES = ["indexed", "archived", "unarchived"]
|
||||
|
||||
|
||||
def _apply_pattern_filters(
|
||||
snapshots: QuerySet['Snapshot', 'Snapshot'],
|
||||
snapshots: QuerySet["Snapshot", "Snapshot"],
|
||||
filter_patterns: list[str],
|
||||
filter_type: str,
|
||||
) -> QuerySet['Snapshot', 'Snapshot']:
|
||||
) -> QuerySet["Snapshot", "Snapshot"]:
|
||||
filter_builder = LINK_FILTERS.get(filter_type)
|
||||
if filter_builder is None:
|
||||
stderr()
|
||||
stderr(f'[X] Got invalid pattern for --filter-type={filter_type}', color='red')
|
||||
stderr(f"[X] Got invalid pattern for --filter-type={filter_type}", color="red")
|
||||
raise SystemExit(2)
|
||||
|
||||
query = Q()
|
||||
@@ -53,7 +52,7 @@ def _apply_pattern_filters(
|
||||
|
||||
|
||||
def _snapshots_to_json(
|
||||
snapshots: QuerySet['Snapshot', 'Snapshot'],
|
||||
snapshots: QuerySet["Snapshot", "Snapshot"],
|
||||
*,
|
||||
with_headers: bool,
|
||||
) -> str:
|
||||
@@ -63,31 +62,35 @@ def _snapshots_to_json(
|
||||
from archivebox.config.common import SERVER_CONFIG
|
||||
from archivebox.misc.util import to_json
|
||||
|
||||
main_index_header = {
|
||||
'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
|
||||
'schema': 'archivebox.index.json',
|
||||
'copyright_info': SERVER_CONFIG.FOOTER_INFO,
|
||||
'meta': {
|
||||
'project': 'ArchiveBox',
|
||||
'version': VERSION,
|
||||
'git_sha': VERSION,
|
||||
'website': 'https://ArchiveBox.io',
|
||||
'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki',
|
||||
'source': 'https://github.com/ArchiveBox/ArchiveBox',
|
||||
'issues': 'https://github.com/ArchiveBox/ArchiveBox/issues',
|
||||
'dependencies': {},
|
||||
},
|
||||
} if with_headers else {}
|
||||
main_index_header = (
|
||||
{
|
||||
"info": "This is an index of site data archived by ArchiveBox: The self-hosted web archive.",
|
||||
"schema": "archivebox.index.json",
|
||||
"copyright_info": SERVER_CONFIG.FOOTER_INFO,
|
||||
"meta": {
|
||||
"project": "ArchiveBox",
|
||||
"version": VERSION,
|
||||
"git_sha": VERSION,
|
||||
"website": "https://ArchiveBox.io",
|
||||
"docs": "https://github.com/ArchiveBox/ArchiveBox/wiki",
|
||||
"source": "https://github.com/ArchiveBox/ArchiveBox",
|
||||
"issues": "https://github.com/ArchiveBox/ArchiveBox/issues",
|
||||
"dependencies": {},
|
||||
},
|
||||
}
|
||||
if with_headers
|
||||
else {}
|
||||
)
|
||||
|
||||
snapshot_dicts = [snapshot.to_dict(extended=True) for snapshot in snapshots.iterator(chunk_size=500)]
|
||||
output: dict[str, object] | list[dict[str, object]]
|
||||
if with_headers:
|
||||
output = {
|
||||
**main_index_header,
|
||||
'num_links': len(snapshot_dicts),
|
||||
'updated': datetime.now(tz.utc),
|
||||
'last_run_cmd': sys.argv,
|
||||
'links': snapshot_dicts,
|
||||
"num_links": len(snapshot_dicts),
|
||||
"updated": datetime.now(tz.utc),
|
||||
"last_run_cmd": sys.argv,
|
||||
"links": snapshot_dicts,
|
||||
}
|
||||
else:
|
||||
output = snapshot_dicts
|
||||
@@ -96,18 +99,18 @@ def _snapshots_to_json(
|
||||
|
||||
|
||||
def _snapshots_to_csv(
|
||||
snapshots: QuerySet['Snapshot', 'Snapshot'],
|
||||
snapshots: QuerySet["Snapshot", "Snapshot"],
|
||||
*,
|
||||
cols: list[str],
|
||||
with_headers: bool,
|
||||
) -> str:
|
||||
header = ','.join(cols) if with_headers else ''
|
||||
rows = [snapshot.to_csv(cols=cols, separator=',') for snapshot in snapshots.iterator(chunk_size=500)]
|
||||
return '\n'.join((header, *rows))
|
||||
header = ",".join(cols) if with_headers else ""
|
||||
rows = [snapshot.to_csv(cols=cols, separator=",") for snapshot in snapshots.iterator(chunk_size=500)]
|
||||
return "\n".join((header, *rows))
|
||||
|
||||
|
||||
def _snapshots_to_html(
|
||||
snapshots: QuerySet['Snapshot', 'Snapshot'],
|
||||
snapshots: QuerySet["Snapshot", "Snapshot"],
|
||||
*,
|
||||
with_headers: bool,
|
||||
) -> str:
|
||||
@@ -119,26 +122,31 @@ def _snapshots_to_html(
|
||||
from archivebox.config.common import SERVER_CONFIG
|
||||
from archivebox.config.version import get_COMMIT_HASH
|
||||
|
||||
template = 'static_index.html' if with_headers else 'minimal_index.html'
|
||||
template = "static_index.html" if with_headers else "minimal_index.html"
|
||||
snapshot_list = list(snapshots.iterator(chunk_size=500))
|
||||
|
||||
return render_to_string(template, {
|
||||
'version': VERSION,
|
||||
'git_sha': get_COMMIT_HASH() or VERSION,
|
||||
'num_links': str(len(snapshot_list)),
|
||||
'date_updated': datetime.now(tz.utc).strftime('%Y-%m-%d'),
|
||||
'time_updated': datetime.now(tz.utc).strftime('%Y-%m-%d %H:%M'),
|
||||
'links': snapshot_list,
|
||||
'FOOTER_INFO': SERVER_CONFIG.FOOTER_INFO,
|
||||
})
|
||||
return render_to_string(
|
||||
template,
|
||||
{
|
||||
"version": VERSION,
|
||||
"git_sha": get_COMMIT_HASH() or VERSION,
|
||||
"num_links": str(len(snapshot_list)),
|
||||
"date_updated": datetime.now(tz.utc).strftime("%Y-%m-%d"),
|
||||
"time_updated": datetime.now(tz.utc).strftime("%Y-%m-%d %H:%M"),
|
||||
"links": snapshot_list,
|
||||
"FOOTER_INFO": SERVER_CONFIG.FOOTER_INFO,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def get_snapshots(snapshots: QuerySet['Snapshot', 'Snapshot'] | None=None,
|
||||
filter_patterns: list[str] | None=None,
|
||||
filter_type: str='substring',
|
||||
after: float | None=None,
|
||||
before: float | None=None,
|
||||
out_dir: Path=DATA_DIR) -> QuerySet['Snapshot', 'Snapshot']:
|
||||
def get_snapshots(
|
||||
snapshots: QuerySet["Snapshot", "Snapshot"] | None = None,
|
||||
filter_patterns: list[str] | None = None,
|
||||
filter_type: str = "substring",
|
||||
after: float | None = None,
|
||||
before: float | None = None,
|
||||
out_dir: Path = DATA_DIR,
|
||||
) -> QuerySet["Snapshot", "Snapshot"]:
|
||||
"""Filter and return Snapshots matching the given criteria."""
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
@@ -155,29 +163,31 @@ def get_snapshots(snapshots: QuerySet['Snapshot', 'Snapshot'] | None=None,
|
||||
result = _apply_pattern_filters(result, filter_patterns, filter_type)
|
||||
|
||||
# Prefetch crawl relationship to avoid N+1 queries when accessing output_dir
|
||||
result = result.select_related('crawl', 'crawl__created_by')
|
||||
result = result.select_related("crawl", "crawl__created_by")
|
||||
|
||||
if not result.exists():
|
||||
stderr('[!] No Snapshots matched your filters:', filter_patterns, f'({filter_type})', color='lightyellow')
|
||||
stderr("[!] No Snapshots matched your filters:", filter_patterns, f"({filter_type})", color="lightyellow")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
@enforce_types
|
||||
def search(filter_patterns: list[str] | None=None,
|
||||
filter_type: str='substring',
|
||||
status: str='indexed',
|
||||
before: float | None=None,
|
||||
after: float | None=None,
|
||||
sort: str | None=None,
|
||||
json: bool=False,
|
||||
html: bool=False,
|
||||
csv: str | None=None,
|
||||
with_headers: bool=False):
|
||||
def search(
|
||||
filter_patterns: list[str] | None = None,
|
||||
filter_type: str = "substring",
|
||||
status: str = "indexed",
|
||||
before: float | None = None,
|
||||
after: float | None = None,
|
||||
sort: str | None = None,
|
||||
json: bool = False,
|
||||
html: bool = False,
|
||||
csv: str | None = None,
|
||||
with_headers: bool = False,
|
||||
):
|
||||
"""List, filter, and export information about archive entries"""
|
||||
|
||||
if with_headers and not (json or html or csv):
|
||||
stderr('[X] --with-headers requires --json, --html or --csv\n', color='red')
|
||||
stderr("[X] --with-headers requires --json, --html or --csv\n", color="red")
|
||||
raise SystemExit(2)
|
||||
|
||||
# Query DB directly - no filesystem scanning
|
||||
@@ -189,9 +199,9 @@ def search(filter_patterns: list[str] | None=None,
|
||||
)
|
||||
|
||||
# Apply status filter
|
||||
if status == 'archived':
|
||||
if status == "archived":
|
||||
snapshots = snapshots.filter(downloaded_at__isnull=False)
|
||||
elif status == 'unarchived':
|
||||
elif status == "unarchived":
|
||||
snapshots = snapshots.filter(downloaded_at__isnull=True)
|
||||
# 'indexed' = all snapshots (no filter)
|
||||
|
||||
@@ -204,9 +214,10 @@ def search(filter_patterns: list[str] | None=None,
|
||||
elif html:
|
||||
output = _snapshots_to_html(snapshots, with_headers=with_headers)
|
||||
elif csv:
|
||||
output = _snapshots_to_csv(snapshots, cols=csv.split(','), with_headers=with_headers)
|
||||
output = _snapshots_to_csv(snapshots, cols=csv.split(","), with_headers=with_headers)
|
||||
else:
|
||||
from archivebox.misc.logging_util import printable_folders
|
||||
|
||||
# Convert to dict for printable_folders
|
||||
folders: dict[str, Snapshot | None] = {str(snapshot.output_dir): snapshot for snapshot in snapshots}
|
||||
output = printable_folders(folders, with_headers)
|
||||
@@ -214,28 +225,33 @@ def search(filter_patterns: list[str] | None=None,
|
||||
# Structured exports must be written directly to stdout.
|
||||
# rich.print() reflows long lines to console width, which corrupts JSON/CSV/HTML output.
|
||||
sys.stdout.write(output)
|
||||
if not output.endswith('\n'):
|
||||
sys.stdout.write('\n')
|
||||
if not output.endswith("\n"):
|
||||
sys.stdout.write("\n")
|
||||
return output
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--filter-type', '-f', type=click.Choice(['search', *LINK_FILTERS.keys()]), default='substring', help='Pattern matching type for filtering URLs')
|
||||
@click.option('--status', '-s', type=click.Choice(STATUS_CHOICES), default='indexed', help='List snapshots with the given status')
|
||||
@click.option('--before', '-b', type=float, help='List snapshots bookmarked before the given UNIX timestamp')
|
||||
@click.option('--after', '-a', type=float, help='List snapshots bookmarked after the given UNIX timestamp')
|
||||
@click.option('--sort', '-o', type=str, help='Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at')
|
||||
@click.option('--json', '-J', is_flag=True, help='Print output in JSON format')
|
||||
@click.option('--html', '-M', is_flag=True, help='Print output in HTML format (suitable for viewing statically without a server)')
|
||||
@click.option('--csv', '-C', type=str, help='Print output as CSV with the provided fields, e.g.: created_at,url,title')
|
||||
@click.option('--with-headers', '-H', is_flag=True, help='Include extra CSV/HTML headers in the output')
|
||||
@click.help_option('--help', '-h')
|
||||
@click.argument('filter_patterns', nargs=-1)
|
||||
@click.option(
|
||||
"--filter-type",
|
||||
"-f",
|
||||
type=click.Choice(["search", *LINK_FILTERS.keys()]),
|
||||
default="substring",
|
||||
help="Pattern matching type for filtering URLs",
|
||||
)
|
||||
@click.option("--status", "-s", type=click.Choice(STATUS_CHOICES), default="indexed", help="List snapshots with the given status")
|
||||
@click.option("--before", "-b", type=float, help="List snapshots bookmarked before the given UNIX timestamp")
|
||||
@click.option("--after", "-a", type=float, help="List snapshots bookmarked after the given UNIX timestamp")
|
||||
@click.option("--sort", "-o", type=str, help="Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at")
|
||||
@click.option("--json", "-J", is_flag=True, help="Print output in JSON format")
|
||||
@click.option("--html", "-M", is_flag=True, help="Print output in HTML format (suitable for viewing statically without a server)")
|
||||
@click.option("--csv", "-C", type=str, help="Print output as CSV with the provided fields, e.g.: created_at,url,title")
|
||||
@click.option("--with-headers", "-H", is_flag=True, help="Include extra CSV/HTML headers in the output")
|
||||
@click.help_option("--help", "-h")
|
||||
@click.argument("filter_patterns", nargs=-1)
|
||||
@docstring(search.__doc__)
|
||||
def main(**kwargs):
|
||||
return search(**kwargs)
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__package__ = "archivebox.cli"
|
||||
|
||||
from typing import Iterable
|
||||
from collections.abc import Iterable
|
||||
import sys
|
||||
|
||||
import rich_click as click
|
||||
@@ -15,20 +15,23 @@ from archivebox.config.common import SERVER_CONFIG
|
||||
def stop_existing_background_runner(*, machine, process_model, supervisor=None, stop_worker_fn=None, log=print) -> int:
|
||||
"""Stop any existing orchestrator process so the server can take ownership."""
|
||||
process_model.cleanup_stale_running(machine=machine)
|
||||
process_model.cleanup_orphaned_workers()
|
||||
|
||||
running_runners = list(process_model.objects.filter(
|
||||
machine=machine,
|
||||
status=process_model.StatusChoices.RUNNING,
|
||||
process_type=process_model.TypeChoices.ORCHESTRATOR,
|
||||
).order_by('created_at'))
|
||||
running_runners = list(
|
||||
process_model.objects.filter(
|
||||
machine=machine,
|
||||
status=process_model.StatusChoices.RUNNING,
|
||||
process_type=process_model.TypeChoices.ORCHESTRATOR,
|
||||
).order_by("created_at"),
|
||||
)
|
||||
|
||||
if not running_runners:
|
||||
return 0
|
||||
|
||||
log('[yellow][*] Stopping existing ArchiveBox background runner...[/yellow]')
|
||||
log("[yellow][*] Stopping existing ArchiveBox background runner...[/yellow]")
|
||||
|
||||
if supervisor is not None and stop_worker_fn is not None:
|
||||
for worker_name in ('worker_runner', 'worker_runner_watch'):
|
||||
for worker_name in ("worker_runner", "worker_runner_watch"):
|
||||
try:
|
||||
stop_worker_fn(supervisor, worker_name)
|
||||
except Exception:
|
||||
@@ -47,23 +50,70 @@ def stop_existing_background_runner(*, machine, process_model, supervisor=None,
|
||||
return len(running_runners)
|
||||
|
||||
|
||||
def _read_supervisor_worker_command(worker_name: str) -> str:
|
||||
from archivebox.workers.supervisord_util import WORKERS_DIR_NAME, get_sock_file
|
||||
|
||||
worker_conf = get_sock_file().parent / WORKERS_DIR_NAME / f"{worker_name}.conf"
|
||||
if not worker_conf.exists():
|
||||
return ""
|
||||
|
||||
for line in worker_conf.read_text().splitlines():
|
||||
if line.startswith("command="):
|
||||
return line.removeprefix("command=").strip()
|
||||
return ""
|
||||
|
||||
|
||||
def _worker_command_matches_bind(command: str, host: str, port: str) -> bool:
|
||||
if not command:
|
||||
return False
|
||||
return f"{host}:{port}" in command or (f"--bind={host}" in command and f"--port={port}" in command)
|
||||
|
||||
|
||||
def stop_existing_server_workers(*, supervisor, stop_worker_fn, host: str, port: str, log=print) -> int:
|
||||
"""Stop existing ArchiveBox web workers if they already own the requested bind."""
|
||||
stopped = 0
|
||||
|
||||
for worker_name in ("worker_runserver", "worker_daphne"):
|
||||
try:
|
||||
proc = supervisor.getProcessInfo(worker_name) if supervisor else None
|
||||
except Exception:
|
||||
proc = None
|
||||
if not isinstance(proc, dict) or proc.get("statename") != "RUNNING":
|
||||
continue
|
||||
|
||||
command = _read_supervisor_worker_command(worker_name)
|
||||
if not _worker_command_matches_bind(command, host, port):
|
||||
continue
|
||||
|
||||
if stopped == 0:
|
||||
log("[yellow][*] Taking over existing ArchiveBox web server on same port...[/yellow]")
|
||||
stop_worker_fn(supervisor, worker_name)
|
||||
stopped += 1
|
||||
|
||||
return stopped
|
||||
|
||||
|
||||
@enforce_types
|
||||
def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
|
||||
reload: bool=False,
|
||||
init: bool=False,
|
||||
debug: bool=False,
|
||||
daemonize: bool=False,
|
||||
nothreading: bool=False) -> None:
|
||||
def server(
|
||||
runserver_args: Iterable[str] = (SERVER_CONFIG.BIND_ADDR,),
|
||||
reload: bool = False,
|
||||
init: bool = False,
|
||||
debug: bool = False,
|
||||
daemonize: bool = False,
|
||||
nothreading: bool = False,
|
||||
) -> None:
|
||||
"""Run the ArchiveBox HTTP server"""
|
||||
|
||||
runserver_args = list(runserver_args)
|
||||
|
||||
|
||||
if init:
|
||||
from archivebox.cli.archivebox_init import init as archivebox_init
|
||||
|
||||
archivebox_init(quick=True)
|
||||
print()
|
||||
|
||||
from archivebox.misc.checks import check_data_folder
|
||||
|
||||
check_data_folder()
|
||||
|
||||
from archivebox.config.common import SHELL_CONFIG
|
||||
@@ -73,22 +123,24 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
|
||||
SHELL_CONFIG.DEBUG = True
|
||||
|
||||
from django.contrib.auth.models import User
|
||||
|
||||
if not User.objects.filter(is_superuser=True).exclude(username='system').exists():
|
||||
|
||||
if not User.objects.filter(is_superuser=True).exclude(username="system").exists():
|
||||
print()
|
||||
print('[violet]Hint:[/violet] To create an [bold]admin username & password[/bold] for the [deep_sky_blue3][underline][link=http://{host}:{port}/admin]Admin UI[/link][/underline][/deep_sky_blue3], run:')
|
||||
print(' [green]archivebox manage createsuperuser[/green]')
|
||||
print(
|
||||
"[violet]Hint:[/violet] To create an [bold]admin username & password[/bold] for the [deep_sky_blue3][underline][link=http://{host}:{port}/admin]Admin UI[/link][/underline][/deep_sky_blue3], run:",
|
||||
)
|
||||
print(" [green]archivebox manage createsuperuser[/green]")
|
||||
print()
|
||||
|
||||
host = '127.0.0.1'
|
||||
port = '8000'
|
||||
|
||||
host = "127.0.0.1"
|
||||
port = "8000"
|
||||
|
||||
try:
|
||||
host_and_port = [arg for arg in runserver_args if arg.replace('.', '').replace(':', '').isdigit()][0]
|
||||
if ':' in host_and_port:
|
||||
host, port = host_and_port.split(':')
|
||||
host_and_port = [arg for arg in runserver_args if arg.replace(".", "").replace(":", "").isdigit()][0]
|
||||
if ":" in host_and_port:
|
||||
host, port = host_and_port.split(":")
|
||||
else:
|
||||
if '.' in host_and_port:
|
||||
if "." in host_and_port:
|
||||
host = host_and_port
|
||||
else:
|
||||
port = host_and_port
|
||||
@@ -104,66 +156,80 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
|
||||
)
|
||||
from archivebox.machine.models import Machine, Process
|
||||
|
||||
# Check if port is already in use
|
||||
if is_port_in_use(host, int(port)):
|
||||
print(f'[red][X] Error: Port {port} is already in use[/red]')
|
||||
print(f' Another process (possibly daphne or runserver) is already listening on {host}:{port}')
|
||||
print(' Stop the conflicting process or choose a different port')
|
||||
sys.exit(1)
|
||||
|
||||
machine = Machine.current()
|
||||
supervisor = get_existing_supervisord_process()
|
||||
stop_existing_background_runner(
|
||||
machine=machine,
|
||||
process_model=Process,
|
||||
supervisor=get_existing_supervisord_process(),
|
||||
supervisor=supervisor,
|
||||
stop_worker_fn=stop_worker,
|
||||
)
|
||||
if supervisor:
|
||||
stop_existing_server_workers(
|
||||
supervisor=supervisor,
|
||||
stop_worker_fn=stop_worker,
|
||||
host=host,
|
||||
port=port,
|
||||
)
|
||||
|
||||
# Check if port is already in use
|
||||
if is_port_in_use(host, int(port)):
|
||||
print(f"[red][X] Error: Port {port} is already in use[/red]")
|
||||
print(f" Another process (possibly daphne or runserver) is already listening on {host}:{port}")
|
||||
print(" Stop the conflicting process or choose a different port")
|
||||
sys.exit(1)
|
||||
|
||||
supervisor = get_existing_supervisord_process()
|
||||
if supervisor:
|
||||
server_worker_name = 'worker_runserver' if run_in_debug else 'worker_daphne'
|
||||
server_worker_name = "worker_runserver" if run_in_debug else "worker_daphne"
|
||||
server_proc = get_worker(supervisor, server_worker_name)
|
||||
server_state = server_proc.get('statename') if isinstance(server_proc, dict) else None
|
||||
if server_state == 'RUNNING':
|
||||
runner_proc = get_worker(supervisor, 'worker_runner')
|
||||
runner_watch_proc = get_worker(supervisor, 'worker_runner_watch')
|
||||
runner_state = runner_proc.get('statename') if isinstance(runner_proc, dict) else None
|
||||
runner_watch_state = runner_watch_proc.get('statename') if isinstance(runner_watch_proc, dict) else None
|
||||
print('[red][X] Error: ArchiveBox server is already running[/red]')
|
||||
print(f' [green]√[/green] Web server ({server_worker_name}) is RUNNING on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
|
||||
if runner_state == 'RUNNING':
|
||||
print(' [green]√[/green] Background runner (worker_runner) is RUNNING')
|
||||
if runner_watch_state == 'RUNNING':
|
||||
print(' [green]√[/green] Reload watcher (worker_runner_watch) is RUNNING')
|
||||
server_state = server_proc.get("statename") if isinstance(server_proc, dict) else None
|
||||
if server_state == "RUNNING":
|
||||
runner_proc = get_worker(supervisor, "worker_runner")
|
||||
runner_watch_proc = get_worker(supervisor, "worker_runner_watch")
|
||||
runner_state = runner_proc.get("statename") if isinstance(runner_proc, dict) else None
|
||||
runner_watch_state = runner_watch_proc.get("statename") if isinstance(runner_watch_proc, dict) else None
|
||||
print("[red][X] Error: ArchiveBox server is already running[/red]")
|
||||
print(
|
||||
f" [green]√[/green] Web server ({server_worker_name}) is RUNNING on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]",
|
||||
)
|
||||
if runner_state == "RUNNING":
|
||||
print(" [green]√[/green] Background runner (worker_runner) is RUNNING")
|
||||
if runner_watch_state == "RUNNING":
|
||||
print(" [green]√[/green] Reload watcher (worker_runner_watch) is RUNNING")
|
||||
print()
|
||||
print('[yellow]To stop the existing server, run:[/yellow]')
|
||||
print("[yellow]To stop the existing server, run:[/yellow]")
|
||||
print(' pkill -f "archivebox server"')
|
||||
print(' pkill -f supervisord')
|
||||
print(" pkill -f supervisord")
|
||||
sys.exit(1)
|
||||
|
||||
if run_in_debug:
|
||||
print('[green][+] Starting ArchiveBox webserver in DEBUG mode...[/green]')
|
||||
print("[green][+] Starting ArchiveBox webserver in DEBUG mode...[/green]")
|
||||
else:
|
||||
print('[green][+] Starting ArchiveBox webserver...[/green]')
|
||||
print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
|
||||
print(f' [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]')
|
||||
print(' > Writing ArchiveBox error log to ./logs/errors.log')
|
||||
print("[green][+] Starting ArchiveBox webserver...[/green]")
|
||||
print(
|
||||
f" [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]",
|
||||
)
|
||||
print(
|
||||
f" [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]",
|
||||
)
|
||||
print(" > Writing ArchiveBox error log to ./logs/errors.log")
|
||||
print()
|
||||
start_server_workers(host=host, port=port, daemonize=daemonize, debug=run_in_debug, reload=reload, nothreading=nothreading)
|
||||
print("\n[i][green][🟩] ArchiveBox server shut down gracefully.[/green][/i]")
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.argument('runserver_args', nargs=-1)
|
||||
@click.option('--reload', is_flag=True, help='Enable auto-reloading when code or templates change')
|
||||
@click.option('--debug', is_flag=True, help='Enable DEBUG=True mode with more verbose errors')
|
||||
@click.option('--nothreading', is_flag=True, help='Force runserver to run in single-threaded mode')
|
||||
@click.option('--init', is_flag=True, help='Run a full archivebox init/upgrade before starting the server')
|
||||
@click.option('--daemonize', is_flag=True, help='Run the server in the background as a daemon')
|
||||
@click.argument("runserver_args", nargs=-1)
|
||||
@click.option("--reload", is_flag=True, help="Enable auto-reloading when code or templates change")
|
||||
@click.option("--debug", is_flag=True, help="Enable DEBUG=True mode with more verbose errors")
|
||||
@click.option("--nothreading", is_flag=True, help="Force runserver to run in single-threaded mode")
|
||||
@click.option("--init", is_flag=True, help="Run a full archivebox init/upgrade before starting the server")
|
||||
@click.option("--daemonize", is_flag=True, help="Run the server in the background as a daemon")
|
||||
@docstring(server.__doc__)
|
||||
def main(**kwargs):
|
||||
server(**kwargs)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,27 +1,28 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__package__ = "archivebox.cli"
|
||||
|
||||
from typing import Iterable
|
||||
from collections.abc import Iterable
|
||||
|
||||
import rich_click as click
|
||||
|
||||
from archivebox.misc.util import docstring
|
||||
|
||||
|
||||
def shell(args: Iterable[str]=()) -> None:
|
||||
def shell(args: Iterable[str] = ()) -> None:
|
||||
"""Enter an interactive ArchiveBox Django shell"""
|
||||
|
||||
from django.core.management import call_command
|
||||
|
||||
call_command("shell_plus", *args)
|
||||
|
||||
|
||||
@click.command(add_help_option=False, context_settings=dict(ignore_unknown_options=True))
|
||||
@click.argument('args', nargs=-1)
|
||||
@click.argument("args", nargs=-1)
|
||||
@docstring(shell.__doc__)
|
||||
def main(args: Iterable[str]=()) -> None:
|
||||
def main(args: Iterable[str] = ()) -> None:
|
||||
shell(args=args)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -27,14 +27,16 @@ Examples:
|
||||
archivebox snapshot list --url__icontains=spam.com | archivebox snapshot delete --yes
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox snapshot'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox snapshot"
|
||||
|
||||
import sys
|
||||
from typing import Optional, Iterable
|
||||
from collections.abc import Iterable
|
||||
|
||||
import rich_click as click
|
||||
from rich import print as rprint
|
||||
from django.db.models import Q, Sum
|
||||
from django.db.models.functions import Coalesce
|
||||
|
||||
from archivebox.cli.cli_utils import apply_filters
|
||||
|
||||
@@ -43,12 +45,13 @@ from archivebox.cli.cli_utils import apply_filters
|
||||
# CREATE
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def create_snapshots(
|
||||
urls: Iterable[str],
|
||||
tag: str = '',
|
||||
status: str = 'queued',
|
||||
tag: str = "",
|
||||
status: str = "queued",
|
||||
depth: int = 0,
|
||||
created_by_id: Optional[int] = None,
|
||||
created_by_id: int | None = None,
|
||||
) -> int:
|
||||
"""
|
||||
Create Snapshots from URLs or stdin JSONL (Crawl or Snapshot records).
|
||||
@@ -59,8 +62,10 @@ def create_snapshots(
|
||||
1: Failure
|
||||
"""
|
||||
from archivebox.misc.jsonl import (
|
||||
read_args_or_stdin, write_record,
|
||||
TYPE_SNAPSHOT, TYPE_CRAWL
|
||||
read_args_or_stdin,
|
||||
write_record,
|
||||
TYPE_SNAPSHOT,
|
||||
TYPE_CRAWL,
|
||||
)
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from archivebox.core.models import Snapshot
|
||||
@@ -73,7 +78,7 @@ def create_snapshots(
|
||||
records = list(read_args_or_stdin(urls))
|
||||
|
||||
if not records:
|
||||
rprint('[yellow]No URLs or Crawls provided. Pass URLs as arguments or via stdin.[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No URLs or Crawls provided. Pass URLs as arguments or via stdin.[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Process each record - handle Crawls and plain URLs/Snapshots
|
||||
@@ -81,7 +86,7 @@ def create_snapshots(
|
||||
pass_through_count = 0
|
||||
|
||||
for record in records:
|
||||
record_type = record.get('type', '')
|
||||
record_type = record.get("type", "")
|
||||
|
||||
try:
|
||||
if record_type == TYPE_CRAWL:
|
||||
@@ -91,14 +96,14 @@ def create_snapshots(
|
||||
|
||||
# Input is a Crawl - get or create it, then create Snapshots for its URLs
|
||||
crawl = None
|
||||
crawl_id = record.get('id')
|
||||
crawl_id = record.get("id")
|
||||
if crawl_id:
|
||||
try:
|
||||
crawl = Crawl.objects.get(id=crawl_id)
|
||||
except Crawl.DoesNotExist:
|
||||
crawl = Crawl.from_json(record, overrides={'created_by_id': created_by_id})
|
||||
crawl = Crawl.from_json(record, overrides={"created_by_id": created_by_id})
|
||||
else:
|
||||
crawl = Crawl.from_json(record, overrides={'created_by_id': created_by_id})
|
||||
crawl = Crawl.from_json(record, overrides={"created_by_id": created_by_id})
|
||||
|
||||
if not crawl:
|
||||
continue
|
||||
@@ -109,27 +114,27 @@ def create_snapshots(
|
||||
if tag:
|
||||
merged_tags = f"{merged_tags},{tag}" if merged_tags else tag
|
||||
snapshot_record = {
|
||||
'url': url,
|
||||
'tags': merged_tags,
|
||||
'crawl_id': str(crawl.id),
|
||||
'depth': depth,
|
||||
'status': status,
|
||||
"url": url,
|
||||
"tags": merged_tags,
|
||||
"crawl_id": str(crawl.id),
|
||||
"depth": depth,
|
||||
"status": status,
|
||||
}
|
||||
snapshot = Snapshot.from_json(snapshot_record, overrides={'created_by_id': created_by_id})
|
||||
snapshot = Snapshot.from_json(snapshot_record, overrides={"created_by_id": created_by_id})
|
||||
if snapshot:
|
||||
created_snapshots.append(snapshot)
|
||||
if not is_tty:
|
||||
write_record(snapshot.to_json())
|
||||
|
||||
elif record_type == TYPE_SNAPSHOT or record.get('url'):
|
||||
elif record_type == TYPE_SNAPSHOT or record.get("url"):
|
||||
# Input is a Snapshot or plain URL
|
||||
if tag and not record.get('tags'):
|
||||
record['tags'] = tag
|
||||
if tag and not record.get("tags"):
|
||||
record["tags"] = tag
|
||||
if status:
|
||||
record['status'] = status
|
||||
record['depth'] = record.get('depth', depth)
|
||||
record["status"] = status
|
||||
record["depth"] = record.get("depth", depth)
|
||||
|
||||
snapshot = Snapshot.from_json(record, overrides={'created_by_id': created_by_id})
|
||||
snapshot = Snapshot.from_json(record, overrides={"created_by_id": created_by_id})
|
||||
if snapshot:
|
||||
created_snapshots.append(snapshot)
|
||||
if not is_tty:
|
||||
@@ -142,21 +147,21 @@ def create_snapshots(
|
||||
pass_through_count += 1
|
||||
|
||||
except Exception as e:
|
||||
rprint(f'[red]Error creating snapshot: {e}[/red]', file=sys.stderr)
|
||||
rprint(f"[red]Error creating snapshot: {e}[/red]", file=sys.stderr)
|
||||
continue
|
||||
|
||||
if not created_snapshots:
|
||||
if pass_through_count > 0:
|
||||
rprint(f'[dim]Passed through {pass_through_count} records, no new snapshots[/dim]', file=sys.stderr)
|
||||
rprint(f"[dim]Passed through {pass_through_count} records, no new snapshots[/dim]", file=sys.stderr)
|
||||
return 0
|
||||
rprint('[red]No snapshots created[/red]', file=sys.stderr)
|
||||
rprint("[red]No snapshots created[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
rprint(f'[green]Created {len(created_snapshots)} snapshots[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Created {len(created_snapshots)} snapshots[/green]", file=sys.stderr)
|
||||
|
||||
if is_tty:
|
||||
for snapshot in created_snapshots:
|
||||
rprint(f' [dim]{snapshot.id}[/dim] {snapshot.url[:60]}', file=sys.stderr)
|
||||
rprint(f" [dim]{snapshot.id}[/dim] {snapshot.url[:60]}", file=sys.stderr)
|
||||
|
||||
return 0
|
||||
|
||||
@@ -165,16 +170,19 @@ def create_snapshots(
|
||||
# LIST
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def list_snapshots(
|
||||
status: Optional[str] = None,
|
||||
url__icontains: Optional[str] = None,
|
||||
url__istartswith: Optional[str] = None,
|
||||
tag: Optional[str] = None,
|
||||
crawl_id: Optional[str] = None,
|
||||
limit: Optional[int] = None,
|
||||
sort: Optional[str] = None,
|
||||
csv: Optional[str] = None,
|
||||
status: str | None = None,
|
||||
url__icontains: str | None = None,
|
||||
url__istartswith: str | None = None,
|
||||
tag: str | None = None,
|
||||
crawl_id: str | None = None,
|
||||
limit: int | None = None,
|
||||
sort: str | None = None,
|
||||
csv: str | None = None,
|
||||
with_headers: bool = False,
|
||||
search: str | None = None,
|
||||
query: str | None = None,
|
||||
) -> int:
|
||||
"""
|
||||
List Snapshots as JSONL with optional filters.
|
||||
@@ -184,64 +192,106 @@ def list_snapshots(
|
||||
"""
|
||||
from archivebox.misc.jsonl import write_record
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.search import (
|
||||
get_default_search_mode,
|
||||
get_search_mode,
|
||||
prioritize_metadata_matches,
|
||||
query_search_index,
|
||||
)
|
||||
|
||||
if with_headers and not csv:
|
||||
rprint('[red]--with-headers requires --csv[/red]', file=sys.stderr)
|
||||
rprint("[red]--with-headers requires --csv[/red]", file=sys.stderr)
|
||||
return 2
|
||||
|
||||
is_tty = sys.stdout.isatty() and not csv
|
||||
|
||||
queryset = Snapshot.objects.all().order_by('-created_at')
|
||||
queryset = Snapshot.objects.annotate(output_size_sum=Coalesce(Sum("archiveresult__output_size"), 0)).order_by("-created_at")
|
||||
|
||||
# Apply filters
|
||||
filter_kwargs = {
|
||||
'status': status,
|
||||
'url__icontains': url__icontains,
|
||||
'url__istartswith': url__istartswith,
|
||||
'crawl_id': crawl_id,
|
||||
"status": status,
|
||||
"url__icontains": url__icontains,
|
||||
"url__istartswith": url__istartswith,
|
||||
"crawl_id": crawl_id,
|
||||
}
|
||||
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
|
||||
queryset = apply_filters(queryset, filter_kwargs)
|
||||
|
||||
# Tag filter requires special handling (M2M)
|
||||
if tag:
|
||||
queryset = queryset.filter(tags__name__iexact=tag)
|
||||
|
||||
query = (query or "").strip()
|
||||
if query:
|
||||
metadata_qs = queryset.filter(
|
||||
Q(title__icontains=query) | Q(url__icontains=query) | Q(timestamp__icontains=query) | Q(tags__name__icontains=query),
|
||||
)
|
||||
requested_search_mode = (search or "").strip().lower()
|
||||
if requested_search_mode == "content":
|
||||
requested_search_mode = "contents"
|
||||
search_mode = get_default_search_mode() if not requested_search_mode else get_search_mode(requested_search_mode)
|
||||
|
||||
if search_mode == "meta":
|
||||
queryset = metadata_qs
|
||||
else:
|
||||
try:
|
||||
deep_qsearch = None
|
||||
if search_mode == "deep":
|
||||
qsearch = query_search_index(query, search_mode="contents")
|
||||
deep_qsearch = query_search_index(query, search_mode="deep")
|
||||
else:
|
||||
qsearch = query_search_index(query, search_mode=search_mode)
|
||||
queryset = prioritize_metadata_matches(
|
||||
queryset,
|
||||
metadata_qs,
|
||||
qsearch,
|
||||
deep_queryset=deep_qsearch,
|
||||
ordering=("-created_at",) if not sort else None,
|
||||
)
|
||||
except Exception as err:
|
||||
rprint(
|
||||
f"[yellow]Search backend error, falling back to metadata search: {err}[/yellow]",
|
||||
file=sys.stderr,
|
||||
)
|
||||
queryset = metadata_qs
|
||||
|
||||
if sort:
|
||||
queryset = queryset.order_by(sort)
|
||||
if limit:
|
||||
queryset = queryset[:limit]
|
||||
|
||||
count = 0
|
||||
if csv:
|
||||
cols = [col.strip() for col in csv.split(',') if col.strip()]
|
||||
cols = [col.strip() for col in csv.split(",") if col.strip()]
|
||||
if not cols:
|
||||
rprint('[red]No CSV columns provided[/red]', file=sys.stderr)
|
||||
rprint("[red]No CSV columns provided[/red]", file=sys.stderr)
|
||||
return 2
|
||||
rows: list[str] = []
|
||||
if with_headers:
|
||||
rows.append(','.join(cols))
|
||||
rows.append(",".join(cols))
|
||||
for snapshot in queryset.iterator(chunk_size=500):
|
||||
rows.append(snapshot.to_csv(cols=cols, separator=','))
|
||||
rows.append(snapshot.to_csv(cols=cols, separator=","))
|
||||
count += 1
|
||||
output = '\n'.join(rows)
|
||||
output = "\n".join(rows)
|
||||
if output:
|
||||
sys.stdout.write(output)
|
||||
if not output.endswith('\n'):
|
||||
sys.stdout.write('\n')
|
||||
rprint(f'[dim]Listed {count} snapshots[/dim]', file=sys.stderr)
|
||||
if not output.endswith("\n"):
|
||||
sys.stdout.write("\n")
|
||||
rprint(f"[dim]Listed {count} snapshots[/dim]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
for snapshot in queryset:
|
||||
if is_tty:
|
||||
status_color = {
|
||||
'queued': 'yellow',
|
||||
'started': 'blue',
|
||||
'sealed': 'green',
|
||||
}.get(snapshot.status, 'dim')
|
||||
rprint(f'[{status_color}]{snapshot.status:8}[/{status_color}] [dim]{snapshot.id}[/dim] {snapshot.url[:60]}')
|
||||
"queued": "yellow",
|
||||
"started": "blue",
|
||||
"sealed": "green",
|
||||
}.get(snapshot.status, "dim")
|
||||
rprint(f"[{status_color}]{snapshot.status:8}[/{status_color}] [dim]{snapshot.id}[/dim] {snapshot.url[:60]}")
|
||||
else:
|
||||
write_record(snapshot.to_json())
|
||||
count += 1
|
||||
|
||||
rprint(f'[dim]Listed {count} snapshots[/dim]', file=sys.stderr)
|
||||
rprint(f"[dim]Listed {count} snapshots[/dim]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -249,9 +299,10 @@ def list_snapshots(
|
||||
# UPDATE
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def update_snapshots(
|
||||
status: Optional[str] = None,
|
||||
tag: Optional[str] = None,
|
||||
status: str | None = None,
|
||||
tag: str | None = None,
|
||||
) -> int:
|
||||
"""
|
||||
Update Snapshots from stdin JSONL.
|
||||
@@ -272,12 +323,12 @@ def update_snapshots(
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
updated_count = 0
|
||||
for record in records:
|
||||
snapshot_id = record.get('id')
|
||||
snapshot_id = record.get("id")
|
||||
if not snapshot_id:
|
||||
continue
|
||||
|
||||
@@ -292,6 +343,7 @@ def update_snapshots(
|
||||
# Add tag to existing tags
|
||||
snapshot.save() # Ensure saved before M2M
|
||||
from archivebox.core.models import Tag
|
||||
|
||||
tag_obj, _ = Tag.objects.get_or_create(name=tag)
|
||||
snapshot.tags.add(tag_obj)
|
||||
|
||||
@@ -302,10 +354,10 @@ def update_snapshots(
|
||||
write_record(snapshot.to_json())
|
||||
|
||||
except Snapshot.DoesNotExist:
|
||||
rprint(f'[yellow]Snapshot not found: {snapshot_id}[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]Snapshot not found: {snapshot_id}[/yellow]", file=sys.stderr)
|
||||
continue
|
||||
|
||||
rprint(f'[green]Updated {updated_count} snapshots[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Updated {updated_count} snapshots[/green]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -313,6 +365,7 @@ def update_snapshots(
|
||||
# DELETE
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def delete_snapshots(yes: bool = False, dry_run: bool = False) -> int:
|
||||
"""
|
||||
Delete Snapshots from stdin JSONL.
|
||||
@@ -328,35 +381,35 @@ def delete_snapshots(yes: bool = False, dry_run: bool = False) -> int:
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
snapshot_ids = [r.get('id') for r in records if r.get('id')]
|
||||
snapshot_ids = [r.get("id") for r in records if r.get("id")]
|
||||
|
||||
if not snapshot_ids:
|
||||
rprint('[yellow]No valid snapshot IDs in input[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No valid snapshot IDs in input[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
snapshots = Snapshot.objects.filter(id__in=snapshot_ids)
|
||||
count = snapshots.count()
|
||||
|
||||
if count == 0:
|
||||
rprint('[yellow]No matching snapshots found[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No matching snapshots found[/yellow]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if dry_run:
|
||||
rprint(f'[yellow]Would delete {count} snapshots (dry run)[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]Would delete {count} snapshots (dry run)[/yellow]", file=sys.stderr)
|
||||
for snapshot in snapshots:
|
||||
rprint(f' [dim]{snapshot.id}[/dim] {snapshot.url[:60]}', file=sys.stderr)
|
||||
rprint(f" [dim]{snapshot.id}[/dim] {snapshot.url[:60]}", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if not yes:
|
||||
rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
|
||||
rprint("[red]Use --yes to confirm deletion[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Perform deletion
|
||||
deleted_count, _ = snapshots.delete()
|
||||
rprint(f'[green]Deleted {deleted_count} snapshots[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Deleted {deleted_count} snapshots[/green]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -364,57 +417,81 @@ def delete_snapshots(yes: bool = False, dry_run: bool = False) -> int:
|
||||
# CLI Commands
|
||||
# =============================================================================
|
||||
|
||||
|
||||
@click.group()
|
||||
def main():
|
||||
"""Manage Snapshot records."""
|
||||
pass
|
||||
|
||||
|
||||
@main.command('create')
|
||||
@click.argument('urls', nargs=-1)
|
||||
@click.option('--tag', '-t', default='', help='Comma-separated tags to add')
|
||||
@click.option('--status', '-s', default='queued', help='Initial status (default: queued)')
|
||||
@click.option('--depth', '-d', type=int, default=0, help='Crawl depth (default: 0)')
|
||||
@main.command("create")
|
||||
@click.argument("urls", nargs=-1)
|
||||
@click.option("--tag", "-t", default="", help="Comma-separated tags to add")
|
||||
@click.option("--status", "-s", default="queued", help="Initial status (default: queued)")
|
||||
@click.option("--depth", "-d", type=int, default=0, help="Crawl depth (default: 0)")
|
||||
def create_cmd(urls: tuple, tag: str, status: str, depth: int):
|
||||
"""Create Snapshots from URLs or stdin JSONL."""
|
||||
sys.exit(create_snapshots(urls, tag=tag, status=status, depth=depth))
|
||||
|
||||
|
||||
@main.command('list')
|
||||
@click.option('--status', '-s', help='Filter by status (queued, started, sealed)')
|
||||
@click.option('--url__icontains', help='Filter by URL contains')
|
||||
@click.option('--url__istartswith', help='Filter by URL starts with')
|
||||
@click.option('--tag', '-t', help='Filter by tag name')
|
||||
@click.option('--crawl-id', help='Filter by crawl ID')
|
||||
@click.option('--limit', '-n', type=int, help='Limit number of results')
|
||||
def list_cmd(status: Optional[str], url__icontains: Optional[str], url__istartswith: Optional[str],
|
||||
tag: Optional[str], crawl_id: Optional[str], limit: Optional[int]):
|
||||
@main.command("list")
|
||||
@click.option("--status", "-s", help="Filter by status (queued, started, sealed)")
|
||||
@click.option("--url__icontains", help="Filter by URL contains")
|
||||
@click.option("--url__istartswith", help="Filter by URL starts with")
|
||||
@click.option("--tag", "-t", help="Filter by tag name")
|
||||
@click.option("--crawl-id", help="Filter by crawl ID")
|
||||
@click.option("--limit", "-n", type=int, help="Limit number of results")
|
||||
@click.option("--sort", "-o", type=str, help="Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at")
|
||||
@click.option("--csv", "-C", type=str, help="Print output as CSV with the provided fields, e.g.: timestamp,url,title")
|
||||
@click.option("--with-headers", is_flag=True, help="Include column headers in structured output")
|
||||
@click.option("--search", type=click.Choice(["meta", "content", "contents", "deep"]), help="Search mode to use for the query")
|
||||
@click.argument("query", nargs=-1)
|
||||
def list_cmd(
|
||||
status: str | None,
|
||||
url__icontains: str | None,
|
||||
url__istartswith: str | None,
|
||||
tag: str | None,
|
||||
crawl_id: str | None,
|
||||
limit: int | None,
|
||||
sort: str | None,
|
||||
csv: str | None,
|
||||
with_headers: bool,
|
||||
search: str | None,
|
||||
query: tuple[str, ...],
|
||||
):
|
||||
"""List Snapshots as JSONL."""
|
||||
sys.exit(list_snapshots(
|
||||
status=status,
|
||||
url__icontains=url__icontains,
|
||||
url__istartswith=url__istartswith,
|
||||
tag=tag,
|
||||
crawl_id=crawl_id,
|
||||
limit=limit,
|
||||
))
|
||||
sys.exit(
|
||||
list_snapshots(
|
||||
status=status,
|
||||
url__icontains=url__icontains,
|
||||
url__istartswith=url__istartswith,
|
||||
tag=tag,
|
||||
crawl_id=crawl_id,
|
||||
limit=limit,
|
||||
sort=sort,
|
||||
csv=csv,
|
||||
with_headers=with_headers,
|
||||
search=search,
|
||||
query=" ".join(query),
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@main.command('update')
|
||||
@click.option('--status', '-s', help='Set status')
|
||||
@click.option('--tag', '-t', help='Add tag')
|
||||
def update_cmd(status: Optional[str], tag: Optional[str]):
|
||||
@main.command("update")
|
||||
@click.option("--status", "-s", help="Set status")
|
||||
@click.option("--tag", "-t", help="Add tag")
|
||||
def update_cmd(status: str | None, tag: str | None):
|
||||
"""Update Snapshots from stdin JSONL."""
|
||||
sys.exit(update_snapshots(status=status, tag=tag))
|
||||
|
||||
|
||||
@main.command('delete')
|
||||
@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
|
||||
@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
|
||||
@main.command("delete")
|
||||
@click.option("--yes", "-y", is_flag=True, help="Confirm deletion")
|
||||
@click.option("--dry-run", is_flag=True, help="Show what would be deleted")
|
||||
def delete_cmd(yes: bool, dry_run: bool):
|
||||
"""Delete Snapshots from stdin JSONL."""
|
||||
sys.exit(delete_snapshots(yes=yes, dry_run=dry_run))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox snapshot'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox snapshot"
|
||||
|
||||
import sys
|
||||
|
||||
@@ -10,15 +10,15 @@ import rich_click as click
|
||||
from archivebox.cli.archivebox_snapshot import create_snapshots
|
||||
|
||||
|
||||
@click.command(context_settings={'ignore_unknown_options': True})
|
||||
@click.option('--tag', '-t', default='', help='Comma-separated tags to add')
|
||||
@click.option('--status', '-s', default='queued', help='Initial status (default: queued)')
|
||||
@click.option('--depth', '-d', type=int, default=0, help='Crawl depth (default: 0)')
|
||||
@click.argument('urls', nargs=-1)
|
||||
@click.command(context_settings={"ignore_unknown_options": True})
|
||||
@click.option("--tag", "-t", default="", help="Comma-separated tags to add")
|
||||
@click.option("--status", "-s", default="queued", help="Initial status (default: queued)")
|
||||
@click.option("--depth", "-d", type=int, default=0, help="Crawl depth (default: 0)")
|
||||
@click.argument("urls", nargs=-1)
|
||||
def main(tag: str, status: str, depth: int, urls: tuple[str, ...]):
|
||||
"""Backwards-compatible `archivebox snapshot URL...` entrypoint."""
|
||||
sys.exit(create_snapshots(urls, tag=tag, status=status, depth=depth))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__package__ = "archivebox.cli"
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
@@ -16,31 +16,34 @@ from archivebox.misc.logging_util import printable_filesize
|
||||
|
||||
|
||||
@enforce_types
|
||||
def status(out_dir: Path=DATA_DIR) -> None:
|
||||
def status(out_dir: Path = DATA_DIR) -> None:
|
||||
"""Print out some info and statistics about the archive collection"""
|
||||
|
||||
from django.contrib.auth import get_user_model
|
||||
from django.db.models import Sum
|
||||
from django.db.models.functions import Coalesce
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
User = get_user_model()
|
||||
|
||||
print('[green]\\[*] Scanning archive main index...[/green]')
|
||||
print(f'[yellow] {out_dir}/*[/yellow]')
|
||||
num_bytes, num_dirs, num_files = get_dir_size(out_dir, recursive=False, pattern='index.')
|
||||
print("[green]\\[*] Scanning archive main index...[/green]")
|
||||
print(f"[yellow] {out_dir}/*[/yellow]")
|
||||
num_bytes, num_dirs, num_files = get_dir_size(out_dir, recursive=False, pattern="index.")
|
||||
size = printable_filesize(num_bytes)
|
||||
print(f' Index size: {size} across {num_files} files')
|
||||
print(f" Index size: {size} across {num_files} files")
|
||||
print()
|
||||
|
||||
links = list(Snapshot.objects.all())
|
||||
links = list(Snapshot.objects.annotate(output_size_sum=Coalesce(Sum("archiveresult__output_size"), 0)))
|
||||
num_sql_links = len(links)
|
||||
num_link_details = sum(1 for link in parse_json_links_details(out_dir=out_dir))
|
||||
print(f' > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {CONSTANTS.SQL_INDEX_FILENAME})')
|
||||
print(f' > JSON Link Details: {num_link_details} links'.ljust(36), f'(found in {ARCHIVE_DIR.name}/*/index.json)')
|
||||
print(f" > SQL Main Index: {num_sql_links} links".ljust(36), f"(found in {CONSTANTS.SQL_INDEX_FILENAME})")
|
||||
print(f" > JSON Link Details: {num_link_details} links".ljust(36), f"(found in {ARCHIVE_DIR.name}/*/index.json)")
|
||||
print()
|
||||
print('[green]\\[*] Scanning archive data directories...[/green]')
|
||||
users_dir = out_dir / 'users'
|
||||
print("[green]\\[*] Scanning archive data directories...[/green]")
|
||||
users_dir = out_dir / "users"
|
||||
scan_roots = [root for root in (ARCHIVE_DIR, users_dir) if root.exists()]
|
||||
scan_roots_display = ', '.join(str(root) for root in scan_roots) if scan_roots else str(ARCHIVE_DIR)
|
||||
print(f'[yellow] {scan_roots_display}[/yellow]')
|
||||
scan_roots_display = ", ".join(str(root) for root in scan_roots) if scan_roots else str(ARCHIVE_DIR)
|
||||
print(f"[yellow] {scan_roots_display}[/yellow]")
|
||||
num_bytes = num_dirs = num_files = 0
|
||||
for root in scan_roots:
|
||||
root_bytes, root_dirs, root_files = get_dir_size(root)
|
||||
@@ -48,80 +51,66 @@ def status(out_dir: Path=DATA_DIR) -> None:
|
||||
num_dirs += root_dirs
|
||||
num_files += root_files
|
||||
size = printable_filesize(num_bytes)
|
||||
print(f' Size: {size} across {num_files} files in {num_dirs} directories')
|
||||
print(f" Size: {size} across {num_files} files in {num_dirs} directories")
|
||||
|
||||
# Use DB as source of truth for snapshot status
|
||||
num_indexed = len(links)
|
||||
num_archived = sum(1 for snapshot in links if snapshot.is_archived)
|
||||
num_unarchived = max(num_indexed - num_archived, 0)
|
||||
print(f' > indexed: {num_indexed}'.ljust(36), '(total snapshots in DB)')
|
||||
print(f' > archived: {num_archived}'.ljust(36), '(snapshots with archived content)')
|
||||
print(f' > unarchived: {num_unarchived}'.ljust(36), '(snapshots pending archiving)')
|
||||
print(f" > indexed: {num_indexed}".ljust(36), "(total snapshots in DB)")
|
||||
print(f" > archived: {num_archived}".ljust(36), "(snapshots with archived content)")
|
||||
print(f" > unarchived: {num_unarchived}".ljust(36), "(snapshots pending archiving)")
|
||||
|
||||
# Count snapshot directories on filesystem across both legacy and current layouts.
|
||||
expected_snapshot_dirs = {
|
||||
str(Path(snapshot.output_dir).resolve())
|
||||
for snapshot in links
|
||||
if Path(snapshot.output_dir).exists()
|
||||
}
|
||||
expected_snapshot_dirs = {str(Path(snapshot.output_dir).resolve()) for snapshot in links if Path(snapshot.output_dir).exists()}
|
||||
discovered_snapshot_dirs = set()
|
||||
|
||||
if ARCHIVE_DIR.exists():
|
||||
discovered_snapshot_dirs.update(
|
||||
str(entry.resolve())
|
||||
for entry in ARCHIVE_DIR.iterdir()
|
||||
if entry.is_dir()
|
||||
)
|
||||
discovered_snapshot_dirs.update(str(entry.resolve()) for entry in ARCHIVE_DIR.iterdir() if entry.is_dir())
|
||||
|
||||
if users_dir.exists():
|
||||
discovered_snapshot_dirs.update(
|
||||
str(entry.resolve())
|
||||
for entry in users_dir.glob('*/snapshots/*/*/*')
|
||||
if entry.is_dir()
|
||||
)
|
||||
discovered_snapshot_dirs.update(str(entry.resolve()) for entry in users_dir.glob("*/snapshots/*/*/*") if entry.is_dir())
|
||||
|
||||
orphaned_dirs = sorted(discovered_snapshot_dirs - expected_snapshot_dirs)
|
||||
num_present = len(discovered_snapshot_dirs)
|
||||
num_valid = len(discovered_snapshot_dirs & expected_snapshot_dirs)
|
||||
print()
|
||||
print(f' > present: {num_present}'.ljust(36), '(snapshot directories on disk)')
|
||||
print(f' > [green]valid:[/green] {num_valid}'.ljust(36), ' (directories with matching DB entry)')
|
||||
print(f" > present: {num_present}".ljust(36), "(snapshot directories on disk)")
|
||||
print(f" > [green]valid:[/green] {num_valid}".ljust(36), " (directories with matching DB entry)")
|
||||
|
||||
num_orphaned = len(orphaned_dirs)
|
||||
print(f' > [red]orphaned:[/red] {num_orphaned}'.ljust(36), ' (directories without matching DB entry)')
|
||||
print(f" > [red]orphaned:[/red] {num_orphaned}".ljust(36), " (directories without matching DB entry)")
|
||||
|
||||
if num_indexed:
|
||||
print(' [violet]Hint:[/violet] You can list snapshots by status like so:')
|
||||
print(' [green]archivebox list --status=<status> (e.g. archived, queued, etc.)[/green]')
|
||||
print(" [violet]Hint:[/violet] You can list snapshots by status like so:")
|
||||
print(" [green]archivebox list --status=<status> (e.g. archived, queued, etc.)[/green]")
|
||||
|
||||
if orphaned_dirs:
|
||||
print(' [violet]Hint:[/violet] To automatically import orphaned data directories into the main index, run:')
|
||||
print(' [green]archivebox init[/green]')
|
||||
print(" [violet]Hint:[/violet] To automatically import orphaned data directories into the main index, run:")
|
||||
print(" [green]archivebox init[/green]")
|
||||
|
||||
print()
|
||||
print('[green]\\[*] Scanning recent archive changes and user logins:[/green]')
|
||||
print(f'[yellow] {CONSTANTS.LOGS_DIR}/*[/yellow]')
|
||||
admin_users = User.objects.filter(is_superuser=True).exclude(username='system')
|
||||
print("[green]\\[*] Scanning recent archive changes and user logins:[/green]")
|
||||
print(f"[yellow] {CONSTANTS.LOGS_DIR}/*[/yellow]")
|
||||
admin_users = User.objects.filter(is_superuser=True).exclude(username="system")
|
||||
users = [user.get_username() for user in admin_users]
|
||||
print(f' UI users {len(users)}: {", ".join(users)}')
|
||||
last_login = admin_users.order_by('last_login').last()
|
||||
print(f" UI users {len(users)}: {', '.join(users)}")
|
||||
last_login = admin_users.order_by("last_login").last()
|
||||
if last_login:
|
||||
print(f' Last UI login: {last_login.get_username()} @ {str(last_login.last_login)[:16]}')
|
||||
last_downloaded = Snapshot.objects.order_by('downloaded_at').last()
|
||||
print(f" Last UI login: {last_login.get_username()} @ {str(last_login.last_login)[:16]}")
|
||||
last_downloaded = Snapshot.objects.order_by("downloaded_at").last()
|
||||
if last_downloaded:
|
||||
print(f' Last changes: {str(last_downloaded.downloaded_at)[:16]}')
|
||||
print(f" Last changes: {str(last_downloaded.downloaded_at)[:16]}")
|
||||
|
||||
if not users:
|
||||
print()
|
||||
print(' [violet]Hint:[/violet] You can create an admin user by running:')
|
||||
print(' [green]archivebox manage createsuperuser[/green]')
|
||||
print(" [violet]Hint:[/violet] You can create an admin user by running:")
|
||||
print(" [green]archivebox manage createsuperuser[/green]")
|
||||
|
||||
print()
|
||||
recent_snapshots = sorted(
|
||||
links,
|
||||
key=lambda snapshot: (
|
||||
snapshot.downloaded_at or snapshot.modified_at or snapshot.created_at
|
||||
),
|
||||
key=lambda snapshot: snapshot.downloaded_at or snapshot.modified_at or snapshot.created_at,
|
||||
reverse=True,
|
||||
)[:10]
|
||||
for snapshot in recent_snapshots:
|
||||
@@ -129,14 +118,14 @@ def status(out_dir: Path=DATA_DIR) -> None:
|
||||
continue
|
||||
print(
|
||||
(
|
||||
'[grey53] '
|
||||
f' > {str(snapshot.downloaded_at)[:16]} '
|
||||
f'[{snapshot.num_outputs} {("X", "√")[snapshot.is_archived]} {printable_filesize(snapshot.archive_size)}] '
|
||||
"[grey53] "
|
||||
f" > {str(snapshot.downloaded_at)[:16]} "
|
||||
f"[{snapshot.num_outputs} {('X', '√')[snapshot.is_archived]} {printable_filesize(snapshot.archive_size)}] "
|
||||
f'"{snapshot.title}": {snapshot.url}'
|
||||
'[/grey53]'
|
||||
)[:SHELL_CONFIG.TERM_WIDTH],
|
||||
"[/grey53]"
|
||||
)[: SHELL_CONFIG.TERM_WIDTH],
|
||||
)
|
||||
print('[grey53] ...')
|
||||
print("[grey53] ...")
|
||||
|
||||
|
||||
@click.command()
|
||||
@@ -146,5 +135,5 @@ def main(**kwargs):
|
||||
status(**kwargs)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -27,11 +27,11 @@ Examples:
|
||||
archivebox tag list --name=unused | archivebox tag delete --yes
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox tag'
|
||||
__package__ = "archivebox.cli"
|
||||
__command__ = "archivebox tag"
|
||||
|
||||
import sys
|
||||
from typing import Optional, Iterable
|
||||
from collections.abc import Iterable
|
||||
|
||||
import rich_click as click
|
||||
from rich import print as rprint
|
||||
@@ -43,6 +43,7 @@ from archivebox.cli.cli_utils import apply_filters
|
||||
# CREATE
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def create_tags(names: Iterable[str]) -> int:
|
||||
"""
|
||||
Create Tags from names.
|
||||
@@ -60,7 +61,7 @@ def create_tags(names: Iterable[str]) -> int:
|
||||
name_list = list(names) if names else []
|
||||
|
||||
if not name_list:
|
||||
rprint('[yellow]No tag names provided. Pass names as arguments.[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No tag names provided. Pass names as arguments.[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
created_count = 0
|
||||
@@ -76,11 +77,11 @@ def create_tags(names: Iterable[str]) -> int:
|
||||
|
||||
if created:
|
||||
created_count += 1
|
||||
rprint(f'[green]Created tag: {name}[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Created tag: {name}[/green]", file=sys.stderr)
|
||||
else:
|
||||
rprint(f'[dim]Tag already exists: {name}[/dim]', file=sys.stderr)
|
||||
rprint(f"[dim]Tag already exists: {name}[/dim]", file=sys.stderr)
|
||||
|
||||
rprint(f'[green]Created {created_count} new tags[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Created {created_count} new tags[/green]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -88,10 +89,11 @@ def create_tags(names: Iterable[str]) -> int:
|
||||
# LIST
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def list_tags(
|
||||
name: Optional[str] = None,
|
||||
name__icontains: Optional[str] = None,
|
||||
limit: Optional[int] = None,
|
||||
name: str | None = None,
|
||||
name__icontains: str | None = None,
|
||||
limit: int | None = None,
|
||||
) -> int:
|
||||
"""
|
||||
List Tags as JSONL with optional filters.
|
||||
@@ -104,12 +106,12 @@ def list_tags(
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
queryset = Tag.objects.all().order_by('name')
|
||||
queryset = Tag.objects.all().order_by("name")
|
||||
|
||||
# Apply filters
|
||||
filter_kwargs = {
|
||||
'name': name,
|
||||
'name__icontains': name__icontains,
|
||||
"name": name,
|
||||
"name__icontains": name__icontains,
|
||||
}
|
||||
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
|
||||
|
||||
@@ -117,12 +119,12 @@ def list_tags(
|
||||
for tag in queryset:
|
||||
snapshot_count = tag.snapshot_set.count()
|
||||
if is_tty:
|
||||
rprint(f'[cyan]{tag.name:30}[/cyan] [dim]({snapshot_count} snapshots)[/dim]')
|
||||
rprint(f"[cyan]{tag.name:30}[/cyan] [dim]({snapshot_count} snapshots)[/dim]")
|
||||
else:
|
||||
write_record(tag.to_json())
|
||||
count += 1
|
||||
|
||||
rprint(f'[dim]Listed {count} tags[/dim]', file=sys.stderr)
|
||||
rprint(f"[dim]Listed {count} tags[/dim]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -130,7 +132,8 @@ def list_tags(
|
||||
# UPDATE
|
||||
# =============================================================================
|
||||
|
||||
def update_tags(name: Optional[str] = None) -> int:
|
||||
|
||||
def update_tags(name: str | None = None) -> int:
|
||||
"""
|
||||
Update Tags from stdin JSONL.
|
||||
|
||||
@@ -148,13 +151,13 @@ def update_tags(name: Optional[str] = None) -> int:
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
updated_count = 0
|
||||
for record in records:
|
||||
tag_id = record.get('id')
|
||||
old_name = record.get('name')
|
||||
tag_id = record.get("id")
|
||||
old_name = record.get("name")
|
||||
|
||||
if not tag_id and not old_name:
|
||||
continue
|
||||
@@ -176,10 +179,10 @@ def update_tags(name: Optional[str] = None) -> int:
|
||||
write_record(tag.to_json())
|
||||
|
||||
except Tag.DoesNotExist:
|
||||
rprint(f'[yellow]Tag not found: {tag_id or old_name}[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]Tag not found: {tag_id or old_name}[/yellow]", file=sys.stderr)
|
||||
continue
|
||||
|
||||
rprint(f'[green]Updated {updated_count} tags[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Updated {updated_count} tags[/green]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -187,6 +190,7 @@ def update_tags(name: Optional[str] = None) -> int:
|
||||
# DELETE
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def delete_tags(yes: bool = False, dry_run: bool = False) -> int:
|
||||
"""
|
||||
Delete Tags from stdin JSONL.
|
||||
@@ -202,23 +206,24 @@ def delete_tags(yes: bool = False, dry_run: bool = False) -> int:
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Collect tag IDs or names
|
||||
tag_ids = []
|
||||
tag_names = []
|
||||
for r in records:
|
||||
if r.get('id'):
|
||||
tag_ids.append(r['id'])
|
||||
elif r.get('name'):
|
||||
tag_names.append(r['name'])
|
||||
if r.get("id"):
|
||||
tag_ids.append(r["id"])
|
||||
elif r.get("name"):
|
||||
tag_names.append(r["name"])
|
||||
|
||||
if not tag_ids and not tag_names:
|
||||
rprint('[yellow]No valid tag IDs or names in input[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No valid tag IDs or names in input[/yellow]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
from django.db.models import Q
|
||||
|
||||
query = Q()
|
||||
if tag_ids:
|
||||
query |= Q(id__in=tag_ids)
|
||||
@@ -229,22 +234,22 @@ def delete_tags(yes: bool = False, dry_run: bool = False) -> int:
|
||||
count = tags.count()
|
||||
|
||||
if count == 0:
|
||||
rprint('[yellow]No matching tags found[/yellow]', file=sys.stderr)
|
||||
rprint("[yellow]No matching tags found[/yellow]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if dry_run:
|
||||
rprint(f'[yellow]Would delete {count} tags (dry run)[/yellow]', file=sys.stderr)
|
||||
rprint(f"[yellow]Would delete {count} tags (dry run)[/yellow]", file=sys.stderr)
|
||||
for tag in tags:
|
||||
rprint(f' {tag.name}', file=sys.stderr)
|
||||
rprint(f" {tag.name}", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if not yes:
|
||||
rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
|
||||
rprint("[red]Use --yes to confirm deletion[/red]", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Perform deletion
|
||||
deleted_count, _ = tags.delete()
|
||||
rprint(f'[green]Deleted {deleted_count} tags[/green]', file=sys.stderr)
|
||||
rprint(f"[green]Deleted {deleted_count} tags[/green]", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
@@ -252,42 +257,43 @@ def delete_tags(yes: bool = False, dry_run: bool = False) -> int:
|
||||
# CLI Commands
|
||||
# =============================================================================
|
||||
|
||||
|
||||
@click.group()
|
||||
def main():
|
||||
"""Manage Tag records."""
|
||||
pass
|
||||
|
||||
|
||||
@main.command('create')
|
||||
@click.argument('names', nargs=-1)
|
||||
@main.command("create")
|
||||
@click.argument("names", nargs=-1)
|
||||
def create_cmd(names: tuple):
|
||||
"""Create Tags from names."""
|
||||
sys.exit(create_tags(names))
|
||||
|
||||
|
||||
@main.command('list')
|
||||
@click.option('--name', help='Filter by exact name')
|
||||
@click.option('--name__icontains', help='Filter by name contains')
|
||||
@click.option('--limit', '-n', type=int, help='Limit number of results')
|
||||
def list_cmd(name: Optional[str], name__icontains: Optional[str], limit: Optional[int]):
|
||||
@main.command("list")
|
||||
@click.option("--name", help="Filter by exact name")
|
||||
@click.option("--name__icontains", help="Filter by name contains")
|
||||
@click.option("--limit", "-n", type=int, help="Limit number of results")
|
||||
def list_cmd(name: str | None, name__icontains: str | None, limit: int | None):
|
||||
"""List Tags as JSONL."""
|
||||
sys.exit(list_tags(name=name, name__icontains=name__icontains, limit=limit))
|
||||
|
||||
|
||||
@main.command('update')
|
||||
@click.option('--name', '-n', help='Set new name')
|
||||
def update_cmd(name: Optional[str]):
|
||||
@main.command("update")
|
||||
@click.option("--name", "-n", help="Set new name")
|
||||
def update_cmd(name: str | None):
|
||||
"""Update Tags from stdin JSONL."""
|
||||
sys.exit(update_tags(name=name))
|
||||
|
||||
|
||||
@main.command('delete')
|
||||
@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
|
||||
@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
|
||||
@main.command("delete")
|
||||
@click.option("--yes", "-y", is_flag=True, help="Confirm deletion")
|
||||
@click.option("--dry-run", is_flag=True, help="Show what would be deleted")
|
||||
def delete_cmd(yes: bool, dry_run: bool):
|
||||
"""Delete Tags from stdin JSONL."""
|
||||
sys.exit(delete_tags(yes=yes, dry_run=dry_run))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__package__ = "archivebox.cli"
|
||||
|
||||
import os
|
||||
import time
|
||||
|
||||
from typing import TYPE_CHECKING, Callable, Iterable
|
||||
from typing import TYPE_CHECKING, Any
|
||||
from collections.abc import Callable, Iterable
|
||||
from pathlib import Path
|
||||
|
||||
import rich_click as click
|
||||
@@ -20,24 +21,22 @@ if TYPE_CHECKING:
|
||||
|
||||
|
||||
LINK_FILTERS: dict[str, Callable[[str], Q]] = {
|
||||
'exact': lambda pattern: Q(url=pattern),
|
||||
'substring': lambda pattern: Q(url__icontains=pattern),
|
||||
'regex': lambda pattern: Q(url__iregex=pattern),
|
||||
'domain': lambda pattern: (
|
||||
Q(url__istartswith=f'http://{pattern}')
|
||||
| Q(url__istartswith=f'https://{pattern}')
|
||||
| Q(url__istartswith=f'ftp://{pattern}')
|
||||
"exact": lambda pattern: Q(url=pattern),
|
||||
"substring": lambda pattern: Q(url__icontains=pattern),
|
||||
"regex": lambda pattern: Q(url__iregex=pattern),
|
||||
"domain": lambda pattern: (
|
||||
Q(url__istartswith=f"http://{pattern}") | Q(url__istartswith=f"https://{pattern}") | Q(url__istartswith=f"ftp://{pattern}")
|
||||
),
|
||||
'tag': lambda pattern: Q(tags__name=pattern),
|
||||
'timestamp': lambda pattern: Q(timestamp=pattern),
|
||||
"tag": lambda pattern: Q(tags__name=pattern),
|
||||
"timestamp": lambda pattern: Q(timestamp=pattern),
|
||||
}
|
||||
|
||||
|
||||
def _apply_pattern_filters(
|
||||
snapshots: QuerySet['Snapshot', 'Snapshot'],
|
||||
snapshots: QuerySet["Snapshot", "Snapshot"],
|
||||
filter_patterns: list[str],
|
||||
filter_type: str,
|
||||
) -> QuerySet['Snapshot', 'Snapshot']:
|
||||
) -> QuerySet["Snapshot", "Snapshot"]:
|
||||
filter_builder = LINK_FILTERS.get(filter_type)
|
||||
if filter_builder is None:
|
||||
raise SystemExit(2)
|
||||
@@ -48,21 +47,120 @@ def _apply_pattern_filters(
|
||||
return snapshots.filter(query)
|
||||
|
||||
|
||||
def _get_snapshot_crawl(snapshot: 'Snapshot') -> 'Crawl | None':
|
||||
def _get_snapshot_crawl(snapshot: "Snapshot") -> "Crawl | None":
|
||||
try:
|
||||
return snapshot.crawl
|
||||
except ObjectDoesNotExist:
|
||||
return None
|
||||
|
||||
|
||||
def _get_search_indexing_plugins() -> list[str]:
|
||||
from abx_dl.models import discover_plugins
|
||||
from archivebox.hooks import get_search_backends
|
||||
|
||||
available_backends = set(get_search_backends())
|
||||
plugins = discover_plugins()
|
||||
return sorted(
|
||||
plugin_name
|
||||
for plugin_name, plugin in plugins.items()
|
||||
if plugin_name.startswith("search_backend_")
|
||||
and plugin_name.removeprefix("search_backend_") in available_backends
|
||||
and any("Snapshot" in hook.name and "index" in hook.name.lower() for hook in plugin.hooks)
|
||||
)
|
||||
|
||||
|
||||
def _build_filtered_snapshots_queryset(
|
||||
*,
|
||||
filter_patterns: Iterable[str],
|
||||
filter_type: str,
|
||||
before: float | None,
|
||||
after: float | None,
|
||||
resume: str | None = None,
|
||||
):
|
||||
from archivebox.core.models import Snapshot
|
||||
from datetime import datetime
|
||||
|
||||
snapshots = Snapshot.objects.all()
|
||||
|
||||
if filter_patterns:
|
||||
snapshots = _apply_pattern_filters(snapshots, list(filter_patterns), filter_type)
|
||||
|
||||
if before:
|
||||
snapshots = snapshots.filter(bookmarked_at__lt=datetime.fromtimestamp(before))
|
||||
if after:
|
||||
snapshots = snapshots.filter(bookmarked_at__gt=datetime.fromtimestamp(after))
|
||||
if resume:
|
||||
snapshots = snapshots.filter(timestamp__lte=resume)
|
||||
|
||||
return snapshots.select_related("crawl").order_by("-bookmarked_at")
|
||||
|
||||
|
||||
def reindex_snapshots(
|
||||
snapshots: QuerySet["Snapshot", "Snapshot"],
|
||||
*,
|
||||
search_plugins: list[str],
|
||||
batch_size: int,
|
||||
) -> dict[str, int]:
|
||||
from archivebox.cli.archivebox_extract import run_plugins
|
||||
|
||||
stats = {"processed": 0, "reconciled": 0, "queued": 0, "reindexed": 0}
|
||||
records: list[dict[str, str]] = []
|
||||
|
||||
total = snapshots.count()
|
||||
print(f"[*] Reindexing {total} snapshots with search plugins: {', '.join(search_plugins)}")
|
||||
|
||||
for snapshot in snapshots.iterator(chunk_size=batch_size):
|
||||
stats["processed"] += 1
|
||||
|
||||
if _get_snapshot_crawl(snapshot) is None:
|
||||
continue
|
||||
|
||||
output_dir = Path(snapshot.output_dir)
|
||||
has_directory = output_dir.exists() and output_dir.is_dir()
|
||||
if has_directory:
|
||||
snapshot.reconcile_with_index_json()
|
||||
stats["reconciled"] += 1
|
||||
|
||||
for plugin_name in search_plugins:
|
||||
existing_result = snapshot.archiveresult_set.filter(plugin=plugin_name).order_by("-created_at").first()
|
||||
if existing_result:
|
||||
existing_result.reset_for_retry()
|
||||
records.append(
|
||||
{
|
||||
"type": "ArchiveResult",
|
||||
"snapshot_id": str(snapshot.id),
|
||||
"plugin": plugin_name,
|
||||
},
|
||||
)
|
||||
stats["queued"] += 1
|
||||
|
||||
if not records:
|
||||
return stats
|
||||
|
||||
exit_code = run_plugins(
|
||||
args=(),
|
||||
records=records,
|
||||
wait=True,
|
||||
emit_results=False,
|
||||
)
|
||||
if exit_code != 0:
|
||||
raise SystemExit(exit_code)
|
||||
|
||||
stats["reindexed"] = len(records)
|
||||
return stats
|
||||
|
||||
|
||||
@enforce_types
|
||||
def update(filter_patterns: Iterable[str] = (),
|
||||
filter_type: str = 'exact',
|
||||
before: float | None = None,
|
||||
after: float | None = None,
|
||||
resume: str | None = None,
|
||||
batch_size: int = 100,
|
||||
continuous: bool = False) -> None:
|
||||
def update(
|
||||
filter_patterns: Iterable[str] = (),
|
||||
filter_type: str = "exact",
|
||||
before: float | None = None,
|
||||
after: float | None = None,
|
||||
resume: str | None = None,
|
||||
batch_size: int = 100,
|
||||
continuous: bool = False,
|
||||
index_only: bool = False,
|
||||
) -> None:
|
||||
"""
|
||||
Update snapshots: migrate old dirs, reconcile DB, and re-queue for archiving.
|
||||
|
||||
@@ -77,41 +175,69 @@ def update(filter_patterns: Iterable[str] = (),
|
||||
|
||||
from rich import print
|
||||
from archivebox.config.django import setup_django
|
||||
|
||||
setup_django()
|
||||
|
||||
from django.core.management import call_command
|
||||
|
||||
# Run migrations first to ensure DB schema is up-to-date
|
||||
print('[*] Checking for pending migrations...')
|
||||
print("[*] Checking for pending migrations...")
|
||||
try:
|
||||
call_command('migrate', '--no-input', verbosity=0)
|
||||
call_command("migrate", "--no-input", verbosity=0)
|
||||
except Exception as e:
|
||||
print(f'[!] Warning: Migration check failed: {e}')
|
||||
print(f"[!] Warning: Migration check failed: {e}")
|
||||
|
||||
while True:
|
||||
if filter_patterns or before or after:
|
||||
if index_only:
|
||||
search_plugins = _get_search_indexing_plugins()
|
||||
if not search_plugins:
|
||||
print("[*] No search indexing plugins are available, nothing to backfill.")
|
||||
break
|
||||
|
||||
if not (filter_patterns or before or after):
|
||||
print("[*] Phase 1: Draining old archive/ directories (0.8.x → 0.9.x migration)...")
|
||||
drain_old_archive_dirs(
|
||||
resume_from=resume,
|
||||
batch_size=batch_size,
|
||||
)
|
||||
|
||||
snapshots = _build_filtered_snapshots_queryset(
|
||||
filter_patterns=filter_patterns,
|
||||
filter_type=filter_type,
|
||||
before=before,
|
||||
after=after,
|
||||
resume=resume,
|
||||
)
|
||||
stats = reindex_snapshots(
|
||||
snapshots,
|
||||
search_plugins=search_plugins,
|
||||
batch_size=batch_size,
|
||||
)
|
||||
print_index_stats(stats)
|
||||
elif filter_patterns or before or after:
|
||||
# Filtered mode: query DB only
|
||||
print('[*] Processing filtered snapshots from database...')
|
||||
print("[*] Processing filtered snapshots from database...")
|
||||
stats = process_filtered_snapshots(
|
||||
filter_patterns=filter_patterns,
|
||||
filter_type=filter_type,
|
||||
before=before,
|
||||
after=after,
|
||||
batch_size=batch_size
|
||||
resume=resume,
|
||||
batch_size=batch_size,
|
||||
)
|
||||
print_stats(stats)
|
||||
else:
|
||||
# Full mode: drain old dirs + process DB
|
||||
stats_combined = {'phase1': {}, 'phase2': {}}
|
||||
stats_combined = {"phase1": {}, "phase2": {}}
|
||||
|
||||
print('[*] Phase 1: Draining old archive/ directories (0.8.x → 0.9.x migration)...')
|
||||
stats_combined['phase1'] = drain_old_archive_dirs(
|
||||
print("[*] Phase 1: Draining old archive/ directories (0.8.x → 0.9.x migration)...")
|
||||
stats_combined["phase1"] = drain_old_archive_dirs(
|
||||
resume_from=resume,
|
||||
batch_size=batch_size
|
||||
batch_size=batch_size,
|
||||
)
|
||||
|
||||
print('[*] Phase 2: Processing all database snapshots (most recent first)...')
|
||||
stats_combined['phase2'] = process_all_db_snapshots(batch_size=batch_size)
|
||||
print("[*] Phase 2: Processing all database snapshots (most recent first)...")
|
||||
stats_combined["phase2"] = process_all_db_snapshots(batch_size=batch_size, resume=resume)
|
||||
|
||||
# Phase 3: Deduplication (disabled for now)
|
||||
# print('[*] Phase 3: Deduplicating...')
|
||||
@@ -122,7 +248,7 @@ def update(filter_patterns: Iterable[str] = (),
|
||||
if not continuous:
|
||||
break
|
||||
|
||||
print('[yellow]Sleeping 60s before next pass...[/yellow]')
|
||||
print("[yellow]Sleeping 60s before next pass...[/yellow]")
|
||||
time.sleep(60)
|
||||
resume = None
|
||||
|
||||
@@ -144,34 +270,34 @@ def drain_old_archive_dirs(resume_from: str | None = None, batch_size: int = 100
|
||||
from archivebox.config import CONSTANTS
|
||||
from django.db import transaction
|
||||
|
||||
stats = {'processed': 0, 'migrated': 0, 'skipped': 0, 'invalid': 0}
|
||||
stats = {"processed": 0, "migrated": 0, "skipped": 0, "invalid": 0}
|
||||
|
||||
archive_dir = CONSTANTS.ARCHIVE_DIR
|
||||
if not archive_dir.exists():
|
||||
return stats
|
||||
|
||||
print('[DEBUG Phase1] Scanning for old directories in archive/...')
|
||||
print("[DEBUG Phase1] Scanning for old directories in archive/...")
|
||||
|
||||
# Scan for real directories only (skip symlinks - they're already migrated)
|
||||
all_entries = list(os.scandir(archive_dir))
|
||||
print(f'[DEBUG Phase1] Total entries in archive/: {len(all_entries)}')
|
||||
print(f"[DEBUG Phase1] Total entries in archive/: {len(all_entries)}")
|
||||
entries = [
|
||||
(e.stat().st_mtime, e.path)
|
||||
for e in all_entries
|
||||
if e.is_dir(follow_symlinks=False) # Skip symlinks
|
||||
]
|
||||
entries.sort(reverse=True) # Newest first
|
||||
print(f'[DEBUG Phase1] Real directories (not symlinks): {len(entries)}')
|
||||
print(f'[*] Found {len(entries)} old directories to drain')
|
||||
print(f"[DEBUG Phase1] Real directories (not symlinks): {len(entries)}")
|
||||
print(f"[*] Found {len(entries)} old directories to drain")
|
||||
|
||||
for mtime, entry_path in entries:
|
||||
entry_path = Path(entry_path)
|
||||
|
||||
# Resume from timestamp if specified
|
||||
if resume_from and entry_path.name < resume_from:
|
||||
if resume_from and entry_path.name > resume_from:
|
||||
continue
|
||||
|
||||
stats['processed'] += 1
|
||||
stats["processed"] += 1
|
||||
|
||||
# Try to load existing snapshot from DB
|
||||
snapshot = Snapshot.load_from_directory(entry_path)
|
||||
@@ -182,16 +308,16 @@ def drain_old_archive_dirs(resume_from: str | None = None, batch_size: int = 100
|
||||
if not snapshot:
|
||||
# Invalid directory - move to invalid/
|
||||
Snapshot.move_directory_to_invalid(entry_path)
|
||||
stats['invalid'] += 1
|
||||
stats["invalid"] += 1
|
||||
print(f" [{stats['processed']}] Invalid: {entry_path.name}")
|
||||
continue
|
||||
|
||||
try:
|
||||
snapshot.save()
|
||||
stats['migrated'] += 1
|
||||
stats["migrated"] += 1
|
||||
print(f" [{stats['processed']}] Imported orphaned snapshot: {entry_path.name}")
|
||||
except Exception as e:
|
||||
stats['skipped'] += 1
|
||||
stats["skipped"] += 1
|
||||
print(f" [{stats['processed']}] Skipped (error: {e}): {entry_path.name}")
|
||||
continue
|
||||
|
||||
@@ -201,30 +327,35 @@ def drain_old_archive_dirs(resume_from: str | None = None, batch_size: int = 100
|
||||
if not has_valid_crawl:
|
||||
# Create a new crawl (created_by will default to system user)
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
crawl = Crawl.objects.create(urls=snapshot.url)
|
||||
# Use queryset update to avoid triggering save() hooks
|
||||
from archivebox.core.models import Snapshot as SnapshotModel
|
||||
|
||||
SnapshotModel.objects.filter(pk=snapshot.pk).update(crawl=crawl)
|
||||
# Refresh the instance
|
||||
snapshot.crawl = crawl
|
||||
print(f"[DEBUG Phase1] Created missing crawl for snapshot {str(snapshot.id)[:8]}")
|
||||
|
||||
# Check if needs migration (0.8.x → 0.9.x)
|
||||
print(f"[DEBUG Phase1] Snapshot {str(snapshot.id)[:8]}: fs_version={snapshot.fs_version}, needs_migration={snapshot.fs_migration_needed}")
|
||||
print(
|
||||
f"[DEBUG Phase1] Snapshot {str(snapshot.id)[:8]}: fs_version={snapshot.fs_version}, needs_migration={snapshot.fs_migration_needed}",
|
||||
)
|
||||
if snapshot.fs_migration_needed:
|
||||
try:
|
||||
# Calculate paths using actual directory (entry_path), not snapshot.timestamp
|
||||
# because snapshot.timestamp might be truncated
|
||||
old_dir = entry_path
|
||||
new_dir = snapshot.get_storage_path_for_version('0.9.0')
|
||||
new_dir = snapshot.get_storage_path_for_version("0.9.0")
|
||||
print(f"[DEBUG Phase1] Migrating {old_dir.name} → {new_dir}")
|
||||
|
||||
# Manually migrate files
|
||||
if not new_dir.exists() and old_dir.exists():
|
||||
new_dir.mkdir(parents=True, exist_ok=True)
|
||||
import shutil
|
||||
|
||||
file_count = 0
|
||||
for old_file in old_dir.rglob('*'):
|
||||
for old_file in old_dir.rglob("*"):
|
||||
if old_file.is_file():
|
||||
rel_path = old_file.relative_to(old_dir)
|
||||
new_file = new_dir / rel_path
|
||||
@@ -236,7 +367,8 @@ def drain_old_archive_dirs(resume_from: str | None = None, batch_size: int = 100
|
||||
|
||||
# Update only fs_version field using queryset update (bypasses validation)
|
||||
from archivebox.core.models import Snapshot as SnapshotModel
|
||||
SnapshotModel.objects.filter(pk=snapshot.pk).update(fs_version='0.9.0')
|
||||
|
||||
SnapshotModel.objects.filter(pk=snapshot.pk).update(fs_version="0.9.0")
|
||||
|
||||
# Commit the transaction
|
||||
transaction.commit()
|
||||
@@ -245,22 +377,22 @@ def drain_old_archive_dirs(resume_from: str | None = None, batch_size: int = 100
|
||||
if old_dir.exists() and old_dir != new_dir:
|
||||
snapshot._cleanup_old_migration_dir(old_dir, new_dir)
|
||||
|
||||
stats['migrated'] += 1
|
||||
stats["migrated"] += 1
|
||||
print(f" [{stats['processed']}] Migrated: {entry_path.name}")
|
||||
except Exception as e:
|
||||
stats['skipped'] += 1
|
||||
stats["skipped"] += 1
|
||||
print(f" [{stats['processed']}] Skipped (error: {e}): {entry_path.name}")
|
||||
else:
|
||||
stats['skipped'] += 1
|
||||
stats["skipped"] += 1
|
||||
|
||||
if stats['processed'] % batch_size == 0:
|
||||
if stats["processed"] % batch_size == 0:
|
||||
transaction.commit()
|
||||
|
||||
transaction.commit()
|
||||
return stats
|
||||
|
||||
|
||||
def process_all_db_snapshots(batch_size: int = 100) -> dict[str, int]:
|
||||
def process_all_db_snapshots(batch_size: int = 100, resume: str | None = None) -> dict[str, int]:
|
||||
"""
|
||||
O(n) scan over entire DB from most recent to least recent.
|
||||
|
||||
@@ -275,24 +407,30 @@ def process_all_db_snapshots(batch_size: int = 100) -> dict[str, int]:
|
||||
from django.db import transaction
|
||||
from django.utils import timezone
|
||||
|
||||
stats = {'processed': 0, 'reconciled': 0, 'queued': 0}
|
||||
stats = {"processed": 0, "reconciled": 0, "queued": 0}
|
||||
|
||||
total = Snapshot.objects.count()
|
||||
print(f'[*] Processing {total} snapshots from database (most recent first)...')
|
||||
queryset = Snapshot.objects.all()
|
||||
if resume:
|
||||
queryset = queryset.filter(timestamp__lte=resume)
|
||||
total = queryset.count()
|
||||
print(f"[*] Processing {total} snapshots from database (most recent first)...")
|
||||
|
||||
# Process from most recent to least recent
|
||||
for snapshot in Snapshot.objects.select_related('crawl').order_by('-bookmarked_at').iterator(chunk_size=batch_size):
|
||||
stats['processed'] += 1
|
||||
for snapshot in queryset.select_related("crawl").order_by("-bookmarked_at").iterator(chunk_size=batch_size):
|
||||
stats["processed"] += 1
|
||||
|
||||
# Skip snapshots with missing crawl references (orphaned by migration errors)
|
||||
if _get_snapshot_crawl(snapshot) is None:
|
||||
continue
|
||||
|
||||
try:
|
||||
print(f"[DEBUG Phase2] Snapshot {str(snapshot.id)[:8]}: fs_version={snapshot.fs_version}, needs_migration={snapshot.fs_migration_needed}")
|
||||
print(
|
||||
f"[DEBUG Phase2] Snapshot {str(snapshot.id)[:8]}: fs_version={snapshot.fs_version}, needs_migration={snapshot.fs_migration_needed}",
|
||||
)
|
||||
|
||||
# Check if snapshot has a directory on disk
|
||||
from pathlib import Path
|
||||
|
||||
output_dir = Path(snapshot.output_dir)
|
||||
has_directory = output_dir.exists() and output_dir.is_dir()
|
||||
|
||||
@@ -313,22 +451,23 @@ def process_all_db_snapshots(batch_size: int = 100) -> dict[str, int]:
|
||||
print(f"[DEBUG Phase2] Orphan snapshot {str(snapshot.id)[:8]} - marking as migrated without filesystem operation")
|
||||
# Use queryset update to set fs_version without triggering save() hooks
|
||||
from archivebox.core.models import Snapshot as SnapshotModel
|
||||
SnapshotModel.objects.filter(pk=snapshot.pk).update(fs_version='0.9.0')
|
||||
snapshot.fs_version = '0.9.0'
|
||||
|
||||
SnapshotModel.objects.filter(pk=snapshot.pk).update(fs_version="0.9.0")
|
||||
snapshot.fs_version = "0.9.0"
|
||||
|
||||
# Queue for archiving (state machine will handle it)
|
||||
snapshot.status = Snapshot.StatusChoices.QUEUED
|
||||
snapshot.retry_at = timezone.now()
|
||||
snapshot.save()
|
||||
|
||||
stats['reconciled'] += 1 if has_directory else 0
|
||||
stats['queued'] += 1
|
||||
stats["reconciled"] += 1 if has_directory else 0
|
||||
stats["queued"] += 1
|
||||
except Exception as e:
|
||||
# Skip snapshots that can't be processed (e.g., missing crawl)
|
||||
print(f" [!] Skipping snapshot {snapshot.id}: {e}")
|
||||
continue
|
||||
|
||||
if stats['processed'] % batch_size == 0:
|
||||
if stats["processed"] % batch_size == 0:
|
||||
transaction.commit()
|
||||
print(f" [{stats['processed']}/{total}] Processed...")
|
||||
|
||||
@@ -341,31 +480,28 @@ def process_filtered_snapshots(
|
||||
filter_type: str,
|
||||
before: float | None,
|
||||
after: float | None,
|
||||
batch_size: int
|
||||
resume: str | None,
|
||||
batch_size: int,
|
||||
) -> dict[str, int]:
|
||||
"""Process snapshots matching filters (DB query only)."""
|
||||
from archivebox.core.models import Snapshot
|
||||
from django.db import transaction
|
||||
from django.utils import timezone
|
||||
from datetime import datetime
|
||||
|
||||
stats = {'processed': 0, 'reconciled': 0, 'queued': 0}
|
||||
stats = {"processed": 0, "reconciled": 0, "queued": 0}
|
||||
|
||||
snapshots = Snapshot.objects.all()
|
||||
|
||||
if filter_patterns:
|
||||
snapshots = _apply_pattern_filters(snapshots, list(filter_patterns), filter_type)
|
||||
|
||||
if before:
|
||||
snapshots = snapshots.filter(bookmarked_at__lt=datetime.fromtimestamp(before))
|
||||
if after:
|
||||
snapshots = snapshots.filter(bookmarked_at__gt=datetime.fromtimestamp(after))
|
||||
snapshots = _build_filtered_snapshots_queryset(
|
||||
filter_patterns=filter_patterns,
|
||||
filter_type=filter_type,
|
||||
before=before,
|
||||
after=after,
|
||||
resume=resume,
|
||||
)
|
||||
|
||||
total = snapshots.count()
|
||||
print(f'[*] Found {total} matching snapshots')
|
||||
print(f"[*] Found {total} matching snapshots")
|
||||
|
||||
for snapshot in snapshots.select_related('crawl').iterator(chunk_size=batch_size):
|
||||
stats['processed'] += 1
|
||||
for snapshot in snapshots.select_related("crawl").iterator(chunk_size=batch_size):
|
||||
stats["processed"] += 1
|
||||
|
||||
# Skip snapshots with missing crawl references
|
||||
if _get_snapshot_crawl(snapshot) is None:
|
||||
@@ -384,14 +520,14 @@ def process_filtered_snapshots(
|
||||
snapshot.retry_at = timezone.now()
|
||||
snapshot.save()
|
||||
|
||||
stats['reconciled'] += 1
|
||||
stats['queued'] += 1
|
||||
stats["reconciled"] += 1
|
||||
stats["queued"] += 1
|
||||
except Exception as e:
|
||||
# Skip snapshots that can't be processed
|
||||
print(f" [!] Skipping snapshot {snapshot.id}: {e}")
|
||||
continue
|
||||
|
||||
if stats['processed'] % batch_size == 0:
|
||||
if stats["processed"] % batch_size == 0:
|
||||
transaction.commit()
|
||||
print(f" [{stats['processed']}/{total}] Processed...")
|
||||
|
||||
@@ -405,9 +541,9 @@ def print_stats(stats: dict):
|
||||
|
||||
print(f"""
|
||||
[green]Update Complete[/green]
|
||||
Processed: {stats['processed']}
|
||||
Reconciled: {stats['reconciled']}
|
||||
Queued: {stats['queued']}
|
||||
Processed: {stats["processed"]}
|
||||
Reconciled: {stats["reconciled"]}
|
||||
Queued: {stats["queued"]}
|
||||
""")
|
||||
|
||||
|
||||
@@ -415,37 +551,50 @@ def print_combined_stats(stats_combined: dict):
|
||||
"""Print statistics for full mode."""
|
||||
from rich import print
|
||||
|
||||
s1 = stats_combined['phase1']
|
||||
s2 = stats_combined['phase2']
|
||||
s1 = stats_combined["phase1"]
|
||||
s2 = stats_combined["phase2"]
|
||||
|
||||
print(f"""
|
||||
[green]Archive Update Complete[/green]
|
||||
|
||||
Phase 1 (Drain Old Dirs):
|
||||
Checked: {s1.get('processed', 0)}
|
||||
Migrated: {s1.get('migrated', 0)}
|
||||
Skipped: {s1.get('skipped', 0)}
|
||||
Invalid: {s1.get('invalid', 0)}
|
||||
Checked: {s1.get("processed", 0)}
|
||||
Migrated: {s1.get("migrated", 0)}
|
||||
Skipped: {s1.get("skipped", 0)}
|
||||
Invalid: {s1.get("invalid", 0)}
|
||||
|
||||
Phase 2 (Process DB):
|
||||
Processed: {s2.get('processed', 0)}
|
||||
Reconciled: {s2.get('reconciled', 0)}
|
||||
Queued: {s2.get('queued', 0)}
|
||||
Processed: {s2.get("processed", 0)}
|
||||
Reconciled: {s2.get("reconciled", 0)}
|
||||
Queued: {s2.get("queued", 0)}
|
||||
""")
|
||||
|
||||
|
||||
def print_index_stats(stats: dict[str, Any]) -> None:
|
||||
from rich import print
|
||||
|
||||
print(f"""
|
||||
[green]Search Reindex Complete[/green]
|
||||
Processed: {stats["processed"]}
|
||||
Reconciled: {stats["reconciled"]}
|
||||
Queued: {stats["queued"]}
|
||||
Reindexed: {stats["reindexed"]}
|
||||
""")
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--resume', type=str, help='Resume from timestamp')
|
||||
@click.option('--before', type=float, help='Only snapshots before timestamp')
|
||||
@click.option('--after', type=float, help='Only snapshots after timestamp')
|
||||
@click.option('--filter-type', '-t', type=click.Choice(['exact', 'substring', 'regex', 'domain', 'tag', 'timestamp']), default='exact')
|
||||
@click.option('--batch-size', type=int, default=100, help='Commit every N snapshots')
|
||||
@click.option('--continuous', is_flag=True, help='Run continuously as background worker')
|
||||
@click.argument('filter_patterns', nargs=-1)
|
||||
@click.option("--resume", type=str, help="Resume from timestamp")
|
||||
@click.option("--before", type=float, help="Only snapshots before timestamp")
|
||||
@click.option("--after", type=float, help="Only snapshots after timestamp")
|
||||
@click.option("--filter-type", "-t", type=click.Choice(["exact", "substring", "regex", "domain", "tag", "timestamp"]), default="exact")
|
||||
@click.option("--batch-size", type=int, default=100, help="Commit every N snapshots")
|
||||
@click.option("--continuous", is_flag=True, help="Run continuously as background worker")
|
||||
@click.option("--index-only", is_flag=True, help="Backfill available search indexes from existing archived content")
|
||||
@click.argument("filter_patterns", nargs=-1)
|
||||
@docstring(update.__doc__)
|
||||
def main(**kwargs):
|
||||
update(**kwargs)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__package__ = "archivebox.cli"
|
||||
|
||||
import sys
|
||||
import os
|
||||
import platform
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
from collections.abc import Iterable
|
||||
|
||||
import rich_click as click
|
||||
|
||||
@@ -14,19 +14,22 @@ from archivebox.misc.util import docstring, enforce_types
|
||||
|
||||
|
||||
@enforce_types
|
||||
def version(quiet: bool=False,
|
||||
binaries: Iterable[str]=()) -> list[str]:
|
||||
def version(
|
||||
quiet: bool = False,
|
||||
binaries: Iterable[str] = (),
|
||||
) -> list[str]:
|
||||
"""Print the ArchiveBox version, debug metadata, and installed dependency versions"""
|
||||
|
||||
|
||||
# fast path for just getting the version and exiting, dont do any slower imports
|
||||
from archivebox.config.version import VERSION
|
||||
|
||||
print(VERSION)
|
||||
if quiet or '--version' in sys.argv:
|
||||
if quiet or "--version" in sys.argv:
|
||||
return []
|
||||
|
||||
|
||||
from rich.panel import Panel
|
||||
from rich.console import Console
|
||||
|
||||
|
||||
from archivebox.config import CONSTANTS, DATA_DIR
|
||||
from archivebox.config.version import get_COMMIT_HASH, get_BUILD_TIME
|
||||
from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, RUNNING_AS_UID, RUNNING_AS_GID, IN_DOCKER
|
||||
@@ -34,78 +37,89 @@ def version(quiet: bool=False,
|
||||
from archivebox.config.common import SHELL_CONFIG, STORAGE_CONFIG, SEARCH_BACKEND_CONFIG
|
||||
from archivebox.misc.logging_util import printable_folder_status
|
||||
from archivebox.config.configset import get_config
|
||||
|
||||
|
||||
console = Console()
|
||||
prnt = console.print
|
||||
|
||||
|
||||
# Check if LDAP is enabled (simple config lookup)
|
||||
config = get_config()
|
||||
LDAP_ENABLED = config.get('LDAP_ENABLED', False)
|
||||
LDAP_ENABLED = config.get("LDAP_ENABLED", False)
|
||||
|
||||
p = platform.uname()
|
||||
COMMIT_HASH = get_COMMIT_HASH()
|
||||
prnt(
|
||||
'[dark_green]ArchiveBox[/dark_green] [dark_goldenrod]v{}[/dark_goldenrod]'.format(CONSTANTS.VERSION),
|
||||
f'COMMIT_HASH={COMMIT_HASH[:7] if COMMIT_HASH else "unknown"}',
|
||||
f'BUILD_TIME={get_BUILD_TIME()}',
|
||||
f"[dark_green]ArchiveBox[/dark_green] [dark_goldenrod]v{CONSTANTS.VERSION}[/dark_goldenrod]",
|
||||
f"COMMIT_HASH={COMMIT_HASH[:7] if COMMIT_HASH else 'unknown'}",
|
||||
f"BUILD_TIME={get_BUILD_TIME()}",
|
||||
)
|
||||
prnt(
|
||||
f'IN_DOCKER={IN_DOCKER}',
|
||||
f'IN_QEMU={SHELL_CONFIG.IN_QEMU}',
|
||||
f'ARCH={p.machine}',
|
||||
f'OS={p.system}',
|
||||
f'PLATFORM={platform.platform()}',
|
||||
f'PYTHON={sys.implementation.name.title()}' + (' (venv)' if CONSTANTS.IS_INSIDE_VENV else ''),
|
||||
f"IN_DOCKER={IN_DOCKER}",
|
||||
f"IN_QEMU={SHELL_CONFIG.IN_QEMU}",
|
||||
f"ARCH={p.machine}",
|
||||
f"OS={p.system}",
|
||||
f"PLATFORM={platform.platform()}",
|
||||
f"PYTHON={sys.implementation.name.title()}" + (" (venv)" if CONSTANTS.IS_INSIDE_VENV else ""),
|
||||
)
|
||||
|
||||
|
||||
try:
|
||||
OUTPUT_IS_REMOTE_FS = get_data_locations().DATA_DIR.is_mount or get_data_locations().ARCHIVE_DIR.is_mount
|
||||
except Exception:
|
||||
OUTPUT_IS_REMOTE_FS = False
|
||||
|
||||
|
||||
try:
|
||||
DATA_DIR_STAT = CONSTANTS.DATA_DIR.stat()
|
||||
prnt(
|
||||
f'EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} PUID={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}',
|
||||
f'FS_UID={DATA_DIR_STAT.st_uid}:{DATA_DIR_STAT.st_gid}',
|
||||
f'FS_PERMS={STORAGE_CONFIG.OUTPUT_PERMISSIONS}',
|
||||
f'FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}',
|
||||
f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
|
||||
f"EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} PUID={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}",
|
||||
f"FS_UID={DATA_DIR_STAT.st_uid}:{DATA_DIR_STAT.st_gid}",
|
||||
f"FS_PERMS={STORAGE_CONFIG.OUTPUT_PERMISSIONS}",
|
||||
f"FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}",
|
||||
f"FS_REMOTE={OUTPUT_IS_REMOTE_FS}",
|
||||
)
|
||||
except Exception:
|
||||
prnt(
|
||||
f'EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} PUID={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}',
|
||||
f"EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} PUID={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}",
|
||||
)
|
||||
|
||||
|
||||
prnt(
|
||||
f'DEBUG={SHELL_CONFIG.DEBUG}',
|
||||
f'IS_TTY={SHELL_CONFIG.IS_TTY}',
|
||||
f'SUDO={CONSTANTS.IS_ROOT}',
|
||||
f'ID={CONSTANTS.MACHINE_ID}:{CONSTANTS.COLLECTION_ID}',
|
||||
f'SEARCH_BACKEND={SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}',
|
||||
f'LDAP={LDAP_ENABLED}',
|
||||
f"DEBUG={SHELL_CONFIG.DEBUG}",
|
||||
f"IS_TTY={SHELL_CONFIG.IS_TTY}",
|
||||
f"SUDO={CONSTANTS.IS_ROOT}",
|
||||
f"ID={CONSTANTS.MACHINE_ID}:{CONSTANTS.COLLECTION_ID}",
|
||||
f"SEARCH_BACKEND={SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}",
|
||||
f"LDAP={LDAP_ENABLED}",
|
||||
)
|
||||
prnt()
|
||||
|
||||
|
||||
if not (os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) and os.access(CONSTANTS.CONFIG_FILE, os.R_OK)):
|
||||
PANEL_TEXT = '\n'.join((
|
||||
'',
|
||||
'[violet]Hint:[/violet] [green]cd[/green] into a collection [blue]DATA_DIR[/blue] and run [green]archivebox version[/green] again...',
|
||||
' [grey53]OR[/grey53] run [green]archivebox init[/green] to create a new collection in the current dir.',
|
||||
'',
|
||||
' [i][grey53](this is [red]REQUIRED[/red] if you are opening a Github Issue to get help)[/grey53][/i]',
|
||||
'',
|
||||
))
|
||||
prnt(Panel(PANEL_TEXT, expand=False, border_style='grey53', title='[red]:exclamation: No collection [blue]DATA_DIR[/blue] is currently active[/red]', subtitle='Full version info is only available when inside a collection [light_slate_blue]DATA DIR[/light_slate_blue]'))
|
||||
PANEL_TEXT = "\n".join(
|
||||
(
|
||||
"",
|
||||
"[violet]Hint:[/violet] [green]cd[/green] into a collection [blue]DATA_DIR[/blue] and run [green]archivebox version[/green] again...",
|
||||
" [grey53]OR[/grey53] run [green]archivebox init[/green] to create a new collection in the current dir.",
|
||||
"",
|
||||
" [i][grey53](this is [red]REQUIRED[/red] if you are opening a Github Issue to get help)[/grey53][/i]",
|
||||
"",
|
||||
),
|
||||
)
|
||||
prnt(
|
||||
Panel(
|
||||
PANEL_TEXT,
|
||||
expand=False,
|
||||
border_style="grey53",
|
||||
title="[red]:exclamation: No collection [blue]DATA_DIR[/blue] is currently active[/red]",
|
||||
subtitle="Full version info is only available when inside a collection [light_slate_blue]DATA DIR[/light_slate_blue]",
|
||||
),
|
||||
)
|
||||
prnt()
|
||||
return []
|
||||
|
||||
prnt('[pale_green1][i] Binary Dependencies:[/pale_green1]')
|
||||
prnt("[pale_green1][i] Binary Dependencies:[/pale_green1]")
|
||||
failures = []
|
||||
|
||||
# Setup Django before importing models
|
||||
try:
|
||||
from archivebox.config.django import setup_django
|
||||
|
||||
setup_django()
|
||||
|
||||
from archivebox.machine.models import Machine, Binary
|
||||
@@ -113,12 +127,17 @@ def version(quiet: bool=False,
|
||||
machine = Machine.current()
|
||||
|
||||
# Get all binaries from the database with timeout protection
|
||||
all_installed = Binary.objects.filter(
|
||||
machine=machine
|
||||
).exclude(abspath='').exclude(abspath__isnull=True).order_by('name')
|
||||
all_installed = (
|
||||
Binary.objects.filter(
|
||||
machine=machine,
|
||||
)
|
||||
.exclude(abspath="")
|
||||
.exclude(abspath__isnull=True)
|
||||
.order_by("name")
|
||||
)
|
||||
|
||||
if not all_installed.exists():
|
||||
prnt('', '[grey53]No binaries detected. Run [green]archivebox install[/green] to detect dependencies.[/grey53]')
|
||||
prnt("", "[grey53]No binaries detected. Run [green]archivebox install[/green] to detect dependencies.[/grey53]")
|
||||
else:
|
||||
for installed in all_installed:
|
||||
# Skip if user specified specific binaries and this isn't one
|
||||
@@ -126,71 +145,91 @@ def version(quiet: bool=False,
|
||||
continue
|
||||
|
||||
if installed.is_valid:
|
||||
display_path = installed.abspath.replace(str(DATA_DIR), '.').replace(str(Path('~').expanduser()), '~')
|
||||
version_str = (installed.version or 'unknown')[:15]
|
||||
provider = (installed.binprovider or 'env')[:8]
|
||||
prnt('', '[green]√[/green]', '', installed.name.ljust(18), version_str.ljust(16), provider.ljust(8), display_path, overflow='ignore', crop=False)
|
||||
display_path = installed.abspath.replace(str(DATA_DIR), ".").replace(str(Path("~").expanduser()), "~")
|
||||
version_str = (installed.version or "unknown")[:15]
|
||||
provider = (installed.binprovider or "env")[:8]
|
||||
prnt(
|
||||
"",
|
||||
"[green]√[/green]",
|
||||
"",
|
||||
installed.name.ljust(18),
|
||||
version_str.ljust(16),
|
||||
provider.ljust(8),
|
||||
display_path,
|
||||
overflow="ignore",
|
||||
crop=False,
|
||||
)
|
||||
else:
|
||||
prnt('', '[red]X[/red]', '', installed.name.ljust(18), '[grey53]not installed[/grey53]', overflow='ignore', crop=False)
|
||||
prnt("", "[red]X[/red]", "", installed.name.ljust(18), "[grey53]not installed[/grey53]", overflow="ignore", crop=False)
|
||||
failures.append(installed.name)
|
||||
|
||||
# Show hint if no binaries are installed yet
|
||||
has_any_installed = Binary.objects.filter(machine=machine).exclude(abspath='').exists()
|
||||
has_any_installed = Binary.objects.filter(machine=machine).exclude(abspath="").exists()
|
||||
if not has_any_installed:
|
||||
prnt()
|
||||
prnt('', '[grey53]Run [green]archivebox install[/green] to detect and install dependencies.[/grey53]')
|
||||
prnt("", "[grey53]Run [green]archivebox install[/green] to detect and install dependencies.[/grey53]")
|
||||
|
||||
except Exception as e:
|
||||
# Handle database errors gracefully (locked, missing, etc.)
|
||||
prnt()
|
||||
prnt('', f'[yellow]Warning: Could not query binaries from database: {e}[/yellow]')
|
||||
prnt('', '[grey53]Run [green]archivebox init[/green] and [green]archivebox install[/green] to set up dependencies.[/grey53]')
|
||||
prnt("", f"[yellow]Warning: Could not query binaries from database: {e}[/yellow]")
|
||||
prnt("", "[grey53]Run [green]archivebox init[/green] and [green]archivebox install[/green] to set up dependencies.[/grey53]")
|
||||
|
||||
if not binaries:
|
||||
# Show code and data locations
|
||||
prnt()
|
||||
prnt('[deep_sky_blue3][i] Code locations:[/deep_sky_blue3]')
|
||||
prnt("[deep_sky_blue3][i] Code locations:[/deep_sky_blue3]")
|
||||
try:
|
||||
for name, path in get_code_locations().items():
|
||||
if isinstance(name, str) and isinstance(path, dict):
|
||||
prnt(printable_folder_status(name, path), overflow='ignore', crop=False)
|
||||
prnt(printable_folder_status(name, path), overflow="ignore", crop=False)
|
||||
except Exception as e:
|
||||
prnt(f' [red]Error getting code locations: {e}[/red]')
|
||||
prnt(f" [red]Error getting code locations: {e}[/red]")
|
||||
|
||||
prnt()
|
||||
if os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) or os.access(CONSTANTS.CONFIG_FILE, os.R_OK):
|
||||
prnt('[bright_yellow][i] Data locations:[/bright_yellow]')
|
||||
prnt("[bright_yellow][i] Data locations:[/bright_yellow]")
|
||||
try:
|
||||
for name, path in get_data_locations().items():
|
||||
if isinstance(name, str) and isinstance(path, dict):
|
||||
prnt(printable_folder_status(name, path), overflow='ignore', crop=False)
|
||||
prnt(printable_folder_status(name, path), overflow="ignore", crop=False)
|
||||
except Exception as e:
|
||||
prnt(f' [red]Error getting data locations: {e}[/red]')
|
||||
|
||||
prnt(f" [red]Error getting data locations: {e}[/red]")
|
||||
|
||||
try:
|
||||
from archivebox.misc.checks import check_data_dir_permissions
|
||||
|
||||
check_data_dir_permissions()
|
||||
except Exception:
|
||||
pass
|
||||
else:
|
||||
prnt()
|
||||
prnt('[red][i] Data locations:[/red] (not in a data directory)')
|
||||
|
||||
prnt("[red][i] Data locations:[/red] (not in a data directory)")
|
||||
|
||||
prnt()
|
||||
|
||||
|
||||
if failures:
|
||||
prnt('[red]Error:[/red] [yellow]Failed to detect the following binaries:[/yellow]')
|
||||
prnt(f' [red]{", ".join(failures)}[/red]')
|
||||
prnt("[red]Error:[/red] [yellow]Failed to detect the following binaries:[/yellow]")
|
||||
prnt(f" [red]{', '.join(failures)}[/red]")
|
||||
prnt()
|
||||
prnt('[violet]Hint:[/violet] To install missing binaries automatically, run:')
|
||||
prnt(' [green]archivebox install[/green]')
|
||||
prnt("[violet]Hint:[/violet] To install missing binaries automatically, run:")
|
||||
prnt(" [green]archivebox install[/green]")
|
||||
prnt()
|
||||
return failures
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--quiet', '-q', is_flag=True, help='Only print ArchiveBox version number and nothing else. (equivalent to archivebox --version)')
|
||||
@click.option('--binaries', '-b', help='Select binaries to detect DEFAULT=curl,wget,git,yt-dlp,chrome,single-file,readability-extractor,postlight-parser,... (all)')
|
||||
@click.option(
|
||||
"--quiet",
|
||||
"-q",
|
||||
is_flag=True,
|
||||
help="Only print ArchiveBox version number and nothing else. (equivalent to archivebox --version)",
|
||||
)
|
||||
@click.option(
|
||||
"--binaries",
|
||||
"-b",
|
||||
help="Select binaries to detect DEFAULT=curl,wget,git,yt-dlp,chrome,single-file,readability-extractor,postlight-parser,... (all)",
|
||||
)
|
||||
@docstring(version.__doc__)
|
||||
def main(**kwargs):
|
||||
failures = version(**kwargs)
|
||||
@@ -198,5 +237,5 @@ def main(**kwargs):
|
||||
raise SystemExit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -5,12 +5,10 @@ This module contains common utilities used across multiple CLI commands,
|
||||
extracted to avoid code duplication.
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
|
||||
from typing import Optional
|
||||
__package__ = "archivebox.cli"
|
||||
|
||||
|
||||
def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
|
||||
def apply_filters(queryset, filter_kwargs: dict, limit: int | None = None):
|
||||
"""
|
||||
Apply Django-style filters from CLI kwargs to a QuerySet.
|
||||
|
||||
@@ -31,11 +29,11 @@ def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
|
||||
"""
|
||||
filters = {}
|
||||
for key, value in filter_kwargs.items():
|
||||
if value is None or key in ('limit', 'offset'):
|
||||
if value is None or key in ("limit", "offset"):
|
||||
continue
|
||||
# Handle CSV lists for __in filters
|
||||
if key.endswith('__in') and isinstance(value, str):
|
||||
value = [v.strip() for v in value.split(',')]
|
||||
if key.endswith("__in") and isinstance(value, str):
|
||||
value = [v.strip() for v in value.split(",")]
|
||||
filters[key] = value
|
||||
|
||||
if filters:
|
||||
|
||||
Reference in New Issue
Block a user