This commit is contained in:
Nick Sweeting
2026-03-23 03:58:32 -07:00
parent 268856bcfb
commit b749b26c5d
286 changed files with 21704 additions and 13480 deletions

View File

@@ -1,5 +1,5 @@
__package__ = 'archivebox.cli'
__command__ = 'archivebox'
__package__ = "archivebox.cli"
__command__ = "archivebox"
import os
import sys
from importlib import import_module
@@ -10,55 +10,55 @@ from rich import print
from archivebox.config.version import VERSION
if '--debug' in sys.argv:
os.environ['DEBUG'] = 'True'
sys.argv.remove('--debug')
if "--debug" in sys.argv:
os.environ["DEBUG"] = "True"
sys.argv.remove("--debug")
class ArchiveBoxGroup(click.Group):
"""lazy loading click group for archivebox commands"""
meta_commands = {
'help': 'archivebox.cli.archivebox_help.main',
'version': 'archivebox.cli.archivebox_version.main',
'mcp': 'archivebox.cli.archivebox_mcp.main',
"help": "archivebox.cli.archivebox_help.main",
"version": "archivebox.cli.archivebox_version.main",
"mcp": "archivebox.cli.archivebox_mcp.main",
}
setup_commands = {
'init': 'archivebox.cli.archivebox_init.main',
'install': 'archivebox.cli.archivebox_install.main',
"init": "archivebox.cli.archivebox_init.main",
"install": "archivebox.cli.archivebox_install.main",
}
# Model commands (CRUD operations via subcommands)
model_commands = {
'crawl': 'archivebox.cli.archivebox_crawl.main',
'snapshot': 'archivebox.cli.archivebox_snapshot.main',
'archiveresult': 'archivebox.cli.archivebox_archiveresult.main',
'tag': 'archivebox.cli.archivebox_tag.main',
'binary': 'archivebox.cli.archivebox_binary.main',
'process': 'archivebox.cli.archivebox_process.main',
'machine': 'archivebox.cli.archivebox_machine.main',
'persona': 'archivebox.cli.archivebox_persona.main',
"crawl": "archivebox.cli.archivebox_crawl.main",
"snapshot": "archivebox.cli.archivebox_snapshot.main",
"archiveresult": "archivebox.cli.archivebox_archiveresult.main",
"tag": "archivebox.cli.archivebox_tag.main",
"binary": "archivebox.cli.archivebox_binary.main",
"process": "archivebox.cli.archivebox_process.main",
"machine": "archivebox.cli.archivebox_machine.main",
"persona": "archivebox.cli.archivebox_persona.main",
}
archive_commands = {
# High-level commands
'add': 'archivebox.cli.archivebox_add.main',
'extract': 'archivebox.cli.archivebox_extract.main',
'list': 'archivebox.cli.archivebox_list.main',
'remove': 'archivebox.cli.archivebox_remove.main',
'run': 'archivebox.cli.archivebox_run.main',
'update': 'archivebox.cli.archivebox_update.main',
'status': 'archivebox.cli.archivebox_status.main',
'search': 'archivebox.cli.archivebox_search.main',
'config': 'archivebox.cli.archivebox_config.main',
'schedule': 'archivebox.cli.archivebox_schedule.main',
'server': 'archivebox.cli.archivebox_server.main',
'shell': 'archivebox.cli.archivebox_shell.main',
'manage': 'archivebox.cli.archivebox_manage.main',
"add": "archivebox.cli.archivebox_add.main",
"extract": "archivebox.cli.archivebox_extract.main",
"list": "archivebox.cli.archivebox_list.main",
"remove": "archivebox.cli.archivebox_remove.main",
"run": "archivebox.cli.archivebox_run.main",
"update": "archivebox.cli.archivebox_update.main",
"status": "archivebox.cli.archivebox_status.main",
"search": "archivebox.cli.archivebox_search.main",
"config": "archivebox.cli.archivebox_config.main",
"schedule": "archivebox.cli.archivebox_schedule.main",
"server": "archivebox.cli.archivebox_server.main",
"shell": "archivebox.cli.archivebox_shell.main",
"manage": "archivebox.cli.archivebox_manage.main",
# Introspection commands
'pluginmap': 'archivebox.cli.archivebox_pluginmap.main',
"pluginmap": "archivebox.cli.archivebox_pluginmap.main",
}
legacy_model_commands = {
'crawl': 'archivebox.cli.archivebox_crawl_compat.main',
'snapshot': 'archivebox.cli.archivebox_snapshot_compat.main',
"crawl": "archivebox.cli.archivebox_crawl_compat.main",
"snapshot": "archivebox.cli.archivebox_snapshot_compat.main",
}
all_subcommands = {
**meta_commands,
@@ -67,15 +67,15 @@ class ArchiveBoxGroup(click.Group):
**archive_commands,
}
renamed_commands = {
'setup': 'install',
'import': 'add',
'archive': 'add',
"setup": "install",
"import": "add",
"archive": "add",
}
legacy_model_subcommands = {
'crawl': {'create', 'list', 'update', 'delete'},
'snapshot': {'create', 'list', 'update', 'delete'},
"crawl": {"create", "list", "update", "delete"},
"snapshot": {"create", "list", "update", "delete"},
}
@classmethod
def get_canonical_name(cls, cmd_name):
return cls.renamed_commands.get(cmd_name, cmd_name)
@@ -90,23 +90,22 @@ class ArchiveBoxGroup(click.Group):
except ValueError:
return False
remaining_args = sys.argv[arg_idx + 1:]
remaining_args = sys.argv[arg_idx + 1 :]
if not remaining_args:
return False
first_arg = remaining_args[0]
if first_arg in ('-h', '--help'):
if first_arg in ("-h", "--help"):
return False
return first_arg not in cls.legacy_model_subcommands[cmd_name]
def get_command(self, ctx, cmd_name):
# handle renamed commands
if cmd_name in self.renamed_commands:
new_name = self.renamed_commands[cmd_name]
print(
f' [violet]Hint:[/violet] `archivebox {cmd_name}` has been renamed to `archivebox {new_name}`',
f" [violet]Hint:[/violet] `archivebox {cmd_name}` has been renamed to `archivebox {new_name}`",
file=sys.stderr,
)
cmd_name = new_name
@@ -114,11 +113,11 @@ class ArchiveBoxGroup(click.Group):
if self._should_use_legacy_model_command(cmd_name):
return self._lazy_load(self.legacy_model_commands[cmd_name])
# handle lazy loading of commands
if cmd_name in self.all_subcommands:
return self._lazy_load(cmd_name)
# fall-back to using click's default command lookup
return super().get_command(ctx, cmd_name)
@@ -127,72 +126,74 @@ class ArchiveBoxGroup(click.Group):
import_path = cls.all_subcommands.get(cmd_name_or_path)
if import_path is None:
import_path = cmd_name_or_path
modname, funcname = import_path.rsplit('.', 1)
modname, funcname = import_path.rsplit(".", 1)
# print(f'LAZY LOADING {import_path}')
mod = import_module(modname)
func = getattr(mod, funcname)
if not hasattr(func, '__doc__'):
raise ValueError(f'lazy loading of {import_path} failed - no docstring found on method')
if not hasattr(func, "__doc__"):
raise ValueError(f"lazy loading of {import_path} failed - no docstring found on method")
# if not isinstance(cmd, click.BaseCommand):
# raise ValueError(f'lazy loading of {import_path} failed - not a click command')
# raise ValueError(f'lazy loading of {import_path} failed - not a click command')
return func
@click.group(cls=ArchiveBoxGroup, invoke_without_command=True)
@click.option('--help', '-h', is_flag=True, help='Show help')
@click.version_option(VERSION, '-v', '--version', package_name='archivebox', message='%(version)s')
@click.option("--help", "-h", is_flag=True, help="Show help")
@click.version_option(VERSION, "-v", "--version", package_name="archivebox", message="%(version)s")
@click.pass_context
def cli(ctx, help=False):
"""ArchiveBox: The self-hosted internet archive"""
subcommand = ArchiveBoxGroup.get_canonical_name(ctx.invoked_subcommand)
# if --help is passed or no subcommand is given, show custom help message
if help or ctx.invoked_subcommand is None:
ctx.invoke(ctx.command.get_command(ctx, 'help'))
ctx.invoke(ctx.command.get_command(ctx, "help"))
# if the subcommand is in archive_commands or model_commands,
# then we need to set up the django environment and check that we're in a valid data folder
if subcommand in ArchiveBoxGroup.archive_commands or subcommand in ArchiveBoxGroup.model_commands:
# print('SETUP DJANGO AND CHECK DATA FOLDER')
try:
if subcommand == 'server':
run_in_debug = '--reload' in sys.argv or os.environ.get('DEBUG') in ('1', 'true', 'True', 'TRUE', 'yes')
if subcommand == "server":
run_in_debug = "--reload" in sys.argv or os.environ.get("DEBUG") in ("1", "true", "True", "TRUE", "yes")
if run_in_debug:
os.environ['ARCHIVEBOX_RUNSERVER'] = '1'
if '--reload' in sys.argv:
os.environ['ARCHIVEBOX_AUTORELOAD'] = '1'
os.environ["ARCHIVEBOX_RUNSERVER"] = "1"
if "--reload" in sys.argv:
os.environ["ARCHIVEBOX_AUTORELOAD"] = "1"
from archivebox.config.common import STORAGE_CONFIG
os.environ['ARCHIVEBOX_RUNSERVER_PIDFILE'] = str(STORAGE_CONFIG.TMP_DIR / 'runserver.pid')
os.environ["ARCHIVEBOX_RUNSERVER_PIDFILE"] = str(STORAGE_CONFIG.TMP_DIR / "runserver.pid")
from archivebox.config.django import setup_django
from archivebox.misc.checks import check_data_folder
setup_django()
check_data_folder()
except Exception as e:
print(f'[red][X] Error setting up Django or checking data folder: {e}[/red]', file=sys.stderr)
if subcommand not in ('manage', 'shell'): # not all management commands need django to be setup beforehand
print(f"[red][X] Error setting up Django or checking data folder: {e}[/red]", file=sys.stderr)
if subcommand not in ("manage", "shell"): # not all management commands need django to be setup beforehand
raise
def main(args=None, prog_name=None, stdin=None):
# show `docker run archivebox xyz` in help messages if running in docker
IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes')
IN_DOCKER = os.environ.get("IN_DOCKER", False) in ("1", "true", "True", "TRUE", "yes")
IS_TTY = sys.stdin.isatty()
prog_name = prog_name or (f'docker compose run{"" if IS_TTY else " -T"} archivebox' if IN_DOCKER else 'archivebox')
prog_name = prog_name or (f"docker compose run{'' if IS_TTY else ' -T'} archivebox" if IN_DOCKER else "archivebox")
# stdin param allows passing input data from caller (used by __main__.py)
# currently not used by click-based CLI, but kept for backwards compatibility
try:
cli(args=args, prog_name=prog_name)
except KeyboardInterrupt:
print('\n\n[red][X] Got CTRL+C. Exiting...[/red]')
print("\n\n[red][X] Got CTRL+C. Exiting...[/red]")
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -1,7 +1,7 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox add'
__package__ = "archivebox.cli"
__command__ = "archivebox add"
import sys
from pathlib import Path
@@ -14,6 +14,7 @@ from django.utils import timezone
from django.db.models import QuerySet
from archivebox.misc.util import enforce_types, docstring
from archivebox.misc.util import parse_filesize_to_bytes
from archivebox import CONSTANTS
from archivebox.config.common import ARCHIVING_CONFIG, SERVER_CONFIG
from archivebox.config.permissions import USER, HOSTNAME
@@ -29,34 +30,38 @@ def _collect_input_urls(args: tuple[str, ...]) -> list[str]:
urls: list[str] = []
for record in read_args_or_stdin(args):
url = record.get('url')
url = record.get("url")
if isinstance(url, str) and url:
urls.append(url)
urls_field = record.get('urls')
urls_field = record.get("urls")
if isinstance(urls_field, str):
for line in urls_field.splitlines():
line = line.strip()
if line and not line.startswith('#'):
if line and not line.startswith("#"):
urls.append(line)
return urls
@enforce_types
def add(urls: str | list[str],
depth: int | str=0,
tag: str='',
url_allowlist: str='',
url_denylist: str='',
parser: str="auto",
plugins: str="",
persona: str='Default',
overwrite: bool=False,
update: bool | None=None,
index_only: bool=False,
bg: bool=False,
created_by_id: int | None=None) -> tuple['Crawl', QuerySet['Snapshot']]:
def add(
urls: str | list[str],
depth: int | str = 0,
max_urls: int = 0,
max_size: int | str = 0,
tag: str = "",
url_allowlist: str = "",
url_denylist: str = "",
parser: str = "auto",
plugins: str = "",
persona: str = "Default",
overwrite: bool = False,
update: bool | None = None,
index_only: bool = False,
bg: bool = False,
created_by_id: int | None = None,
) -> tuple["Crawl", QuerySet["Snapshot"]]:
"""Add a new URL or list of URLs to your archive.
The flow is:
@@ -72,8 +77,15 @@ def add(urls: str | list[str],
from rich import print
depth = int(depth)
max_urls = int(max_urls or 0)
max_size = parse_filesize_to_bytes(max_size)
assert depth in (0, 1, 2, 3, 4), 'Depth must be 0-4'
if depth not in (0, 1, 2, 3, 4):
raise ValueError("Depth must be 0-4")
if max_urls < 0:
raise ValueError("max_urls must be >= 0")
if max_size < 0:
raise ValueError("max_size must be >= 0")
# import models once django is set up
from archivebox.core.models import Snapshot
@@ -91,47 +103,49 @@ def add(urls: str | list[str],
update = not ARCHIVING_CONFIG.ONLY_NEW
# 1. Save the provided URLs to sources/2024-11-05__23-59-59__cli_add.txt
sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__cli_add.txt'
sources_file = CONSTANTS.SOURCES_DIR / f"{timezone.now().strftime('%Y-%m-%d__%H-%M-%S')}__cli_add.txt"
sources_file.parent.mkdir(parents=True, exist_ok=True)
sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))
sources_file.write_text(urls if isinstance(urls, str) else "\n".join(urls))
# 2. Create a new Crawl with inline URLs
cli_args = [*sys.argv]
if cli_args[0].lower().endswith('archivebox'):
cli_args[0] = 'archivebox'
cmd_str = ' '.join(cli_args)
if cli_args[0].lower().endswith("archivebox"):
cli_args[0] = "archivebox"
cmd_str = " ".join(cli_args)
timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
# Read URLs directly into crawl
urls_content = sources_file.read_text()
persona_name = (persona or 'Default').strip() or 'Default'
plugins = plugins or str(get_config().get('PLUGINS') or '')
persona_name = (persona or "Default").strip() or "Default"
plugins = plugins or str(get_config().get("PLUGINS") or "")
persona_obj, _ = Persona.objects.get_or_create(name=persona_name)
persona_obj.ensure_dirs()
crawl = Crawl.objects.create(
urls=urls_content,
max_depth=depth,
max_urls=max_urls,
max_size=max_size,
tags_str=tag,
persona_id=persona_obj.id,
label=f'{USER}@{HOSTNAME} $ {cmd_str} [{timestamp}]',
label=f"{USER}@{HOSTNAME} $ {cmd_str} [{timestamp}]",
created_by_id=created_by_id,
config={
'ONLY_NEW': not update,
'INDEX_ONLY': index_only,
'OVERWRITE': overwrite,
'PLUGINS': plugins,
'DEFAULT_PERSONA': persona_name,
'PARSER': parser,
**({'URL_ALLOWLIST': url_allowlist} if url_allowlist else {}),
**({'URL_DENYLIST': url_denylist} if url_denylist else {}),
}
"ONLY_NEW": not update,
"INDEX_ONLY": index_only,
"OVERWRITE": overwrite,
"PLUGINS": plugins,
"DEFAULT_PERSONA": persona_name,
"PARSER": parser,
**({"URL_ALLOWLIST": url_allowlist} if url_allowlist else {}),
**({"URL_DENYLIST": url_denylist} if url_denylist else {}),
},
)
print(f'[green]\\[+] Created Crawl {crawl.id} with max_depth={depth}[/green]')
first_url = crawl.get_urls_list()[0] if crawl.get_urls_list() else ''
print(f' [dim]First URL: {first_url}[/dim]')
print(f"[green]\\[+] Created Crawl {crawl.id} with max_depth={depth}[/green]")
first_url = crawl.get_urls_list()[0] if crawl.get_urls_list() else ""
print(f" [dim]First URL: {first_url}[/dim]")
# 3. The CrawlMachine will create Snapshots from all URLs when started
# Parser extractors run on snapshots and discover more URLs
@@ -139,20 +153,21 @@ def add(urls: str | list[str],
if index_only:
# Just create the crawl but don't start processing
print('[yellow]\\[*] Index-only mode - crawl created but not started[/yellow]')
print("[yellow]\\[*] Index-only mode - crawl created but not started[/yellow]")
# Create snapshots for all URLs in the crawl
for url in crawl.get_urls_list():
snapshot, _ = Snapshot.objects.update_or_create(
crawl=crawl, url=url,
crawl=crawl,
url=url,
defaults={
'status': Snapshot.INITIAL_STATE,
'retry_at': timezone.now(),
'timestamp': str(timezone.now().timestamp()),
'depth': 0,
"status": Snapshot.INITIAL_STATE,
"retry_at": timezone.now(),
"timestamp": str(timezone.now().timestamp()),
"depth": 0,
},
)
if tag:
snapshot.save_tags(tag.split(','))
snapshot.save_tags(tag.split(","))
snapshot.ensure_crawl_symlink()
return crawl, crawl.snapshot_set.all()
@@ -168,10 +183,12 @@ def add(urls: str | list[str],
if bg:
# Background mode: just queue work and return (background runner via server will pick it up)
print('[yellow]\\[*] URLs queued. The background runner will process them (run `archivebox server` or `archivebox run --daemon` if not already running).[/yellow]')
print(
"[yellow]\\[*] URLs queued. The background runner will process them (run `archivebox server` or `archivebox run --daemon` if not already running).[/yellow]",
)
else:
# Foreground mode: run full crawl runner until all work is done
print('[green]\\[*] Starting crawl runner to process crawl...[/green]')
print("[green]\\[*] Starting crawl runner to process crawl...[/green]")
run_crawl(str(crawl.id))
# Print summary for foreground runs
@@ -179,7 +196,10 @@ def add(urls: str | list[str],
crawl.refresh_from_db()
snapshots_count = crawl.snapshot_set.count()
try:
total_bytes = sum(s.archive_size for s in crawl.snapshot_set.all())
from django.db.models import Count, Sum
totals = crawl.snapshot_set.aggregate(snapshot_count=Count("id"), total_bytes=Sum("archiveresult__output_size"))
total_bytes = int(totals["total_bytes"] or 0) if totals["snapshot_count"] else 0
except Exception:
total_bytes, _, _ = get_dir_size(crawl.output_dir)
total_size = printable_filesize(total_bytes)
@@ -197,23 +217,23 @@ def add(urls: str | list[str],
# Output dir relative to DATA_DIR
try:
rel_output = Path(crawl.output_dir).relative_to(CONSTANTS.DATA_DIR)
rel_output_str = f'./{rel_output}'
rel_output_str = f"./{rel_output}"
except Exception:
rel_output_str = str(crawl.output_dir)
bind_addr = SERVER_CONFIG.BIND_ADDR or '127.0.0.1:8000'
if bind_addr.startswith('http://') or bind_addr.startswith('https://'):
bind_addr = SERVER_CONFIG.BIND_ADDR or "127.0.0.1:8000"
if bind_addr.startswith("http://") or bind_addr.startswith("https://"):
base_url = bind_addr
else:
base_url = f'http://{bind_addr}'
admin_url = f'{base_url}/admin/crawls/crawl/{crawl.id}/change/'
base_url = f"http://{bind_addr}"
admin_url = f"{base_url}/admin/crawls/crawl/{crawl.id}/change/"
print('\n[bold]crawl output saved to:[/bold]')
print(f' {rel_output_str}')
print(f' {admin_url}')
print(f'\n[bold]total urls snapshotted:[/bold] {snapshots_count}')
print(f'[bold]total size:[/bold] {total_size}')
print(f'[bold]total time:[/bold] {duration_str}')
print("\n[bold]crawl output saved to:[/bold]")
print(f" {rel_output_str}")
print(f" {admin_url}")
print(f"\n[bold]total urls snapshotted:[/bold] {snapshots_count}")
print(f"[bold]total size:[/bold] {total_size}")
print(f"[bold]total time:[/bold] {duration_str}")
except Exception:
# Summary is best-effort; avoid failing the command if something goes wrong
pass
@@ -224,29 +244,43 @@ def add(urls: str | list[str],
@click.command()
@click.option('--depth', '-d', type=click.Choice([str(i) for i in range(5)]), default='0', help='Recursively archive linked pages up to N hops away')
@click.option('--tag', '-t', default='', help='Comma-separated list of tags to add to each snapshot e.g. tag1,tag2,tag3')
@click.option('--url-allowlist', '--domain-allowlist', default='', help='Comma-separated URL/domain allowlist for this crawl')
@click.option('--url-denylist', '--domain-denylist', default='', help='Comma-separated URL/domain denylist for this crawl')
@click.option('--parser', default='auto', help='Parser for reading input URLs (auto, txt, html, rss, json, jsonl, netscape, ...)')
@click.option('--plugins', '-p', default='', help='Comma-separated list of plugins to run e.g. title,favicon,screenshot,singlefile,...')
@click.option('--persona', default='Default', help='Authentication profile to use when archiving')
@click.option('--overwrite', '-F', is_flag=True, help='Overwrite existing data if URLs have been archived previously')
@click.option('--update', is_flag=True, default=None, help='Retry any previously skipped/failed URLs when re-adding them')
@click.option('--index-only', is_flag=True, help='Just add the URLs to the index without archiving them now')
@click.option('--bg', is_flag=True, help='Run archiving in background (queue work and return immediately)')
@click.argument('urls', nargs=-1, type=click.Path())
@click.option(
"--depth",
"-d",
type=click.Choice([str(i) for i in range(5)]),
default="0",
help="Recursively archive linked pages up to N hops away",
)
@click.option("--max-urls", type=int, default=0, help="Maximum number of URLs to snapshot for this crawl (0 = unlimited)")
@click.option("--max-size", default="0", help="Maximum total crawl size in bytes or units like 45mb / 1gb (0 = unlimited)")
@click.option("--tag", "-t", default="", help="Comma-separated list of tags to add to each snapshot e.g. tag1,tag2,tag3")
@click.option("--url-allowlist", "--domain-allowlist", default="", help="Comma-separated URL/domain allowlist for this crawl")
@click.option("--url-denylist", "--domain-denylist", default="", help="Comma-separated URL/domain denylist for this crawl")
@click.option("--parser", default="auto", help="Parser for reading input URLs (auto, txt, html, rss, json, jsonl, netscape, ...)")
@click.option("--plugins", "-p", default="", help="Comma-separated list of plugins to run e.g. title,favicon,screenshot,singlefile,...")
@click.option("--persona", default="Default", help="Authentication profile to use when archiving")
@click.option("--overwrite", "-F", is_flag=True, help="Overwrite existing data if URLs have been archived previously")
@click.option("--update", is_flag=True, default=None, help="Retry any previously skipped/failed URLs when re-adding them")
@click.option("--index-only", is_flag=True, help="Just add the URLs to the index without archiving them now")
@click.option("--bg", is_flag=True, help="Run archiving in background (queue work and return immediately)")
@click.argument("urls", nargs=-1, type=click.Path())
@docstring(add.__doc__)
def main(**kwargs):
"""Add a new URL or list of URLs to your archive"""
raw_urls = kwargs.pop('urls')
raw_urls = kwargs.pop("urls")
urls = _collect_input_urls(raw_urls)
if not urls:
raise click.UsageError('No URLs provided. Pass URLs as arguments or via stdin.')
raise click.UsageError("No URLs provided. Pass URLs as arguments or via stdin.")
if int(kwargs.get("max_urls") or 0) < 0:
raise click.BadParameter("max_urls must be 0 or a positive integer.", param_hint="--max-urls")
try:
kwargs["max_size"] = parse_filesize_to_bytes(kwargs.get("max_size"))
except ValueError as err:
raise click.BadParameter(str(err), param_hint="--max-size") from err
add(urls=urls, **kwargs)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -30,11 +30,10 @@ Examples:
archivebox archiveresult list --status=failed | archivebox run
"""
__package__ = 'archivebox.cli'
__command__ = 'archivebox archiveresult'
__package__ = "archivebox.cli"
__command__ = "archivebox archiveresult"
import sys
from typing import Optional
import rich_click as click
from rich import print as rprint
@@ -42,13 +41,13 @@ from rich import print as rprint
from archivebox.cli.cli_utils import apply_filters
def build_archiveresult_request(snapshot_id: str, plugin: str, hook_name: str = '', status: str = 'queued') -> dict:
def build_archiveresult_request(snapshot_id: str, plugin: str, hook_name: str = "", status: str = "queued") -> dict:
return {
'type': 'ArchiveResult',
'snapshot_id': str(snapshot_id),
'plugin': plugin,
'hook_name': hook_name,
'status': status,
"type": "ArchiveResult",
"snapshot_id": str(snapshot_id),
"plugin": plugin,
"hook_name": hook_name,
"status": status,
}
@@ -56,10 +55,11 @@ def build_archiveresult_request(snapshot_id: str, plugin: str, hook_name: str =
# CREATE
# =============================================================================
def create_archiveresults(
snapshot_id: Optional[str] = None,
plugin: Optional[str] = None,
status: str = 'queued',
snapshot_id: str | None = None,
plugin: str | None = None,
status: str = "queued",
) -> int:
"""
Create ArchiveResult request records for Snapshots.
@@ -86,13 +86,13 @@ def create_archiveresults(
snapshots = [Snapshot.objects.get(id=snapshot_id)]
pass_through_records = []
except Snapshot.DoesNotExist:
rprint(f'[red]Snapshot not found: {snapshot_id}[/red]', file=sys.stderr)
rprint(f"[red]Snapshot not found: {snapshot_id}[/red]", file=sys.stderr)
return 1
else:
# Read from stdin
records = list(read_stdin())
if not records:
rprint('[yellow]No Snapshot records provided via stdin[/yellow]', file=sys.stderr)
rprint("[yellow]No Snapshot records provided via stdin[/yellow]", file=sys.stderr)
return 1
# Separate snapshot records from pass-through records
@@ -100,17 +100,17 @@ def create_archiveresults(
pass_through_records = []
for record in records:
record_type = record.get('type', '')
record_type = record.get("type", "")
if record_type == TYPE_SNAPSHOT:
# Pass through the Snapshot record itself
pass_through_records.append(record)
if record.get('id'):
snapshot_ids.append(record['id'])
if record.get("id"):
snapshot_ids.append(record["id"])
elif record_type == TYPE_ARCHIVERESULT:
# ArchiveResult records: pass through if they have an id
if record.get('id'):
if record.get("id"):
pass_through_records.append(record)
# If no id, we could create it, but for now just pass through
else:
@@ -120,9 +120,9 @@ def create_archiveresults(
# Other typed records (Crawl, Tag, etc): pass through
pass_through_records.append(record)
elif record.get('id'):
elif record.get("id"):
# Untyped record with id - assume it's a snapshot ID
snapshot_ids.append(record['id'])
snapshot_ids.append(record["id"])
# Output pass-through records first
if not is_tty:
@@ -131,15 +131,15 @@ def create_archiveresults(
if not snapshot_ids:
if pass_through_records:
rprint(f'[dim]Passed through {len(pass_through_records)} records, no new snapshots to process[/dim]', file=sys.stderr)
rprint(f"[dim]Passed through {len(pass_through_records)} records, no new snapshots to process[/dim]", file=sys.stderr)
return 0
rprint('[yellow]No valid Snapshot IDs in input[/yellow]', file=sys.stderr)
rprint("[yellow]No valid Snapshot IDs in input[/yellow]", file=sys.stderr)
return 1
snapshots = list(Snapshot.objects.filter(id__in=snapshot_ids))
if not snapshots:
rprint('[yellow]No matching snapshots found[/yellow]', file=sys.stderr)
rprint("[yellow]No matching snapshots found[/yellow]", file=sys.stderr)
return 0 if pass_through_records else 1
created_count = 0
@@ -150,7 +150,7 @@ def create_archiveresults(
created_count += 1
else:
config = get_config(crawl=snapshot.crawl, snapshot=snapshot)
hooks = discover_hooks('Snapshot', config=config)
hooks = discover_hooks("Snapshot", config=config)
for hook_path in hooks:
hook_name = hook_path.name
plugin_name = hook_path.parent.name
@@ -158,7 +158,7 @@ def create_archiveresults(
write_record(build_archiveresult_request(snapshot.id, plugin_name, hook_name=hook_name, status=status))
created_count += 1
rprint(f'[green]Created {created_count} archive result request records[/green]', file=sys.stderr)
rprint(f"[green]Created {created_count} archive result request records[/green]", file=sys.stderr)
return 0
@@ -166,11 +166,12 @@ def create_archiveresults(
# LIST
# =============================================================================
def list_archiveresults(
status: Optional[str] = None,
plugin: Optional[str] = None,
snapshot_id: Optional[str] = None,
limit: Optional[int] = None,
status: str | None = None,
plugin: str | None = None,
snapshot_id: str | None = None,
limit: int | None = None,
) -> int:
"""
List ArchiveResults as JSONL with optional filters.
@@ -183,13 +184,13 @@ def list_archiveresults(
is_tty = sys.stdout.isatty()
queryset = ArchiveResult.objects.all().order_by('-start_ts')
queryset = ArchiveResult.objects.all().order_by("-start_ts")
# Apply filters
filter_kwargs = {
'status': status,
'plugin': plugin,
'snapshot_id': snapshot_id,
"status": status,
"plugin": plugin,
"snapshot_id": snapshot_id,
}
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
@@ -197,20 +198,22 @@ def list_archiveresults(
for result in queryset:
if is_tty:
status_color = {
'queued': 'yellow',
'started': 'blue',
'succeeded': 'green',
'failed': 'red',
'skipped': 'dim',
'noresults': 'dim',
'backoff': 'magenta',
}.get(result.status, 'dim')
rprint(f'[{status_color}]{result.status:10}[/{status_color}] {result.plugin:15} [dim]{result.id}[/dim] {result.snapshot.url[:40]}')
"queued": "yellow",
"started": "blue",
"succeeded": "green",
"failed": "red",
"skipped": "dim",
"noresults": "dim",
"backoff": "magenta",
}.get(result.status, "dim")
rprint(
f"[{status_color}]{result.status:10}[/{status_color}] {result.plugin:15} [dim]{result.id}[/dim] {result.snapshot.url[:40]}",
)
else:
write_record(result.to_json())
count += 1
rprint(f'[dim]Listed {count} archive results[/dim]', file=sys.stderr)
rprint(f"[dim]Listed {count} archive results[/dim]", file=sys.stderr)
return 0
@@ -218,8 +221,9 @@ def list_archiveresults(
# UPDATE
# =============================================================================
def update_archiveresults(
status: Optional[str] = None,
status: str | None = None,
) -> int:
"""
Update ArchiveResults from stdin JSONL.
@@ -238,12 +242,12 @@ def update_archiveresults(
records = list(read_stdin())
if not records:
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
return 1
updated_count = 0
for record in records:
result_id = record.get('id')
result_id = record.get("id")
if not result_id:
continue
@@ -261,10 +265,10 @@ def update_archiveresults(
write_record(result.to_json())
except ArchiveResult.DoesNotExist:
rprint(f'[yellow]ArchiveResult not found: {result_id}[/yellow]', file=sys.stderr)
rprint(f"[yellow]ArchiveResult not found: {result_id}[/yellow]", file=sys.stderr)
continue
rprint(f'[green]Updated {updated_count} archive results[/green]', file=sys.stderr)
rprint(f"[green]Updated {updated_count} archive results[/green]", file=sys.stderr)
return 0
@@ -272,6 +276,7 @@ def update_archiveresults(
# DELETE
# =============================================================================
def delete_archiveresults(yes: bool = False, dry_run: bool = False) -> int:
"""
Delete ArchiveResults from stdin JSONL.
@@ -287,37 +292,37 @@ def delete_archiveresults(yes: bool = False, dry_run: bool = False) -> int:
records = list(read_stdin())
if not records:
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
return 1
result_ids = [r.get('id') for r in records if r.get('id')]
result_ids = [r.get("id") for r in records if r.get("id")]
if not result_ids:
rprint('[yellow]No valid archive result IDs in input[/yellow]', file=sys.stderr)
rprint("[yellow]No valid archive result IDs in input[/yellow]", file=sys.stderr)
return 1
results = ArchiveResult.objects.filter(id__in=result_ids)
count = results.count()
if count == 0:
rprint('[yellow]No matching archive results found[/yellow]', file=sys.stderr)
rprint("[yellow]No matching archive results found[/yellow]", file=sys.stderr)
return 0
if dry_run:
rprint(f'[yellow]Would delete {count} archive results (dry run)[/yellow]', file=sys.stderr)
rprint(f"[yellow]Would delete {count} archive results (dry run)[/yellow]", file=sys.stderr)
for result in results[:10]:
rprint(f' [dim]{result.id}[/dim] {result.plugin} {result.snapshot.url[:40]}', file=sys.stderr)
rprint(f" [dim]{result.id}[/dim] {result.plugin} {result.snapshot.url[:40]}", file=sys.stderr)
if count > 10:
rprint(f' ... and {count - 10} more', file=sys.stderr)
rprint(f" ... and {count - 10} more", file=sys.stderr)
return 0
if not yes:
rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
rprint("[red]Use --yes to confirm deletion[/red]", file=sys.stderr)
return 1
# Perform deletion
deleted_count, _ = results.delete()
rprint(f'[green]Deleted {deleted_count} archive results[/green]', file=sys.stderr)
rprint(f"[green]Deleted {deleted_count} archive results[/green]", file=sys.stderr)
return 0
@@ -325,51 +330,58 @@ def delete_archiveresults(yes: bool = False, dry_run: bool = False) -> int:
# CLI Commands
# =============================================================================
@click.group()
def main():
"""Manage ArchiveResult records (plugin extraction results)."""
pass
@main.command('create')
@click.option('--snapshot-id', help='Snapshot ID to create results for')
@click.option('--plugin', '-p', help='Plugin name (e.g., screenshot, singlefile)')
@click.option('--status', '-s', default='queued', help='Initial status (default: queued)')
def create_cmd(snapshot_id: Optional[str], plugin: Optional[str], status: str):
@main.command("create")
@click.option("--snapshot-id", help="Snapshot ID to create results for")
@click.option("--plugin", "-p", help="Plugin name (e.g., screenshot, singlefile)")
@click.option("--status", "-s", default="queued", help="Initial status (default: queued)")
def create_cmd(snapshot_id: str | None, plugin: str | None, status: str):
"""Create ArchiveResults for Snapshots from stdin JSONL."""
sys.exit(create_archiveresults(snapshot_id=snapshot_id, plugin=plugin, status=status))
@main.command('list')
@click.option('--status', '-s', help='Filter by status (queued, started, succeeded, failed, skipped)')
@click.option('--plugin', '-p', help='Filter by plugin name')
@click.option('--snapshot-id', help='Filter by snapshot ID')
@click.option('--limit', '-n', type=int, help='Limit number of results')
def list_cmd(status: Optional[str], plugin: Optional[str],
snapshot_id: Optional[str], limit: Optional[int]):
@main.command("list")
@click.option("--status", "-s", help="Filter by status (queued, started, succeeded, failed, skipped)")
@click.option("--plugin", "-p", help="Filter by plugin name")
@click.option("--snapshot-id", help="Filter by snapshot ID")
@click.option("--limit", "-n", type=int, help="Limit number of results")
def list_cmd(
status: str | None,
plugin: str | None,
snapshot_id: str | None,
limit: int | None,
):
"""List ArchiveResults as JSONL."""
sys.exit(list_archiveresults(
status=status,
plugin=plugin,
snapshot_id=snapshot_id,
limit=limit,
))
sys.exit(
list_archiveresults(
status=status,
plugin=plugin,
snapshot_id=snapshot_id,
limit=limit,
),
)
@main.command('update')
@click.option('--status', '-s', help='Set status')
def update_cmd(status: Optional[str]):
@main.command("update")
@click.option("--status", "-s", help="Set status")
def update_cmd(status: str | None):
"""Update ArchiveResults from stdin JSONL."""
sys.exit(update_archiveresults(status=status))
@main.command('delete')
@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
@main.command("delete")
@click.option("--yes", "-y", is_flag=True, help="Confirm deletion")
@click.option("--dry-run", is_flag=True, help="Show what would be deleted")
def delete_cmd(yes: bool, dry_run: bool):
"""Delete ArchiveResults from stdin JSONL."""
sys.exit(delete_archiveresults(yes=yes, dry_run=dry_run))
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -25,11 +25,10 @@ Examples:
archivebox binary list --name=chrome | archivebox binary delete --yes
"""
__package__ = 'archivebox.cli'
__command__ = 'archivebox binary'
__package__ = "archivebox.cli"
__command__ = "archivebox binary"
import sys
from typing import Optional
import rich_click as click
from rich import print as rprint
@@ -41,10 +40,11 @@ from archivebox.cli.cli_utils import apply_filters
# CREATE
# =============================================================================
def create_binary(
name: str,
abspath: str,
version: str = '',
version: str = "",
) -> int:
"""
Create/register a Binary.
@@ -59,7 +59,7 @@ def create_binary(
is_tty = sys.stdout.isatty()
if not name or not abspath:
rprint('[red]Both --name and --abspath are required[/red]', file=sys.stderr)
rprint("[red]Both --name and --abspath are required[/red]", file=sys.stderr)
return 1
try:
@@ -76,28 +76,30 @@ def create_binary(
# Mirror the Binary model lifecycle used elsewhere in the system so CLI
# records are owned by the current machine and can be safely piped into
# `archivebox run` without creating invalid rows missing machine_id.
binary = Binary.from_json({
'name': name,
'abspath': abspath,
'version': version,
'binproviders': 'env',
'binprovider': 'env',
})
binary = Binary.from_json(
{
"name": name,
"abspath": abspath,
"version": version,
"binproviders": "env",
"binprovider": "env",
},
)
if binary is None:
raise ValueError('failed to create binary record')
raise ValueError("failed to create binary record")
if not is_tty:
write_record(binary.to_json())
if created:
rprint(f'[green]Created binary: {name} at {abspath}[/green]', file=sys.stderr)
rprint(f"[green]Created binary: {name} at {abspath}[/green]", file=sys.stderr)
else:
rprint(f'[dim]Binary already exists: {name} at {abspath}[/dim]', file=sys.stderr)
rprint(f"[dim]Binary already exists: {name} at {abspath}[/dim]", file=sys.stderr)
return 0
except Exception as e:
rprint(f'[red]Error creating binary: {e}[/red]', file=sys.stderr)
rprint(f"[red]Error creating binary: {e}[/red]", file=sys.stderr)
return 1
@@ -105,11 +107,12 @@ def create_binary(
# LIST
# =============================================================================
def list_binaries(
name: Optional[str] = None,
abspath__icontains: Optional[str] = None,
version__icontains: Optional[str] = None,
limit: Optional[int] = None,
name: str | None = None,
abspath__icontains: str | None = None,
version__icontains: str | None = None,
limit: int | None = None,
) -> int:
"""
List Binaries as JSONL with optional filters.
@@ -122,25 +125,25 @@ def list_binaries(
is_tty = sys.stdout.isatty()
queryset = Binary.objects.all().order_by('name', '-modified_at', '-created_at')
queryset = Binary.objects.all().order_by("name", "-modified_at", "-created_at")
# Apply filters
filter_kwargs = {
'name': name,
'abspath__icontains': abspath__icontains,
'version__icontains': version__icontains,
"name": name,
"abspath__icontains": abspath__icontains,
"version__icontains": version__icontains,
}
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
count = 0
for binary in queryset:
if is_tty:
rprint(f'[cyan]{binary.name:20}[/cyan] [dim]{binary.version:15}[/dim] {binary.abspath}')
rprint(f"[cyan]{binary.name:20}[/cyan] [dim]{binary.version:15}[/dim] {binary.abspath}")
else:
write_record(binary.to_json())
count += 1
rprint(f'[dim]Listed {count} binaries[/dim]', file=sys.stderr)
rprint(f"[dim]Listed {count} binaries[/dim]", file=sys.stderr)
return 0
@@ -148,9 +151,10 @@ def list_binaries(
# UPDATE
# =============================================================================
def update_binaries(
version: Optional[str] = None,
abspath: Optional[str] = None,
version: str | None = None,
abspath: str | None = None,
) -> int:
"""
Update Binaries from stdin JSONL.
@@ -169,12 +173,12 @@ def update_binaries(
records = list(read_stdin())
if not records:
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
return 1
updated_count = 0
for record in records:
binary_id = record.get('id')
binary_id = record.get("id")
if not binary_id:
continue
@@ -194,10 +198,10 @@ def update_binaries(
write_record(binary.to_json())
except Binary.DoesNotExist:
rprint(f'[yellow]Binary not found: {binary_id}[/yellow]', file=sys.stderr)
rprint(f"[yellow]Binary not found: {binary_id}[/yellow]", file=sys.stderr)
continue
rprint(f'[green]Updated {updated_count} binaries[/green]', file=sys.stderr)
rprint(f"[green]Updated {updated_count} binaries[/green]", file=sys.stderr)
return 0
@@ -205,6 +209,7 @@ def update_binaries(
# DELETE
# =============================================================================
def delete_binaries(yes: bool = False, dry_run: bool = False) -> int:
"""
Delete Binaries from stdin JSONL.
@@ -220,35 +225,35 @@ def delete_binaries(yes: bool = False, dry_run: bool = False) -> int:
records = list(read_stdin())
if not records:
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
return 1
binary_ids = [r.get('id') for r in records if r.get('id')]
binary_ids = [r.get("id") for r in records if r.get("id")]
if not binary_ids:
rprint('[yellow]No valid binary IDs in input[/yellow]', file=sys.stderr)
rprint("[yellow]No valid binary IDs in input[/yellow]", file=sys.stderr)
return 1
binaries = Binary.objects.filter(id__in=binary_ids)
count = binaries.count()
if count == 0:
rprint('[yellow]No matching binaries found[/yellow]', file=sys.stderr)
rprint("[yellow]No matching binaries found[/yellow]", file=sys.stderr)
return 0
if dry_run:
rprint(f'[yellow]Would delete {count} binaries (dry run)[/yellow]', file=sys.stderr)
rprint(f"[yellow]Would delete {count} binaries (dry run)[/yellow]", file=sys.stderr)
for binary in binaries:
rprint(f' {binary.name} {binary.abspath}', file=sys.stderr)
rprint(f" {binary.name} {binary.abspath}", file=sys.stderr)
return 0
if not yes:
rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
rprint("[red]Use --yes to confirm deletion[/red]", file=sys.stderr)
return 1
# Perform deletion
deleted_count, _ = binaries.delete()
rprint(f'[green]Deleted {deleted_count} binaries[/green]', file=sys.stderr)
rprint(f"[green]Deleted {deleted_count} binaries[/green]", file=sys.stderr)
return 0
@@ -256,52 +261,59 @@ def delete_binaries(yes: bool = False, dry_run: bool = False) -> int:
# CLI Commands
# =============================================================================
@click.group()
def main():
"""Manage Binary records (detected executables)."""
pass
@main.command('create')
@click.option('--name', '-n', required=True, help='Binary name (e.g., chrome, wget)')
@click.option('--abspath', '-p', required=True, help='Absolute path to binary')
@click.option('--version', '-v', default='', help='Binary version')
@main.command("create")
@click.option("--name", "-n", required=True, help="Binary name (e.g., chrome, wget)")
@click.option("--abspath", "-p", required=True, help="Absolute path to binary")
@click.option("--version", "-v", default="", help="Binary version")
def create_cmd(name: str, abspath: str, version: str):
"""Create/register a Binary."""
sys.exit(create_binary(name=name, abspath=abspath, version=version))
@main.command('list')
@click.option('--name', '-n', help='Filter by name')
@click.option('--abspath__icontains', help='Filter by path contains')
@click.option('--version__icontains', help='Filter by version contains')
@click.option('--limit', type=int, help='Limit number of results')
def list_cmd(name: Optional[str], abspath__icontains: Optional[str],
version__icontains: Optional[str], limit: Optional[int]):
@main.command("list")
@click.option("--name", "-n", help="Filter by name")
@click.option("--abspath__icontains", help="Filter by path contains")
@click.option("--version__icontains", help="Filter by version contains")
@click.option("--limit", type=int, help="Limit number of results")
def list_cmd(
name: str | None,
abspath__icontains: str | None,
version__icontains: str | None,
limit: int | None,
):
"""List Binaries as JSONL."""
sys.exit(list_binaries(
name=name,
abspath__icontains=abspath__icontains,
version__icontains=version__icontains,
limit=limit,
))
sys.exit(
list_binaries(
name=name,
abspath__icontains=abspath__icontains,
version__icontains=version__icontains,
limit=limit,
),
)
@main.command('update')
@click.option('--version', '-v', help='Set version')
@click.option('--abspath', '-p', help='Set path')
def update_cmd(version: Optional[str], abspath: Optional[str]):
@main.command("update")
@click.option("--version", "-v", help="Set version")
@click.option("--abspath", "-p", help="Set path")
def update_cmd(version: str | None, abspath: str | None):
"""Update Binaries from stdin JSONL."""
sys.exit(update_binaries(version=version, abspath=abspath))
@main.command('delete')
@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
@main.command("delete")
@click.option("--yes", "-y", is_flag=True, help="Confirm deletion")
@click.option("--dry-run", is_flag=True, help="Show what would be deleted")
def delete_cmd(yes: bool, dry_run: bool):
"""Delete Binaries from stdin JSONL."""
sys.exit(delete_binaries(yes=yes, dry_run=dry_run))
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -1,6 +1,6 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__package__ = "archivebox.cli"
import sys
import rich_click as click
@@ -12,12 +12,14 @@ from archivebox.misc.toml_util import CustomTOMLEncoder
@enforce_types
def config(*keys,
get: bool=False,
set: bool=False,
search: bool=False,
reset: bool=False,
**kwargs) -> None:
def config(
*keys,
get: bool = False,
set: bool = False,
search: bool = False,
reset: bool = False,
**kwargs,
) -> None:
"""Get and set your ArchiveBox project configuration values"""
from archivebox.misc.checks import check_data_folder
@@ -29,8 +31,8 @@ def config(*keys,
FLAT_CONFIG = get_flat_config()
CONFIGS = get_all_configs()
config_options: list[str] = list(kwargs.pop('key=value', []) or keys or [f'{key}={val}' for key, val in kwargs.items()])
config_options: list[str] = list(kwargs.pop("key=value", []) or keys or [f"{key}={val}" for key, val in kwargs.items()])
no_args = not (get or set or reset or config_options)
matching_config = {}
@@ -39,19 +41,19 @@ def config(*keys,
config_options = [get_real_name(key) for key in config_options]
matching_config = {key: FLAT_CONFIG[key] for key in config_options if key in FLAT_CONFIG}
for config_section in CONFIGS.values():
aliases = getattr(config_section, 'aliases', {})
aliases = getattr(config_section, "aliases", {})
for search_key in config_options:
# search all aliases in the section
for alias_key, key in aliases.items():
if search_key.lower() in alias_key.lower():
matching_config[key] = dict(config_section)[key]
# search all keys and values in the section
for existing_key, value in dict(config_section).items():
if search_key.lower() in existing_key.lower() or search_key.lower() in str(value).lower():
matching_config[existing_key] = value
print(printable_config(matching_config))
raise SystemExit(not matching_config)
@@ -61,23 +63,23 @@ def config(*keys,
matching_config = {key: FLAT_CONFIG[key] for key in config_options if key in FLAT_CONFIG}
failed_config = [key for key in config_options if key not in FLAT_CONFIG]
if failed_config:
print('\n[red][X] These options failed to get[/red]')
print(' {}'.format('\n '.join(config_options)))
print("\n[red][X] These options failed to get[/red]")
print(" {}".format("\n ".join(config_options)))
raise SystemExit(1)
else:
matching_config = FLAT_CONFIG
# Display core config sections
for config_section in CONFIGS.values():
section_header = getattr(config_section, 'toml_section_header', '')
section_header = getattr(config_section, "toml_section_header", "")
if isinstance(section_header, str) and section_header:
print(f'[grey53]\\[{section_header}][/grey53]')
print(f"[grey53]\\[{section_header}][/grey53]")
else:
print('[grey53]\\[CONSTANTS] # (read-only)[/grey53]')
print("[grey53]\\[CONSTANTS] # (read-only)[/grey53]")
kv_in_section = {key: val for key, val in dict(config_section).items() if key in matching_config}
print(benedict(kv_in_section).to_toml(encoder=CustomTOMLEncoder()).strip().replace('\n\n', '\n'))
print('[grey53]################################################################[/grey53]')
print(benedict(kv_in_section).to_toml(encoder=CustomTOMLEncoder()).strip().replace("\n\n", "\n"))
print("[grey53]################################################################[/grey53]")
# Display plugin config section
from archivebox.hooks import discover_plugin_configs
@@ -87,17 +89,17 @@ def config(*keys,
# Collect all plugin config keys
for plugin_name, schema in plugin_configs.items():
if 'properties' not in schema:
if "properties" not in schema:
continue
for key in schema['properties'].keys():
for key in schema["properties"].keys():
if key in matching_config:
plugin_keys[key] = matching_config[key]
# Display all plugin config in single [PLUGINS] section
if plugin_keys:
print('[grey53]\\[PLUGINS][/grey53]')
print(benedict(plugin_keys).to_toml(encoder=CustomTOMLEncoder()).strip().replace('\n\n', '\n'))
print('[grey53]################################################################[/grey53]')
print("[grey53]\\[PLUGINS][/grey53]")
print(benedict(plugin_keys).to_toml(encoder=CustomTOMLEncoder()).strip().replace("\n\n", "\n"))
print("[grey53]################################################################[/grey53]")
raise SystemExit(not matching_config)
@@ -105,18 +107,20 @@ def config(*keys,
new_config = {}
failed_options = []
for line in config_options:
if line.startswith('#') or not line.strip():
if line.startswith("#") or not line.strip():
continue
if '=' not in line:
print('[red][X] Config KEY=VALUE must have an = sign in it[/red]')
print(f' {line}')
if "=" not in line:
print("[red][X] Config KEY=VALUE must have an = sign in it[/red]")
print(f" {line}")
raise SystemExit(2)
raw_key, val = line.split('=', 1)
raw_key, val = line.split("=", 1)
raw_key = raw_key.upper().strip()
key = get_real_name(raw_key)
if key != raw_key:
print(f'[yellow][i] Note: The config option {raw_key} has been renamed to {key}, please use the new name going forwards.[/yellow]')
print(
f"[yellow][i] Note: The config option {raw_key} has been renamed to {key}, please use the new name going forwards.[/yellow]",
)
if key in FLAT_CONFIG:
new_config[key] = val.strip()
@@ -136,38 +140,38 @@ def config(*keys,
if side_effect_changes:
print(file=sys.stderr)
print('[yellow][i] Note: This change also affected these other options that depended on it:[/yellow]', file=sys.stderr)
print(' {}'.format(printable_config(side_effect_changes, prefix=' ')), file=sys.stderr)
print("[yellow][i] Note: This change also affected these other options that depended on it:[/yellow]", file=sys.stderr)
print(" {}".format(printable_config(side_effect_changes, prefix=" ")), file=sys.stderr)
if failed_options:
print()
print('[red][X] These options failed to set (check for typos):[/red]')
print(' {}'.format('\n '.join(failed_options)))
print("[red][X] These options failed to set (check for typos):[/red]")
print(" {}".format("\n ".join(failed_options)))
raise SystemExit(1)
elif reset:
print('[red][X] This command is not implemented yet.[/red]')
print(' Please manually remove the relevant lines from your config file:')
print("[red][X] This command is not implemented yet.[/red]")
print(" Please manually remove the relevant lines from your config file:")
raise SystemExit(2)
else:
print('[red][X] You must pass either --get or --set, or no arguments to get the whole config.[/red]')
print(' archivebox config')
print(' archivebox config --get SOME_KEY')
print(' archivebox config --set SOME_KEY=SOME_VALUE')
print("[red][X] You must pass either --get or --set, or no arguments to get the whole config.[/red]")
print(" archivebox config")
print(" archivebox config --get SOME_KEY")
print(" archivebox config --set SOME_KEY=SOME_VALUE")
raise SystemExit(2)
@click.command()
@click.option('--search', is_flag=True, help='Search config KEYs, VALUEs, and ALIASES for the given term')
@click.option('--get', is_flag=True, help='Get the value for the given config KEYs')
@click.option('--set', is_flag=True, help='Set the given KEY=VALUE config values')
@click.option('--reset', is_flag=True, help='Reset the given KEY config values to their defaults')
@click.argument('KEY=VALUE', nargs=-1, type=str)
@click.option("--search", is_flag=True, help="Search config KEYs, VALUEs, and ALIASES for the given term")
@click.option("--get", is_flag=True, help="Get the value for the given config KEYs")
@click.option("--set", is_flag=True, help="Set the given KEY=VALUE config values")
@click.option("--reset", is_flag=True, help="Reset the given KEY config values to their defaults")
@click.argument("KEY=VALUE", nargs=-1, type=str)
@docstring(config.__doc__)
def main(**kwargs) -> None:
config(**kwargs)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -30,11 +30,11 @@ Examples:
archivebox crawl create https://example.com | archivebox snapshot create | archivebox run
"""
__package__ = 'archivebox.cli'
__command__ = 'archivebox crawl'
__package__ = "archivebox.cli"
__command__ = "archivebox crawl"
import sys
from typing import Optional, Iterable
from collections.abc import Iterable
import rich_click as click
from rich import print as rprint
@@ -46,12 +46,13 @@ from archivebox.cli.cli_utils import apply_filters
# CREATE
# =============================================================================
def create_crawl(
urls: Iterable[str],
depth: int = 0,
tag: str = '',
status: str = 'queued',
created_by_id: Optional[int] = None,
tag: str = "",
status: str = "queued",
created_by_id: int | None = None,
) -> int:
"""
Create a Crawl job from URLs.
@@ -74,7 +75,7 @@ def create_crawl(
records = list(read_args_or_stdin(urls))
if not records:
rprint('[yellow]No URLs provided. Pass URLs as arguments or via stdin.[/yellow]', file=sys.stderr)
rprint("[yellow]No URLs provided. Pass URLs as arguments or via stdin.[/yellow]", file=sys.stderr)
return 1
# Separate pass-through records from URL records
@@ -82,29 +83,29 @@ def create_crawl(
pass_through_records = []
for record in records:
record_type = record.get('type', '')
record_type = record.get("type", "")
# Pass-through: output records that aren't URL/Crawl types
if record_type and record_type != TYPE_CRAWL and not record.get('url') and not record.get('urls'):
if record_type and record_type != TYPE_CRAWL and not record.get("url") and not record.get("urls"):
pass_through_records.append(record)
continue
# Handle existing Crawl records (just pass through with id)
if record_type == TYPE_CRAWL and record.get('id'):
if record_type == TYPE_CRAWL and record.get("id"):
pass_through_records.append(record)
continue
# Collect URLs
url = record.get('url')
url = record.get("url")
if url:
url_list.append(url)
# Handle 'urls' field (newline-separated)
urls_field = record.get('urls')
urls_field = record.get("urls")
if urls_field:
for line in urls_field.split('\n'):
for line in urls_field.split("\n"):
line = line.strip()
if line and not line.startswith('#'):
if line and not line.startswith("#"):
url_list.append(line)
# Output pass-through records first
@@ -115,44 +116,44 @@ def create_crawl(
if not url_list:
if pass_through_records:
# If we had pass-through records but no URLs, that's OK
rprint(f'[dim]Passed through {len(pass_through_records)} records, no new URLs[/dim]', file=sys.stderr)
rprint(f"[dim]Passed through {len(pass_through_records)} records, no new URLs[/dim]", file=sys.stderr)
return 0
rprint('[red]No valid URLs found[/red]', file=sys.stderr)
rprint("[red]No valid URLs found[/red]", file=sys.stderr)
return 1
try:
# Build crawl record with all URLs as newline-separated string
crawl_record = {
'urls': '\n'.join(url_list),
'max_depth': depth,
'tags_str': tag,
'status': status,
'label': '',
"urls": "\n".join(url_list),
"max_depth": depth,
"tags_str": tag,
"status": status,
"label": "",
}
crawl = Crawl.from_json(crawl_record, overrides={'created_by_id': created_by_id})
crawl = Crawl.from_json(crawl_record, overrides={"created_by_id": created_by_id})
if not crawl:
rprint('[red]Failed to create crawl[/red]', file=sys.stderr)
rprint("[red]Failed to create crawl[/red]", file=sys.stderr)
return 1
# Output JSONL record (only when piped)
if not is_tty:
write_record(crawl.to_json())
rprint(f'[green]Created crawl with {len(url_list)} URLs[/green]', file=sys.stderr)
rprint(f"[green]Created crawl with {len(url_list)} URLs[/green]", file=sys.stderr)
# If TTY, show human-readable output
if is_tty:
rprint(f' [dim]{crawl.id}[/dim]', file=sys.stderr)
rprint(f" [dim]{crawl.id}[/dim]", file=sys.stderr)
for url in url_list[:5]: # Show first 5 URLs
rprint(f' {url[:70]}', file=sys.stderr)
rprint(f" {url[:70]}", file=sys.stderr)
if len(url_list) > 5:
rprint(f' ... and {len(url_list) - 5} more', file=sys.stderr)
rprint(f" ... and {len(url_list) - 5} more", file=sys.stderr)
return 0
except Exception as e:
rprint(f'[red]Error creating crawl: {e}[/red]', file=sys.stderr)
rprint(f"[red]Error creating crawl: {e}[/red]", file=sys.stderr)
return 1
@@ -160,11 +161,12 @@ def create_crawl(
# LIST
# =============================================================================
def list_crawls(
status: Optional[str] = None,
urls__icontains: Optional[str] = None,
max_depth: Optional[int] = None,
limit: Optional[int] = None,
status: str | None = None,
urls__icontains: str | None = None,
max_depth: int | None = None,
limit: int | None = None,
) -> int:
"""
List Crawls as JSONL with optional filters.
@@ -177,13 +179,13 @@ def list_crawls(
is_tty = sys.stdout.isatty()
queryset = Crawl.objects.all().order_by('-created_at')
queryset = Crawl.objects.all().order_by("-created_at")
# Apply filters
filter_kwargs = {
'status': status,
'urls__icontains': urls__icontains,
'max_depth': max_depth,
"status": status,
"urls__icontains": urls__icontains,
"max_depth": max_depth,
}
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
@@ -191,17 +193,17 @@ def list_crawls(
for crawl in queryset:
if is_tty:
status_color = {
'queued': 'yellow',
'started': 'blue',
'sealed': 'green',
}.get(crawl.status, 'dim')
url_preview = crawl.urls[:50].replace('\n', ' ')
rprint(f'[{status_color}]{crawl.status:8}[/{status_color}] [dim]{crawl.id}[/dim] {url_preview}...')
"queued": "yellow",
"started": "blue",
"sealed": "green",
}.get(crawl.status, "dim")
url_preview = crawl.urls[:50].replace("\n", " ")
rprint(f"[{status_color}]{crawl.status:8}[/{status_color}] [dim]{crawl.id}[/dim] {url_preview}...")
else:
write_record(crawl.to_json())
count += 1
rprint(f'[dim]Listed {count} crawls[/dim]', file=sys.stderr)
rprint(f"[dim]Listed {count} crawls[/dim]", file=sys.stderr)
return 0
@@ -209,9 +211,10 @@ def list_crawls(
# UPDATE
# =============================================================================
def update_crawls(
status: Optional[str] = None,
max_depth: Optional[int] = None,
status: str | None = None,
max_depth: int | None = None,
) -> int:
"""
Update Crawls from stdin JSONL.
@@ -232,12 +235,12 @@ def update_crawls(
records = list(read_stdin())
if not records:
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
return 1
updated_count = 0
for record in records:
crawl_id = record.get('id')
crawl_id = record.get("id")
if not crawl_id:
continue
@@ -258,10 +261,10 @@ def update_crawls(
write_record(crawl.to_json())
except Crawl.DoesNotExist:
rprint(f'[yellow]Crawl not found: {crawl_id}[/yellow]', file=sys.stderr)
rprint(f"[yellow]Crawl not found: {crawl_id}[/yellow]", file=sys.stderr)
continue
rprint(f'[green]Updated {updated_count} crawls[/green]', file=sys.stderr)
rprint(f"[green]Updated {updated_count} crawls[/green]", file=sys.stderr)
return 0
@@ -269,6 +272,7 @@ def update_crawls(
# DELETE
# =============================================================================
def delete_crawls(yes: bool = False, dry_run: bool = False) -> int:
"""
Delete Crawls from stdin JSONL.
@@ -284,36 +288,36 @@ def delete_crawls(yes: bool = False, dry_run: bool = False) -> int:
records = list(read_stdin())
if not records:
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
return 1
crawl_ids = [r.get('id') for r in records if r.get('id')]
crawl_ids = [r.get("id") for r in records if r.get("id")]
if not crawl_ids:
rprint('[yellow]No valid crawl IDs in input[/yellow]', file=sys.stderr)
rprint("[yellow]No valid crawl IDs in input[/yellow]", file=sys.stderr)
return 1
crawls = Crawl.objects.filter(id__in=crawl_ids)
count = crawls.count()
if count == 0:
rprint('[yellow]No matching crawls found[/yellow]', file=sys.stderr)
rprint("[yellow]No matching crawls found[/yellow]", file=sys.stderr)
return 0
if dry_run:
rprint(f'[yellow]Would delete {count} crawls (dry run)[/yellow]', file=sys.stderr)
rprint(f"[yellow]Would delete {count} crawls (dry run)[/yellow]", file=sys.stderr)
for crawl in crawls:
url_preview = crawl.urls[:50].replace('\n', ' ')
rprint(f' [dim]{crawl.id}[/dim] {url_preview}...', file=sys.stderr)
url_preview = crawl.urls[:50].replace("\n", " ")
rprint(f" [dim]{crawl.id}[/dim] {url_preview}...", file=sys.stderr)
return 0
if not yes:
rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
rprint("[red]Use --yes to confirm deletion[/red]", file=sys.stderr)
return 1
# Perform deletion
deleted_count, _ = crawls.delete()
rprint(f'[green]Deleted {deleted_count} crawls[/green]', file=sys.stderr)
rprint(f"[green]Deleted {deleted_count} crawls[/green]", file=sys.stderr)
return 0
@@ -321,53 +325,60 @@ def delete_crawls(yes: bool = False, dry_run: bool = False) -> int:
# CLI Commands
# =============================================================================
@click.group()
def main():
"""Manage Crawl records."""
pass
@main.command('create')
@click.argument('urls', nargs=-1)
@click.option('--depth', '-d', type=int, default=0, help='Max crawl depth (default: 0)')
@click.option('--tag', '-t', default='', help='Comma-separated tags to add')
@click.option('--status', '-s', default='queued', help='Initial status (default: queued)')
@main.command("create")
@click.argument("urls", nargs=-1)
@click.option("--depth", "-d", type=int, default=0, help="Max crawl depth (default: 0)")
@click.option("--tag", "-t", default="", help="Comma-separated tags to add")
@click.option("--status", "-s", default="queued", help="Initial status (default: queued)")
def create_cmd(urls: tuple, depth: int, tag: str, status: str):
"""Create a Crawl job from URLs or stdin."""
sys.exit(create_crawl(urls, depth=depth, tag=tag, status=status))
@main.command('list')
@click.option('--status', '-s', help='Filter by status (queued, started, sealed)')
@click.option('--urls__icontains', help='Filter by URLs contains')
@click.option('--max-depth', type=int, help='Filter by max depth')
@click.option('--limit', '-n', type=int, help='Limit number of results')
def list_cmd(status: Optional[str], urls__icontains: Optional[str],
max_depth: Optional[int], limit: Optional[int]):
@main.command("list")
@click.option("--status", "-s", help="Filter by status (queued, started, sealed)")
@click.option("--urls__icontains", help="Filter by URLs contains")
@click.option("--max-depth", type=int, help="Filter by max depth")
@click.option("--limit", "-n", type=int, help="Limit number of results")
def list_cmd(
status: str | None,
urls__icontains: str | None,
max_depth: int | None,
limit: int | None,
):
"""List Crawls as JSONL."""
sys.exit(list_crawls(
status=status,
urls__icontains=urls__icontains,
max_depth=max_depth,
limit=limit,
))
sys.exit(
list_crawls(
status=status,
urls__icontains=urls__icontains,
max_depth=max_depth,
limit=limit,
),
)
@main.command('update')
@click.option('--status', '-s', help='Set status')
@click.option('--max-depth', type=int, help='Set max depth')
def update_cmd(status: Optional[str], max_depth: Optional[int]):
@main.command("update")
@click.option("--status", "-s", help="Set status")
@click.option("--max-depth", type=int, help="Set max depth")
def update_cmd(status: str | None, max_depth: int | None):
"""Update Crawls from stdin JSONL."""
sys.exit(update_crawls(status=status, max_depth=max_depth))
@main.command('delete')
@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
@main.command("delete")
@click.option("--yes", "-y", is_flag=True, help="Confirm deletion")
@click.option("--dry-run", is_flag=True, help="Show what would be deleted")
def delete_cmd(yes: bool, dry_run: bool):
"""Delete Crawls from stdin JSONL."""
sys.exit(delete_crawls(yes=yes, dry_run=dry_run))
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -1,7 +1,7 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox crawl'
__package__ = "archivebox.cli"
__command__ = "archivebox crawl"
import sys
@@ -10,12 +10,12 @@ import rich_click as click
from archivebox.cli.archivebox_add import add
@click.command(context_settings={'ignore_unknown_options': True})
@click.option('--depth', '-d', type=int, default=0, help='Max crawl depth (default: 0)')
@click.option('--tag', '-t', default='', help='Comma-separated tags to add')
@click.option('--status', '-s', default='queued', help='Initial status (default: queued)')
@click.option('--wait/--no-wait', 'wait', default=True, help='Accepted for backwards compatibility')
@click.argument('urls', nargs=-1)
@click.command(context_settings={"ignore_unknown_options": True})
@click.option("--depth", "-d", type=int, default=0, help="Max crawl depth (default: 0)")
@click.option("--tag", "-t", default="", help="Comma-separated tags to add")
@click.option("--status", "-s", default="queued", help="Initial status (default: queued)")
@click.option("--wait/--no-wait", "wait", default=True, help="Accepted for backwards compatibility")
@click.argument("urls", nargs=-1)
def main(depth: int, tag: str, status: str, wait: bool, urls: tuple[str, ...]):
"""Backwards-compatible `archivebox crawl URL...` entrypoint."""
del status, wait
@@ -23,5 +23,5 @@ def main(depth: int, tag: str, status: str, wait: bool, urls: tuple[str, ...]):
sys.exit(0)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -27,8 +27,8 @@ Examples:
archivebox crawl https://example.com | archivebox snapshot | archivebox extract
"""
__package__ = 'archivebox.cli'
__command__ = 'archivebox extract'
__package__ = "archivebox.cli"
__command__ = "archivebox extract"
import sys
from collections import defaultdict
@@ -52,51 +52,52 @@ def process_archiveresult_by_id(archiveresult_id: str) -> int:
try:
archiveresult = ArchiveResult.objects.get(id=archiveresult_id)
except ArchiveResult.DoesNotExist:
rprint(f'[red]ArchiveResult {archiveresult_id} not found[/red]', file=sys.stderr)
rprint(f"[red]ArchiveResult {archiveresult_id} not found[/red]", file=sys.stderr)
return 1
rprint(f'[blue]Extracting {archiveresult.plugin} for {archiveresult.snapshot.url}[/blue]', file=sys.stderr)
rprint(f"[blue]Extracting {archiveresult.plugin} for {archiveresult.snapshot.url}[/blue]", file=sys.stderr)
try:
archiveresult.reset_for_retry()
snapshot = archiveresult.snapshot
snapshot.status = snapshot.StatusChoices.QUEUED
snapshot.retry_at = timezone.now()
snapshot.save(update_fields=['status', 'retry_at', 'modified_at'])
snapshot.save(update_fields=["status", "retry_at", "modified_at"])
crawl = snapshot.crawl
if crawl.status != crawl.StatusChoices.STARTED:
crawl.status = crawl.StatusChoices.QUEUED
crawl.retry_at = timezone.now()
crawl.save(update_fields=['status', 'retry_at', 'modified_at'])
crawl.save(update_fields=["status", "retry_at", "modified_at"])
run_crawl(str(crawl.id), snapshot_ids=[str(snapshot.id)], selected_plugins=[archiveresult.plugin])
archiveresult.refresh_from_db()
if archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED:
print(f'[green]Extraction succeeded: {archiveresult.output_str}[/green]')
print(f"[green]Extraction succeeded: {archiveresult.output_str}[/green]")
return 0
elif archiveresult.status == ArchiveResult.StatusChoices.NORESULTS:
print(f'[dim]Extraction completed with no results: {archiveresult.output_str}[/dim]')
print(f"[dim]Extraction completed with no results: {archiveresult.output_str}[/dim]")
return 0
elif archiveresult.status == ArchiveResult.StatusChoices.FAILED:
print(f'[red]Extraction failed: {archiveresult.output_str}[/red]', file=sys.stderr)
print(f"[red]Extraction failed: {archiveresult.output_str}[/red]", file=sys.stderr)
return 1
else:
# Still in progress or backoff - not a failure
print(f'[yellow]Extraction status: {archiveresult.status}[/yellow]')
print(f"[yellow]Extraction status: {archiveresult.status}[/yellow]")
return 0
except Exception as e:
print(f'[red]Extraction error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
print(f"[red]Extraction error: {type(e).__name__}: {e}[/red]", file=sys.stderr)
return 1
def run_plugins(
args: tuple,
records: list[dict] | None = None,
plugins: str = '',
plugins: str = "",
wait: bool = True,
emit_results: bool = True,
) -> int:
"""
Run plugins on Snapshots from input.
@@ -111,16 +112,18 @@ def run_plugins(
from django.utils import timezone
from archivebox.misc.jsonl import (
read_args_or_stdin, write_record,
TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
read_args_or_stdin,
write_record,
TYPE_SNAPSHOT,
TYPE_ARCHIVERESULT,
)
from archivebox.core.models import Snapshot, ArchiveResult
from archivebox.core.models import Snapshot
from archivebox.services.runner import run_crawl
is_tty = sys.stdout.isatty()
# Parse comma-separated plugins list once (reused in creation and filtering)
plugins_list = [p.strip() for p in plugins.split(',') if p.strip()] if plugins else []
plugins_list = [p.strip() for p in plugins.split(",") if p.strip()] if plugins else []
# Parse stdin/args exactly once per CLI invocation.
# `main()` may already have consumed stdin to distinguish Snapshot input from
@@ -130,41 +133,41 @@ def run_plugins(
records = list(read_args_or_stdin(args))
if not records:
rprint('[yellow]No snapshots provided. Pass snapshot IDs as arguments or via stdin.[/yellow]', file=sys.stderr)
rprint("[yellow]No snapshots provided. Pass snapshot IDs as arguments or via stdin.[/yellow]", file=sys.stderr)
return 1
# Gather snapshot IDs and optional plugin constraints to process
snapshot_ids = set()
requested_plugins_by_snapshot: dict[str, set[str]] = defaultdict(set)
for record in records:
record_type = record.get('type')
record_type = record.get("type")
if record_type == TYPE_SNAPSHOT:
snapshot_id = record.get('id')
snapshot_id = record.get("id")
if snapshot_id:
snapshot_ids.add(snapshot_id)
elif record.get('url'):
elif record.get("url"):
# Look up by URL (get most recent if multiple exist)
snap = Snapshot.objects.filter(url=record['url']).order_by('-created_at').first()
snap = Snapshot.objects.filter(url=record["url"]).order_by("-created_at").first()
if snap:
snapshot_ids.add(str(snap.id))
else:
rprint(f'[yellow]Snapshot not found for URL: {record["url"]}[/yellow]', file=sys.stderr)
rprint(f"[yellow]Snapshot not found for URL: {record['url']}[/yellow]", file=sys.stderr)
elif record_type == TYPE_ARCHIVERESULT:
snapshot_id = record.get('snapshot_id')
snapshot_id = record.get("snapshot_id")
if snapshot_id:
snapshot_ids.add(snapshot_id)
plugin_name = record.get('plugin')
plugin_name = record.get("plugin")
if plugin_name and not plugins_list:
requested_plugins_by_snapshot[str(snapshot_id)].add(str(plugin_name))
elif 'id' in record:
elif "id" in record:
# Assume it's a snapshot ID
snapshot_ids.add(record['id'])
snapshot_ids.add(record["id"])
if not snapshot_ids:
rprint('[red]No valid snapshot IDs found in input[/red]', file=sys.stderr)
rprint("[red]No valid snapshot IDs found in input[/red]", file=sys.stderr)
return 1
# Get snapshots and ensure they have pending ArchiveResults
@@ -173,17 +176,13 @@ def run_plugins(
try:
snapshot = Snapshot.objects.get(id=snapshot_id)
except Snapshot.DoesNotExist:
rprint(f'[yellow]Snapshot {snapshot_id} not found[/yellow]', file=sys.stderr)
rprint(f"[yellow]Snapshot {snapshot_id} not found[/yellow]", file=sys.stderr)
continue
for plugin_name in requested_plugins_by_snapshot.get(str(snapshot.id), set()):
existing_result = snapshot.archiveresult_set.filter(plugin=plugin_name).order_by('-created_at').first()
if existing_result and existing_result.status in [
ArchiveResult.StatusChoices.FAILED,
ArchiveResult.StatusChoices.SKIPPED,
ArchiveResult.StatusChoices.NORESULTS,
ArchiveResult.StatusChoices.BACKOFF,
]:
requested_plugin_names = set(plugins_list) | requested_plugins_by_snapshot.get(str(snapshot.id), set())
for plugin_name in requested_plugin_names:
existing_result = snapshot.archiveresult_set.filter(plugin=plugin_name).order_by("-created_at").first()
if existing_result:
existing_result.reset_for_retry()
# Reset snapshot status to allow processing
@@ -195,34 +194,39 @@ def run_plugins(
processed_count += 1
if processed_count == 0:
rprint('[red]No snapshots to process[/red]', file=sys.stderr)
rprint("[red]No snapshots to process[/red]", file=sys.stderr)
return 1
rprint(f'[blue]Queued {processed_count} snapshots for extraction[/blue]', file=sys.stderr)
rprint(f"[blue]Queued {processed_count} snapshots for extraction[/blue]", file=sys.stderr)
# Run orchestrator if --wait (default)
if wait:
rprint('[blue]Running plugins...[/blue]', file=sys.stderr)
rprint("[blue]Running plugins...[/blue]", file=sys.stderr)
snapshot_ids_by_crawl: dict[str, set[str]] = defaultdict(set)
for snapshot_id in snapshot_ids:
try:
snapshot = Snapshot.objects.only('id', 'crawl_id').get(id=snapshot_id)
snapshot = Snapshot.objects.only("id", "crawl_id").get(id=snapshot_id)
except Snapshot.DoesNotExist:
continue
snapshot_ids_by_crawl[str(snapshot.crawl_id)].add(str(snapshot.id))
for crawl_id, crawl_snapshot_ids in snapshot_ids_by_crawl.items():
selected_plugins = plugins_list or sorted({
plugin
for snapshot_id in crawl_snapshot_ids
for plugin in requested_plugins_by_snapshot.get(str(snapshot_id), set())
}) or None
selected_plugins = (
plugins_list
or sorted(
{plugin for snapshot_id in crawl_snapshot_ids for plugin in requested_plugins_by_snapshot.get(str(snapshot_id), set())},
)
or None
)
run_crawl(
crawl_id,
snapshot_ids=sorted(crawl_snapshot_ids),
selected_plugins=selected_plugins,
)
if not emit_results:
return 0
# Output results as JSONL (when piped) or human-readable (when TTY)
for snapshot_id in snapshot_ids:
try:
@@ -234,11 +238,14 @@ def run_plugins(
for result in results:
if is_tty:
status_color = {
'succeeded': 'green',
'failed': 'red',
'skipped': 'yellow',
}.get(result.status, 'dim')
rprint(f' [{status_color}]{result.status}[/{status_color}] {result.plugin}{result.output_str or ""}', file=sys.stderr)
"succeeded": "green",
"failed": "red",
"skipped": "yellow",
}.get(result.status, "dim")
rprint(
f" [{status_color}]{result.status}[/{status_color}] {result.plugin}{result.output_str or ''}",
file=sys.stderr,
)
else:
write_record(result.to_json())
except Snapshot.DoesNotExist:
@@ -250,18 +257,20 @@ def run_plugins(
def is_archiveresult_id(value: str) -> bool:
"""Check if value looks like an ArchiveResult UUID."""
import re
uuid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.I)
uuid_pattern = re.compile(r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$", re.I)
if not uuid_pattern.match(value):
return False
# Verify it's actually an ArchiveResult (not a Snapshot or other object)
from archivebox.core.models import ArchiveResult
return ArchiveResult.objects.filter(id=value).exists()
@click.command()
@click.option('--plugins', '--plugin', '-p', default='', help='Comma-separated list of plugins to run (e.g., screenshot,singlefile)')
@click.option('--wait/--no-wait', default=True, help='Wait for plugins to complete (default: wait)')
@click.argument('args', nargs=-1)
@click.option("--plugins", "--plugin", "-p", default="", help="Comma-separated list of plugins to run (e.g., screenshot,singlefile)")
@click.option("--wait/--no-wait", default=True, help="Wait for plugins to complete (default: wait)")
@click.argument("args", nargs=-1)
def main(plugins: str, wait: bool, args: tuple):
"""Run plugins on Snapshots, or process existing ArchiveResults by ID"""
from archivebox.misc.jsonl import read_args_or_stdin
@@ -271,14 +280,12 @@ def main(plugins: str, wait: bool, args: tuple):
if not records:
from rich import print as rprint
rprint('[yellow]No Snapshot IDs or ArchiveResult IDs provided. Pass as arguments or via stdin.[/yellow]', file=sys.stderr)
rprint("[yellow]No Snapshot IDs or ArchiveResult IDs provided. Pass as arguments or via stdin.[/yellow]", file=sys.stderr)
sys.exit(1)
# Check if input looks like existing ArchiveResult IDs to process
all_are_archiveresult_ids = all(
is_archiveresult_id(r.get('id') or r.get('url', ''))
for r in records
)
all_are_archiveresult_ids = all(is_archiveresult_id(r.get("id") or r.get("url", "")) for r in records)
if all_are_archiveresult_ids:
# Process existing ArchiveResults by ID
@@ -286,9 +293,9 @@ def main(plugins: str, wait: bool, args: tuple):
exit_code = 0
for record in records:
archiveresult_id = record.get('id') or record.get('url')
archiveresult_id = record.get("id") or record.get("url")
if not isinstance(archiveresult_id, str):
rprint(f'[red]Invalid ArchiveResult input: {record}[/red]', file=sys.stderr)
rprint(f"[red]Invalid ArchiveResult input: {record}[/red]", file=sys.stderr)
exit_code = 1
continue
result = process_archiveresult_by_id(archiveresult_id)
@@ -300,5 +307,5 @@ def main(plugins: str, wait: bool, args: tuple):
sys.exit(run_plugins(args, records=records, plugins=plugins, wait=wait))
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -1,8 +1,8 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox help'
__package__ = "archivebox.cli"
__command__ = "archivebox help"
import os
import os
from pathlib import Path
import click
@@ -17,33 +17,44 @@ def help() -> None:
from archivebox.config import CONSTANTS
from archivebox.config.permissions import IN_DOCKER
from archivebox.misc.logging_util import log_cli_command
log_cli_command('help', [], None, '.')
COMMANDS_HELP_TEXT = '\n '.join(
f'[green]{cmd.ljust(20)}[/green] {ArchiveBoxGroup._lazy_load(cmd).__doc__}'
for cmd in ArchiveBoxGroup.meta_commands.keys()
) + '\n\n ' + '\n '.join(
f'[green]{cmd.ljust(20)}[/green] {ArchiveBoxGroup._lazy_load(cmd).__doc__}'
for cmd in ArchiveBoxGroup.setup_commands.keys()
) + '\n\n ' + '\n '.join(
f'[green]{cmd.ljust(20)}[/green] {ArchiveBoxGroup._lazy_load(cmd).__doc__}'
for cmd in ArchiveBoxGroup.archive_commands.keys()
log_cli_command("help", [], None, ".")
COMMANDS_HELP_TEXT = (
"\n ".join(
f"[green]{cmd.ljust(20)}[/green] {ArchiveBoxGroup._lazy_load(cmd).__doc__}" for cmd in ArchiveBoxGroup.meta_commands.keys()
)
+ "\n\n "
+ "\n ".join(
f"[green]{cmd.ljust(20)}[/green] {ArchiveBoxGroup._lazy_load(cmd).__doc__}" for cmd in ArchiveBoxGroup.setup_commands.keys()
)
+ "\n\n "
+ "\n ".join(
f"[green]{cmd.ljust(20)}[/green] {ArchiveBoxGroup._lazy_load(cmd).__doc__}" for cmd in ArchiveBoxGroup.archive_commands.keys()
)
)
DOCKER_USAGE = '''
DOCKER_USAGE = (
"""
[dodger_blue3]Docker Usage:[/dodger_blue3]
[grey53]# using Docker Compose:[/grey53]
[blue]docker compose run[/blue] [dark_green]archivebox[/dark_green] [green]\\[command][/green] [green3][...args][/green3] [violet][--help][/violet] [grey53][--version][/grey53]
[grey53]# using Docker:[/grey53]
[blue]docker run[/blue] -v [light_slate_blue]$PWD:/data[/light_slate_blue] [grey53]-p 8000:8000[/grey53] -it [dark_green]archivebox/archivebox[/dark_green] [green]\\[command][/green] [green3][...args][/green3] [violet][--help][/violet] [grey53][--version][/grey53]
''' if IN_DOCKER else ''
DOCKER_DOCS = '\n [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Docker[/link]' if IN_DOCKER else ''
DOCKER_OUTSIDE_HINT = "\n [grey53]# outside of Docker:[/grey53]" if IN_DOCKER else ''
DOCKER_CMD_PREFIX = "[blue]docker ... [/blue]" if IN_DOCKER else ''
"""
if IN_DOCKER
else ""
)
DOCKER_DOCS = (
"\n [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Docker[/link]"
if IN_DOCKER
else ""
)
DOCKER_OUTSIDE_HINT = "\n [grey53]# outside of Docker:[/grey53]" if IN_DOCKER else ""
DOCKER_CMD_PREFIX = "[blue]docker ... [/blue]" if IN_DOCKER else ""
print(f'''{DOCKER_USAGE}
print(f"""{DOCKER_USAGE}
[deep_sky_blue4]Usage:[/deep_sky_blue4]{DOCKER_OUTSIDE_HINT}
[dark_green]archivebox[/dark_green] [green]\\[command][/green] [green3][...args][/green3] [violet][--help][/violet] [grey53][--version][/grey53]
@@ -54,12 +65,11 @@ def help() -> None:
[link=https://github.com/ArchiveBox/ArchiveBox/wiki]https://github.com/ArchiveBox/ArchiveBox/wiki[/link]{DOCKER_DOCS}
[link=https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#cli-usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Usage[/link]
[link=https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration]https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration[/link]
''')
""")
if os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) and CONSTANTS.ARCHIVE_DIR.is_dir():
pretty_out_dir = str(CONSTANTS.DATA_DIR).replace(str(Path('~').expanduser()), '~')
EXAMPLE_USAGE = f'''
pretty_out_dir = str(CONSTANTS.DATA_DIR).replace(str(Path("~").expanduser()), "~")
EXAMPLE_USAGE = f"""
[light_slate_blue]DATA DIR[/light_slate_blue]: [yellow]{pretty_out_dir}[/yellow]
[violet]Hint:[/violet] [i]Common maintenance tasks:[/i]
@@ -73,33 +83,49 @@ def help() -> None:
[dark_green]archivebox[/dark_green] [green]list[/green] --sort=timestamp --csv=timestamp,downloaded_at,url,title
[dark_green]archivebox[/dark_green] [green]schedule[/green] --every=day --depth=1 "https://example.com/some/feed.rss"
[dark_green]archivebox[/dark_green] [green]server[/green] [blue]0.0.0.0:8000[/blue] [grey53]# Start the Web UI / API server[/grey53]
'''
print(Panel(EXAMPLE_USAGE, expand=False, border_style='grey53', title='[green3]:white_check_mark: A collection [light_slate_blue]DATA DIR[/light_slate_blue] is currently active[/green3]', subtitle='Commands run inside this dir will only apply to this collection.'))
"""
print(
Panel(
EXAMPLE_USAGE,
expand=False,
border_style="grey53",
title="[green3]:white_check_mark: A collection [light_slate_blue]DATA DIR[/light_slate_blue] is currently active[/green3]",
subtitle="Commands run inside this dir will only apply to this collection.",
),
)
else:
DATA_SETUP_HELP = '\n'
DATA_SETUP_HELP = "\n"
if IN_DOCKER:
DATA_SETUP_HELP += '[violet]Hint:[/violet] When using Docker, you need to mount a volume to use as your data dir:\n'
DATA_SETUP_HELP += ' docker run [violet]-v /some/path/data:/data[/violet] archivebox/archivebox ...\n\n'
DATA_SETUP_HELP += 'To load an [dark_blue]existing[/dark_blue] collection:\n'
DATA_SETUP_HELP += ' 1. [green]cd[/green] ~/archivebox/data [grey53]# go into existing [light_slate_blue]DATA DIR[/light_slate_blue] (can be anywhere)[/grey53]\n'
DATA_SETUP_HELP += f' 2. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]init[/green] [grey53]# migrate to latest version (safe to run multiple times)[/grey53]\n'
DATA_SETUP_HELP += f' 3. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]install[/green] [grey53]# auto-update all plugins (wget, chrome, singlefile, etc.)[/grey53]\n'
DATA_SETUP_HELP += f' 4. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]help[/green] [grey53]# ...get help with next steps... [/grey53]\n\n'
DATA_SETUP_HELP += 'To start a [sea_green1]new[/sea_green1] collection:\n'
DATA_SETUP_HELP += ' 1. [green]mkdir[/green] ~/archivebox/data [grey53]# create a new, empty [light_slate_blue]DATA DIR[/light_slate_blue] (can be anywhere)[/grey53]\n'
DATA_SETUP_HELP += ' 2. [green]cd[/green] ~/archivebox/data [grey53]# cd into the new directory[/grey53]\n'
DATA_SETUP_HELP += f' 3. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]init[/green] [grey53]# initialize ArchiveBox in the new data dir[/grey53]\n'
DATA_SETUP_HELP += f' 4. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]install[/green] [grey53]# auto-install all plugins (wget, chrome, singlefile, etc.)[/grey53]\n'
DATA_SETUP_HELP += f' 5. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]help[/green] [grey53]# ... get help with next steps... [/grey53]\n'
print(Panel(DATA_SETUP_HELP, expand=False, border_style='grey53', title='[red]:cross_mark: No collection is currently active[/red]', subtitle='All archivebox [green]commands[/green] should be run from inside a collection [light_slate_blue]DATA DIR[/light_slate_blue]'))
DATA_SETUP_HELP += "[violet]Hint:[/violet] When using Docker, you need to mount a volume to use as your data dir:\n"
DATA_SETUP_HELP += " docker run [violet]-v /some/path/data:/data[/violet] archivebox/archivebox ...\n\n"
DATA_SETUP_HELP += "To load an [dark_blue]existing[/dark_blue] collection:\n"
DATA_SETUP_HELP += " 1. [green]cd[/green] ~/archivebox/data [grey53]# go into existing [light_slate_blue]DATA DIR[/light_slate_blue] (can be anywhere)[/grey53]\n"
DATA_SETUP_HELP += f" 2. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]init[/green] [grey53]# migrate to latest version (safe to run multiple times)[/grey53]\n"
DATA_SETUP_HELP += f" 3. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]install[/green] [grey53]# auto-update all plugins (wget, chrome, singlefile, etc.)[/grey53]\n"
DATA_SETUP_HELP += f" 4. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]help[/green] [grey53]# ...get help with next steps... [/grey53]\n\n"
DATA_SETUP_HELP += "To start a [sea_green1]new[/sea_green1] collection:\n"
DATA_SETUP_HELP += " 1. [green]mkdir[/green] ~/archivebox/data [grey53]# create a new, empty [light_slate_blue]DATA DIR[/light_slate_blue] (can be anywhere)[/grey53]\n"
DATA_SETUP_HELP += " 2. [green]cd[/green] ~/archivebox/data [grey53]# cd into the new directory[/grey53]\n"
DATA_SETUP_HELP += f" 3. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]init[/green] [grey53]# initialize ArchiveBox in the new data dir[/grey53]\n"
DATA_SETUP_HELP += f" 4. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]install[/green] [grey53]# auto-install all plugins (wget, chrome, singlefile, etc.)[/grey53]\n"
DATA_SETUP_HELP += f" 5. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]help[/green] [grey53]# ... get help with next steps... [/grey53]\n"
print(
Panel(
DATA_SETUP_HELP,
expand=False,
border_style="grey53",
title="[red]:cross_mark: No collection is currently active[/red]",
subtitle="All archivebox [green]commands[/green] should be run from inside a collection [light_slate_blue]DATA DIR[/light_slate_blue]",
),
)
@click.command()
@click.option('--help', '-h', is_flag=True, help='Show help')
@click.option("--help", "-h", is_flag=True, help="Show help")
def main(**kwargs):
"""Print the ArchiveBox help message and usage"""
return help()
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -1,11 +1,11 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__package__ = "archivebox.cli"
import os
import sys
from pathlib import Path
from typing import Mapping
from collections.abc import Mapping
from rich import print
import rich_click as click
@@ -14,12 +14,12 @@ from archivebox.misc.util import docstring, enforce_types
def _normalize_snapshot_record(link_dict: Mapping[str, object]) -> tuple[str, dict[str, object]] | None:
url = link_dict.get('url')
url = link_dict.get("url")
if not isinstance(url, str) or not url:
return None
record: dict[str, object] = {'url': url}
for key in ('timestamp', 'title', 'tags', 'sources'):
record: dict[str, object] = {"url": url}
for key in ("timestamp", "title", "tags", "sources"):
value = link_dict.get(key)
if value is not None:
record[key] = value
@@ -27,15 +27,15 @@ def _normalize_snapshot_record(link_dict: Mapping[str, object]) -> tuple[str, di
@enforce_types
def init(force: bool=False, quick: bool=False, install: bool=False) -> None:
def init(force: bool = False, quick: bool = False, install: bool = False) -> None:
"""Initialize a new ArchiveBox collection in the current directory"""
from archivebox.config import CONSTANTS, VERSION, DATA_DIR
from archivebox.config.common import SERVER_CONFIG
from archivebox.config.collection import write_config_file
from archivebox.misc.legacy import parse_json_main_index, parse_json_links_details
from archivebox.misc.db import apply_migrations
# if os.access(out_dir / CONSTANTS.JSON_INDEX_FILENAME, os.F_OK):
# print("[red]:warning: This folder contains a JSON index. It is deprecated, and will no longer be kept up to date automatically.[/red]", file=sys.stderr)
# print("[red] You can run `archivebox list --json --with-headers > static_index.json` to manually generate it.[/red]", file=sys.stderr)
@@ -43,69 +43,71 @@ def init(force: bool=False, quick: bool=False, install: bool=False) -> None:
is_empty = not len(set(os.listdir(DATA_DIR)) - CONSTANTS.ALLOWED_IN_DATA_DIR)
existing_index = os.path.isfile(CONSTANTS.DATABASE_FILE)
if is_empty and not existing_index:
print(f'[turquoise4][+] Initializing a new ArchiveBox v{VERSION} collection...[/turquoise4]')
print('[green]----------------------------------------------------------------------[/green]')
print(f"[turquoise4][+] Initializing a new ArchiveBox v{VERSION} collection...[/turquoise4]")
print("[green]----------------------------------------------------------------------[/green]")
elif existing_index:
# TODO: properly detect and print the existing version in current index as well
print(f'[green][*] Verifying and updating existing ArchiveBox collection to v{VERSION}...[/green]')
print('[green]----------------------------------------------------------------------[/green]')
print(f"[green][*] Verifying and updating existing ArchiveBox collection to v{VERSION}...[/green]")
print("[green]----------------------------------------------------------------------[/green]")
else:
if force:
print('[red][!] This folder appears to already have files in it, but no index.sqlite3 is present.[/red]')
print('[red] Because --force was passed, ArchiveBox will initialize anyway (which may overwrite existing files).[/red]')
print("[red][!] This folder appears to already have files in it, but no index.sqlite3 is present.[/red]")
print("[red] Because --force was passed, ArchiveBox will initialize anyway (which may overwrite existing files).[/red]")
else:
print(
("[red][X] This folder appears to already have files in it, but no index.sqlite3 present.[/red]\n\n"
"[red][X] This folder appears to already have files in it, but no index.sqlite3 present.[/red]\n\n"
" You must run init in a completely empty directory, or an existing data folder.\n\n"
" [violet]Hint:[/violet] To import an existing data folder make sure to cd into the folder first, \n"
" then run and run 'archivebox init' to pick up where you left off.\n\n"
" (Always make sure your data folder is backed up first before updating ArchiveBox)"
)
" (Always make sure your data folder is backed up first before updating ArchiveBox)",
)
raise SystemExit(2)
if existing_index:
print('\n[green][*] Verifying archive folder structure...[/green]')
print("\n[green][*] Verifying archive folder structure...[/green]")
else:
print('\n[green][+] Building archive folder structure...[/green]')
print(f' + ./{CONSTANTS.ARCHIVE_DIR.relative_to(DATA_DIR)}, ./{CONSTANTS.SOURCES_DIR.relative_to(DATA_DIR)}, ./{CONSTANTS.LOGS_DIR.relative_to(DATA_DIR)}...')
print("\n[green][+] Building archive folder structure...[/green]")
print(
f" + ./{CONSTANTS.ARCHIVE_DIR.relative_to(DATA_DIR)}, ./{CONSTANTS.SOURCES_DIR.relative_to(DATA_DIR)}, ./{CONSTANTS.LOGS_DIR.relative_to(DATA_DIR)}...",
)
Path(CONSTANTS.SOURCES_DIR).mkdir(exist_ok=True)
Path(CONSTANTS.ARCHIVE_DIR).mkdir(exist_ok=True)
Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True)
print(f' + ./{CONSTANTS.CONFIG_FILE.relative_to(DATA_DIR)}...')
print(f" + ./{CONSTANTS.CONFIG_FILE.relative_to(DATA_DIR)}...")
# create the .archivebox_id file with a unique ID for this collection
from archivebox.config.paths import _get_collection_id
_get_collection_id(DATA_DIR, force_create=True)
# create the ArchiveBox.conf file
write_config_file({'SECRET_KEY': SERVER_CONFIG.SECRET_KEY})
_get_collection_id(DATA_DIR, force_create=True)
# create the ArchiveBox.conf file
write_config_file({"SECRET_KEY": SERVER_CONFIG.SECRET_KEY})
if os.access(CONSTANTS.DATABASE_FILE, os.F_OK):
print('\n[green][*] Verifying main SQL index and running any migrations needed...[/green]')
print("\n[green][*] Verifying main SQL index and running any migrations needed...[/green]")
else:
print('\n[green][+] Building main SQL index and running initial migrations...[/green]')
print("\n[green][+] Building main SQL index and running initial migrations...[/green]")
from archivebox.config.django import setup_django
setup_django()
for migration_line in apply_migrations(DATA_DIR):
sys.stdout.write(f' {migration_line}\n')
sys.stdout.write(f" {migration_line}\n")
assert os.path.isfile(CONSTANTS.DATABASE_FILE) and os.access(CONSTANTS.DATABASE_FILE, os.R_OK)
print()
print(f' √ ./{CONSTANTS.DATABASE_FILE.relative_to(DATA_DIR)}')
print(f" √ ./{CONSTANTS.DATABASE_FILE.relative_to(DATA_DIR)}")
# from django.contrib.auth.models import User
# if SHELL_CONFIG.IS_TTY and not User.objects.filter(is_superuser=True).exclude(username='system').exists():
# print('{green}[+] Creating admin user account...{reset}'.format(**SHELL_CONFIG.ANSI))
# call_command("createsuperuser", interactive=True)
print()
print('[dodger_blue3][*] Checking links from indexes and archive folders (safe to Ctrl+C)...[/dodger_blue3]')
print("[dodger_blue3][*] Checking links from indexes and archive folders (safe to Ctrl+C)...[/dodger_blue3]")
from archivebox.core.models import Snapshot
@@ -114,10 +116,10 @@ def init(force: bool=False, quick: bool=False, install: bool=False) -> None:
if existing_index:
all_links = Snapshot.objects.all()
print(f' √ Loaded {all_links.count()} links from existing main index.')
print(f" √ Loaded {all_links.count()} links from existing main index.")
if quick:
print(' > Skipping orphan snapshot import (quick mode)')
print(" > Skipping orphan snapshot import (quick mode)")
else:
try:
# Import orphaned links from legacy JSON indexes
@@ -131,7 +133,7 @@ def init(force: bool=False, quick: bool=False, install: bool=False) -> None:
orphaned_json_links[url] = record
if orphaned_json_links:
pending_links.update(orphaned_json_links)
print(f' [yellow]√ Added {len(orphaned_json_links)} orphaned links from existing JSON index...[/yellow]')
print(f" [yellow]√ Added {len(orphaned_json_links)} orphaned links from existing JSON index...[/yellow]")
orphaned_data_dir_links: dict[str, dict[str, object]] = {}
for link_dict in parse_json_links_details(DATA_DIR):
@@ -143,7 +145,7 @@ def init(force: bool=False, quick: bool=False, install: bool=False) -> None:
orphaned_data_dir_links[url] = record
if orphaned_data_dir_links:
pending_links.update(orphaned_data_dir_links)
print(f' [yellow]√ Added {len(orphaned_data_dir_links)} orphaned links from existing archive directories.[/yellow]')
print(f" [yellow]√ Added {len(orphaned_data_dir_links)} orphaned links from existing archive directories.[/yellow]")
if pending_links:
for link_dict in pending_links.values():
@@ -151,42 +153,44 @@ def init(force: bool=False, quick: bool=False, install: bool=False) -> None:
# Hint for orphaned snapshot directories
print()
print(' [violet]Hint:[/violet] To import orphaned snapshot directories and reconcile filesystem state, run:')
print(' archivebox update')
print(" [violet]Hint:[/violet] To import orphaned snapshot directories and reconcile filesystem state, run:")
print(" archivebox update")
except (KeyboardInterrupt, SystemExit):
print(file=sys.stderr)
print('[yellow]:stop_sign: Stopped checking archive directories due to Ctrl-C/SIGTERM[/yellow]', file=sys.stderr)
print(' Your archive data is safe, but you should re-run `archivebox init` to finish the process later.', file=sys.stderr)
print("[yellow]:stop_sign: Stopped checking archive directories due to Ctrl-C/SIGTERM[/yellow]", file=sys.stderr)
print(" Your archive data is safe, but you should re-run `archivebox init` to finish the process later.", file=sys.stderr)
print(file=sys.stderr)
print(' [violet]Hint:[/violet] In the future you can run a quick init without checking dirs like so:', file=sys.stderr)
print(' archivebox init --quick', file=sys.stderr)
print(" [violet]Hint:[/violet] In the future you can run a quick init without checking dirs like so:", file=sys.stderr)
print(" archivebox init --quick", file=sys.stderr)
raise SystemExit(1)
print('\n[green]----------------------------------------------------------------------[/green]')
print("\n[green]----------------------------------------------------------------------[/green]")
from django.contrib.auth.models import User
if (SERVER_CONFIG.ADMIN_USERNAME and SERVER_CONFIG.ADMIN_PASSWORD) and not User.objects.filter(username=SERVER_CONFIG.ADMIN_USERNAME).exists():
print('[green][+] Found ADMIN_USERNAME and ADMIN_PASSWORD configuration options, creating new admin user.[/green]')
if (SERVER_CONFIG.ADMIN_USERNAME and SERVER_CONFIG.ADMIN_PASSWORD) and not User.objects.filter(
username=SERVER_CONFIG.ADMIN_USERNAME,
).exists():
print("[green][+] Found ADMIN_USERNAME and ADMIN_PASSWORD configuration options, creating new admin user.[/green]")
User.objects.create_superuser(username=SERVER_CONFIG.ADMIN_USERNAME, password=SERVER_CONFIG.ADMIN_PASSWORD)
if existing_index:
print('[green][√] Done. Verified and updated the existing ArchiveBox collection.[/green]')
print("[green][√] Done. Verified and updated the existing ArchiveBox collection.[/green]")
else:
print(f'[green][√] Done. A new ArchiveBox collection was initialized ({len(all_links) + len(pending_links)} links).[/green]')
print(f"[green][√] Done. A new ArchiveBox collection was initialized ({len(all_links) + len(pending_links)} links).[/green]")
CONSTANTS.PERSONAS_DIR.mkdir(parents=True, exist_ok=True)
CONSTANTS.DEFAULT_TMP_DIR.mkdir(parents=True, exist_ok=True)
CONSTANTS.DEFAULT_LIB_DIR.mkdir(parents=True, exist_ok=True)
(CONSTANTS.DEFAULT_LIB_DIR / 'bin').mkdir(parents=True, exist_ok=True)
(CONSTANTS.DEFAULT_LIB_DIR / "bin").mkdir(parents=True, exist_ok=True)
from archivebox.config.common import STORAGE_CONFIG
from archivebox.config.paths import get_or_create_working_tmp_dir, get_or_create_working_lib_dir
STORAGE_CONFIG.TMP_DIR.mkdir(parents=True, exist_ok=True)
STORAGE_CONFIG.LIB_DIR.mkdir(parents=True, exist_ok=True)
(STORAGE_CONFIG.LIB_DIR / 'bin').mkdir(parents=True, exist_ok=True)
(STORAGE_CONFIG.LIB_DIR / "bin").mkdir(parents=True, exist_ok=True)
working_tmp_dir = get_or_create_working_tmp_dir(autofix=True, quiet=True)
if working_tmp_dir:
@@ -195,33 +199,35 @@ def init(force: bool=False, quick: bool=False, install: bool=False) -> None:
working_lib_dir = get_or_create_working_lib_dir(autofix=True, quiet=True)
if working_lib_dir:
working_lib_dir.mkdir(parents=True, exist_ok=True)
(working_lib_dir / 'bin').mkdir(parents=True, exist_ok=True)
(working_lib_dir / "bin").mkdir(parents=True, exist_ok=True)
if install:
from archivebox.cli.archivebox_install import install as install_method
install_method()
if Snapshot.objects.count() < 25: # hide the hints for experienced users
if Snapshot.objects.count() < 25: # hide the hints for experienced users
print()
print(' [violet]Hint:[/violet] To view your archive index, run:')
print(' archivebox server # then visit [deep_sky_blue4][link=http://127.0.0.1:8000]http://127.0.0.1:8000[/link][/deep_sky_blue4]')
print(" [violet]Hint:[/violet] To view your archive index, run:")
print(
" archivebox server # then visit [deep_sky_blue4][link=http://127.0.0.1:8000]http://127.0.0.1:8000[/link][/deep_sky_blue4]",
)
print()
print(' To add new links, you can run:')
print(" To add new links, you can run:")
print(" archivebox add < ~/some/path/to/list_of_links.txt")
print()
print(' For more usage and examples, run:')
print(' archivebox help')
print(" For more usage and examples, run:")
print(" archivebox help")
@click.command()
@click.option('--force', '-f', is_flag=True, help='Ignore unrecognized files in current directory and initialize anyway')
@click.option('--quick', '-q', is_flag=True, help='Run any updates or migrations without rechecking all snapshot dirs')
@click.option('--install', '-s', is_flag=True, help='Automatically install dependencies and extras used for archiving')
@click.option("--force", "-f", is_flag=True, help="Ignore unrecognized files in current directory and initialize anyway")
@click.option("--quick", "-q", is_flag=True, help="Run any updates or migrations without rechecking all snapshot dirs")
@click.option("--install", "-s", is_flag=True, help="Automatically install dependencies and extras used for archiving")
@docstring(init.__doc__)
def main(**kwargs) -> None:
init(**kwargs)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -1,6 +1,6 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__package__ = "archivebox.cli"
import os
@@ -11,7 +11,7 @@ from archivebox.misc.util import docstring, enforce_types
@enforce_types
def install(binaries: tuple[str, ...] = (), binproviders: str = '*', dry_run: bool = False) -> None:
def install(binaries: tuple[str, ...] = (), binproviders: str = "*", dry_run: bool = False) -> None:
"""Detect and install ArchiveBox dependencies by running the abx-dl install flow
Examples:
@@ -31,33 +31,34 @@ def install(binaries: tuple[str, ...] = (), binproviders: str = '*', dry_run: bo
# Show what we're installing
if binaries:
print(f'\n[green][+] Installing specific binaries: {", ".join(binaries)}[/green]')
print(f"\n[green][+] Installing specific binaries: {', '.join(binaries)}[/green]")
else:
print('\n[green][+] Detecting and installing all ArchiveBox dependencies...[/green]')
print("\n[green][+] Detecting and installing all ArchiveBox dependencies...[/green]")
if binproviders != '*':
print(f'[green][+] Using providers: {binproviders}[/green]')
if binproviders != "*":
print(f"[green][+] Using providers: {binproviders}[/green]")
if IS_ROOT:
EUID = os.geteuid()
print()
print(f'[yellow]:warning: Running as UID=[blue]{EUID}[/blue].[/yellow]')
print(f' DATA_DIR will be owned by [blue]{ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/blue].')
print(f"[yellow]:warning: Running as UID=[blue]{EUID}[/blue].[/yellow]")
print(f" DATA_DIR will be owned by [blue]{ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/blue].")
print()
if dry_run:
print('[dim]Dry run - would run the abx-dl install flow[/dim]')
print("[dim]Dry run - would run the abx-dl install flow[/dim]")
return
# Set up Django
from archivebox.config.django import setup_django
setup_django()
plugin_names = list(binaries)
if binproviders != '*':
plugin_names.extend(provider.strip() for provider in binproviders.split(',') if provider.strip())
if binproviders != "*":
plugin_names.extend(provider.strip() for provider in binproviders.split(",") if provider.strip())
print('[+] Running installer via abx-dl bus...')
print("[+] Running installer via abx-dl bus...")
print()
from archivebox.services.runner import run_install
@@ -68,28 +69,36 @@ def install(binaries: tuple[str, ...] = (), binproviders: str = '*', dry_run: bo
# Check for superuser
from django.contrib.auth import get_user_model
User = get_user_model()
if not User.objects.filter(is_superuser=True).exclude(username='system').exists():
stderr('\n[+] Don\'t forget to create a new admin user for the Web UI...', color='green')
stderr(' archivebox manage createsuperuser')
if not User.objects.filter(is_superuser=True).exclude(username="system").exists():
stderr("\n[+] Don't forget to create a new admin user for the Web UI...", color="green")
stderr(" archivebox manage createsuperuser")
print()
# Show version to display full status including installed binaries
# Django is already loaded, so just import and call the function directly
from archivebox.cli.archivebox_version import version as show_version
show_version(quiet=False)
@click.command()
@click.argument('binaries', nargs=-1, type=str, required=False)
@click.option('--binproviders', '-p', default='*', help='Comma-separated list of providers to use (pip,npm,brew,apt,env,custom) or * for all', show_default=True)
@click.option('--dry-run', '-d', is_flag=True, help='Show what would happen without actually running', default=False)
@click.argument("binaries", nargs=-1, type=str, required=False)
@click.option(
"--binproviders",
"-p",
default="*",
help="Comma-separated list of providers to use (pip,npm,brew,apt,env,custom) or * for all",
show_default=True,
)
@click.option("--dry-run", "-d", is_flag=True, help="Show what would happen without actually running", default=False)
@docstring(install.__doc__)
def main(**kwargs) -> None:
install(**kwargs)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -1,10 +1,9 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox list'
__package__ = "archivebox.cli"
__command__ = "archivebox list"
import sys
from typing import Optional
import rich_click as click
@@ -12,31 +11,47 @@ from archivebox.cli.archivebox_snapshot import list_snapshots
@click.command()
@click.option('--status', '-s', help='Filter by status (queued, started, sealed)')
@click.option('--url__icontains', help='Filter by URL contains')
@click.option('--url__istartswith', help='Filter by URL starts with')
@click.option('--tag', '-t', help='Filter by tag name')
@click.option('--crawl-id', help='Filter by crawl ID')
@click.option('--limit', '-n', type=int, help='Limit number of results')
@click.option('--sort', '-o', type=str, help='Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at')
@click.option('--csv', '-C', type=str, help='Print output as CSV with the provided fields, e.g.: timestamp,url,title')
@click.option('--with-headers', is_flag=True, help='Include column headers in structured output')
def main(status: Optional[str], url__icontains: Optional[str], url__istartswith: Optional[str],
tag: Optional[str], crawl_id: Optional[str], limit: Optional[int],
sort: Optional[str], csv: Optional[str], with_headers: bool) -> None:
@click.option("--status", "-s", help="Filter by status (queued, started, sealed)")
@click.option("--url__icontains", help="Filter by URL contains")
@click.option("--url__istartswith", help="Filter by URL starts with")
@click.option("--tag", "-t", help="Filter by tag name")
@click.option("--crawl-id", help="Filter by crawl ID")
@click.option("--limit", "-n", type=int, help="Limit number of results")
@click.option("--sort", "-o", type=str, help="Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at")
@click.option("--csv", "-C", type=str, help="Print output as CSV with the provided fields, e.g.: timestamp,url,title")
@click.option("--with-headers", is_flag=True, help="Include column headers in structured output")
@click.option("--search", type=click.Choice(["meta", "content", "contents", "deep"]), help="Search mode to use for the query")
@click.argument("query", nargs=-1)
def main(
status: str | None,
url__icontains: str | None,
url__istartswith: str | None,
tag: str | None,
crawl_id: str | None,
limit: int | None,
sort: str | None,
csv: str | None,
with_headers: bool,
search: str | None,
query: tuple[str, ...],
) -> None:
"""List Snapshots."""
sys.exit(list_snapshots(
status=status,
url__icontains=url__icontains,
url__istartswith=url__istartswith,
tag=tag,
crawl_id=crawl_id,
limit=limit,
sort=sort,
csv=csv,
with_headers=with_headers,
))
sys.exit(
list_snapshots(
status=status,
url__icontains=url__icontains,
url__istartswith=url__istartswith,
tag=tag,
crawl_id=crawl_id,
limit=limit,
sort=sort,
csv=csv,
with_headers=with_headers,
search=search,
query=" ".join(query),
),
)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -19,11 +19,10 @@ Examples:
archivebox machine list --hostname__icontains=myserver
"""
__package__ = 'archivebox.cli'
__command__ = 'archivebox machine'
__package__ = "archivebox.cli"
__command__ = "archivebox machine"
import sys
from typing import Optional
import rich_click as click
from rich import print as rprint
@@ -35,10 +34,11 @@ from archivebox.cli.cli_utils import apply_filters
# LIST
# =============================================================================
def list_machines(
hostname__icontains: Optional[str] = None,
os_platform: Optional[str] = None,
limit: Optional[int] = None,
hostname__icontains: str | None = None,
os_platform: str | None = None,
limit: int | None = None,
) -> int:
"""
List Machines as JSONL with optional filters.
@@ -51,24 +51,24 @@ def list_machines(
is_tty = sys.stdout.isatty()
queryset = Machine.objects.all().order_by('-created_at')
queryset = Machine.objects.all().order_by("-created_at")
# Apply filters
filter_kwargs = {
'hostname__icontains': hostname__icontains,
'os_platform': os_platform,
"hostname__icontains": hostname__icontains,
"os_platform": os_platform,
}
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
count = 0
for machine in queryset:
if is_tty:
rprint(f'[cyan]{machine.hostname:30}[/cyan] [dim]{machine.os_platform:10}[/dim] {machine.id}')
rprint(f"[cyan]{machine.hostname:30}[/cyan] [dim]{machine.os_platform:10}[/dim] {machine.id}")
else:
write_record(machine.to_json())
count += 1
rprint(f'[dim]Listed {count} machines[/dim]', file=sys.stderr)
rprint(f"[dim]Listed {count} machines[/dim]", file=sys.stderr)
return 0
@@ -76,24 +76,27 @@ def list_machines(
# CLI Commands
# =============================================================================
@click.group()
def main():
"""Manage Machine records (read-only, system-managed)."""
pass
@main.command('list')
@click.option('--hostname__icontains', help='Filter by hostname contains')
@click.option('--os-platform', help='Filter by OS platform')
@click.option('--limit', '-n', type=int, help='Limit number of results')
def list_cmd(hostname__icontains: Optional[str], os_platform: Optional[str], limit: Optional[int]):
@main.command("list")
@click.option("--hostname__icontains", help="Filter by hostname contains")
@click.option("--os-platform", help="Filter by OS platform")
@click.option("--limit", "-n", type=int, help="Limit number of results")
def list_cmd(hostname__icontains: str | None, os_platform: str | None, limit: int | None):
"""List Machines as JSONL."""
sys.exit(list_machines(
hostname__icontains=hostname__icontains,
os_platform=os_platform,
limit=limit,
))
sys.exit(
list_machines(
hostname__icontains=hostname__icontains,
os_platform=os_platform,
limit=limit,
),
)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -1,33 +1,34 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__package__ = "archivebox.cli"
import rich_click as click
from archivebox.misc.util import docstring, enforce_types
@enforce_types
def manage(args: list[str] | None=None) -> None:
def manage(args: list[str] | None = None) -> None:
"""Run an ArchiveBox Django management command"""
from archivebox.config.common import SHELL_CONFIG
from archivebox.misc.logging import stderr
if (args and "createsuperuser" in args) and (SHELL_CONFIG.IN_DOCKER and not SHELL_CONFIG.IS_TTY):
stderr('[!] Warning: you need to pass -it to use interactive commands in docker', color='lightyellow')
stderr(' docker run -it archivebox manage {}'.format(' '.join(args or ['...'])), color='lightyellow')
stderr('')
stderr("[!] Warning: you need to pass -it to use interactive commands in docker", color="lightyellow")
stderr(" docker run -it archivebox manage {}".format(" ".join(args or ["..."])), color="lightyellow")
stderr("")
from django.core.management import execute_from_command_line
execute_from_command_line(['manage.py', *(args or ['help'])])
execute_from_command_line(["manage.py", *(args or ["help"])])
@click.command(add_help_option=False, context_settings=dict(ignore_unknown_options=True))
@click.argument('args', nargs=-1)
@click.argument("args", nargs=-1)
@docstring(manage.__doc__)
def main(args: list[str] | None=None) -> None:
def main(args: list[str] | None = None) -> None:
manage(args=args)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -6,8 +6,8 @@ Start the Model Context Protocol (MCP) server in stdio mode.
Exposes all ArchiveBox CLI commands as MCP tools for AI agents.
"""
__package__ = 'archivebox.cli'
__command__ = 'archivebox mcp'
__package__ = "archivebox.cli"
__command__ = "archivebox mcp"
import rich_click as click
@@ -45,5 +45,5 @@ def main(**kwargs):
mcp()
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -24,8 +24,8 @@ Examples:
archivebox persona list --name=old | archivebox persona delete --yes
"""
__package__ = 'archivebox.cli'
__command__ = 'archivebox persona'
__package__ = "archivebox.cli"
__command__ = "archivebox persona"
import os
import sys
@@ -35,7 +35,7 @@ import subprocess
import tempfile
import json
from pathlib import Path
from typing import Optional, Iterable
from collections.abc import Iterable
from collections import OrderedDict
import rich_click as click
@@ -49,134 +49,145 @@ from archivebox.personas import importers as persona_importers
# Browser Profile Locations
# =============================================================================
def get_chrome_user_data_dir() -> Optional[Path]:
def get_chrome_user_data_dir() -> Path | None:
"""Get the default Chrome user data directory for the current platform."""
system = platform.system()
home = Path.home()
if system == 'Darwin': # macOS
if system == "Darwin": # macOS
candidates = [
home / 'Library' / 'Application Support' / 'Google' / 'Chrome',
home / 'Library' / 'Application Support' / 'Chromium',
home / "Library" / "Application Support" / "Google" / "Chrome",
home / "Library" / "Application Support" / "Chromium",
]
elif system == 'Linux':
elif system == "Linux":
candidates = [
home / '.config' / 'google-chrome',
home / '.config' / 'chromium',
home / '.config' / 'chrome',
home / 'snap' / 'chromium' / 'common' / 'chromium',
home / ".config" / "google-chrome",
home / ".config" / "chromium",
home / ".config" / "chrome",
home / "snap" / "chromium" / "common" / "chromium",
]
elif system == 'Windows':
local_app_data = Path(os.environ.get('LOCALAPPDATA', home / 'AppData' / 'Local'))
elif system == "Windows":
local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local"))
candidates = [
local_app_data / 'Google' / 'Chrome' / 'User Data',
local_app_data / 'Chromium' / 'User Data',
local_app_data / "Google" / "Chrome" / "User Data",
local_app_data / "Chromium" / "User Data",
]
else:
candidates = []
for candidate in candidates:
if candidate.exists() and (candidate / 'Default').exists():
if candidate.exists() and (candidate / "Default").exists():
return candidate
return None
def get_brave_user_data_dir() -> Optional[Path]:
def get_brave_user_data_dir() -> Path | None:
"""Get the default Brave user data directory for the current platform."""
system = platform.system()
home = Path.home()
if system == 'Darwin':
if system == "Darwin":
candidates = [
home / 'Library' / 'Application Support' / 'BraveSoftware' / 'Brave-Browser',
home / "Library" / "Application Support" / "BraveSoftware" / "Brave-Browser",
]
elif system == 'Linux':
elif system == "Linux":
candidates = [
home / '.config' / 'BraveSoftware' / 'Brave-Browser',
home / ".config" / "BraveSoftware" / "Brave-Browser",
]
elif system == 'Windows':
local_app_data = Path(os.environ.get('LOCALAPPDATA', home / 'AppData' / 'Local'))
elif system == "Windows":
local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local"))
candidates = [
local_app_data / 'BraveSoftware' / 'Brave-Browser' / 'User Data',
local_app_data / "BraveSoftware" / "Brave-Browser" / "User Data",
]
else:
candidates = []
for candidate in candidates:
if candidate.exists() and (candidate / 'Default').exists():
if candidate.exists() and (candidate / "Default").exists():
return candidate
return None
def get_edge_user_data_dir() -> Optional[Path]:
def get_edge_user_data_dir() -> Path | None:
"""Get the default Edge user data directory for the current platform."""
system = platform.system()
home = Path.home()
if system == 'Darwin':
if system == "Darwin":
candidates = [
home / 'Library' / 'Application Support' / 'Microsoft Edge',
home / "Library" / "Application Support" / "Microsoft Edge",
]
elif system == 'Linux':
elif system == "Linux":
candidates = [
home / '.config' / 'microsoft-edge',
home / '.config' / 'microsoft-edge-beta',
home / '.config' / 'microsoft-edge-dev',
home / ".config" / "microsoft-edge",
home / ".config" / "microsoft-edge-beta",
home / ".config" / "microsoft-edge-dev",
]
elif system == 'Windows':
local_app_data = Path(os.environ.get('LOCALAPPDATA', home / 'AppData' / 'Local'))
elif system == "Windows":
local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local"))
candidates = [
local_app_data / 'Microsoft' / 'Edge' / 'User Data',
local_app_data / "Microsoft" / "Edge" / "User Data",
]
else:
candidates = []
for candidate in candidates:
if candidate.exists() and (candidate / 'Default').exists():
if candidate.exists() and (candidate / "Default").exists():
return candidate
return None
def get_browser_binary(browser: str) -> Optional[str]:
def get_browser_binary(browser: str) -> str | None:
system = platform.system()
home = Path.home()
browser = browser.lower()
if system == 'Darwin':
if system == "Darwin":
candidates = {
'chrome': ['/Applications/Google Chrome.app/Contents/MacOS/Google Chrome'],
'chromium': ['/Applications/Chromium.app/Contents/MacOS/Chromium'],
'brave': ['/Applications/Brave Browser.app/Contents/MacOS/Brave Browser'],
'edge': ['/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge'],
"chrome": ["/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"],
"chromium": ["/Applications/Chromium.app/Contents/MacOS/Chromium"],
"brave": ["/Applications/Brave Browser.app/Contents/MacOS/Brave Browser"],
"edge": ["/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge"],
}.get(browser, [])
elif system == 'Linux':
elif system == "Linux":
candidates = {
'chrome': ['/usr/bin/google-chrome', '/usr/bin/google-chrome-stable', '/usr/bin/google-chrome-beta', '/usr/bin/google-chrome-unstable'],
'chromium': ['/usr/bin/chromium', '/usr/bin/chromium-browser'],
'brave': ['/usr/bin/brave-browser', '/usr/bin/brave-browser-beta', '/usr/bin/brave-browser-nightly'],
'edge': ['/usr/bin/microsoft-edge', '/usr/bin/microsoft-edge-stable', '/usr/bin/microsoft-edge-beta', '/usr/bin/microsoft-edge-dev'],
"chrome": [
"/usr/bin/google-chrome",
"/usr/bin/google-chrome-stable",
"/usr/bin/google-chrome-beta",
"/usr/bin/google-chrome-unstable",
],
"chromium": ["/usr/bin/chromium", "/usr/bin/chromium-browser"],
"brave": ["/usr/bin/brave-browser", "/usr/bin/brave-browser-beta", "/usr/bin/brave-browser-nightly"],
"edge": [
"/usr/bin/microsoft-edge",
"/usr/bin/microsoft-edge-stable",
"/usr/bin/microsoft-edge-beta",
"/usr/bin/microsoft-edge-dev",
],
}.get(browser, [])
elif system == 'Windows':
local_app_data = Path(os.environ.get('LOCALAPPDATA', home / 'AppData' / 'Local'))
elif system == "Windows":
local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local"))
candidates = {
'chrome': [
str(local_app_data / 'Google' / 'Chrome' / 'Application' / 'chrome.exe'),
'C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe',
'C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe',
"chrome": [
str(local_app_data / "Google" / "Chrome" / "Application" / "chrome.exe"),
"C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
"C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe",
],
'chromium': [str(local_app_data / 'Chromium' / 'Application' / 'chrome.exe')],
'brave': [
str(local_app_data / 'BraveSoftware' / 'Brave-Browser' / 'Application' / 'brave.exe'),
'C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe',
'C:\\Program Files (x86)\\BraveSoftware\\Brave-Browser\\Application\\brave.exe',
"chromium": [str(local_app_data / "Chromium" / "Application" / "chrome.exe")],
"brave": [
str(local_app_data / "BraveSoftware" / "Brave-Browser" / "Application" / "brave.exe"),
"C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe",
"C:\\Program Files (x86)\\BraveSoftware\\Brave-Browser\\Application\\brave.exe",
],
'edge': [
str(local_app_data / 'Microsoft' / 'Edge' / 'Application' / 'msedge.exe'),
'C:\\Program Files\\Microsoft\\Edge\\Application\\msedge.exe',
'C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe',
"edge": [
str(local_app_data / "Microsoft" / "Edge" / "Application" / "msedge.exe"),
"C:\\Program Files\\Microsoft\\Edge\\Application\\msedge.exe",
"C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe",
],
}.get(browser, [])
else:
@@ -190,13 +201,13 @@ def get_browser_binary(browser: str) -> Optional[str]:
BROWSER_PROFILE_FINDERS = {
'chrome': get_chrome_user_data_dir,
'chromium': get_chrome_user_data_dir, # Same locations
'brave': get_brave_user_data_dir,
'edge': get_edge_user_data_dir,
"chrome": get_chrome_user_data_dir,
"chromium": get_chrome_user_data_dir, # Same locations
"brave": get_brave_user_data_dir,
"edge": get_edge_user_data_dir,
}
CHROMIUM_BROWSERS = {'chrome', 'chromium', 'brave', 'edge'}
CHROMIUM_BROWSERS = {"chrome", "chromium", "brave", "edge"}
# =============================================================================
@@ -204,12 +215,12 @@ CHROMIUM_BROWSERS = {'chrome', 'chromium', 'brave', 'edge'}
# =============================================================================
NETSCAPE_COOKIE_HEADER = [
'# Netscape HTTP Cookie File',
'# https://curl.se/docs/http-cookies.html',
'# This file was generated by ArchiveBox persona cookie extraction',
'#',
'# Format: domain\\tincludeSubdomains\\tpath\\tsecure\\texpiry\\tname\\tvalue',
'',
"# Netscape HTTP Cookie File",
"# https://curl.se/docs/http-cookies.html",
"# This file was generated by ArchiveBox persona cookie extraction",
"#",
"# Format: domain\\tincludeSubdomains\\tpath\\tsecure\\texpiry\\tname\\tvalue",
"",
]
@@ -219,9 +230,9 @@ def _parse_netscape_cookies(path: Path) -> "OrderedDict[tuple[str, str, str], tu
return cookies
for line in path.read_text().splitlines():
if not line or line.startswith('#'):
if not line or line.startswith("#"):
continue
parts = line.split('\t')
parts = line.split("\t")
if len(parts) < 7:
continue
domain, include_subdomains, cookie_path, secure, expiry, name, value = parts[:7]
@@ -233,8 +244,8 @@ def _parse_netscape_cookies(path: Path) -> "OrderedDict[tuple[str, str, str], tu
def _write_netscape_cookies(path: Path, cookies: "OrderedDict[tuple[str, str, str], tuple[str, str, str, str, str, str, str]]") -> None:
lines = list(NETSCAPE_COOKIE_HEADER)
for cookie in cookies.values():
lines.append('\t'.join(cookie))
path.write_text('\n'.join(lines) + '\n')
lines.append("\t".join(cookie))
path.write_text("\n".join(lines) + "\n")
def _merge_netscape_cookies(existing_file: Path, new_file: Path) -> None:
@@ -259,52 +270,52 @@ def extract_cookies_via_cdp(
from archivebox.config.common import STORAGE_CONFIG
# Find the cookie extraction script
chrome_plugin_dir = Path(__file__).parent.parent / 'plugins' / 'chrome'
extract_script = chrome_plugin_dir / 'extract_cookies.js'
chrome_plugin_dir = Path(__file__).parent.parent / "plugins" / "chrome"
extract_script = chrome_plugin_dir / "extract_cookies.js"
if not extract_script.exists():
rprint(f'[yellow]Cookie extraction script not found at {extract_script}[/yellow]', file=sys.stderr)
rprint(f"[yellow]Cookie extraction script not found at {extract_script}[/yellow]", file=sys.stderr)
return False
# Get node modules dir
node_modules_dir = STORAGE_CONFIG.LIB_DIR / 'npm' / 'node_modules'
node_modules_dir = STORAGE_CONFIG.LIB_DIR / "npm" / "node_modules"
# Set up environment
env = os.environ.copy()
env['NODE_MODULES_DIR'] = str(node_modules_dir)
env['CHROME_USER_DATA_DIR'] = str(user_data_dir)
env['CHROME_HEADLESS'] = 'true'
env["NODE_MODULES_DIR"] = str(node_modules_dir)
env["CHROME_USER_DATA_DIR"] = str(user_data_dir)
env["CHROME_HEADLESS"] = "true"
if chrome_binary:
env['CHROME_BINARY'] = str(chrome_binary)
env["CHROME_BINARY"] = str(chrome_binary)
output_path = output_file
temp_output = None
temp_dir = None
if output_file.exists():
temp_dir = Path(tempfile.mkdtemp(prefix='ab_cookies_'))
temp_output = temp_dir / 'cookies.txt'
temp_dir = Path(tempfile.mkdtemp(prefix="ab_cookies_"))
temp_output = temp_dir / "cookies.txt"
output_path = temp_output
if profile_dir:
extra_arg = f'--profile-directory={profile_dir}'
existing_extra = env.get('CHROME_ARGS_EXTRA', '').strip()
extra_arg = f"--profile-directory={profile_dir}"
existing_extra = env.get("CHROME_ARGS_EXTRA", "").strip()
args_list = []
if existing_extra:
if existing_extra.startswith('['):
if existing_extra.startswith("["):
try:
parsed = json.loads(existing_extra)
if isinstance(parsed, list):
args_list.extend(str(x) for x in parsed)
except Exception:
args_list.extend([s.strip() for s in existing_extra.split(',') if s.strip()])
args_list.extend([s.strip() for s in existing_extra.split(",") if s.strip()])
else:
args_list.extend([s.strip() for s in existing_extra.split(',') if s.strip()])
args_list.extend([s.strip() for s in existing_extra.split(",") if s.strip()])
args_list.append(extra_arg)
env['CHROME_ARGS_EXTRA'] = json.dumps(args_list)
env["CHROME_ARGS_EXTRA"] = json.dumps(args_list)
env['COOKIES_OUTPUT_FILE'] = str(output_path)
env["COOKIES_OUTPUT_FILE"] = str(output_path)
try:
result = subprocess.run(
['node', str(extract_script)],
["node", str(extract_script)],
env=env,
capture_output=True,
text=True,
@@ -316,17 +327,17 @@ def extract_cookies_via_cdp(
_merge_netscape_cookies(output_file, temp_output)
return True
else:
rprint(f'[yellow]Cookie extraction failed: {result.stderr}[/yellow]', file=sys.stderr)
rprint(f"[yellow]Cookie extraction failed: {result.stderr}[/yellow]", file=sys.stderr)
return False
except subprocess.TimeoutExpired:
rprint('[yellow]Cookie extraction timed out[/yellow]', file=sys.stderr)
rprint("[yellow]Cookie extraction timed out[/yellow]", file=sys.stderr)
return False
except FileNotFoundError:
rprint('[yellow]Node.js not found. Cannot extract cookies.[/yellow]', file=sys.stderr)
rprint("[yellow]Node.js not found. Cannot extract cookies.[/yellow]", file=sys.stderr)
return False
except Exception as e:
rprint(f'[yellow]Cookie extraction error: {e}[/yellow]', file=sys.stderr)
rprint(f"[yellow]Cookie extraction error: {e}[/yellow]", file=sys.stderr)
return False
finally:
if temp_dir and temp_dir.exists():
@@ -337,6 +348,7 @@ def extract_cookies_via_cdp(
# Validation Helpers
# =============================================================================
def validate_persona_name(name: str) -> tuple[bool, str]:
"""
Validate persona name to prevent path traversal attacks.
@@ -348,19 +360,19 @@ def validate_persona_name(name: str) -> tuple[bool, str]:
return False, "Persona name cannot be empty"
# Check for path separators
if '/' in name or '\\' in name:
if "/" in name or "\\" in name:
return False, "Persona name cannot contain path separators (/ or \\)"
# Check for parent directory references
if '..' in name:
if ".." in name:
return False, "Persona name cannot contain parent directory references (..)"
# Check for hidden files/directories
if name.startswith('.'):
if name.startswith("."):
return False, "Persona name cannot start with a dot (.)"
# Ensure name doesn't contain null bytes or other dangerous chars
if '\x00' in name or '\n' in name or '\r' in name:
if "\x00" in name or "\n" in name or "\r" in name:
return False, "Persona name contains invalid characters"
return True, ""
@@ -394,10 +406,11 @@ def ensure_path_within_personas_dir(persona_path: Path) -> bool:
# CREATE
# =============================================================================
def create_personas(
names: Iterable[str],
import_from: Optional[str] = None,
profile: Optional[str] = None,
import_from: str | None = None,
profile: str | None = None,
) -> int:
"""
Create Personas from names.
@@ -416,7 +429,7 @@ def create_personas(
name_list = list(names) if names else []
if not name_list:
rprint('[yellow]No persona names provided. Pass names as arguments.[/yellow]', file=sys.stderr)
rprint("[yellow]No persona names provided. Pass names as arguments.[/yellow]", file=sys.stderr)
return 1
# Validate import source if specified
@@ -424,23 +437,23 @@ def create_personas(
if import_from:
import_from = import_from.lower()
if import_from not in BROWSER_PROFILE_FINDERS:
rprint(f'[red]Unknown browser: {import_from}[/red]', file=sys.stderr)
rprint(f'[dim]Supported browsers: {", ".join(BROWSER_PROFILE_FINDERS.keys())}[/dim]', file=sys.stderr)
rprint(f"[red]Unknown browser: {import_from}[/red]", file=sys.stderr)
rprint(f"[dim]Supported browsers: {', '.join(BROWSER_PROFILE_FINDERS.keys())}[/dim]", file=sys.stderr)
return 1
source_profile_dir = BROWSER_PROFILE_FINDERS[import_from]()
if not source_profile_dir:
rprint(f'[red]Could not find {import_from} profile directory[/red]', file=sys.stderr)
rprint(f"[red]Could not find {import_from} profile directory[/red]", file=sys.stderr)
return 1
rprint(f'[dim]Found {import_from} profile: {source_profile_dir}[/dim]', file=sys.stderr)
rprint(f"[dim]Found {import_from} profile: {source_profile_dir}[/dim]", file=sys.stderr)
if profile is None and (source_profile_dir / 'Default').exists():
profile = 'Default'
if profile is None and (source_profile_dir / "Default").exists():
profile = "Default"
browser_binary = get_browser_binary(import_from)
if browser_binary:
rprint(f'[dim]Using {import_from} binary: {browser_binary}[/dim]', file=sys.stderr)
rprint(f"[dim]Using {import_from} binary: {browser_binary}[/dim]", file=sys.stderr)
created_count = 0
for name in name_list:
@@ -459,11 +472,11 @@ def create_personas(
if created:
persona.ensure_dirs()
created_count += 1
rprint(f'[green]Created persona: {name}[/green]', file=sys.stderr)
rprint(f"[green]Created persona: {name}[/green]", file=sys.stderr)
else:
rprint(f'[dim]Persona already exists: {name}[/dim]', file=sys.stderr)
rprint(f"[dim]Persona already exists: {name}[/dim]", file=sys.stderr)
cookies_file = Path(persona.path) / 'cookies.txt'
cookies_file = Path(persona.path) / "cookies.txt"
# Import browser profile if requested
if import_from in CHROMIUM_BROWSERS and source_profile_dir is not None:
@@ -477,29 +490,31 @@ def create_personas(
capture_storage=False,
)
except Exception as e:
rprint(f'[red]Failed to import browser profile: {e}[/red]', file=sys.stderr)
rprint(f"[red]Failed to import browser profile: {e}[/red]", file=sys.stderr)
return 1
if import_result.profile_copied:
rprint('[green]Copied browser profile to persona[/green]', file=sys.stderr)
rprint("[green]Copied browser profile to persona[/green]", file=sys.stderr)
if import_result.cookies_imported:
rprint(f'[green]Extracted cookies to {cookies_file}[/green]', file=sys.stderr)
rprint(f"[green]Extracted cookies to {cookies_file}[/green]", file=sys.stderr)
elif not import_result.profile_copied:
rprint('[yellow]Could not import cookies automatically.[/yellow]', file=sys.stderr)
rprint("[yellow]Could not import cookies automatically.[/yellow]", file=sys.stderr)
for warning in import_result.warnings:
rprint(f'[yellow]{warning}[/yellow]', file=sys.stderr)
rprint(f"[yellow]{warning}[/yellow]", file=sys.stderr)
if not is_tty:
write_record({
'id': str(persona.id) if hasattr(persona, 'id') else None,
'name': persona.name,
'path': str(persona.path),
'CHROME_USER_DATA_DIR': persona.CHROME_USER_DATA_DIR,
'COOKIES_FILE': persona.COOKIES_FILE,
})
write_record(
{
"id": str(persona.id) if hasattr(persona, "id") else None,
"name": persona.name,
"path": str(persona.path),
"CHROME_USER_DATA_DIR": persona.CHROME_USER_DATA_DIR,
"COOKIES_FILE": persona.COOKIES_FILE,
},
)
rprint(f'[green]Created {created_count} new persona(s)[/green]', file=sys.stderr)
rprint(f"[green]Created {created_count} new persona(s)[/green]", file=sys.stderr)
return 0
@@ -507,10 +522,11 @@ def create_personas(
# LIST
# =============================================================================
def list_personas(
name: Optional[str] = None,
name__icontains: Optional[str] = None,
limit: Optional[int] = None,
name: str | None = None,
name__icontains: str | None = None,
limit: int | None = None,
) -> int:
"""
List Personas as JSONL with optional filters.
@@ -523,33 +539,35 @@ def list_personas(
is_tty = sys.stdout.isatty()
queryset = Persona.objects.all().order_by('name')
queryset = Persona.objects.all().order_by("name")
# Apply filters
filter_kwargs = {
'name': name,
'name__icontains': name__icontains,
"name": name,
"name__icontains": name__icontains,
}
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
count = 0
for persona in queryset:
cookies_status = '[green]✓[/green]' if persona.COOKIES_FILE else '[dim]✗[/dim]'
chrome_status = '[green]✓[/green]' if Path(persona.CHROME_USER_DATA_DIR).exists() else '[dim]✗[/dim]'
cookies_status = "[green]✓[/green]" if persona.COOKIES_FILE else "[dim]✗[/dim]"
chrome_status = "[green]✓[/green]" if Path(persona.CHROME_USER_DATA_DIR).exists() else "[dim]✗[/dim]"
if is_tty:
rprint(f'[cyan]{persona.name:20}[/cyan] cookies:{cookies_status} chrome:{chrome_status} [dim]{persona.path}[/dim]')
rprint(f"[cyan]{persona.name:20}[/cyan] cookies:{cookies_status} chrome:{chrome_status} [dim]{persona.path}[/dim]")
else:
write_record({
'id': str(persona.id) if hasattr(persona, 'id') else None,
'name': persona.name,
'path': str(persona.path),
'CHROME_USER_DATA_DIR': persona.CHROME_USER_DATA_DIR,
'COOKIES_FILE': persona.COOKIES_FILE,
})
write_record(
{
"id": str(persona.id) if hasattr(persona, "id") else None,
"name": persona.name,
"path": str(persona.path),
"CHROME_USER_DATA_DIR": persona.CHROME_USER_DATA_DIR,
"COOKIES_FILE": persona.COOKIES_FILE,
},
)
count += 1
rprint(f'[dim]Listed {count} persona(s)[/dim]', file=sys.stderr)
rprint(f"[dim]Listed {count} persona(s)[/dim]", file=sys.stderr)
return 0
@@ -557,7 +575,8 @@ def list_personas(
# UPDATE
# =============================================================================
def update_personas(name: Optional[str] = None) -> int:
def update_personas(name: str | None = None) -> int:
"""
Update Personas from stdin JSONL.
@@ -575,13 +594,13 @@ def update_personas(name: Optional[str] = None) -> int:
records = list(read_stdin())
if not records:
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
return 1
updated_count = 0
for record in records:
persona_id = record.get('id')
old_name = record.get('name')
persona_id = record.get("id")
old_name = record.get("name")
if not persona_id and not old_name:
continue
@@ -613,17 +632,19 @@ def update_personas(name: Optional[str] = None) -> int:
updated_count += 1
if not is_tty:
write_record({
'id': str(persona.id) if hasattr(persona, 'id') else None,
'name': persona.name,
'path': str(persona.path),
})
write_record(
{
"id": str(persona.id) if hasattr(persona, "id") else None,
"name": persona.name,
"path": str(persona.path),
},
)
except Persona.DoesNotExist:
rprint(f'[yellow]Persona not found: {persona_id or old_name}[/yellow]', file=sys.stderr)
rprint(f"[yellow]Persona not found: {persona_id or old_name}[/yellow]", file=sys.stderr)
continue
rprint(f'[green]Updated {updated_count} persona(s)[/green]', file=sys.stderr)
rprint(f"[green]Updated {updated_count} persona(s)[/green]", file=sys.stderr)
return 0
@@ -631,6 +652,7 @@ def update_personas(name: Optional[str] = None) -> int:
# DELETE
# =============================================================================
def delete_personas(yes: bool = False, dry_run: bool = False) -> int:
"""
Delete Personas from stdin JSONL.
@@ -646,23 +668,24 @@ def delete_personas(yes: bool = False, dry_run: bool = False) -> int:
records = list(read_stdin())
if not records:
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
return 1
# Collect persona IDs or names
persona_ids = []
persona_names = []
for r in records:
if r.get('id'):
persona_ids.append(r['id'])
elif r.get('name'):
persona_names.append(r['name'])
if r.get("id"):
persona_ids.append(r["id"])
elif r.get("name"):
persona_names.append(r["name"])
if not persona_ids and not persona_names:
rprint('[yellow]No valid persona IDs or names in input[/yellow]', file=sys.stderr)
rprint("[yellow]No valid persona IDs or names in input[/yellow]", file=sys.stderr)
return 1
from django.db.models import Q
query = Q()
if persona_ids:
query |= Q(id__in=persona_ids)
@@ -673,17 +696,17 @@ def delete_personas(yes: bool = False, dry_run: bool = False) -> int:
count = personas.count()
if count == 0:
rprint('[yellow]No matching personas found[/yellow]', file=sys.stderr)
rprint("[yellow]No matching personas found[/yellow]", file=sys.stderr)
return 0
if dry_run:
rprint(f'[yellow]Would delete {count} persona(s) (dry run)[/yellow]', file=sys.stderr)
rprint(f"[yellow]Would delete {count} persona(s) (dry run)[/yellow]", file=sys.stderr)
for persona in personas:
rprint(f' {persona.name} ({persona.path})', file=sys.stderr)
rprint(f" {persona.name} ({persona.path})", file=sys.stderr)
return 0
if not yes:
rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
rprint("[red]Use --yes to confirm deletion[/red]", file=sys.stderr)
return 1
# Delete persona directories and database records
@@ -701,7 +724,7 @@ def delete_personas(yes: bool = False, dry_run: bool = False) -> int:
persona.delete()
deleted_count += 1
rprint(f'[green]Deleted {deleted_count} persona(s)[/green]', file=sys.stderr)
rprint(f"[green]Deleted {deleted_count} persona(s)[/green]", file=sys.stderr)
return 0
@@ -709,44 +732,45 @@ def delete_personas(yes: bool = False, dry_run: bool = False) -> int:
# CLI Commands
# =============================================================================
@click.group()
def main():
"""Manage Persona records (browser profiles)."""
pass
@main.command('create')
@click.argument('names', nargs=-1)
@click.option('--import', 'import_from', help='Import profile from browser (chrome, chromium, brave, edge)')
@click.option('--profile', help='Profile directory name under the user data dir (e.g. Default, Profile 1)')
def create_cmd(names: tuple, import_from: Optional[str], profile: Optional[str]):
@main.command("create")
@click.argument("names", nargs=-1)
@click.option("--import", "import_from", help="Import profile from browser (chrome, chromium, brave, edge)")
@click.option("--profile", help="Profile directory name under the user data dir (e.g. Default, Profile 1)")
def create_cmd(names: tuple, import_from: str | None, profile: str | None):
"""Create Personas, optionally importing from a browser profile."""
sys.exit(create_personas(names, import_from=import_from, profile=profile))
@main.command('list')
@click.option('--name', help='Filter by exact name')
@click.option('--name__icontains', help='Filter by name contains')
@click.option('--limit', '-n', type=int, help='Limit number of results')
def list_cmd(name: Optional[str], name__icontains: Optional[str], limit: Optional[int]):
@main.command("list")
@click.option("--name", help="Filter by exact name")
@click.option("--name__icontains", help="Filter by name contains")
@click.option("--limit", "-n", type=int, help="Limit number of results")
def list_cmd(name: str | None, name__icontains: str | None, limit: int | None):
"""List Personas as JSONL."""
sys.exit(list_personas(name=name, name__icontains=name__icontains, limit=limit))
@main.command('update')
@click.option('--name', '-n', help='Set new name')
def update_cmd(name: Optional[str]):
@main.command("update")
@click.option("--name", "-n", help="Set new name")
def update_cmd(name: str | None):
"""Update Personas from stdin JSONL."""
sys.exit(update_personas(name=name))
@main.command('delete')
@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
@main.command("delete")
@click.option("--yes", "-y", is_flag=True, help="Confirm deletion")
@click.option("--dry-run", is_flag=True, help="Show what would be deleted")
def delete_cmd(yes: bool, dry_run: bool):
"""Delete Personas from stdin JSONL."""
sys.exit(delete_personas(yes=yes, dry_run=dry_run))
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -1,8 +1,7 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__package__ = "archivebox.cli"
from typing import Optional
import rich_click as click
@@ -137,7 +136,7 @@ BINARY_MACHINE_DIAGRAM = """
@enforce_types
def pluginmap(
show_disabled: bool = False,
model: Optional[str] = None,
model: str | None = None,
quiet: bool = False,
) -> dict:
"""
@@ -164,25 +163,25 @@ def pluginmap(
# Model event types that can have hooks
model_events = {
'Crawl': {
'description': 'Hooks run when a Crawl starts (QUEUED→STARTED)',
'machine': 'CrawlMachine',
'diagram': CRAWL_MACHINE_DIAGRAM,
"Crawl": {
"description": "Hooks run when a Crawl starts (QUEUED→STARTED)",
"machine": "CrawlMachine",
"diagram": CRAWL_MACHINE_DIAGRAM,
},
'CrawlEnd': {
'description': 'Hooks run when a Crawl finishes (STARTED→SEALED)',
'machine': 'CrawlMachine',
'diagram': None, # Part of CrawlMachine
"CrawlEnd": {
"description": "Hooks run when a Crawl finishes (STARTED→SEALED)",
"machine": "CrawlMachine",
"diagram": None, # Part of CrawlMachine
},
'Snapshot': {
'description': 'Hooks run for each Snapshot (creates ArchiveResults)',
'machine': 'SnapshotMachine',
'diagram': SNAPSHOT_MACHINE_DIAGRAM,
"Snapshot": {
"description": "Hooks run for each Snapshot (creates ArchiveResults)",
"machine": "SnapshotMachine",
"diagram": SNAPSHOT_MACHINE_DIAGRAM,
},
'Binary': {
'description': 'Hooks for installing binary dependencies (providers)',
'machine': 'BinaryMachine',
'diagram': BINARY_MACHINE_DIAGRAM,
"Binary": {
"description": "Hooks for installing binary dependencies (providers)",
"machine": "BinaryMachine",
"diagram": BINARY_MACHINE_DIAGRAM,
},
}
@@ -195,16 +194,16 @@ def pluginmap(
model_events = {model: model_events[model]}
result = {
'models': {},
'plugins_dir': str(BUILTIN_PLUGINS_DIR),
'user_plugins_dir': str(USER_PLUGINS_DIR),
"models": {},
"plugins_dir": str(BUILTIN_PLUGINS_DIR),
"user_plugins_dir": str(USER_PLUGINS_DIR),
}
if not quiet:
prnt()
prnt('[bold cyan]ArchiveBox Plugin Map[/bold cyan]')
prnt(f'[dim]Built-in plugins: {BUILTIN_PLUGINS_DIR}[/dim]')
prnt(f'[dim]User plugins: {USER_PLUGINS_DIR}[/dim]')
prnt("[bold cyan]ArchiveBox Plugin Map[/bold cyan]")
prnt(f"[dim]Built-in plugins: {BUILTIN_PLUGINS_DIR}[/dim]")
prnt(f"[dim]User plugins: {USER_PLUGINS_DIR}[/dim]")
prnt()
for event_name, info in model_events.items():
@@ -218,88 +217,93 @@ def pluginmap(
plugin_name = hook_path.parent.name
is_bg = is_background_hook(hook_path.name)
hook_infos.append({
'path': str(hook_path),
'name': hook_path.name,
'plugin': plugin_name,
'is_background': is_bg,
'extension': hook_path.suffix,
})
hook_infos.append(
{
"path": str(hook_path),
"name": hook_path.name,
"plugin": plugin_name,
"is_background": is_bg,
"extension": hook_path.suffix,
},
)
result['models'][event_name] = {
'description': info['description'],
'machine': info['machine'],
'hooks': hook_infos,
'hook_count': len(hook_infos),
result["models"][event_name] = {
"description": info["description"],
"machine": info["machine"],
"hooks": hook_infos,
"hook_count": len(hook_infos),
}
if not quiet:
# Show diagram if this model has one
if info.get('diagram'):
assert info['diagram'] is not None
prnt(Panel(
info['diagram'],
title=f'[bold green]{info["machine"]}[/bold green]',
border_style='green',
expand=False,
))
if info.get("diagram"):
assert info["diagram"] is not None
prnt(
Panel(
info["diagram"],
title=f"[bold green]{info['machine']}[/bold green]",
border_style="green",
expand=False,
),
)
prnt()
# Create hooks table
table = Table(
title=f'[bold yellow]on_{event_name}__* Hooks[/bold yellow] ({len(hooks)} found)',
title=f"[bold yellow]on_{event_name}__* Hooks[/bold yellow] ({len(hooks)} found)",
box=box.ROUNDED,
show_header=True,
header_style='bold magenta',
header_style="bold magenta",
)
table.add_column('Plugin', style='cyan', width=20)
table.add_column('Hook Name', style='green')
table.add_column('BG', justify='center', width=4)
table.add_column('Type', justify='center', width=5)
table.add_column("Plugin", style="cyan", width=20)
table.add_column("Hook Name", style="green")
table.add_column("BG", justify="center", width=4)
table.add_column("Type", justify="center", width=5)
# Sort lexicographically by hook name
sorted_hooks = sorted(hook_infos, key=lambda h: h['name'])
sorted_hooks = sorted(hook_infos, key=lambda h: h["name"])
for hook in sorted_hooks:
bg_marker = '[yellow]bg[/yellow]' if hook['is_background'] else ''
ext = hook['extension'].lstrip('.')
bg_marker = "[yellow]bg[/yellow]" if hook["is_background"] else ""
ext = hook["extension"].lstrip(".")
table.add_row(
hook['plugin'],
hook['name'],
hook["plugin"],
hook["name"],
bg_marker,
ext,
)
prnt(table)
prnt()
prnt(f'[dim]{info["description"]}[/dim]')
prnt(f"[dim]{info['description']}[/dim]")
prnt()
# Summary
if not quiet:
total_hooks = sum(m['hook_count'] for m in result['models'].values())
prnt(f'[bold]Total hooks discovered: {total_hooks}[/bold]')
total_hooks = sum(m["hook_count"] for m in result["models"].values())
prnt(f"[bold]Total hooks discovered: {total_hooks}[/bold]")
prnt()
prnt('[dim]Hook naming convention: on_{Model}__{XX}_{description}[.bg].{ext}[/dim]')
prnt('[dim] - XX: Two-digit lexicographic order (00-99)[/dim]')
prnt('[dim] - .bg: Background hook (non-blocking)[/dim]')
prnt('[dim] - ext: py, sh, or js[/dim]')
prnt("[dim]Hook naming convention: on_{Model}__{XX}_{description}[.bg].{ext}[/dim]")
prnt("[dim] - XX: Two-digit lexicographic order (00-99)[/dim]")
prnt("[dim] - .bg: Background hook (non-blocking)[/dim]")
prnt("[dim] - ext: py, sh, or js[/dim]")
prnt()
return result
@click.command()
@click.option('--show-disabled', '-a', is_flag=True, help='Show hooks from disabled plugins too')
@click.option('--model', '-m', type=str, default=None, help='Filter to specific model (Crawl, Snapshot, Binary, CrawlEnd)')
@click.option('--quiet', '-q', is_flag=True, help='Output JSON only, no ASCII diagrams')
@click.option("--show-disabled", "-a", is_flag=True, help="Show hooks from disabled plugins too")
@click.option("--model", "-m", type=str, default=None, help="Filter to specific model (Crawl, Snapshot, Binary, CrawlEnd)")
@click.option("--quiet", "-q", is_flag=True, help="Output JSON only, no ASCII diagrams")
@docstring(pluginmap.__doc__)
def main(**kwargs):
import json
result = pluginmap(**kwargs)
if kwargs.get('quiet'):
if kwargs.get("quiet"):
print(json.dumps(result, indent=2))
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -22,11 +22,10 @@ Examples:
archivebox process list --limit=10
"""
__package__ = 'archivebox.cli'
__command__ = 'archivebox process'
__package__ = "archivebox.cli"
__command__ = "archivebox process"
import sys
from typing import Optional
import rich_click as click
from rich import print as rprint
@@ -38,10 +37,11 @@ from archivebox.cli.cli_utils import apply_filters
# LIST
# =============================================================================
def list_processes(
binary_name: Optional[str] = None,
machine_id: Optional[str] = None,
limit: Optional[int] = None,
binary_name: str | None = None,
machine_id: str | None = None,
limit: int | None = None,
) -> int:
"""
List Processes as JSONL with optional filters.
@@ -54,29 +54,29 @@ def list_processes(
is_tty = sys.stdout.isatty()
queryset = Process.objects.all().select_related('binary', 'machine').order_by('-start_ts')
queryset = Process.objects.all().select_related("binary", "machine").order_by("-start_ts")
# Apply filters
filter_kwargs = {}
if binary_name:
filter_kwargs['binary__name'] = binary_name
filter_kwargs["binary__name"] = binary_name
if machine_id:
filter_kwargs['machine_id'] = machine_id
filter_kwargs["machine_id"] = machine_id
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
count = 0
for process in queryset:
if is_tty:
binary_name_str = process.binary.name if process.binary else 'unknown'
exit_code = process.exit_code if process.exit_code is not None else '?'
status_color = 'green' if process.exit_code == 0 else 'red' if process.exit_code else 'yellow'
rprint(f'[{status_color}]exit={exit_code:3}[/{status_color}] [cyan]{binary_name_str:15}[/cyan] [dim]{process.id}[/dim]')
binary_name_str = process.binary.name if process.binary else "unknown"
exit_code = process.exit_code if process.exit_code is not None else "?"
status_color = "green" if process.exit_code == 0 else "red" if process.exit_code else "yellow"
rprint(f"[{status_color}]exit={exit_code:3}[/{status_color}] [cyan]{binary_name_str:15}[/cyan] [dim]{process.id}[/dim]")
else:
write_record(process.to_json())
count += 1
rprint(f'[dim]Listed {count} processes[/dim]', file=sys.stderr)
rprint(f"[dim]Listed {count} processes[/dim]", file=sys.stderr)
return 0
@@ -84,24 +84,27 @@ def list_processes(
# CLI Commands
# =============================================================================
@click.group()
def main():
"""Manage Process records (read-only, system-managed)."""
pass
@main.command('list')
@click.option('--binary-name', '-b', help='Filter by binary name')
@click.option('--machine-id', '-m', help='Filter by machine ID')
@click.option('--limit', '-n', type=int, help='Limit number of results')
def list_cmd(binary_name: Optional[str], machine_id: Optional[str], limit: Optional[int]):
@main.command("list")
@click.option("--binary-name", "-b", help="Filter by binary name")
@click.option("--machine-id", "-m", help="Filter by machine ID")
@click.option("--limit", "-n", type=int, help="Limit number of results")
def list_cmd(binary_name: str | None, machine_id: str | None, limit: int | None):
"""List Processes as JSONL."""
sys.exit(list_processes(
binary_name=binary_name,
machine_id=machine_id,
limit=limit,
))
sys.exit(
list_processes(
binary_name=binary_name,
machine_id=machine_id,
limit=limit,
),
)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -1,11 +1,11 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox remove'
__package__ = "archivebox.cli"
__command__ = "archivebox remove"
import shutil
from pathlib import Path
from typing import Iterable
from collections.abc import Iterable
import rich_click as click
@@ -26,25 +26,27 @@ from archivebox.misc.logging_util import (
@enforce_types
def remove(filter_patterns: Iterable[str]=(),
filter_type: str='exact',
snapshots: QuerySet | None=None,
after: float | None=None,
before: float | None=None,
yes: bool=False,
delete: bool=False,
out_dir: Path=DATA_DIR) -> QuerySet:
def remove(
filter_patterns: Iterable[str] = (),
filter_type: str = "exact",
snapshots: QuerySet | None = None,
after: float | None = None,
before: float | None = None,
yes: bool = False,
delete: bool = False,
out_dir: Path = DATA_DIR,
) -> QuerySet:
"""Remove the specified URLs from the archive"""
setup_django()
check_data_folder()
from archivebox.cli.archivebox_search import get_snapshots
pattern_list = list(filter_patterns)
log_list_started(pattern_list or None, filter_type)
timer = TimedProgress(360, prefix=' ')
timer = TimedProgress(360, prefix=" ")
try:
snapshots = get_snapshots(
snapshots=snapshots,
@@ -63,7 +65,7 @@ def remove(filter_patterns: Iterable[str]=(),
log_list_finished(snapshots)
log_removal_started(snapshots, yes=yes, delete=delete)
timer = TimedProgress(360, prefix=' ')
timer = TimedProgress(360, prefix=" ")
try:
for snapshot in snapshots:
if delete:
@@ -88,17 +90,23 @@ def remove(filter_patterns: Iterable[str]=(),
@click.command()
@click.option('--yes', is_flag=True, help='Remove links instantly without prompting to confirm')
@click.option('--delete', is_flag=True, help='Delete the archived content and metadata folder in addition to removing from index')
@click.option('--before', type=float, help='Remove only URLs bookmarked before timestamp')
@click.option('--after', type=float, help='Remove only URLs bookmarked after timestamp')
@click.option('--filter-type', '-f', type=click.Choice(('exact', 'substring', 'domain', 'regex', 'tag')), default='exact', help='Type of pattern matching to use when filtering URLs')
@click.argument('filter_patterns', nargs=-1)
@click.option("--yes", is_flag=True, help="Remove links instantly without prompting to confirm")
@click.option("--delete", is_flag=True, help="Delete the archived content and metadata folder in addition to removing from index")
@click.option("--before", type=float, help="Remove only URLs bookmarked before timestamp")
@click.option("--after", type=float, help="Remove only URLs bookmarked after timestamp")
@click.option(
"--filter-type",
"-f",
type=click.Choice(("exact", "substring", "domain", "regex", "tag")),
default="exact",
help="Type of pattern matching to use when filtering URLs",
)
@click.argument("filter_patterns", nargs=-1)
@docstring(remove.__doc__)
def main(**kwargs):
"""Remove the specified URLs from the archive"""
remove(**kwargs)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -37,8 +37,8 @@ Examples:
archivebox run --binary-id=019b7e90-5a8e-712c-9877-2c70eebe80ad
"""
__package__ = 'archivebox.cli'
__command__ = 'archivebox run'
__package__ = "archivebox.cli"
__command__ = "archivebox run"
import sys
from collections import defaultdict
@@ -87,8 +87,8 @@ def process_stdin_records() -> int:
binary_ids: list[str] = []
for record in records:
record_type = record.get('type', '')
record_id = record.get('id')
record_type = record.get("type", "")
record_id = record.get("id")
try:
if record_type == TYPE_CRAWL:
@@ -97,10 +97,10 @@ def process_stdin_records() -> int:
try:
crawl = Crawl.objects.get(id=record_id)
except Crawl.DoesNotExist:
crawl = Crawl.from_json(record, overrides={'created_by_id': created_by_id})
crawl = Crawl.from_json(record, overrides={"created_by_id": created_by_id})
else:
# New crawl - create it
crawl = Crawl.from_json(record, overrides={'created_by_id': created_by_id})
crawl = Crawl.from_json(record, overrides={"created_by_id": created_by_id})
if crawl:
crawl.retry_at = timezone.now()
@@ -112,16 +112,16 @@ def process_stdin_records() -> int:
output_records.append(crawl.to_json())
queued_count += 1
elif record_type == TYPE_SNAPSHOT or (record.get('url') and not record_type):
elif record_type == TYPE_SNAPSHOT or (record.get("url") and not record_type):
if record_id:
# Existing snapshot - re-queue
try:
snapshot = Snapshot.objects.get(id=record_id)
except Snapshot.DoesNotExist:
snapshot = Snapshot.from_json(record, overrides={'created_by_id': created_by_id})
snapshot = Snapshot.from_json(record, overrides={"created_by_id": created_by_id})
else:
# New snapshot - create it
snapshot = Snapshot.from_json(record, overrides={'created_by_id': created_by_id})
snapshot = Snapshot.from_json(record, overrides={"created_by_id": created_by_id})
if snapshot:
snapshot.retry_at = timezone.now()
@@ -132,7 +132,7 @@ def process_stdin_records() -> int:
crawl.retry_at = timezone.now()
if crawl.status != Crawl.StatusChoices.STARTED:
crawl.status = Crawl.StatusChoices.QUEUED
crawl.save(update_fields=['status', 'retry_at', 'modified_at'])
crawl.save(update_fields=["status", "retry_at", "modified_at"])
crawl_id = str(snapshot.crawl_id)
snapshot_ids_by_crawl[crawl_id].add(str(snapshot.id))
run_all_plugins_for_crawl.add(crawl_id)
@@ -149,11 +149,16 @@ def process_stdin_records() -> int:
else:
archiveresult = None
snapshot_id = record.get('snapshot_id')
plugin_name = record.get('plugin')
snapshot_id = record.get("snapshot_id")
plugin_name = record.get("plugin")
snapshot = None
if archiveresult:
if archiveresult.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED, ArchiveResult.StatusChoices.NORESULTS, ArchiveResult.StatusChoices.BACKOFF]:
if archiveresult.status in [
ArchiveResult.StatusChoices.FAILED,
ArchiveResult.StatusChoices.SKIPPED,
ArchiveResult.StatusChoices.NORESULTS,
ArchiveResult.StatusChoices.BACKOFF,
]:
archiveresult.reset_for_retry()
snapshot = archiveresult.snapshot
plugin_name = plugin_name or archiveresult.plugin
@@ -167,12 +172,12 @@ def process_stdin_records() -> int:
snapshot.retry_at = timezone.now()
if snapshot.status != Snapshot.StatusChoices.STARTED:
snapshot.status = Snapshot.StatusChoices.QUEUED
snapshot.save(update_fields=['status', 'retry_at', 'modified_at'])
snapshot.save(update_fields=["status", "retry_at", "modified_at"])
crawl = snapshot.crawl
crawl.retry_at = timezone.now()
if crawl.status != Crawl.StatusChoices.STARTED:
crawl.status = Crawl.StatusChoices.QUEUED
crawl.save(update_fields=['status', 'retry_at', 'modified_at'])
crawl.save(update_fields=["status", "retry_at", "modified_at"])
crawl_id = str(snapshot.crawl_id)
snapshot_ids_by_crawl[crawl_id].add(str(snapshot.id))
if plugin_name:
@@ -203,7 +208,7 @@ def process_stdin_records() -> int:
output_records.append(record)
except Exception as e:
rprint(f'[yellow]Error processing record: {e}[/yellow]', file=sys.stderr)
rprint(f"[yellow]Error processing record: {e}[/yellow]", file=sys.stderr)
continue
# Output all processed records (for chaining)
@@ -212,10 +217,10 @@ def process_stdin_records() -> int:
write_record(rec)
if queued_count == 0:
rprint('[yellow]No records to process[/yellow]', file=sys.stderr)
rprint("[yellow]No records to process[/yellow]", file=sys.stderr)
return 0
rprint(f'[blue]Processing {queued_count} records...[/blue]', file=sys.stderr)
rprint(f"[blue]Processing {queued_count} records...[/blue]", file=sys.stderr)
for binary_id in binary_ids:
run_binary(binary_id)
@@ -245,13 +250,14 @@ def run_runner(daemon: bool = False) -> int:
from archivebox.services.runner import recover_orphaned_crawls, recover_orphaned_snapshots, run_pending_crawls
Process.cleanup_stale_running()
Process.cleanup_orphaned_workers()
recover_orphaned_snapshots()
recover_orphaned_crawls()
Machine.current()
current = Process.current()
if current.process_type != Process.TypeChoices.ORCHESTRATOR:
current.process_type = Process.TypeChoices.ORCHESTRATOR
current.save(update_fields=['process_type', 'modified_at'])
current.save(update_fields=["process_type", "modified_at"])
try:
run_pending_crawls(daemon=daemon)
@@ -259,21 +265,21 @@ def run_runner(daemon: bool = False) -> int:
except KeyboardInterrupt:
return 0
except Exception as e:
rprint(f'[red]Runner error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
rprint(f"[red]Runner error: {type(e).__name__}: {e}[/red]", file=sys.stderr)
return 1
finally:
current.refresh_from_db()
if current.status != Process.StatusChoices.EXITED:
current.status = Process.StatusChoices.EXITED
current.ended_at = current.ended_at or timezone.now()
current.save(update_fields=['status', 'ended_at', 'modified_at'])
current.save(update_fields=["status", "ended_at", "modified_at"])
@click.command()
@click.option('--daemon', '-d', is_flag=True, help="Run forever (don't exit on idle)")
@click.option('--crawl-id', help="Run the crawl runner for a specific crawl only")
@click.option('--snapshot-id', help="Run one snapshot through its crawl")
@click.option('--binary-id', help="Run one queued binary install directly on the bus")
@click.option("--daemon", "-d", is_flag=True, help="Run forever (don't exit on idle)")
@click.option("--crawl-id", help="Run the crawl runner for a specific crawl only")
@click.option("--snapshot-id", help="Run one snapshot through its crawl")
@click.option("--binary-id", help="Run one queued binary install directly on the bus")
def main(daemon: bool, crawl_id: str, snapshot_id: str, binary_id: str):
"""
Process queued work.
@@ -297,21 +303,24 @@ def main(daemon: bool, crawl_id: str, snapshot_id: str, binary_id: str):
except KeyboardInterrupt:
sys.exit(0)
except Exception as e:
rprint(f'[red]Runner error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
rprint(f"[red]Runner error: {type(e).__name__}: {e}[/red]", file=sys.stderr)
import traceback
traceback.print_exc()
sys.exit(1)
if crawl_id:
try:
from archivebox.services.runner import run_crawl
run_crawl(crawl_id)
sys.exit(0)
except KeyboardInterrupt:
sys.exit(0)
except Exception as e:
rprint(f'[red]Runner error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
rprint(f"[red]Runner error: {type(e).__name__}: {e}[/red]", file=sys.stderr)
import traceback
traceback.print_exc()
sys.exit(1)
@@ -333,17 +342,18 @@ def run_snapshot_worker(snapshot_id: str) -> int:
from archivebox.services.runner import run_crawl
try:
snapshot = Snapshot.objects.select_related('crawl').get(id=snapshot_id)
snapshot = Snapshot.objects.select_related("crawl").get(id=snapshot_id)
run_crawl(str(snapshot.crawl_id), snapshot_ids=[str(snapshot.id)])
return 0
except KeyboardInterrupt:
return 0
except Exception as e:
rprint(f'[red]Runner error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
rprint(f"[red]Runner error: {type(e).__name__}: {e}[/red]", file=sys.stderr)
import traceback
traceback.print_exc()
return 1
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -1,6 +1,6 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__package__ = "archivebox.cli"
import rich_click as click
from rich import print
@@ -10,18 +10,20 @@ from archivebox.config.common import ARCHIVING_CONFIG
@enforce_types
def schedule(add: bool = False,
show: bool = False,
clear: bool = False,
foreground: bool = False,
run_all: bool = False,
quiet: bool = False,
every: str | None = None,
tag: str = '',
depth: int | str = 0,
overwrite: bool = False,
update: bool = not ARCHIVING_CONFIG.ONLY_NEW,
import_path: str | None = None):
def schedule(
add: bool = False,
show: bool = False,
clear: bool = False,
foreground: bool = False,
run_all: bool = False,
quiet: bool = False,
every: str | None = None,
tag: str = "",
depth: int | str = 0,
overwrite: bool = False,
update: bool = not ARCHIVING_CONFIG.ONLY_NEW,
import_path: str | None = None,
):
"""Manage database-backed scheduled crawls processed by the crawl runner."""
from django.utils import timezone
@@ -33,55 +35,51 @@ def schedule(add: bool = False,
depth = int(depth)
result: dict[str, object] = {
'created_schedule_ids': [],
'disabled_count': 0,
'run_all_enqueued': 0,
'active_schedule_ids': [],
"created_schedule_ids": [],
"disabled_count": 0,
"run_all_enqueued": 0,
"active_schedule_ids": [],
}
def _active_schedules():
return CrawlSchedule.objects.filter(is_enabled=True).select_related('template').order_by('created_at')
return CrawlSchedule.objects.filter(is_enabled=True).select_related("template").order_by("created_at")
if clear:
disabled_count = CrawlSchedule.objects.filter(is_enabled=True).update(
is_enabled=False,
modified_at=timezone.now(),
)
result['disabled_count'] = disabled_count
print(f'[green]\\[√] Disabled {disabled_count} scheduled crawl(s).[/green]')
result["disabled_count"] = disabled_count
print(f"[green]\\[√] Disabled {disabled_count} scheduled crawl(s).[/green]")
if every or add:
schedule_str = (every or 'day').strip()
schedule_str = (every or "day").strip()
validate_schedule(schedule_str)
created_by_id = get_or_create_system_user_pk()
is_update_schedule = not import_path
template_urls = import_path or 'archivebox://update'
template_label = (
f'Scheduled import: {template_urls}'
if import_path else
'Scheduled ArchiveBox update'
)[:64]
template_urls = import_path or "archivebox://update"
template_label = (f"Scheduled import: {template_urls}" if import_path else "Scheduled ArchiveBox update")[:64]
template_notes = (
f'Created by archivebox schedule for {template_urls}'
if import_path else
'Created by archivebox schedule to queue recurring archivebox://update maintenance crawls.'
f"Created by archivebox schedule for {template_urls}"
if import_path
else "Created by archivebox schedule to queue recurring archivebox://update maintenance crawls."
)
template = Crawl.objects.create(
urls=template_urls,
max_depth=0 if is_update_schedule else depth,
tags_str='' if is_update_schedule else tag,
tags_str="" if is_update_schedule else tag,
label=template_label,
notes=template_notes,
created_by_id=created_by_id,
status=Crawl.StatusChoices.SEALED,
retry_at=None,
config={
'ONLY_NEW': not update,
'OVERWRITE': overwrite,
'DEPTH': 0 if is_update_schedule else depth,
'SCHEDULE_KIND': 'update' if is_update_schedule else 'crawl',
"ONLY_NEW": not update,
"OVERWRITE": overwrite,
"DEPTH": 0 if is_update_schedule else depth,
"SCHEDULE_KIND": "update" if is_update_schedule else "crawl",
},
)
crawl_schedule = CrawlSchedule.objects.create(
@@ -92,31 +90,31 @@ def schedule(add: bool = False,
notes=template_notes,
created_by_id=created_by_id,
)
result['created_schedule_ids'] = [str(crawl_schedule.id)]
result["created_schedule_ids"] = [str(crawl_schedule.id)]
schedule_type = 'maintenance update' if is_update_schedule else 'crawl'
print(f'[green]\\[√] Created scheduled {schedule_type}.[/green]')
print(f' id={crawl_schedule.id}')
print(f' every={crawl_schedule.schedule}')
print(f' next_run={crawl_schedule.next_run_at.isoformat()}')
schedule_type = "maintenance update" if is_update_schedule else "crawl"
print(f"[green]\\[√] Created scheduled {schedule_type}.[/green]")
print(f" id={crawl_schedule.id}")
print(f" every={crawl_schedule.schedule}")
print(f" next_run={crawl_schedule.next_run_at.isoformat()}")
if import_path:
print(f' source={import_path}')
print(f" source={import_path}")
schedules = list(_active_schedules())
result['active_schedule_ids'] = [str(schedule.id) for schedule in schedules]
result["active_schedule_ids"] = [str(schedule.id) for schedule in schedules]
if show:
if schedules:
print(f'[green]\\[*] Active scheduled crawls: {len(schedules)}[/green]')
print(f"[green]\\[*] Active scheduled crawls: {len(schedules)}[/green]")
for scheduled_crawl in schedules:
template = scheduled_crawl.template
print(
f' - id={scheduled_crawl.id} every={scheduled_crawl.schedule} '
f'next_run={scheduled_crawl.next_run_at.isoformat()} '
f'source={template.urls.splitlines()[0] if template.urls else ""}'
f" - id={scheduled_crawl.id} every={scheduled_crawl.schedule} "
f"next_run={scheduled_crawl.next_run_at.isoformat()} "
f"source={template.urls.splitlines()[0] if template.urls else ''}",
)
else:
print('[yellow]\\[*] No scheduled crawls are enabled.[/yellow]')
print("[yellow]\\[*] No scheduled crawls are enabled.[/yellow]")
if run_all:
enqueued = 0
@@ -124,13 +122,17 @@ def schedule(add: bool = False,
for scheduled_crawl in schedules:
scheduled_crawl.enqueue(queued_at=now)
enqueued += 1
result['run_all_enqueued'] = enqueued
print(f'[green]\\[*] Enqueued {enqueued} scheduled crawl(s) immediately.[/green]')
result["run_all_enqueued"] = enqueued
print(f"[green]\\[*] Enqueued {enqueued} scheduled crawl(s) immediately.[/green]")
if enqueued:
print('[yellow]\\[*] Start `archivebox server`, `archivebox run --daemon`, or `archivebox schedule --foreground` to process the queued crawls.[/yellow]')
print(
"[yellow]\\[*] Start `archivebox server`, `archivebox run --daemon`, or `archivebox schedule --foreground` to process the queued crawls.[/yellow]",
)
if foreground:
print('[green]\\[*] Starting global crawl runner in foreground mode. It will materialize scheduled crawls and process queued work.[/green]')
print(
"[green]\\[*] Starting global crawl runner in foreground mode. It will materialize scheduled crawls and process queued work.[/green]",
)
run_pending_crawls(daemon=True)
if quiet:
@@ -138,33 +140,38 @@ def schedule(add: bool = False,
if not any((every, add, show, clear, foreground, run_all)):
if schedules:
print('[green]\\[*] Active scheduled crawls:[/green]')
print("[green]\\[*] Active scheduled crawls:[/green]")
for scheduled_crawl in schedules:
print(f' - {scheduled_crawl.id} every={scheduled_crawl.schedule} next_run={scheduled_crawl.next_run_at.isoformat()}')
print(f" - {scheduled_crawl.id} every={scheduled_crawl.schedule} next_run={scheduled_crawl.next_run_at.isoformat()}")
else:
print('[yellow]\\[*] No scheduled crawls are enabled.[/yellow]')
print("[yellow]\\[*] No scheduled crawls are enabled.[/yellow]")
return result
@click.command()
@click.option('--quiet', '-q', is_flag=True, help="Return structured results without extra summary output")
@click.option('--add', is_flag=True, help='Create a new scheduled crawl')
@click.option('--every', type=str, help='Run on an alias like daily/weekly/monthly or a cron expression such as "0 */6 * * *"')
@click.option('--tag', '-t', default='', help='Comma-separated tags to apply to scheduled crawl snapshots')
@click.option('--depth', type=click.Choice([str(i) for i in range(5)]), default='0', help='Recursively archive linked pages up to N hops away')
@click.option('--overwrite', is_flag=True, help='Overwrite existing data if URLs have been archived previously')
@click.option('--update', is_flag=True, help='Retry previously failed/skipped URLs when scheduled crawls run')
@click.option('--clear', is_flag=True, help='Disable all currently enabled schedules')
@click.option('--show', is_flag=True, help='Print all currently enabled schedules')
@click.option('--foreground', '-f', is_flag=True, help='Run the global crawl runner in the foreground (no crontab required)')
@click.option('--run-all', is_flag=True, help='Enqueue all enabled schedules immediately and process them once')
@click.argument('import_path', required=False)
@click.option("--quiet", "-q", is_flag=True, help="Return structured results without extra summary output")
@click.option("--add", is_flag=True, help="Create a new scheduled crawl")
@click.option("--every", type=str, help='Run on an alias like daily/weekly/monthly or a cron expression such as "0 */6 * * *"')
@click.option("--tag", "-t", default="", help="Comma-separated tags to apply to scheduled crawl snapshots")
@click.option(
"--depth",
type=click.Choice([str(i) for i in range(5)]),
default="0",
help="Recursively archive linked pages up to N hops away",
)
@click.option("--overwrite", is_flag=True, help="Overwrite existing data if URLs have been archived previously")
@click.option("--update", is_flag=True, help="Retry previously failed/skipped URLs when scheduled crawls run")
@click.option("--clear", is_flag=True, help="Disable all currently enabled schedules")
@click.option("--show", is_flag=True, help="Print all currently enabled schedules")
@click.option("--foreground", "-f", is_flag=True, help="Run the global crawl runner in the foreground (no crontab required)")
@click.option("--run-all", is_flag=True, help="Enqueue all enabled schedules immediately and process them once")
@click.argument("import_path", required=False)
@docstring(schedule.__doc__)
def main(**kwargs):
"""Manage database-backed scheduled crawls processed by the crawl runner."""
schedule(**kwargs)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -1,11 +1,12 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox search'
__package__ = "archivebox.cli"
__command__ = "archivebox search"
import sys
from pathlib import Path
from typing import TYPE_CHECKING, Callable
from typing import TYPE_CHECKING
from collections.abc import Callable
import rich_click as click
@@ -20,30 +21,28 @@ if TYPE_CHECKING:
# Filter types for URL matching
LINK_FILTERS: dict[str, Callable[[str], Q]] = {
'exact': lambda pattern: Q(url=pattern),
'substring': lambda pattern: Q(url__icontains=pattern),
'regex': lambda pattern: Q(url__iregex=pattern),
'domain': lambda pattern: (
Q(url__istartswith=f'http://{pattern}')
| Q(url__istartswith=f'https://{pattern}')
| Q(url__istartswith=f'ftp://{pattern}')
"exact": lambda pattern: Q(url=pattern),
"substring": lambda pattern: Q(url__icontains=pattern),
"regex": lambda pattern: Q(url__iregex=pattern),
"domain": lambda pattern: (
Q(url__istartswith=f"http://{pattern}") | Q(url__istartswith=f"https://{pattern}") | Q(url__istartswith=f"ftp://{pattern}")
),
'tag': lambda pattern: Q(tags__name=pattern),
'timestamp': lambda pattern: Q(timestamp=pattern),
"tag": lambda pattern: Q(tags__name=pattern),
"timestamp": lambda pattern: Q(timestamp=pattern),
}
STATUS_CHOICES = ['indexed', 'archived', 'unarchived']
STATUS_CHOICES = ["indexed", "archived", "unarchived"]
def _apply_pattern_filters(
snapshots: QuerySet['Snapshot', 'Snapshot'],
snapshots: QuerySet["Snapshot", "Snapshot"],
filter_patterns: list[str],
filter_type: str,
) -> QuerySet['Snapshot', 'Snapshot']:
) -> QuerySet["Snapshot", "Snapshot"]:
filter_builder = LINK_FILTERS.get(filter_type)
if filter_builder is None:
stderr()
stderr(f'[X] Got invalid pattern for --filter-type={filter_type}', color='red')
stderr(f"[X] Got invalid pattern for --filter-type={filter_type}", color="red")
raise SystemExit(2)
query = Q()
@@ -53,7 +52,7 @@ def _apply_pattern_filters(
def _snapshots_to_json(
snapshots: QuerySet['Snapshot', 'Snapshot'],
snapshots: QuerySet["Snapshot", "Snapshot"],
*,
with_headers: bool,
) -> str:
@@ -63,31 +62,35 @@ def _snapshots_to_json(
from archivebox.config.common import SERVER_CONFIG
from archivebox.misc.util import to_json
main_index_header = {
'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
'schema': 'archivebox.index.json',
'copyright_info': SERVER_CONFIG.FOOTER_INFO,
'meta': {
'project': 'ArchiveBox',
'version': VERSION,
'git_sha': VERSION,
'website': 'https://ArchiveBox.io',
'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki',
'source': 'https://github.com/ArchiveBox/ArchiveBox',
'issues': 'https://github.com/ArchiveBox/ArchiveBox/issues',
'dependencies': {},
},
} if with_headers else {}
main_index_header = (
{
"info": "This is an index of site data archived by ArchiveBox: The self-hosted web archive.",
"schema": "archivebox.index.json",
"copyright_info": SERVER_CONFIG.FOOTER_INFO,
"meta": {
"project": "ArchiveBox",
"version": VERSION,
"git_sha": VERSION,
"website": "https://ArchiveBox.io",
"docs": "https://github.com/ArchiveBox/ArchiveBox/wiki",
"source": "https://github.com/ArchiveBox/ArchiveBox",
"issues": "https://github.com/ArchiveBox/ArchiveBox/issues",
"dependencies": {},
},
}
if with_headers
else {}
)
snapshot_dicts = [snapshot.to_dict(extended=True) for snapshot in snapshots.iterator(chunk_size=500)]
output: dict[str, object] | list[dict[str, object]]
if with_headers:
output = {
**main_index_header,
'num_links': len(snapshot_dicts),
'updated': datetime.now(tz.utc),
'last_run_cmd': sys.argv,
'links': snapshot_dicts,
"num_links": len(snapshot_dicts),
"updated": datetime.now(tz.utc),
"last_run_cmd": sys.argv,
"links": snapshot_dicts,
}
else:
output = snapshot_dicts
@@ -96,18 +99,18 @@ def _snapshots_to_json(
def _snapshots_to_csv(
snapshots: QuerySet['Snapshot', 'Snapshot'],
snapshots: QuerySet["Snapshot", "Snapshot"],
*,
cols: list[str],
with_headers: bool,
) -> str:
header = ','.join(cols) if with_headers else ''
rows = [snapshot.to_csv(cols=cols, separator=',') for snapshot in snapshots.iterator(chunk_size=500)]
return '\n'.join((header, *rows))
header = ",".join(cols) if with_headers else ""
rows = [snapshot.to_csv(cols=cols, separator=",") for snapshot in snapshots.iterator(chunk_size=500)]
return "\n".join((header, *rows))
def _snapshots_to_html(
snapshots: QuerySet['Snapshot', 'Snapshot'],
snapshots: QuerySet["Snapshot", "Snapshot"],
*,
with_headers: bool,
) -> str:
@@ -119,26 +122,31 @@ def _snapshots_to_html(
from archivebox.config.common import SERVER_CONFIG
from archivebox.config.version import get_COMMIT_HASH
template = 'static_index.html' if with_headers else 'minimal_index.html'
template = "static_index.html" if with_headers else "minimal_index.html"
snapshot_list = list(snapshots.iterator(chunk_size=500))
return render_to_string(template, {
'version': VERSION,
'git_sha': get_COMMIT_HASH() or VERSION,
'num_links': str(len(snapshot_list)),
'date_updated': datetime.now(tz.utc).strftime('%Y-%m-%d'),
'time_updated': datetime.now(tz.utc).strftime('%Y-%m-%d %H:%M'),
'links': snapshot_list,
'FOOTER_INFO': SERVER_CONFIG.FOOTER_INFO,
})
return render_to_string(
template,
{
"version": VERSION,
"git_sha": get_COMMIT_HASH() or VERSION,
"num_links": str(len(snapshot_list)),
"date_updated": datetime.now(tz.utc).strftime("%Y-%m-%d"),
"time_updated": datetime.now(tz.utc).strftime("%Y-%m-%d %H:%M"),
"links": snapshot_list,
"FOOTER_INFO": SERVER_CONFIG.FOOTER_INFO,
},
)
def get_snapshots(snapshots: QuerySet['Snapshot', 'Snapshot'] | None=None,
filter_patterns: list[str] | None=None,
filter_type: str='substring',
after: float | None=None,
before: float | None=None,
out_dir: Path=DATA_DIR) -> QuerySet['Snapshot', 'Snapshot']:
def get_snapshots(
snapshots: QuerySet["Snapshot", "Snapshot"] | None = None,
filter_patterns: list[str] | None = None,
filter_type: str = "substring",
after: float | None = None,
before: float | None = None,
out_dir: Path = DATA_DIR,
) -> QuerySet["Snapshot", "Snapshot"]:
"""Filter and return Snapshots matching the given criteria."""
from archivebox.core.models import Snapshot
@@ -155,29 +163,31 @@ def get_snapshots(snapshots: QuerySet['Snapshot', 'Snapshot'] | None=None,
result = _apply_pattern_filters(result, filter_patterns, filter_type)
# Prefetch crawl relationship to avoid N+1 queries when accessing output_dir
result = result.select_related('crawl', 'crawl__created_by')
result = result.select_related("crawl", "crawl__created_by")
if not result.exists():
stderr('[!] No Snapshots matched your filters:', filter_patterns, f'({filter_type})', color='lightyellow')
stderr("[!] No Snapshots matched your filters:", filter_patterns, f"({filter_type})", color="lightyellow")
return result
@enforce_types
def search(filter_patterns: list[str] | None=None,
filter_type: str='substring',
status: str='indexed',
before: float | None=None,
after: float | None=None,
sort: str | None=None,
json: bool=False,
html: bool=False,
csv: str | None=None,
with_headers: bool=False):
def search(
filter_patterns: list[str] | None = None,
filter_type: str = "substring",
status: str = "indexed",
before: float | None = None,
after: float | None = None,
sort: str | None = None,
json: bool = False,
html: bool = False,
csv: str | None = None,
with_headers: bool = False,
):
"""List, filter, and export information about archive entries"""
if with_headers and not (json or html or csv):
stderr('[X] --with-headers requires --json, --html or --csv\n', color='red')
stderr("[X] --with-headers requires --json, --html or --csv\n", color="red")
raise SystemExit(2)
# Query DB directly - no filesystem scanning
@@ -189,9 +199,9 @@ def search(filter_patterns: list[str] | None=None,
)
# Apply status filter
if status == 'archived':
if status == "archived":
snapshots = snapshots.filter(downloaded_at__isnull=False)
elif status == 'unarchived':
elif status == "unarchived":
snapshots = snapshots.filter(downloaded_at__isnull=True)
# 'indexed' = all snapshots (no filter)
@@ -204,9 +214,10 @@ def search(filter_patterns: list[str] | None=None,
elif html:
output = _snapshots_to_html(snapshots, with_headers=with_headers)
elif csv:
output = _snapshots_to_csv(snapshots, cols=csv.split(','), with_headers=with_headers)
output = _snapshots_to_csv(snapshots, cols=csv.split(","), with_headers=with_headers)
else:
from archivebox.misc.logging_util import printable_folders
# Convert to dict for printable_folders
folders: dict[str, Snapshot | None] = {str(snapshot.output_dir): snapshot for snapshot in snapshots}
output = printable_folders(folders, with_headers)
@@ -214,28 +225,33 @@ def search(filter_patterns: list[str] | None=None,
# Structured exports must be written directly to stdout.
# rich.print() reflows long lines to console width, which corrupts JSON/CSV/HTML output.
sys.stdout.write(output)
if not output.endswith('\n'):
sys.stdout.write('\n')
if not output.endswith("\n"):
sys.stdout.write("\n")
return output
@click.command()
@click.option('--filter-type', '-f', type=click.Choice(['search', *LINK_FILTERS.keys()]), default='substring', help='Pattern matching type for filtering URLs')
@click.option('--status', '-s', type=click.Choice(STATUS_CHOICES), default='indexed', help='List snapshots with the given status')
@click.option('--before', '-b', type=float, help='List snapshots bookmarked before the given UNIX timestamp')
@click.option('--after', '-a', type=float, help='List snapshots bookmarked after the given UNIX timestamp')
@click.option('--sort', '-o', type=str, help='Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at')
@click.option('--json', '-J', is_flag=True, help='Print output in JSON format')
@click.option('--html', '-M', is_flag=True, help='Print output in HTML format (suitable for viewing statically without a server)')
@click.option('--csv', '-C', type=str, help='Print output as CSV with the provided fields, e.g.: created_at,url,title')
@click.option('--with-headers', '-H', is_flag=True, help='Include extra CSV/HTML headers in the output')
@click.help_option('--help', '-h')
@click.argument('filter_patterns', nargs=-1)
@click.option(
"--filter-type",
"-f",
type=click.Choice(["search", *LINK_FILTERS.keys()]),
default="substring",
help="Pattern matching type for filtering URLs",
)
@click.option("--status", "-s", type=click.Choice(STATUS_CHOICES), default="indexed", help="List snapshots with the given status")
@click.option("--before", "-b", type=float, help="List snapshots bookmarked before the given UNIX timestamp")
@click.option("--after", "-a", type=float, help="List snapshots bookmarked after the given UNIX timestamp")
@click.option("--sort", "-o", type=str, help="Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at")
@click.option("--json", "-J", is_flag=True, help="Print output in JSON format")
@click.option("--html", "-M", is_flag=True, help="Print output in HTML format (suitable for viewing statically without a server)")
@click.option("--csv", "-C", type=str, help="Print output as CSV with the provided fields, e.g.: created_at,url,title")
@click.option("--with-headers", "-H", is_flag=True, help="Include extra CSV/HTML headers in the output")
@click.help_option("--help", "-h")
@click.argument("filter_patterns", nargs=-1)
@docstring(search.__doc__)
def main(**kwargs):
return search(**kwargs)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -1,8 +1,8 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__package__ = "archivebox.cli"
from typing import Iterable
from collections.abc import Iterable
import sys
import rich_click as click
@@ -15,20 +15,23 @@ from archivebox.config.common import SERVER_CONFIG
def stop_existing_background_runner(*, machine, process_model, supervisor=None, stop_worker_fn=None, log=print) -> int:
"""Stop any existing orchestrator process so the server can take ownership."""
process_model.cleanup_stale_running(machine=machine)
process_model.cleanup_orphaned_workers()
running_runners = list(process_model.objects.filter(
machine=machine,
status=process_model.StatusChoices.RUNNING,
process_type=process_model.TypeChoices.ORCHESTRATOR,
).order_by('created_at'))
running_runners = list(
process_model.objects.filter(
machine=machine,
status=process_model.StatusChoices.RUNNING,
process_type=process_model.TypeChoices.ORCHESTRATOR,
).order_by("created_at"),
)
if not running_runners:
return 0
log('[yellow][*] Stopping existing ArchiveBox background runner...[/yellow]')
log("[yellow][*] Stopping existing ArchiveBox background runner...[/yellow]")
if supervisor is not None and stop_worker_fn is not None:
for worker_name in ('worker_runner', 'worker_runner_watch'):
for worker_name in ("worker_runner", "worker_runner_watch"):
try:
stop_worker_fn(supervisor, worker_name)
except Exception:
@@ -47,23 +50,70 @@ def stop_existing_background_runner(*, machine, process_model, supervisor=None,
return len(running_runners)
def _read_supervisor_worker_command(worker_name: str) -> str:
from archivebox.workers.supervisord_util import WORKERS_DIR_NAME, get_sock_file
worker_conf = get_sock_file().parent / WORKERS_DIR_NAME / f"{worker_name}.conf"
if not worker_conf.exists():
return ""
for line in worker_conf.read_text().splitlines():
if line.startswith("command="):
return line.removeprefix("command=").strip()
return ""
def _worker_command_matches_bind(command: str, host: str, port: str) -> bool:
if not command:
return False
return f"{host}:{port}" in command or (f"--bind={host}" in command and f"--port={port}" in command)
def stop_existing_server_workers(*, supervisor, stop_worker_fn, host: str, port: str, log=print) -> int:
"""Stop existing ArchiveBox web workers if they already own the requested bind."""
stopped = 0
for worker_name in ("worker_runserver", "worker_daphne"):
try:
proc = supervisor.getProcessInfo(worker_name) if supervisor else None
except Exception:
proc = None
if not isinstance(proc, dict) or proc.get("statename") != "RUNNING":
continue
command = _read_supervisor_worker_command(worker_name)
if not _worker_command_matches_bind(command, host, port):
continue
if stopped == 0:
log("[yellow][*] Taking over existing ArchiveBox web server on same port...[/yellow]")
stop_worker_fn(supervisor, worker_name)
stopped += 1
return stopped
@enforce_types
def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
reload: bool=False,
init: bool=False,
debug: bool=False,
daemonize: bool=False,
nothreading: bool=False) -> None:
def server(
runserver_args: Iterable[str] = (SERVER_CONFIG.BIND_ADDR,),
reload: bool = False,
init: bool = False,
debug: bool = False,
daemonize: bool = False,
nothreading: bool = False,
) -> None:
"""Run the ArchiveBox HTTP server"""
runserver_args = list(runserver_args)
if init:
from archivebox.cli.archivebox_init import init as archivebox_init
archivebox_init(quick=True)
print()
from archivebox.misc.checks import check_data_folder
check_data_folder()
from archivebox.config.common import SHELL_CONFIG
@@ -73,22 +123,24 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
SHELL_CONFIG.DEBUG = True
from django.contrib.auth.models import User
if not User.objects.filter(is_superuser=True).exclude(username='system').exists():
if not User.objects.filter(is_superuser=True).exclude(username="system").exists():
print()
print('[violet]Hint:[/violet] To create an [bold]admin username & password[/bold] for the [deep_sky_blue3][underline][link=http://{host}:{port}/admin]Admin UI[/link][/underline][/deep_sky_blue3], run:')
print(' [green]archivebox manage createsuperuser[/green]')
print(
"[violet]Hint:[/violet] To create an [bold]admin username & password[/bold] for the [deep_sky_blue3][underline][link=http://{host}:{port}/admin]Admin UI[/link][/underline][/deep_sky_blue3], run:",
)
print(" [green]archivebox manage createsuperuser[/green]")
print()
host = '127.0.0.1'
port = '8000'
host = "127.0.0.1"
port = "8000"
try:
host_and_port = [arg for arg in runserver_args if arg.replace('.', '').replace(':', '').isdigit()][0]
if ':' in host_and_port:
host, port = host_and_port.split(':')
host_and_port = [arg for arg in runserver_args if arg.replace(".", "").replace(":", "").isdigit()][0]
if ":" in host_and_port:
host, port = host_and_port.split(":")
else:
if '.' in host_and_port:
if "." in host_and_port:
host = host_and_port
else:
port = host_and_port
@@ -104,66 +156,80 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
)
from archivebox.machine.models import Machine, Process
# Check if port is already in use
if is_port_in_use(host, int(port)):
print(f'[red][X] Error: Port {port} is already in use[/red]')
print(f' Another process (possibly daphne or runserver) is already listening on {host}:{port}')
print(' Stop the conflicting process or choose a different port')
sys.exit(1)
machine = Machine.current()
supervisor = get_existing_supervisord_process()
stop_existing_background_runner(
machine=machine,
process_model=Process,
supervisor=get_existing_supervisord_process(),
supervisor=supervisor,
stop_worker_fn=stop_worker,
)
if supervisor:
stop_existing_server_workers(
supervisor=supervisor,
stop_worker_fn=stop_worker,
host=host,
port=port,
)
# Check if port is already in use
if is_port_in_use(host, int(port)):
print(f"[red][X] Error: Port {port} is already in use[/red]")
print(f" Another process (possibly daphne or runserver) is already listening on {host}:{port}")
print(" Stop the conflicting process or choose a different port")
sys.exit(1)
supervisor = get_existing_supervisord_process()
if supervisor:
server_worker_name = 'worker_runserver' if run_in_debug else 'worker_daphne'
server_worker_name = "worker_runserver" if run_in_debug else "worker_daphne"
server_proc = get_worker(supervisor, server_worker_name)
server_state = server_proc.get('statename') if isinstance(server_proc, dict) else None
if server_state == 'RUNNING':
runner_proc = get_worker(supervisor, 'worker_runner')
runner_watch_proc = get_worker(supervisor, 'worker_runner_watch')
runner_state = runner_proc.get('statename') if isinstance(runner_proc, dict) else None
runner_watch_state = runner_watch_proc.get('statename') if isinstance(runner_watch_proc, dict) else None
print('[red][X] Error: ArchiveBox server is already running[/red]')
print(f' [green]√[/green] Web server ({server_worker_name}) is RUNNING on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
if runner_state == 'RUNNING':
print(' [green]√[/green] Background runner (worker_runner) is RUNNING')
if runner_watch_state == 'RUNNING':
print(' [green]√[/green] Reload watcher (worker_runner_watch) is RUNNING')
server_state = server_proc.get("statename") if isinstance(server_proc, dict) else None
if server_state == "RUNNING":
runner_proc = get_worker(supervisor, "worker_runner")
runner_watch_proc = get_worker(supervisor, "worker_runner_watch")
runner_state = runner_proc.get("statename") if isinstance(runner_proc, dict) else None
runner_watch_state = runner_watch_proc.get("statename") if isinstance(runner_watch_proc, dict) else None
print("[red][X] Error: ArchiveBox server is already running[/red]")
print(
f" [green]√[/green] Web server ({server_worker_name}) is RUNNING on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]",
)
if runner_state == "RUNNING":
print(" [green]√[/green] Background runner (worker_runner) is RUNNING")
if runner_watch_state == "RUNNING":
print(" [green]√[/green] Reload watcher (worker_runner_watch) is RUNNING")
print()
print('[yellow]To stop the existing server, run:[/yellow]')
print("[yellow]To stop the existing server, run:[/yellow]")
print(' pkill -f "archivebox server"')
print(' pkill -f supervisord')
print(" pkill -f supervisord")
sys.exit(1)
if run_in_debug:
print('[green][+] Starting ArchiveBox webserver in DEBUG mode...[/green]')
print("[green][+] Starting ArchiveBox webserver in DEBUG mode...[/green]")
else:
print('[green][+] Starting ArchiveBox webserver...[/green]')
print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
print(f' [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]')
print(' > Writing ArchiveBox error log to ./logs/errors.log')
print("[green][+] Starting ArchiveBox webserver...[/green]")
print(
f" [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]",
)
print(
f" [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]",
)
print(" > Writing ArchiveBox error log to ./logs/errors.log")
print()
start_server_workers(host=host, port=port, daemonize=daemonize, debug=run_in_debug, reload=reload, nothreading=nothreading)
print("\n[i][green][🟩] ArchiveBox server shut down gracefully.[/green][/i]")
@click.command()
@click.argument('runserver_args', nargs=-1)
@click.option('--reload', is_flag=True, help='Enable auto-reloading when code or templates change')
@click.option('--debug', is_flag=True, help='Enable DEBUG=True mode with more verbose errors')
@click.option('--nothreading', is_flag=True, help='Force runserver to run in single-threaded mode')
@click.option('--init', is_flag=True, help='Run a full archivebox init/upgrade before starting the server')
@click.option('--daemonize', is_flag=True, help='Run the server in the background as a daemon')
@click.argument("runserver_args", nargs=-1)
@click.option("--reload", is_flag=True, help="Enable auto-reloading when code or templates change")
@click.option("--debug", is_flag=True, help="Enable DEBUG=True mode with more verbose errors")
@click.option("--nothreading", is_flag=True, help="Force runserver to run in single-threaded mode")
@click.option("--init", is_flag=True, help="Run a full archivebox init/upgrade before starting the server")
@click.option("--daemonize", is_flag=True, help="Run the server in the background as a daemon")
@docstring(server.__doc__)
def main(**kwargs):
server(**kwargs)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -1,27 +1,28 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__package__ = "archivebox.cli"
from typing import Iterable
from collections.abc import Iterable
import rich_click as click
from archivebox.misc.util import docstring
def shell(args: Iterable[str]=()) -> None:
def shell(args: Iterable[str] = ()) -> None:
"""Enter an interactive ArchiveBox Django shell"""
from django.core.management import call_command
call_command("shell_plus", *args)
@click.command(add_help_option=False, context_settings=dict(ignore_unknown_options=True))
@click.argument('args', nargs=-1)
@click.argument("args", nargs=-1)
@docstring(shell.__doc__)
def main(args: Iterable[str]=()) -> None:
def main(args: Iterable[str] = ()) -> None:
shell(args=args)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -27,14 +27,16 @@ Examples:
archivebox snapshot list --url__icontains=spam.com | archivebox snapshot delete --yes
"""
__package__ = 'archivebox.cli'
__command__ = 'archivebox snapshot'
__package__ = "archivebox.cli"
__command__ = "archivebox snapshot"
import sys
from typing import Optional, Iterable
from collections.abc import Iterable
import rich_click as click
from rich import print as rprint
from django.db.models import Q, Sum
from django.db.models.functions import Coalesce
from archivebox.cli.cli_utils import apply_filters
@@ -43,12 +45,13 @@ from archivebox.cli.cli_utils import apply_filters
# CREATE
# =============================================================================
def create_snapshots(
urls: Iterable[str],
tag: str = '',
status: str = 'queued',
tag: str = "",
status: str = "queued",
depth: int = 0,
created_by_id: Optional[int] = None,
created_by_id: int | None = None,
) -> int:
"""
Create Snapshots from URLs or stdin JSONL (Crawl or Snapshot records).
@@ -59,8 +62,10 @@ def create_snapshots(
1: Failure
"""
from archivebox.misc.jsonl import (
read_args_or_stdin, write_record,
TYPE_SNAPSHOT, TYPE_CRAWL
read_args_or_stdin,
write_record,
TYPE_SNAPSHOT,
TYPE_CRAWL,
)
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.core.models import Snapshot
@@ -73,7 +78,7 @@ def create_snapshots(
records = list(read_args_or_stdin(urls))
if not records:
rprint('[yellow]No URLs or Crawls provided. Pass URLs as arguments or via stdin.[/yellow]', file=sys.stderr)
rprint("[yellow]No URLs or Crawls provided. Pass URLs as arguments or via stdin.[/yellow]", file=sys.stderr)
return 1
# Process each record - handle Crawls and plain URLs/Snapshots
@@ -81,7 +86,7 @@ def create_snapshots(
pass_through_count = 0
for record in records:
record_type = record.get('type', '')
record_type = record.get("type", "")
try:
if record_type == TYPE_CRAWL:
@@ -91,14 +96,14 @@ def create_snapshots(
# Input is a Crawl - get or create it, then create Snapshots for its URLs
crawl = None
crawl_id = record.get('id')
crawl_id = record.get("id")
if crawl_id:
try:
crawl = Crawl.objects.get(id=crawl_id)
except Crawl.DoesNotExist:
crawl = Crawl.from_json(record, overrides={'created_by_id': created_by_id})
crawl = Crawl.from_json(record, overrides={"created_by_id": created_by_id})
else:
crawl = Crawl.from_json(record, overrides={'created_by_id': created_by_id})
crawl = Crawl.from_json(record, overrides={"created_by_id": created_by_id})
if not crawl:
continue
@@ -109,27 +114,27 @@ def create_snapshots(
if tag:
merged_tags = f"{merged_tags},{tag}" if merged_tags else tag
snapshot_record = {
'url': url,
'tags': merged_tags,
'crawl_id': str(crawl.id),
'depth': depth,
'status': status,
"url": url,
"tags": merged_tags,
"crawl_id": str(crawl.id),
"depth": depth,
"status": status,
}
snapshot = Snapshot.from_json(snapshot_record, overrides={'created_by_id': created_by_id})
snapshot = Snapshot.from_json(snapshot_record, overrides={"created_by_id": created_by_id})
if snapshot:
created_snapshots.append(snapshot)
if not is_tty:
write_record(snapshot.to_json())
elif record_type == TYPE_SNAPSHOT or record.get('url'):
elif record_type == TYPE_SNAPSHOT or record.get("url"):
# Input is a Snapshot or plain URL
if tag and not record.get('tags'):
record['tags'] = tag
if tag and not record.get("tags"):
record["tags"] = tag
if status:
record['status'] = status
record['depth'] = record.get('depth', depth)
record["status"] = status
record["depth"] = record.get("depth", depth)
snapshot = Snapshot.from_json(record, overrides={'created_by_id': created_by_id})
snapshot = Snapshot.from_json(record, overrides={"created_by_id": created_by_id})
if snapshot:
created_snapshots.append(snapshot)
if not is_tty:
@@ -142,21 +147,21 @@ def create_snapshots(
pass_through_count += 1
except Exception as e:
rprint(f'[red]Error creating snapshot: {e}[/red]', file=sys.stderr)
rprint(f"[red]Error creating snapshot: {e}[/red]", file=sys.stderr)
continue
if not created_snapshots:
if pass_through_count > 0:
rprint(f'[dim]Passed through {pass_through_count} records, no new snapshots[/dim]', file=sys.stderr)
rprint(f"[dim]Passed through {pass_through_count} records, no new snapshots[/dim]", file=sys.stderr)
return 0
rprint('[red]No snapshots created[/red]', file=sys.stderr)
rprint("[red]No snapshots created[/red]", file=sys.stderr)
return 1
rprint(f'[green]Created {len(created_snapshots)} snapshots[/green]', file=sys.stderr)
rprint(f"[green]Created {len(created_snapshots)} snapshots[/green]", file=sys.stderr)
if is_tty:
for snapshot in created_snapshots:
rprint(f' [dim]{snapshot.id}[/dim] {snapshot.url[:60]}', file=sys.stderr)
rprint(f" [dim]{snapshot.id}[/dim] {snapshot.url[:60]}", file=sys.stderr)
return 0
@@ -165,16 +170,19 @@ def create_snapshots(
# LIST
# =============================================================================
def list_snapshots(
status: Optional[str] = None,
url__icontains: Optional[str] = None,
url__istartswith: Optional[str] = None,
tag: Optional[str] = None,
crawl_id: Optional[str] = None,
limit: Optional[int] = None,
sort: Optional[str] = None,
csv: Optional[str] = None,
status: str | None = None,
url__icontains: str | None = None,
url__istartswith: str | None = None,
tag: str | None = None,
crawl_id: str | None = None,
limit: int | None = None,
sort: str | None = None,
csv: str | None = None,
with_headers: bool = False,
search: str | None = None,
query: str | None = None,
) -> int:
"""
List Snapshots as JSONL with optional filters.
@@ -184,64 +192,106 @@ def list_snapshots(
"""
from archivebox.misc.jsonl import write_record
from archivebox.core.models import Snapshot
from archivebox.search import (
get_default_search_mode,
get_search_mode,
prioritize_metadata_matches,
query_search_index,
)
if with_headers and not csv:
rprint('[red]--with-headers requires --csv[/red]', file=sys.stderr)
rprint("[red]--with-headers requires --csv[/red]", file=sys.stderr)
return 2
is_tty = sys.stdout.isatty() and not csv
queryset = Snapshot.objects.all().order_by('-created_at')
queryset = Snapshot.objects.annotate(output_size_sum=Coalesce(Sum("archiveresult__output_size"), 0)).order_by("-created_at")
# Apply filters
filter_kwargs = {
'status': status,
'url__icontains': url__icontains,
'url__istartswith': url__istartswith,
'crawl_id': crawl_id,
"status": status,
"url__icontains": url__icontains,
"url__istartswith": url__istartswith,
"crawl_id": crawl_id,
}
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
queryset = apply_filters(queryset, filter_kwargs)
# Tag filter requires special handling (M2M)
if tag:
queryset = queryset.filter(tags__name__iexact=tag)
query = (query or "").strip()
if query:
metadata_qs = queryset.filter(
Q(title__icontains=query) | Q(url__icontains=query) | Q(timestamp__icontains=query) | Q(tags__name__icontains=query),
)
requested_search_mode = (search or "").strip().lower()
if requested_search_mode == "content":
requested_search_mode = "contents"
search_mode = get_default_search_mode() if not requested_search_mode else get_search_mode(requested_search_mode)
if search_mode == "meta":
queryset = metadata_qs
else:
try:
deep_qsearch = None
if search_mode == "deep":
qsearch = query_search_index(query, search_mode="contents")
deep_qsearch = query_search_index(query, search_mode="deep")
else:
qsearch = query_search_index(query, search_mode=search_mode)
queryset = prioritize_metadata_matches(
queryset,
metadata_qs,
qsearch,
deep_queryset=deep_qsearch,
ordering=("-created_at",) if not sort else None,
)
except Exception as err:
rprint(
f"[yellow]Search backend error, falling back to metadata search: {err}[/yellow]",
file=sys.stderr,
)
queryset = metadata_qs
if sort:
queryset = queryset.order_by(sort)
if limit:
queryset = queryset[:limit]
count = 0
if csv:
cols = [col.strip() for col in csv.split(',') if col.strip()]
cols = [col.strip() for col in csv.split(",") if col.strip()]
if not cols:
rprint('[red]No CSV columns provided[/red]', file=sys.stderr)
rprint("[red]No CSV columns provided[/red]", file=sys.stderr)
return 2
rows: list[str] = []
if with_headers:
rows.append(','.join(cols))
rows.append(",".join(cols))
for snapshot in queryset.iterator(chunk_size=500):
rows.append(snapshot.to_csv(cols=cols, separator=','))
rows.append(snapshot.to_csv(cols=cols, separator=","))
count += 1
output = '\n'.join(rows)
output = "\n".join(rows)
if output:
sys.stdout.write(output)
if not output.endswith('\n'):
sys.stdout.write('\n')
rprint(f'[dim]Listed {count} snapshots[/dim]', file=sys.stderr)
if not output.endswith("\n"):
sys.stdout.write("\n")
rprint(f"[dim]Listed {count} snapshots[/dim]", file=sys.stderr)
return 0
for snapshot in queryset:
if is_tty:
status_color = {
'queued': 'yellow',
'started': 'blue',
'sealed': 'green',
}.get(snapshot.status, 'dim')
rprint(f'[{status_color}]{snapshot.status:8}[/{status_color}] [dim]{snapshot.id}[/dim] {snapshot.url[:60]}')
"queued": "yellow",
"started": "blue",
"sealed": "green",
}.get(snapshot.status, "dim")
rprint(f"[{status_color}]{snapshot.status:8}[/{status_color}] [dim]{snapshot.id}[/dim] {snapshot.url[:60]}")
else:
write_record(snapshot.to_json())
count += 1
rprint(f'[dim]Listed {count} snapshots[/dim]', file=sys.stderr)
rprint(f"[dim]Listed {count} snapshots[/dim]", file=sys.stderr)
return 0
@@ -249,9 +299,10 @@ def list_snapshots(
# UPDATE
# =============================================================================
def update_snapshots(
status: Optional[str] = None,
tag: Optional[str] = None,
status: str | None = None,
tag: str | None = None,
) -> int:
"""
Update Snapshots from stdin JSONL.
@@ -272,12 +323,12 @@ def update_snapshots(
records = list(read_stdin())
if not records:
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
return 1
updated_count = 0
for record in records:
snapshot_id = record.get('id')
snapshot_id = record.get("id")
if not snapshot_id:
continue
@@ -292,6 +343,7 @@ def update_snapshots(
# Add tag to existing tags
snapshot.save() # Ensure saved before M2M
from archivebox.core.models import Tag
tag_obj, _ = Tag.objects.get_or_create(name=tag)
snapshot.tags.add(tag_obj)
@@ -302,10 +354,10 @@ def update_snapshots(
write_record(snapshot.to_json())
except Snapshot.DoesNotExist:
rprint(f'[yellow]Snapshot not found: {snapshot_id}[/yellow]', file=sys.stderr)
rprint(f"[yellow]Snapshot not found: {snapshot_id}[/yellow]", file=sys.stderr)
continue
rprint(f'[green]Updated {updated_count} snapshots[/green]', file=sys.stderr)
rprint(f"[green]Updated {updated_count} snapshots[/green]", file=sys.stderr)
return 0
@@ -313,6 +365,7 @@ def update_snapshots(
# DELETE
# =============================================================================
def delete_snapshots(yes: bool = False, dry_run: bool = False) -> int:
"""
Delete Snapshots from stdin JSONL.
@@ -328,35 +381,35 @@ def delete_snapshots(yes: bool = False, dry_run: bool = False) -> int:
records = list(read_stdin())
if not records:
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
return 1
snapshot_ids = [r.get('id') for r in records if r.get('id')]
snapshot_ids = [r.get("id") for r in records if r.get("id")]
if not snapshot_ids:
rprint('[yellow]No valid snapshot IDs in input[/yellow]', file=sys.stderr)
rprint("[yellow]No valid snapshot IDs in input[/yellow]", file=sys.stderr)
return 1
snapshots = Snapshot.objects.filter(id__in=snapshot_ids)
count = snapshots.count()
if count == 0:
rprint('[yellow]No matching snapshots found[/yellow]', file=sys.stderr)
rprint("[yellow]No matching snapshots found[/yellow]", file=sys.stderr)
return 0
if dry_run:
rprint(f'[yellow]Would delete {count} snapshots (dry run)[/yellow]', file=sys.stderr)
rprint(f"[yellow]Would delete {count} snapshots (dry run)[/yellow]", file=sys.stderr)
for snapshot in snapshots:
rprint(f' [dim]{snapshot.id}[/dim] {snapshot.url[:60]}', file=sys.stderr)
rprint(f" [dim]{snapshot.id}[/dim] {snapshot.url[:60]}", file=sys.stderr)
return 0
if not yes:
rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
rprint("[red]Use --yes to confirm deletion[/red]", file=sys.stderr)
return 1
# Perform deletion
deleted_count, _ = snapshots.delete()
rprint(f'[green]Deleted {deleted_count} snapshots[/green]', file=sys.stderr)
rprint(f"[green]Deleted {deleted_count} snapshots[/green]", file=sys.stderr)
return 0
@@ -364,57 +417,81 @@ def delete_snapshots(yes: bool = False, dry_run: bool = False) -> int:
# CLI Commands
# =============================================================================
@click.group()
def main():
"""Manage Snapshot records."""
pass
@main.command('create')
@click.argument('urls', nargs=-1)
@click.option('--tag', '-t', default='', help='Comma-separated tags to add')
@click.option('--status', '-s', default='queued', help='Initial status (default: queued)')
@click.option('--depth', '-d', type=int, default=0, help='Crawl depth (default: 0)')
@main.command("create")
@click.argument("urls", nargs=-1)
@click.option("--tag", "-t", default="", help="Comma-separated tags to add")
@click.option("--status", "-s", default="queued", help="Initial status (default: queued)")
@click.option("--depth", "-d", type=int, default=0, help="Crawl depth (default: 0)")
def create_cmd(urls: tuple, tag: str, status: str, depth: int):
"""Create Snapshots from URLs or stdin JSONL."""
sys.exit(create_snapshots(urls, tag=tag, status=status, depth=depth))
@main.command('list')
@click.option('--status', '-s', help='Filter by status (queued, started, sealed)')
@click.option('--url__icontains', help='Filter by URL contains')
@click.option('--url__istartswith', help='Filter by URL starts with')
@click.option('--tag', '-t', help='Filter by tag name')
@click.option('--crawl-id', help='Filter by crawl ID')
@click.option('--limit', '-n', type=int, help='Limit number of results')
def list_cmd(status: Optional[str], url__icontains: Optional[str], url__istartswith: Optional[str],
tag: Optional[str], crawl_id: Optional[str], limit: Optional[int]):
@main.command("list")
@click.option("--status", "-s", help="Filter by status (queued, started, sealed)")
@click.option("--url__icontains", help="Filter by URL contains")
@click.option("--url__istartswith", help="Filter by URL starts with")
@click.option("--tag", "-t", help="Filter by tag name")
@click.option("--crawl-id", help="Filter by crawl ID")
@click.option("--limit", "-n", type=int, help="Limit number of results")
@click.option("--sort", "-o", type=str, help="Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at")
@click.option("--csv", "-C", type=str, help="Print output as CSV with the provided fields, e.g.: timestamp,url,title")
@click.option("--with-headers", is_flag=True, help="Include column headers in structured output")
@click.option("--search", type=click.Choice(["meta", "content", "contents", "deep"]), help="Search mode to use for the query")
@click.argument("query", nargs=-1)
def list_cmd(
status: str | None,
url__icontains: str | None,
url__istartswith: str | None,
tag: str | None,
crawl_id: str | None,
limit: int | None,
sort: str | None,
csv: str | None,
with_headers: bool,
search: str | None,
query: tuple[str, ...],
):
"""List Snapshots as JSONL."""
sys.exit(list_snapshots(
status=status,
url__icontains=url__icontains,
url__istartswith=url__istartswith,
tag=tag,
crawl_id=crawl_id,
limit=limit,
))
sys.exit(
list_snapshots(
status=status,
url__icontains=url__icontains,
url__istartswith=url__istartswith,
tag=tag,
crawl_id=crawl_id,
limit=limit,
sort=sort,
csv=csv,
with_headers=with_headers,
search=search,
query=" ".join(query),
),
)
@main.command('update')
@click.option('--status', '-s', help='Set status')
@click.option('--tag', '-t', help='Add tag')
def update_cmd(status: Optional[str], tag: Optional[str]):
@main.command("update")
@click.option("--status", "-s", help="Set status")
@click.option("--tag", "-t", help="Add tag")
def update_cmd(status: str | None, tag: str | None):
"""Update Snapshots from stdin JSONL."""
sys.exit(update_snapshots(status=status, tag=tag))
@main.command('delete')
@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
@main.command("delete")
@click.option("--yes", "-y", is_flag=True, help="Confirm deletion")
@click.option("--dry-run", is_flag=True, help="Show what would be deleted")
def delete_cmd(yes: bool, dry_run: bool):
"""Delete Snapshots from stdin JSONL."""
sys.exit(delete_snapshots(yes=yes, dry_run=dry_run))
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -1,7 +1,7 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox snapshot'
__package__ = "archivebox.cli"
__command__ = "archivebox snapshot"
import sys
@@ -10,15 +10,15 @@ import rich_click as click
from archivebox.cli.archivebox_snapshot import create_snapshots
@click.command(context_settings={'ignore_unknown_options': True})
@click.option('--tag', '-t', default='', help='Comma-separated tags to add')
@click.option('--status', '-s', default='queued', help='Initial status (default: queued)')
@click.option('--depth', '-d', type=int, default=0, help='Crawl depth (default: 0)')
@click.argument('urls', nargs=-1)
@click.command(context_settings={"ignore_unknown_options": True})
@click.option("--tag", "-t", default="", help="Comma-separated tags to add")
@click.option("--status", "-s", default="queued", help="Initial status (default: queued)")
@click.option("--depth", "-d", type=int, default=0, help="Crawl depth (default: 0)")
@click.argument("urls", nargs=-1)
def main(tag: str, status: str, depth: int, urls: tuple[str, ...]):
"""Backwards-compatible `archivebox snapshot URL...` entrypoint."""
sys.exit(create_snapshots(urls, tag=tag, status=status, depth=depth))
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -1,6 +1,6 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__package__ = "archivebox.cli"
from pathlib import Path
@@ -16,31 +16,34 @@ from archivebox.misc.logging_util import printable_filesize
@enforce_types
def status(out_dir: Path=DATA_DIR) -> None:
def status(out_dir: Path = DATA_DIR) -> None:
"""Print out some info and statistics about the archive collection"""
from django.contrib.auth import get_user_model
from django.db.models import Sum
from django.db.models.functions import Coalesce
from archivebox.core.models import Snapshot
User = get_user_model()
print('[green]\\[*] Scanning archive main index...[/green]')
print(f'[yellow] {out_dir}/*[/yellow]')
num_bytes, num_dirs, num_files = get_dir_size(out_dir, recursive=False, pattern='index.')
print("[green]\\[*] Scanning archive main index...[/green]")
print(f"[yellow] {out_dir}/*[/yellow]")
num_bytes, num_dirs, num_files = get_dir_size(out_dir, recursive=False, pattern="index.")
size = printable_filesize(num_bytes)
print(f' Index size: {size} across {num_files} files')
print(f" Index size: {size} across {num_files} files")
print()
links = list(Snapshot.objects.all())
links = list(Snapshot.objects.annotate(output_size_sum=Coalesce(Sum("archiveresult__output_size"), 0)))
num_sql_links = len(links)
num_link_details = sum(1 for link in parse_json_links_details(out_dir=out_dir))
print(f' > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {CONSTANTS.SQL_INDEX_FILENAME})')
print(f' > JSON Link Details: {num_link_details} links'.ljust(36), f'(found in {ARCHIVE_DIR.name}/*/index.json)')
print(f" > SQL Main Index: {num_sql_links} links".ljust(36), f"(found in {CONSTANTS.SQL_INDEX_FILENAME})")
print(f" > JSON Link Details: {num_link_details} links".ljust(36), f"(found in {ARCHIVE_DIR.name}/*/index.json)")
print()
print('[green]\\[*] Scanning archive data directories...[/green]')
users_dir = out_dir / 'users'
print("[green]\\[*] Scanning archive data directories...[/green]")
users_dir = out_dir / "users"
scan_roots = [root for root in (ARCHIVE_DIR, users_dir) if root.exists()]
scan_roots_display = ', '.join(str(root) for root in scan_roots) if scan_roots else str(ARCHIVE_DIR)
print(f'[yellow] {scan_roots_display}[/yellow]')
scan_roots_display = ", ".join(str(root) for root in scan_roots) if scan_roots else str(ARCHIVE_DIR)
print(f"[yellow] {scan_roots_display}[/yellow]")
num_bytes = num_dirs = num_files = 0
for root in scan_roots:
root_bytes, root_dirs, root_files = get_dir_size(root)
@@ -48,80 +51,66 @@ def status(out_dir: Path=DATA_DIR) -> None:
num_dirs += root_dirs
num_files += root_files
size = printable_filesize(num_bytes)
print(f' Size: {size} across {num_files} files in {num_dirs} directories')
print(f" Size: {size} across {num_files} files in {num_dirs} directories")
# Use DB as source of truth for snapshot status
num_indexed = len(links)
num_archived = sum(1 for snapshot in links if snapshot.is_archived)
num_unarchived = max(num_indexed - num_archived, 0)
print(f' > indexed: {num_indexed}'.ljust(36), '(total snapshots in DB)')
print(f' > archived: {num_archived}'.ljust(36), '(snapshots with archived content)')
print(f' > unarchived: {num_unarchived}'.ljust(36), '(snapshots pending archiving)')
print(f" > indexed: {num_indexed}".ljust(36), "(total snapshots in DB)")
print(f" > archived: {num_archived}".ljust(36), "(snapshots with archived content)")
print(f" > unarchived: {num_unarchived}".ljust(36), "(snapshots pending archiving)")
# Count snapshot directories on filesystem across both legacy and current layouts.
expected_snapshot_dirs = {
str(Path(snapshot.output_dir).resolve())
for snapshot in links
if Path(snapshot.output_dir).exists()
}
expected_snapshot_dirs = {str(Path(snapshot.output_dir).resolve()) for snapshot in links if Path(snapshot.output_dir).exists()}
discovered_snapshot_dirs = set()
if ARCHIVE_DIR.exists():
discovered_snapshot_dirs.update(
str(entry.resolve())
for entry in ARCHIVE_DIR.iterdir()
if entry.is_dir()
)
discovered_snapshot_dirs.update(str(entry.resolve()) for entry in ARCHIVE_DIR.iterdir() if entry.is_dir())
if users_dir.exists():
discovered_snapshot_dirs.update(
str(entry.resolve())
for entry in users_dir.glob('*/snapshots/*/*/*')
if entry.is_dir()
)
discovered_snapshot_dirs.update(str(entry.resolve()) for entry in users_dir.glob("*/snapshots/*/*/*") if entry.is_dir())
orphaned_dirs = sorted(discovered_snapshot_dirs - expected_snapshot_dirs)
num_present = len(discovered_snapshot_dirs)
num_valid = len(discovered_snapshot_dirs & expected_snapshot_dirs)
print()
print(f' > present: {num_present}'.ljust(36), '(snapshot directories on disk)')
print(f' > [green]valid:[/green] {num_valid}'.ljust(36), ' (directories with matching DB entry)')
print(f" > present: {num_present}".ljust(36), "(snapshot directories on disk)")
print(f" > [green]valid:[/green] {num_valid}".ljust(36), " (directories with matching DB entry)")
num_orphaned = len(orphaned_dirs)
print(f' > [red]orphaned:[/red] {num_orphaned}'.ljust(36), ' (directories without matching DB entry)')
print(f" > [red]orphaned:[/red] {num_orphaned}".ljust(36), " (directories without matching DB entry)")
if num_indexed:
print(' [violet]Hint:[/violet] You can list snapshots by status like so:')
print(' [green]archivebox list --status=<status> (e.g. archived, queued, etc.)[/green]')
print(" [violet]Hint:[/violet] You can list snapshots by status like so:")
print(" [green]archivebox list --status=<status> (e.g. archived, queued, etc.)[/green]")
if orphaned_dirs:
print(' [violet]Hint:[/violet] To automatically import orphaned data directories into the main index, run:')
print(' [green]archivebox init[/green]')
print(" [violet]Hint:[/violet] To automatically import orphaned data directories into the main index, run:")
print(" [green]archivebox init[/green]")
print()
print('[green]\\[*] Scanning recent archive changes and user logins:[/green]')
print(f'[yellow] {CONSTANTS.LOGS_DIR}/*[/yellow]')
admin_users = User.objects.filter(is_superuser=True).exclude(username='system')
print("[green]\\[*] Scanning recent archive changes and user logins:[/green]")
print(f"[yellow] {CONSTANTS.LOGS_DIR}/*[/yellow]")
admin_users = User.objects.filter(is_superuser=True).exclude(username="system")
users = [user.get_username() for user in admin_users]
print(f' UI users {len(users)}: {", ".join(users)}')
last_login = admin_users.order_by('last_login').last()
print(f" UI users {len(users)}: {', '.join(users)}")
last_login = admin_users.order_by("last_login").last()
if last_login:
print(f' Last UI login: {last_login.get_username()} @ {str(last_login.last_login)[:16]}')
last_downloaded = Snapshot.objects.order_by('downloaded_at').last()
print(f" Last UI login: {last_login.get_username()} @ {str(last_login.last_login)[:16]}")
last_downloaded = Snapshot.objects.order_by("downloaded_at").last()
if last_downloaded:
print(f' Last changes: {str(last_downloaded.downloaded_at)[:16]}')
print(f" Last changes: {str(last_downloaded.downloaded_at)[:16]}")
if not users:
print()
print(' [violet]Hint:[/violet] You can create an admin user by running:')
print(' [green]archivebox manage createsuperuser[/green]')
print(" [violet]Hint:[/violet] You can create an admin user by running:")
print(" [green]archivebox manage createsuperuser[/green]")
print()
recent_snapshots = sorted(
links,
key=lambda snapshot: (
snapshot.downloaded_at or snapshot.modified_at or snapshot.created_at
),
key=lambda snapshot: snapshot.downloaded_at or snapshot.modified_at or snapshot.created_at,
reverse=True,
)[:10]
for snapshot in recent_snapshots:
@@ -129,14 +118,14 @@ def status(out_dir: Path=DATA_DIR) -> None:
continue
print(
(
'[grey53] '
f' > {str(snapshot.downloaded_at)[:16]} '
f'[{snapshot.num_outputs} {("X", "")[snapshot.is_archived]} {printable_filesize(snapshot.archive_size)}] '
"[grey53] "
f" > {str(snapshot.downloaded_at)[:16]} "
f"[{snapshot.num_outputs} {('X', '')[snapshot.is_archived]} {printable_filesize(snapshot.archive_size)}] "
f'"{snapshot.title}": {snapshot.url}'
'[/grey53]'
)[:SHELL_CONFIG.TERM_WIDTH],
"[/grey53]"
)[: SHELL_CONFIG.TERM_WIDTH],
)
print('[grey53] ...')
print("[grey53] ...")
@click.command()
@@ -146,5 +135,5 @@ def main(**kwargs):
status(**kwargs)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -27,11 +27,11 @@ Examples:
archivebox tag list --name=unused | archivebox tag delete --yes
"""
__package__ = 'archivebox.cli'
__command__ = 'archivebox tag'
__package__ = "archivebox.cli"
__command__ = "archivebox tag"
import sys
from typing import Optional, Iterable
from collections.abc import Iterable
import rich_click as click
from rich import print as rprint
@@ -43,6 +43,7 @@ from archivebox.cli.cli_utils import apply_filters
# CREATE
# =============================================================================
def create_tags(names: Iterable[str]) -> int:
"""
Create Tags from names.
@@ -60,7 +61,7 @@ def create_tags(names: Iterable[str]) -> int:
name_list = list(names) if names else []
if not name_list:
rprint('[yellow]No tag names provided. Pass names as arguments.[/yellow]', file=sys.stderr)
rprint("[yellow]No tag names provided. Pass names as arguments.[/yellow]", file=sys.stderr)
return 1
created_count = 0
@@ -76,11 +77,11 @@ def create_tags(names: Iterable[str]) -> int:
if created:
created_count += 1
rprint(f'[green]Created tag: {name}[/green]', file=sys.stderr)
rprint(f"[green]Created tag: {name}[/green]", file=sys.stderr)
else:
rprint(f'[dim]Tag already exists: {name}[/dim]', file=sys.stderr)
rprint(f"[dim]Tag already exists: {name}[/dim]", file=sys.stderr)
rprint(f'[green]Created {created_count} new tags[/green]', file=sys.stderr)
rprint(f"[green]Created {created_count} new tags[/green]", file=sys.stderr)
return 0
@@ -88,10 +89,11 @@ def create_tags(names: Iterable[str]) -> int:
# LIST
# =============================================================================
def list_tags(
name: Optional[str] = None,
name__icontains: Optional[str] = None,
limit: Optional[int] = None,
name: str | None = None,
name__icontains: str | None = None,
limit: int | None = None,
) -> int:
"""
List Tags as JSONL with optional filters.
@@ -104,12 +106,12 @@ def list_tags(
is_tty = sys.stdout.isatty()
queryset = Tag.objects.all().order_by('name')
queryset = Tag.objects.all().order_by("name")
# Apply filters
filter_kwargs = {
'name': name,
'name__icontains': name__icontains,
"name": name,
"name__icontains": name__icontains,
}
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
@@ -117,12 +119,12 @@ def list_tags(
for tag in queryset:
snapshot_count = tag.snapshot_set.count()
if is_tty:
rprint(f'[cyan]{tag.name:30}[/cyan] [dim]({snapshot_count} snapshots)[/dim]')
rprint(f"[cyan]{tag.name:30}[/cyan] [dim]({snapshot_count} snapshots)[/dim]")
else:
write_record(tag.to_json())
count += 1
rprint(f'[dim]Listed {count} tags[/dim]', file=sys.stderr)
rprint(f"[dim]Listed {count} tags[/dim]", file=sys.stderr)
return 0
@@ -130,7 +132,8 @@ def list_tags(
# UPDATE
# =============================================================================
def update_tags(name: Optional[str] = None) -> int:
def update_tags(name: str | None = None) -> int:
"""
Update Tags from stdin JSONL.
@@ -148,13 +151,13 @@ def update_tags(name: Optional[str] = None) -> int:
records = list(read_stdin())
if not records:
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
return 1
updated_count = 0
for record in records:
tag_id = record.get('id')
old_name = record.get('name')
tag_id = record.get("id")
old_name = record.get("name")
if not tag_id and not old_name:
continue
@@ -176,10 +179,10 @@ def update_tags(name: Optional[str] = None) -> int:
write_record(tag.to_json())
except Tag.DoesNotExist:
rprint(f'[yellow]Tag not found: {tag_id or old_name}[/yellow]', file=sys.stderr)
rprint(f"[yellow]Tag not found: {tag_id or old_name}[/yellow]", file=sys.stderr)
continue
rprint(f'[green]Updated {updated_count} tags[/green]', file=sys.stderr)
rprint(f"[green]Updated {updated_count} tags[/green]", file=sys.stderr)
return 0
@@ -187,6 +190,7 @@ def update_tags(name: Optional[str] = None) -> int:
# DELETE
# =============================================================================
def delete_tags(yes: bool = False, dry_run: bool = False) -> int:
"""
Delete Tags from stdin JSONL.
@@ -202,23 +206,24 @@ def delete_tags(yes: bool = False, dry_run: bool = False) -> int:
records = list(read_stdin())
if not records:
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
return 1
# Collect tag IDs or names
tag_ids = []
tag_names = []
for r in records:
if r.get('id'):
tag_ids.append(r['id'])
elif r.get('name'):
tag_names.append(r['name'])
if r.get("id"):
tag_ids.append(r["id"])
elif r.get("name"):
tag_names.append(r["name"])
if not tag_ids and not tag_names:
rprint('[yellow]No valid tag IDs or names in input[/yellow]', file=sys.stderr)
rprint("[yellow]No valid tag IDs or names in input[/yellow]", file=sys.stderr)
return 1
from django.db.models import Q
query = Q()
if tag_ids:
query |= Q(id__in=tag_ids)
@@ -229,22 +234,22 @@ def delete_tags(yes: bool = False, dry_run: bool = False) -> int:
count = tags.count()
if count == 0:
rprint('[yellow]No matching tags found[/yellow]', file=sys.stderr)
rprint("[yellow]No matching tags found[/yellow]", file=sys.stderr)
return 0
if dry_run:
rprint(f'[yellow]Would delete {count} tags (dry run)[/yellow]', file=sys.stderr)
rprint(f"[yellow]Would delete {count} tags (dry run)[/yellow]", file=sys.stderr)
for tag in tags:
rprint(f' {tag.name}', file=sys.stderr)
rprint(f" {tag.name}", file=sys.stderr)
return 0
if not yes:
rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
rprint("[red]Use --yes to confirm deletion[/red]", file=sys.stderr)
return 1
# Perform deletion
deleted_count, _ = tags.delete()
rprint(f'[green]Deleted {deleted_count} tags[/green]', file=sys.stderr)
rprint(f"[green]Deleted {deleted_count} tags[/green]", file=sys.stderr)
return 0
@@ -252,42 +257,43 @@ def delete_tags(yes: bool = False, dry_run: bool = False) -> int:
# CLI Commands
# =============================================================================
@click.group()
def main():
"""Manage Tag records."""
pass
@main.command('create')
@click.argument('names', nargs=-1)
@main.command("create")
@click.argument("names", nargs=-1)
def create_cmd(names: tuple):
"""Create Tags from names."""
sys.exit(create_tags(names))
@main.command('list')
@click.option('--name', help='Filter by exact name')
@click.option('--name__icontains', help='Filter by name contains')
@click.option('--limit', '-n', type=int, help='Limit number of results')
def list_cmd(name: Optional[str], name__icontains: Optional[str], limit: Optional[int]):
@main.command("list")
@click.option("--name", help="Filter by exact name")
@click.option("--name__icontains", help="Filter by name contains")
@click.option("--limit", "-n", type=int, help="Limit number of results")
def list_cmd(name: str | None, name__icontains: str | None, limit: int | None):
"""List Tags as JSONL."""
sys.exit(list_tags(name=name, name__icontains=name__icontains, limit=limit))
@main.command('update')
@click.option('--name', '-n', help='Set new name')
def update_cmd(name: Optional[str]):
@main.command("update")
@click.option("--name", "-n", help="Set new name")
def update_cmd(name: str | None):
"""Update Tags from stdin JSONL."""
sys.exit(update_tags(name=name))
@main.command('delete')
@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
@main.command("delete")
@click.option("--yes", "-y", is_flag=True, help="Confirm deletion")
@click.option("--dry-run", is_flag=True, help="Show what would be deleted")
def delete_cmd(yes: bool, dry_run: bool):
"""Delete Tags from stdin JSONL."""
sys.exit(delete_tags(yes=yes, dry_run=dry_run))
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -1,11 +1,12 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__package__ = "archivebox.cli"
import os
import time
from typing import TYPE_CHECKING, Callable, Iterable
from typing import TYPE_CHECKING, Any
from collections.abc import Callable, Iterable
from pathlib import Path
import rich_click as click
@@ -20,24 +21,22 @@ if TYPE_CHECKING:
LINK_FILTERS: dict[str, Callable[[str], Q]] = {
'exact': lambda pattern: Q(url=pattern),
'substring': lambda pattern: Q(url__icontains=pattern),
'regex': lambda pattern: Q(url__iregex=pattern),
'domain': lambda pattern: (
Q(url__istartswith=f'http://{pattern}')
| Q(url__istartswith=f'https://{pattern}')
| Q(url__istartswith=f'ftp://{pattern}')
"exact": lambda pattern: Q(url=pattern),
"substring": lambda pattern: Q(url__icontains=pattern),
"regex": lambda pattern: Q(url__iregex=pattern),
"domain": lambda pattern: (
Q(url__istartswith=f"http://{pattern}") | Q(url__istartswith=f"https://{pattern}") | Q(url__istartswith=f"ftp://{pattern}")
),
'tag': lambda pattern: Q(tags__name=pattern),
'timestamp': lambda pattern: Q(timestamp=pattern),
"tag": lambda pattern: Q(tags__name=pattern),
"timestamp": lambda pattern: Q(timestamp=pattern),
}
def _apply_pattern_filters(
snapshots: QuerySet['Snapshot', 'Snapshot'],
snapshots: QuerySet["Snapshot", "Snapshot"],
filter_patterns: list[str],
filter_type: str,
) -> QuerySet['Snapshot', 'Snapshot']:
) -> QuerySet["Snapshot", "Snapshot"]:
filter_builder = LINK_FILTERS.get(filter_type)
if filter_builder is None:
raise SystemExit(2)
@@ -48,21 +47,120 @@ def _apply_pattern_filters(
return snapshots.filter(query)
def _get_snapshot_crawl(snapshot: 'Snapshot') -> 'Crawl | None':
def _get_snapshot_crawl(snapshot: "Snapshot") -> "Crawl | None":
try:
return snapshot.crawl
except ObjectDoesNotExist:
return None
def _get_search_indexing_plugins() -> list[str]:
from abx_dl.models import discover_plugins
from archivebox.hooks import get_search_backends
available_backends = set(get_search_backends())
plugins = discover_plugins()
return sorted(
plugin_name
for plugin_name, plugin in plugins.items()
if plugin_name.startswith("search_backend_")
and plugin_name.removeprefix("search_backend_") in available_backends
and any("Snapshot" in hook.name and "index" in hook.name.lower() for hook in plugin.hooks)
)
def _build_filtered_snapshots_queryset(
*,
filter_patterns: Iterable[str],
filter_type: str,
before: float | None,
after: float | None,
resume: str | None = None,
):
from archivebox.core.models import Snapshot
from datetime import datetime
snapshots = Snapshot.objects.all()
if filter_patterns:
snapshots = _apply_pattern_filters(snapshots, list(filter_patterns), filter_type)
if before:
snapshots = snapshots.filter(bookmarked_at__lt=datetime.fromtimestamp(before))
if after:
snapshots = snapshots.filter(bookmarked_at__gt=datetime.fromtimestamp(after))
if resume:
snapshots = snapshots.filter(timestamp__lte=resume)
return snapshots.select_related("crawl").order_by("-bookmarked_at")
def reindex_snapshots(
snapshots: QuerySet["Snapshot", "Snapshot"],
*,
search_plugins: list[str],
batch_size: int,
) -> dict[str, int]:
from archivebox.cli.archivebox_extract import run_plugins
stats = {"processed": 0, "reconciled": 0, "queued": 0, "reindexed": 0}
records: list[dict[str, str]] = []
total = snapshots.count()
print(f"[*] Reindexing {total} snapshots with search plugins: {', '.join(search_plugins)}")
for snapshot in snapshots.iterator(chunk_size=batch_size):
stats["processed"] += 1
if _get_snapshot_crawl(snapshot) is None:
continue
output_dir = Path(snapshot.output_dir)
has_directory = output_dir.exists() and output_dir.is_dir()
if has_directory:
snapshot.reconcile_with_index_json()
stats["reconciled"] += 1
for plugin_name in search_plugins:
existing_result = snapshot.archiveresult_set.filter(plugin=plugin_name).order_by("-created_at").first()
if existing_result:
existing_result.reset_for_retry()
records.append(
{
"type": "ArchiveResult",
"snapshot_id": str(snapshot.id),
"plugin": plugin_name,
},
)
stats["queued"] += 1
if not records:
return stats
exit_code = run_plugins(
args=(),
records=records,
wait=True,
emit_results=False,
)
if exit_code != 0:
raise SystemExit(exit_code)
stats["reindexed"] = len(records)
return stats
@enforce_types
def update(filter_patterns: Iterable[str] = (),
filter_type: str = 'exact',
before: float | None = None,
after: float | None = None,
resume: str | None = None,
batch_size: int = 100,
continuous: bool = False) -> None:
def update(
filter_patterns: Iterable[str] = (),
filter_type: str = "exact",
before: float | None = None,
after: float | None = None,
resume: str | None = None,
batch_size: int = 100,
continuous: bool = False,
index_only: bool = False,
) -> None:
"""
Update snapshots: migrate old dirs, reconcile DB, and re-queue for archiving.
@@ -77,41 +175,69 @@ def update(filter_patterns: Iterable[str] = (),
from rich import print
from archivebox.config.django import setup_django
setup_django()
from django.core.management import call_command
# Run migrations first to ensure DB schema is up-to-date
print('[*] Checking for pending migrations...')
print("[*] Checking for pending migrations...")
try:
call_command('migrate', '--no-input', verbosity=0)
call_command("migrate", "--no-input", verbosity=0)
except Exception as e:
print(f'[!] Warning: Migration check failed: {e}')
print(f"[!] Warning: Migration check failed: {e}")
while True:
if filter_patterns or before or after:
if index_only:
search_plugins = _get_search_indexing_plugins()
if not search_plugins:
print("[*] No search indexing plugins are available, nothing to backfill.")
break
if not (filter_patterns or before or after):
print("[*] Phase 1: Draining old archive/ directories (0.8.x → 0.9.x migration)...")
drain_old_archive_dirs(
resume_from=resume,
batch_size=batch_size,
)
snapshots = _build_filtered_snapshots_queryset(
filter_patterns=filter_patterns,
filter_type=filter_type,
before=before,
after=after,
resume=resume,
)
stats = reindex_snapshots(
snapshots,
search_plugins=search_plugins,
batch_size=batch_size,
)
print_index_stats(stats)
elif filter_patterns or before or after:
# Filtered mode: query DB only
print('[*] Processing filtered snapshots from database...')
print("[*] Processing filtered snapshots from database...")
stats = process_filtered_snapshots(
filter_patterns=filter_patterns,
filter_type=filter_type,
before=before,
after=after,
batch_size=batch_size
resume=resume,
batch_size=batch_size,
)
print_stats(stats)
else:
# Full mode: drain old dirs + process DB
stats_combined = {'phase1': {}, 'phase2': {}}
stats_combined = {"phase1": {}, "phase2": {}}
print('[*] Phase 1: Draining old archive/ directories (0.8.x → 0.9.x migration)...')
stats_combined['phase1'] = drain_old_archive_dirs(
print("[*] Phase 1: Draining old archive/ directories (0.8.x → 0.9.x migration)...")
stats_combined["phase1"] = drain_old_archive_dirs(
resume_from=resume,
batch_size=batch_size
batch_size=batch_size,
)
print('[*] Phase 2: Processing all database snapshots (most recent first)...')
stats_combined['phase2'] = process_all_db_snapshots(batch_size=batch_size)
print("[*] Phase 2: Processing all database snapshots (most recent first)...")
stats_combined["phase2"] = process_all_db_snapshots(batch_size=batch_size, resume=resume)
# Phase 3: Deduplication (disabled for now)
# print('[*] Phase 3: Deduplicating...')
@@ -122,7 +248,7 @@ def update(filter_patterns: Iterable[str] = (),
if not continuous:
break
print('[yellow]Sleeping 60s before next pass...[/yellow]')
print("[yellow]Sleeping 60s before next pass...[/yellow]")
time.sleep(60)
resume = None
@@ -144,34 +270,34 @@ def drain_old_archive_dirs(resume_from: str | None = None, batch_size: int = 100
from archivebox.config import CONSTANTS
from django.db import transaction
stats = {'processed': 0, 'migrated': 0, 'skipped': 0, 'invalid': 0}
stats = {"processed": 0, "migrated": 0, "skipped": 0, "invalid": 0}
archive_dir = CONSTANTS.ARCHIVE_DIR
if not archive_dir.exists():
return stats
print('[DEBUG Phase1] Scanning for old directories in archive/...')
print("[DEBUG Phase1] Scanning for old directories in archive/...")
# Scan for real directories only (skip symlinks - they're already migrated)
all_entries = list(os.scandir(archive_dir))
print(f'[DEBUG Phase1] Total entries in archive/: {len(all_entries)}')
print(f"[DEBUG Phase1] Total entries in archive/: {len(all_entries)}")
entries = [
(e.stat().st_mtime, e.path)
for e in all_entries
if e.is_dir(follow_symlinks=False) # Skip symlinks
]
entries.sort(reverse=True) # Newest first
print(f'[DEBUG Phase1] Real directories (not symlinks): {len(entries)}')
print(f'[*] Found {len(entries)} old directories to drain')
print(f"[DEBUG Phase1] Real directories (not symlinks): {len(entries)}")
print(f"[*] Found {len(entries)} old directories to drain")
for mtime, entry_path in entries:
entry_path = Path(entry_path)
# Resume from timestamp if specified
if resume_from and entry_path.name < resume_from:
if resume_from and entry_path.name > resume_from:
continue
stats['processed'] += 1
stats["processed"] += 1
# Try to load existing snapshot from DB
snapshot = Snapshot.load_from_directory(entry_path)
@@ -182,16 +308,16 @@ def drain_old_archive_dirs(resume_from: str | None = None, batch_size: int = 100
if not snapshot:
# Invalid directory - move to invalid/
Snapshot.move_directory_to_invalid(entry_path)
stats['invalid'] += 1
stats["invalid"] += 1
print(f" [{stats['processed']}] Invalid: {entry_path.name}")
continue
try:
snapshot.save()
stats['migrated'] += 1
stats["migrated"] += 1
print(f" [{stats['processed']}] Imported orphaned snapshot: {entry_path.name}")
except Exception as e:
stats['skipped'] += 1
stats["skipped"] += 1
print(f" [{stats['processed']}] Skipped (error: {e}): {entry_path.name}")
continue
@@ -201,30 +327,35 @@ def drain_old_archive_dirs(resume_from: str | None = None, batch_size: int = 100
if not has_valid_crawl:
# Create a new crawl (created_by will default to system user)
from archivebox.crawls.models import Crawl
crawl = Crawl.objects.create(urls=snapshot.url)
# Use queryset update to avoid triggering save() hooks
from archivebox.core.models import Snapshot as SnapshotModel
SnapshotModel.objects.filter(pk=snapshot.pk).update(crawl=crawl)
# Refresh the instance
snapshot.crawl = crawl
print(f"[DEBUG Phase1] Created missing crawl for snapshot {str(snapshot.id)[:8]}")
# Check if needs migration (0.8.x → 0.9.x)
print(f"[DEBUG Phase1] Snapshot {str(snapshot.id)[:8]}: fs_version={snapshot.fs_version}, needs_migration={snapshot.fs_migration_needed}")
print(
f"[DEBUG Phase1] Snapshot {str(snapshot.id)[:8]}: fs_version={snapshot.fs_version}, needs_migration={snapshot.fs_migration_needed}",
)
if snapshot.fs_migration_needed:
try:
# Calculate paths using actual directory (entry_path), not snapshot.timestamp
# because snapshot.timestamp might be truncated
old_dir = entry_path
new_dir = snapshot.get_storage_path_for_version('0.9.0')
new_dir = snapshot.get_storage_path_for_version("0.9.0")
print(f"[DEBUG Phase1] Migrating {old_dir.name}{new_dir}")
# Manually migrate files
if not new_dir.exists() and old_dir.exists():
new_dir.mkdir(parents=True, exist_ok=True)
import shutil
file_count = 0
for old_file in old_dir.rglob('*'):
for old_file in old_dir.rglob("*"):
if old_file.is_file():
rel_path = old_file.relative_to(old_dir)
new_file = new_dir / rel_path
@@ -236,7 +367,8 @@ def drain_old_archive_dirs(resume_from: str | None = None, batch_size: int = 100
# Update only fs_version field using queryset update (bypasses validation)
from archivebox.core.models import Snapshot as SnapshotModel
SnapshotModel.objects.filter(pk=snapshot.pk).update(fs_version='0.9.0')
SnapshotModel.objects.filter(pk=snapshot.pk).update(fs_version="0.9.0")
# Commit the transaction
transaction.commit()
@@ -245,22 +377,22 @@ def drain_old_archive_dirs(resume_from: str | None = None, batch_size: int = 100
if old_dir.exists() and old_dir != new_dir:
snapshot._cleanup_old_migration_dir(old_dir, new_dir)
stats['migrated'] += 1
stats["migrated"] += 1
print(f" [{stats['processed']}] Migrated: {entry_path.name}")
except Exception as e:
stats['skipped'] += 1
stats["skipped"] += 1
print(f" [{stats['processed']}] Skipped (error: {e}): {entry_path.name}")
else:
stats['skipped'] += 1
stats["skipped"] += 1
if stats['processed'] % batch_size == 0:
if stats["processed"] % batch_size == 0:
transaction.commit()
transaction.commit()
return stats
def process_all_db_snapshots(batch_size: int = 100) -> dict[str, int]:
def process_all_db_snapshots(batch_size: int = 100, resume: str | None = None) -> dict[str, int]:
"""
O(n) scan over entire DB from most recent to least recent.
@@ -275,24 +407,30 @@ def process_all_db_snapshots(batch_size: int = 100) -> dict[str, int]:
from django.db import transaction
from django.utils import timezone
stats = {'processed': 0, 'reconciled': 0, 'queued': 0}
stats = {"processed": 0, "reconciled": 0, "queued": 0}
total = Snapshot.objects.count()
print(f'[*] Processing {total} snapshots from database (most recent first)...')
queryset = Snapshot.objects.all()
if resume:
queryset = queryset.filter(timestamp__lte=resume)
total = queryset.count()
print(f"[*] Processing {total} snapshots from database (most recent first)...")
# Process from most recent to least recent
for snapshot in Snapshot.objects.select_related('crawl').order_by('-bookmarked_at').iterator(chunk_size=batch_size):
stats['processed'] += 1
for snapshot in queryset.select_related("crawl").order_by("-bookmarked_at").iterator(chunk_size=batch_size):
stats["processed"] += 1
# Skip snapshots with missing crawl references (orphaned by migration errors)
if _get_snapshot_crawl(snapshot) is None:
continue
try:
print(f"[DEBUG Phase2] Snapshot {str(snapshot.id)[:8]}: fs_version={snapshot.fs_version}, needs_migration={snapshot.fs_migration_needed}")
print(
f"[DEBUG Phase2] Snapshot {str(snapshot.id)[:8]}: fs_version={snapshot.fs_version}, needs_migration={snapshot.fs_migration_needed}",
)
# Check if snapshot has a directory on disk
from pathlib import Path
output_dir = Path(snapshot.output_dir)
has_directory = output_dir.exists() and output_dir.is_dir()
@@ -313,22 +451,23 @@ def process_all_db_snapshots(batch_size: int = 100) -> dict[str, int]:
print(f"[DEBUG Phase2] Orphan snapshot {str(snapshot.id)[:8]} - marking as migrated without filesystem operation")
# Use queryset update to set fs_version without triggering save() hooks
from archivebox.core.models import Snapshot as SnapshotModel
SnapshotModel.objects.filter(pk=snapshot.pk).update(fs_version='0.9.0')
snapshot.fs_version = '0.9.0'
SnapshotModel.objects.filter(pk=snapshot.pk).update(fs_version="0.9.0")
snapshot.fs_version = "0.9.0"
# Queue for archiving (state machine will handle it)
snapshot.status = Snapshot.StatusChoices.QUEUED
snapshot.retry_at = timezone.now()
snapshot.save()
stats['reconciled'] += 1 if has_directory else 0
stats['queued'] += 1
stats["reconciled"] += 1 if has_directory else 0
stats["queued"] += 1
except Exception as e:
# Skip snapshots that can't be processed (e.g., missing crawl)
print(f" [!] Skipping snapshot {snapshot.id}: {e}")
continue
if stats['processed'] % batch_size == 0:
if stats["processed"] % batch_size == 0:
transaction.commit()
print(f" [{stats['processed']}/{total}] Processed...")
@@ -341,31 +480,28 @@ def process_filtered_snapshots(
filter_type: str,
before: float | None,
after: float | None,
batch_size: int
resume: str | None,
batch_size: int,
) -> dict[str, int]:
"""Process snapshots matching filters (DB query only)."""
from archivebox.core.models import Snapshot
from django.db import transaction
from django.utils import timezone
from datetime import datetime
stats = {'processed': 0, 'reconciled': 0, 'queued': 0}
stats = {"processed": 0, "reconciled": 0, "queued": 0}
snapshots = Snapshot.objects.all()
if filter_patterns:
snapshots = _apply_pattern_filters(snapshots, list(filter_patterns), filter_type)
if before:
snapshots = snapshots.filter(bookmarked_at__lt=datetime.fromtimestamp(before))
if after:
snapshots = snapshots.filter(bookmarked_at__gt=datetime.fromtimestamp(after))
snapshots = _build_filtered_snapshots_queryset(
filter_patterns=filter_patterns,
filter_type=filter_type,
before=before,
after=after,
resume=resume,
)
total = snapshots.count()
print(f'[*] Found {total} matching snapshots')
print(f"[*] Found {total} matching snapshots")
for snapshot in snapshots.select_related('crawl').iterator(chunk_size=batch_size):
stats['processed'] += 1
for snapshot in snapshots.select_related("crawl").iterator(chunk_size=batch_size):
stats["processed"] += 1
# Skip snapshots with missing crawl references
if _get_snapshot_crawl(snapshot) is None:
@@ -384,14 +520,14 @@ def process_filtered_snapshots(
snapshot.retry_at = timezone.now()
snapshot.save()
stats['reconciled'] += 1
stats['queued'] += 1
stats["reconciled"] += 1
stats["queued"] += 1
except Exception as e:
# Skip snapshots that can't be processed
print(f" [!] Skipping snapshot {snapshot.id}: {e}")
continue
if stats['processed'] % batch_size == 0:
if stats["processed"] % batch_size == 0:
transaction.commit()
print(f" [{stats['processed']}/{total}] Processed...")
@@ -405,9 +541,9 @@ def print_stats(stats: dict):
print(f"""
[green]Update Complete[/green]
Processed: {stats['processed']}
Reconciled: {stats['reconciled']}
Queued: {stats['queued']}
Processed: {stats["processed"]}
Reconciled: {stats["reconciled"]}
Queued: {stats["queued"]}
""")
@@ -415,37 +551,50 @@ def print_combined_stats(stats_combined: dict):
"""Print statistics for full mode."""
from rich import print
s1 = stats_combined['phase1']
s2 = stats_combined['phase2']
s1 = stats_combined["phase1"]
s2 = stats_combined["phase2"]
print(f"""
[green]Archive Update Complete[/green]
Phase 1 (Drain Old Dirs):
Checked: {s1.get('processed', 0)}
Migrated: {s1.get('migrated', 0)}
Skipped: {s1.get('skipped', 0)}
Invalid: {s1.get('invalid', 0)}
Checked: {s1.get("processed", 0)}
Migrated: {s1.get("migrated", 0)}
Skipped: {s1.get("skipped", 0)}
Invalid: {s1.get("invalid", 0)}
Phase 2 (Process DB):
Processed: {s2.get('processed', 0)}
Reconciled: {s2.get('reconciled', 0)}
Queued: {s2.get('queued', 0)}
Processed: {s2.get("processed", 0)}
Reconciled: {s2.get("reconciled", 0)}
Queued: {s2.get("queued", 0)}
""")
def print_index_stats(stats: dict[str, Any]) -> None:
from rich import print
print(f"""
[green]Search Reindex Complete[/green]
Processed: {stats["processed"]}
Reconciled: {stats["reconciled"]}
Queued: {stats["queued"]}
Reindexed: {stats["reindexed"]}
""")
@click.command()
@click.option('--resume', type=str, help='Resume from timestamp')
@click.option('--before', type=float, help='Only snapshots before timestamp')
@click.option('--after', type=float, help='Only snapshots after timestamp')
@click.option('--filter-type', '-t', type=click.Choice(['exact', 'substring', 'regex', 'domain', 'tag', 'timestamp']), default='exact')
@click.option('--batch-size', type=int, default=100, help='Commit every N snapshots')
@click.option('--continuous', is_flag=True, help='Run continuously as background worker')
@click.argument('filter_patterns', nargs=-1)
@click.option("--resume", type=str, help="Resume from timestamp")
@click.option("--before", type=float, help="Only snapshots before timestamp")
@click.option("--after", type=float, help="Only snapshots after timestamp")
@click.option("--filter-type", "-t", type=click.Choice(["exact", "substring", "regex", "domain", "tag", "timestamp"]), default="exact")
@click.option("--batch-size", type=int, default=100, help="Commit every N snapshots")
@click.option("--continuous", is_flag=True, help="Run continuously as background worker")
@click.option("--index-only", is_flag=True, help="Backfill available search indexes from existing archived content")
@click.argument("filter_patterns", nargs=-1)
@docstring(update.__doc__)
def main(**kwargs):
update(**kwargs)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -1,12 +1,12 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__package__ = "archivebox.cli"
import sys
import os
import platform
from pathlib import Path
from typing import Iterable
from collections.abc import Iterable
import rich_click as click
@@ -14,19 +14,22 @@ from archivebox.misc.util import docstring, enforce_types
@enforce_types
def version(quiet: bool=False,
binaries: Iterable[str]=()) -> list[str]:
def version(
quiet: bool = False,
binaries: Iterable[str] = (),
) -> list[str]:
"""Print the ArchiveBox version, debug metadata, and installed dependency versions"""
# fast path for just getting the version and exiting, dont do any slower imports
from archivebox.config.version import VERSION
print(VERSION)
if quiet or '--version' in sys.argv:
if quiet or "--version" in sys.argv:
return []
from rich.panel import Panel
from rich.console import Console
from archivebox.config import CONSTANTS, DATA_DIR
from archivebox.config.version import get_COMMIT_HASH, get_BUILD_TIME
from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, RUNNING_AS_UID, RUNNING_AS_GID, IN_DOCKER
@@ -34,78 +37,89 @@ def version(quiet: bool=False,
from archivebox.config.common import SHELL_CONFIG, STORAGE_CONFIG, SEARCH_BACKEND_CONFIG
from archivebox.misc.logging_util import printable_folder_status
from archivebox.config.configset import get_config
console = Console()
prnt = console.print
# Check if LDAP is enabled (simple config lookup)
config = get_config()
LDAP_ENABLED = config.get('LDAP_ENABLED', False)
LDAP_ENABLED = config.get("LDAP_ENABLED", False)
p = platform.uname()
COMMIT_HASH = get_COMMIT_HASH()
prnt(
'[dark_green]ArchiveBox[/dark_green] [dark_goldenrod]v{}[/dark_goldenrod]'.format(CONSTANTS.VERSION),
f'COMMIT_HASH={COMMIT_HASH[:7] if COMMIT_HASH else "unknown"}',
f'BUILD_TIME={get_BUILD_TIME()}',
f"[dark_green]ArchiveBox[/dark_green] [dark_goldenrod]v{CONSTANTS.VERSION}[/dark_goldenrod]",
f"COMMIT_HASH={COMMIT_HASH[:7] if COMMIT_HASH else 'unknown'}",
f"BUILD_TIME={get_BUILD_TIME()}",
)
prnt(
f'IN_DOCKER={IN_DOCKER}',
f'IN_QEMU={SHELL_CONFIG.IN_QEMU}',
f'ARCH={p.machine}',
f'OS={p.system}',
f'PLATFORM={platform.platform()}',
f'PYTHON={sys.implementation.name.title()}' + (' (venv)' if CONSTANTS.IS_INSIDE_VENV else ''),
f"IN_DOCKER={IN_DOCKER}",
f"IN_QEMU={SHELL_CONFIG.IN_QEMU}",
f"ARCH={p.machine}",
f"OS={p.system}",
f"PLATFORM={platform.platform()}",
f"PYTHON={sys.implementation.name.title()}" + (" (venv)" if CONSTANTS.IS_INSIDE_VENV else ""),
)
try:
OUTPUT_IS_REMOTE_FS = get_data_locations().DATA_DIR.is_mount or get_data_locations().ARCHIVE_DIR.is_mount
except Exception:
OUTPUT_IS_REMOTE_FS = False
try:
DATA_DIR_STAT = CONSTANTS.DATA_DIR.stat()
prnt(
f'EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} PUID={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}',
f'FS_UID={DATA_DIR_STAT.st_uid}:{DATA_DIR_STAT.st_gid}',
f'FS_PERMS={STORAGE_CONFIG.OUTPUT_PERMISSIONS}',
f'FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}',
f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
f"EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} PUID={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}",
f"FS_UID={DATA_DIR_STAT.st_uid}:{DATA_DIR_STAT.st_gid}",
f"FS_PERMS={STORAGE_CONFIG.OUTPUT_PERMISSIONS}",
f"FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}",
f"FS_REMOTE={OUTPUT_IS_REMOTE_FS}",
)
except Exception:
prnt(
f'EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} PUID={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}',
f"EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} PUID={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}",
)
prnt(
f'DEBUG={SHELL_CONFIG.DEBUG}',
f'IS_TTY={SHELL_CONFIG.IS_TTY}',
f'SUDO={CONSTANTS.IS_ROOT}',
f'ID={CONSTANTS.MACHINE_ID}:{CONSTANTS.COLLECTION_ID}',
f'SEARCH_BACKEND={SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}',
f'LDAP={LDAP_ENABLED}',
f"DEBUG={SHELL_CONFIG.DEBUG}",
f"IS_TTY={SHELL_CONFIG.IS_TTY}",
f"SUDO={CONSTANTS.IS_ROOT}",
f"ID={CONSTANTS.MACHINE_ID}:{CONSTANTS.COLLECTION_ID}",
f"SEARCH_BACKEND={SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}",
f"LDAP={LDAP_ENABLED}",
)
prnt()
if not (os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) and os.access(CONSTANTS.CONFIG_FILE, os.R_OK)):
PANEL_TEXT = '\n'.join((
'',
'[violet]Hint:[/violet] [green]cd[/green] into a collection [blue]DATA_DIR[/blue] and run [green]archivebox version[/green] again...',
' [grey53]OR[/grey53] run [green]archivebox init[/green] to create a new collection in the current dir.',
'',
' [i][grey53](this is [red]REQUIRED[/red] if you are opening a Github Issue to get help)[/grey53][/i]',
'',
))
prnt(Panel(PANEL_TEXT, expand=False, border_style='grey53', title='[red]:exclamation: No collection [blue]DATA_DIR[/blue] is currently active[/red]', subtitle='Full version info is only available when inside a collection [light_slate_blue]DATA DIR[/light_slate_blue]'))
PANEL_TEXT = "\n".join(
(
"",
"[violet]Hint:[/violet] [green]cd[/green] into a collection [blue]DATA_DIR[/blue] and run [green]archivebox version[/green] again...",
" [grey53]OR[/grey53] run [green]archivebox init[/green] to create a new collection in the current dir.",
"",
" [i][grey53](this is [red]REQUIRED[/red] if you are opening a Github Issue to get help)[/grey53][/i]",
"",
),
)
prnt(
Panel(
PANEL_TEXT,
expand=False,
border_style="grey53",
title="[red]:exclamation: No collection [blue]DATA_DIR[/blue] is currently active[/red]",
subtitle="Full version info is only available when inside a collection [light_slate_blue]DATA DIR[/light_slate_blue]",
),
)
prnt()
return []
prnt('[pale_green1][i] Binary Dependencies:[/pale_green1]')
prnt("[pale_green1][i] Binary Dependencies:[/pale_green1]")
failures = []
# Setup Django before importing models
try:
from archivebox.config.django import setup_django
setup_django()
from archivebox.machine.models import Machine, Binary
@@ -113,12 +127,17 @@ def version(quiet: bool=False,
machine = Machine.current()
# Get all binaries from the database with timeout protection
all_installed = Binary.objects.filter(
machine=machine
).exclude(abspath='').exclude(abspath__isnull=True).order_by('name')
all_installed = (
Binary.objects.filter(
machine=machine,
)
.exclude(abspath="")
.exclude(abspath__isnull=True)
.order_by("name")
)
if not all_installed.exists():
prnt('', '[grey53]No binaries detected. Run [green]archivebox install[/green] to detect dependencies.[/grey53]')
prnt("", "[grey53]No binaries detected. Run [green]archivebox install[/green] to detect dependencies.[/grey53]")
else:
for installed in all_installed:
# Skip if user specified specific binaries and this isn't one
@@ -126,71 +145,91 @@ def version(quiet: bool=False,
continue
if installed.is_valid:
display_path = installed.abspath.replace(str(DATA_DIR), '.').replace(str(Path('~').expanduser()), '~')
version_str = (installed.version or 'unknown')[:15]
provider = (installed.binprovider or 'env')[:8]
prnt('', '[green]√[/green]', '', installed.name.ljust(18), version_str.ljust(16), provider.ljust(8), display_path, overflow='ignore', crop=False)
display_path = installed.abspath.replace(str(DATA_DIR), ".").replace(str(Path("~").expanduser()), "~")
version_str = (installed.version or "unknown")[:15]
provider = (installed.binprovider or "env")[:8]
prnt(
"",
"[green]√[/green]",
"",
installed.name.ljust(18),
version_str.ljust(16),
provider.ljust(8),
display_path,
overflow="ignore",
crop=False,
)
else:
prnt('', '[red]X[/red]', '', installed.name.ljust(18), '[grey53]not installed[/grey53]', overflow='ignore', crop=False)
prnt("", "[red]X[/red]", "", installed.name.ljust(18), "[grey53]not installed[/grey53]", overflow="ignore", crop=False)
failures.append(installed.name)
# Show hint if no binaries are installed yet
has_any_installed = Binary.objects.filter(machine=machine).exclude(abspath='').exists()
has_any_installed = Binary.objects.filter(machine=machine).exclude(abspath="").exists()
if not has_any_installed:
prnt()
prnt('', '[grey53]Run [green]archivebox install[/green] to detect and install dependencies.[/grey53]')
prnt("", "[grey53]Run [green]archivebox install[/green] to detect and install dependencies.[/grey53]")
except Exception as e:
# Handle database errors gracefully (locked, missing, etc.)
prnt()
prnt('', f'[yellow]Warning: Could not query binaries from database: {e}[/yellow]')
prnt('', '[grey53]Run [green]archivebox init[/green] and [green]archivebox install[/green] to set up dependencies.[/grey53]')
prnt("", f"[yellow]Warning: Could not query binaries from database: {e}[/yellow]")
prnt("", "[grey53]Run [green]archivebox init[/green] and [green]archivebox install[/green] to set up dependencies.[/grey53]")
if not binaries:
# Show code and data locations
prnt()
prnt('[deep_sky_blue3][i] Code locations:[/deep_sky_blue3]')
prnt("[deep_sky_blue3][i] Code locations:[/deep_sky_blue3]")
try:
for name, path in get_code_locations().items():
if isinstance(name, str) and isinstance(path, dict):
prnt(printable_folder_status(name, path), overflow='ignore', crop=False)
prnt(printable_folder_status(name, path), overflow="ignore", crop=False)
except Exception as e:
prnt(f' [red]Error getting code locations: {e}[/red]')
prnt(f" [red]Error getting code locations: {e}[/red]")
prnt()
if os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) or os.access(CONSTANTS.CONFIG_FILE, os.R_OK):
prnt('[bright_yellow][i] Data locations:[/bright_yellow]')
prnt("[bright_yellow][i] Data locations:[/bright_yellow]")
try:
for name, path in get_data_locations().items():
if isinstance(name, str) and isinstance(path, dict):
prnt(printable_folder_status(name, path), overflow='ignore', crop=False)
prnt(printable_folder_status(name, path), overflow="ignore", crop=False)
except Exception as e:
prnt(f' [red]Error getting data locations: {e}[/red]')
prnt(f" [red]Error getting data locations: {e}[/red]")
try:
from archivebox.misc.checks import check_data_dir_permissions
check_data_dir_permissions()
except Exception:
pass
else:
prnt()
prnt('[red][i] Data locations:[/red] (not in a data directory)')
prnt("[red][i] Data locations:[/red] (not in a data directory)")
prnt()
if failures:
prnt('[red]Error:[/red] [yellow]Failed to detect the following binaries:[/yellow]')
prnt(f' [red]{", ".join(failures)}[/red]')
prnt("[red]Error:[/red] [yellow]Failed to detect the following binaries:[/yellow]")
prnt(f" [red]{', '.join(failures)}[/red]")
prnt()
prnt('[violet]Hint:[/violet] To install missing binaries automatically, run:')
prnt(' [green]archivebox install[/green]')
prnt("[violet]Hint:[/violet] To install missing binaries automatically, run:")
prnt(" [green]archivebox install[/green]")
prnt()
return failures
@click.command()
@click.option('--quiet', '-q', is_flag=True, help='Only print ArchiveBox version number and nothing else. (equivalent to archivebox --version)')
@click.option('--binaries', '-b', help='Select binaries to detect DEFAULT=curl,wget,git,yt-dlp,chrome,single-file,readability-extractor,postlight-parser,... (all)')
@click.option(
"--quiet",
"-q",
is_flag=True,
help="Only print ArchiveBox version number and nothing else. (equivalent to archivebox --version)",
)
@click.option(
"--binaries",
"-b",
help="Select binaries to detect DEFAULT=curl,wget,git,yt-dlp,chrome,single-file,readability-extractor,postlight-parser,... (all)",
)
@docstring(version.__doc__)
def main(**kwargs):
failures = version(**kwargs)
@@ -198,5 +237,5 @@ def main(**kwargs):
raise SystemExit(1)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -5,12 +5,10 @@ This module contains common utilities used across multiple CLI commands,
extracted to avoid code duplication.
"""
__package__ = 'archivebox.cli'
from typing import Optional
__package__ = "archivebox.cli"
def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
def apply_filters(queryset, filter_kwargs: dict, limit: int | None = None):
"""
Apply Django-style filters from CLI kwargs to a QuerySet.
@@ -31,11 +29,11 @@ def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
"""
filters = {}
for key, value in filter_kwargs.items():
if value is None or key in ('limit', 'offset'):
if value is None or key in ("limit", "offset"):
continue
# Handle CSV lists for __in filters
if key.endswith('__in') and isinstance(value, str):
value = [v.strip() for v in value.split(',')]
if key.endswith("__in") and isinstance(value, str):
value = [v.strip() for v in value.split(",")]
filters[key] = value
if filters: