mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-05 23:37:58 +10:00
287 lines
11 KiB
Python
287 lines
11 KiB
Python
#!/usr/bin/env python3
|
|
|
|
__package__ = "archivebox.cli"
|
|
__command__ = "archivebox add"
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
from typing import TYPE_CHECKING
|
|
|
|
import rich_click as click
|
|
|
|
from django.utils import timezone
|
|
from django.db.models import QuerySet
|
|
|
|
from archivebox.misc.util import enforce_types, docstring
|
|
from archivebox.misc.util import parse_filesize_to_bytes
|
|
from archivebox import CONSTANTS
|
|
from archivebox.config.common import ARCHIVING_CONFIG, SERVER_CONFIG
|
|
from archivebox.config.permissions import USER, HOSTNAME
|
|
|
|
|
|
if TYPE_CHECKING:
|
|
from archivebox.core.models import Snapshot
|
|
from archivebox.crawls.models import Crawl
|
|
|
|
|
|
def _collect_input_urls(args: tuple[str, ...]) -> list[str]:
|
|
from archivebox.misc.jsonl import read_args_or_stdin
|
|
|
|
urls: list[str] = []
|
|
for record in read_args_or_stdin(args):
|
|
url = record.get("url")
|
|
if isinstance(url, str) and url:
|
|
urls.append(url)
|
|
|
|
urls_field = record.get("urls")
|
|
if isinstance(urls_field, str):
|
|
for line in urls_field.splitlines():
|
|
line = line.strip()
|
|
if line and not line.startswith("#"):
|
|
urls.append(line)
|
|
|
|
return urls
|
|
|
|
|
|
@enforce_types
|
|
def add(
|
|
urls: str | list[str],
|
|
depth: int | str = 0,
|
|
max_urls: int = 0,
|
|
max_size: int | str = 0,
|
|
tag: str = "",
|
|
url_allowlist: str = "",
|
|
url_denylist: str = "",
|
|
parser: str = "auto",
|
|
plugins: str = "",
|
|
persona: str = "Default",
|
|
overwrite: bool = False,
|
|
update: bool | None = None,
|
|
index_only: bool = False,
|
|
bg: bool = False,
|
|
created_by_id: int | None = None,
|
|
) -> tuple["Crawl", QuerySet["Snapshot"]]:
|
|
"""Add a new URL or list of URLs to your archive.
|
|
|
|
The flow is:
|
|
1. Save URLs to sources file
|
|
2. Create Crawl with URLs and max_depth
|
|
3. Crawl runner creates Snapshots from Crawl URLs (depth=0)
|
|
4. Crawl runner runs parser extractors on root snapshots
|
|
5. Parser extractors output to urls.jsonl
|
|
6. URLs are added to Crawl.urls and child Snapshots are created
|
|
7. Repeat until max_depth is reached
|
|
"""
|
|
|
|
from rich import print
|
|
|
|
depth = int(depth)
|
|
max_urls = int(max_urls or 0)
|
|
max_size = parse_filesize_to_bytes(max_size)
|
|
|
|
if depth not in (0, 1, 2, 3, 4):
|
|
raise ValueError("Depth must be 0-4")
|
|
if max_urls < 0:
|
|
raise ValueError("max_urls must be >= 0")
|
|
if max_size < 0:
|
|
raise ValueError("max_size must be >= 0")
|
|
|
|
# import models once django is set up
|
|
from archivebox.core.models import Snapshot
|
|
from archivebox.crawls.models import Crawl
|
|
from archivebox.base_models.models import get_or_create_system_user_pk
|
|
from archivebox.personas.models import Persona
|
|
from archivebox.misc.logging_util import printable_filesize
|
|
from archivebox.misc.system import get_dir_size
|
|
from archivebox.config.configset import get_config
|
|
from archivebox.services.runner import run_crawl
|
|
|
|
created_by_id = created_by_id or get_or_create_system_user_pk()
|
|
started_at = timezone.now()
|
|
if update is None:
|
|
update = not ARCHIVING_CONFIG.ONLY_NEW
|
|
|
|
# 1. Save the provided URLs to sources/2024-11-05__23-59-59__cli_add.txt
|
|
sources_file = CONSTANTS.SOURCES_DIR / f"{timezone.now().strftime('%Y-%m-%d__%H-%M-%S')}__cli_add.txt"
|
|
sources_file.parent.mkdir(parents=True, exist_ok=True)
|
|
sources_file.write_text(urls if isinstance(urls, str) else "\n".join(urls))
|
|
|
|
# 2. Create a new Crawl with inline URLs
|
|
cli_args = [*sys.argv]
|
|
if cli_args[0].lower().endswith("archivebox"):
|
|
cli_args[0] = "archivebox"
|
|
cmd_str = " ".join(cli_args)
|
|
|
|
timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
|
|
|
|
# Read URLs directly into crawl
|
|
urls_content = sources_file.read_text()
|
|
persona_name = (persona or "Default").strip() or "Default"
|
|
plugins = plugins or str(get_config().get("PLUGINS") or "")
|
|
persona_obj, _ = Persona.objects.get_or_create(name=persona_name)
|
|
persona_obj.ensure_dirs()
|
|
|
|
crawl = Crawl.objects.create(
|
|
urls=urls_content,
|
|
max_depth=depth,
|
|
max_urls=max_urls,
|
|
max_size=max_size,
|
|
tags_str=tag,
|
|
persona_id=persona_obj.id,
|
|
label=f"{USER}@{HOSTNAME} $ {cmd_str} [{timestamp}]",
|
|
created_by_id=created_by_id,
|
|
config={
|
|
"ONLY_NEW": not update,
|
|
"INDEX_ONLY": index_only,
|
|
"OVERWRITE": overwrite,
|
|
"PLUGINS": plugins,
|
|
"DEFAULT_PERSONA": persona_name,
|
|
"PARSER": parser,
|
|
**({"URL_ALLOWLIST": url_allowlist} if url_allowlist else {}),
|
|
**({"URL_DENYLIST": url_denylist} if url_denylist else {}),
|
|
},
|
|
)
|
|
|
|
print(f"[green]\\[+] Created Crawl {crawl.id} with max_depth={depth}[/green]")
|
|
first_url = crawl.get_urls_list()[0] if crawl.get_urls_list() else ""
|
|
print(f" [dim]First URL: {first_url}[/dim]")
|
|
|
|
# 3. The CrawlMachine will create Snapshots from all URLs when started
|
|
# Parser extractors run on snapshots and discover more URLs
|
|
# Discovered URLs become child Snapshots (depth+1)
|
|
|
|
if index_only:
|
|
# Just create the crawl but don't start processing
|
|
print("[yellow]\\[*] Index-only mode - crawl created but not started[/yellow]")
|
|
# Create snapshots for all URLs in the crawl
|
|
for url in crawl.get_urls_list():
|
|
snapshot, _ = Snapshot.objects.update_or_create(
|
|
crawl=crawl,
|
|
url=url,
|
|
defaults={
|
|
"status": Snapshot.INITIAL_STATE,
|
|
"retry_at": timezone.now(),
|
|
"timestamp": str(timezone.now().timestamp()),
|
|
"depth": 0,
|
|
},
|
|
)
|
|
if tag:
|
|
snapshot.save_tags(tag.split(","))
|
|
snapshot.ensure_crawl_symlink()
|
|
return crawl, crawl.snapshot_set.all()
|
|
|
|
if bg:
|
|
crawl.create_snapshots_from_urls()
|
|
|
|
# 5. Start the crawl runner to process the queue
|
|
# The runner will:
|
|
# - Process Crawl -> create Snapshots from all URLs
|
|
# - Process Snapshots -> run extractors
|
|
# - Parser extractors discover new URLs -> create child Snapshots
|
|
# - Repeat until max_depth reached
|
|
|
|
if bg:
|
|
# Background mode: just queue work and return (background runner via server will pick it up)
|
|
print(
|
|
"[yellow]\\[*] URLs queued. The background runner will process them (run `archivebox server` or `archivebox run --daemon` if not already running).[/yellow]",
|
|
)
|
|
else:
|
|
# Foreground mode: run full crawl runner until all work is done
|
|
print("[green]\\[*] Starting crawl runner to process crawl...[/green]")
|
|
run_crawl(str(crawl.id))
|
|
|
|
# Print summary for foreground runs
|
|
try:
|
|
crawl.refresh_from_db()
|
|
snapshots_count = crawl.snapshot_set.count()
|
|
try:
|
|
from django.db.models import Count, Sum
|
|
|
|
totals = crawl.snapshot_set.aggregate(snapshot_count=Count("id"), total_bytes=Sum("archiveresult__output_size"))
|
|
total_bytes = int(totals["total_bytes"] or 0) if totals["snapshot_count"] else 0
|
|
except Exception:
|
|
total_bytes, _, _ = get_dir_size(crawl.output_dir)
|
|
total_size = printable_filesize(total_bytes)
|
|
total_time = timezone.now() - started_at
|
|
total_seconds = int(total_time.total_seconds())
|
|
mins, secs = divmod(total_seconds, 60)
|
|
hours, mins = divmod(mins, 60)
|
|
if hours:
|
|
duration_str = f"{hours}h {mins}m {secs}s"
|
|
elif mins:
|
|
duration_str = f"{mins}m {secs}s"
|
|
else:
|
|
duration_str = f"{secs}s"
|
|
|
|
# Output dir relative to DATA_DIR
|
|
try:
|
|
rel_output = Path(crawl.output_dir).relative_to(CONSTANTS.DATA_DIR)
|
|
rel_output_str = f"./{rel_output}"
|
|
except Exception:
|
|
rel_output_str = str(crawl.output_dir)
|
|
|
|
bind_addr = SERVER_CONFIG.BIND_ADDR or "127.0.0.1:8000"
|
|
if bind_addr.startswith("http://") or bind_addr.startswith("https://"):
|
|
base_url = bind_addr
|
|
else:
|
|
base_url = f"http://{bind_addr}"
|
|
admin_url = f"{base_url}/admin/crawls/crawl/{crawl.id}/change/"
|
|
|
|
print("\n[bold]crawl output saved to:[/bold]")
|
|
print(f" {rel_output_str}")
|
|
print(f" {admin_url}")
|
|
print(f"\n[bold]total urls snapshotted:[/bold] {snapshots_count}")
|
|
print(f"[bold]total size:[/bold] {total_size}")
|
|
print(f"[bold]total time:[/bold] {duration_str}")
|
|
except Exception:
|
|
# Summary is best-effort; avoid failing the command if something goes wrong
|
|
pass
|
|
|
|
# 6. Return the list of Snapshots in this crawl
|
|
snapshots = crawl.snapshot_set.all()
|
|
return crawl, snapshots
|
|
|
|
|
|
@click.command()
|
|
@click.option(
|
|
"--depth",
|
|
"-d",
|
|
type=click.Choice([str(i) for i in range(5)]),
|
|
default="0",
|
|
help="Recursively archive linked pages up to N hops away",
|
|
)
|
|
@click.option("--max-urls", type=int, default=0, help="Maximum number of URLs to snapshot for this crawl (0 = unlimited)")
|
|
@click.option("--max-size", default="0", help="Maximum total crawl size in bytes or units like 45mb / 1gb (0 = unlimited)")
|
|
@click.option("--tag", "-t", default="", help="Comma-separated list of tags to add to each snapshot e.g. tag1,tag2,tag3")
|
|
@click.option("--url-allowlist", "--domain-allowlist", default="", help="Comma-separated URL/domain allowlist for this crawl")
|
|
@click.option("--url-denylist", "--domain-denylist", default="", help="Comma-separated URL/domain denylist for this crawl")
|
|
@click.option("--parser", default="auto", help="Parser for reading input URLs (auto, txt, html, rss, json, jsonl, netscape, ...)")
|
|
@click.option("--plugins", "-p", default="", help="Comma-separated list of plugins to run e.g. title,favicon,screenshot,singlefile,...")
|
|
@click.option("--persona", default="Default", help="Authentication profile to use when archiving")
|
|
@click.option("--overwrite", "-F", is_flag=True, help="Overwrite existing data if URLs have been archived previously")
|
|
@click.option("--update", is_flag=True, default=None, help="Retry any previously skipped/failed URLs when re-adding them")
|
|
@click.option("--index-only", is_flag=True, help="Just add the URLs to the index without archiving them now")
|
|
@click.option("--bg", is_flag=True, help="Run archiving in background (queue work and return immediately)")
|
|
@click.argument("urls", nargs=-1, type=click.Path())
|
|
@docstring(add.__doc__)
|
|
def main(**kwargs):
|
|
"""Add a new URL or list of URLs to your archive"""
|
|
|
|
raw_urls = kwargs.pop("urls")
|
|
urls = _collect_input_urls(raw_urls)
|
|
if not urls:
|
|
raise click.UsageError("No URLs provided. Pass URLs as arguments or via stdin.")
|
|
if int(kwargs.get("max_urls") or 0) < 0:
|
|
raise click.BadParameter("max_urls must be 0 or a positive integer.", param_hint="--max-urls")
|
|
try:
|
|
kwargs["max_size"] = parse_filesize_to_bytes(kwargs.get("max_size"))
|
|
except ValueError as err:
|
|
raise click.BadParameter(str(err), param_hint="--max-size") from err
|
|
|
|
add(urls=urls, **kwargs)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|