#!/usr/bin/env python3 __package__ = "archivebox.cli" __command__ = "archivebox search" import sys from pathlib import Path from typing import TYPE_CHECKING from collections.abc import Callable import rich_click as click from django.db.models import Q, QuerySet from archivebox.config import DATA_DIR from archivebox.misc.logging import stderr from archivebox.misc.util import enforce_types, docstring if TYPE_CHECKING: from archivebox.core.models import Snapshot # Filter types for URL matching LINK_FILTERS: dict[str, Callable[[str], Q]] = { "exact": lambda pattern: Q(url=pattern), "substring": lambda pattern: Q(url__icontains=pattern), "regex": lambda pattern: Q(url__iregex=pattern), "domain": lambda pattern: ( Q(url__istartswith=f"http://{pattern}") | Q(url__istartswith=f"https://{pattern}") | Q(url__istartswith=f"ftp://{pattern}") ), "tag": lambda pattern: Q(tags__name=pattern), "timestamp": lambda pattern: Q(timestamp=pattern), } STATUS_CHOICES = ["indexed", "archived", "unarchived"] def _apply_pattern_filters( snapshots: QuerySet["Snapshot", "Snapshot"], filter_patterns: list[str], filter_type: str, ) -> QuerySet["Snapshot", "Snapshot"]: filter_builder = LINK_FILTERS.get(filter_type) if filter_builder is None: stderr() stderr(f"[X] Got invalid pattern for --filter-type={filter_type}", color="red") raise SystemExit(2) query = Q() for pattern in filter_patterns: query |= filter_builder(pattern) return snapshots.filter(query) def _snapshots_to_json( snapshots: QuerySet["Snapshot", "Snapshot"], *, with_headers: bool, ) -> str: from datetime import datetime, timezone as tz from archivebox.config import VERSION from archivebox.config.common import SERVER_CONFIG from archivebox.misc.util import to_json main_index_header = ( { "info": "This is an index of site data archived by ArchiveBox: The self-hosted web archive.", "schema": "archivebox.index.json", "copyright_info": SERVER_CONFIG.FOOTER_INFO, "meta": { "project": "ArchiveBox", "version": VERSION, "git_sha": VERSION, "website": "https://ArchiveBox.io", "docs": "https://github.com/ArchiveBox/ArchiveBox/wiki", "source": "https://github.com/ArchiveBox/ArchiveBox", "issues": "https://github.com/ArchiveBox/ArchiveBox/issues", "dependencies": {}, }, } if with_headers else {} ) snapshot_dicts = [snapshot.to_dict(extended=True) for snapshot in snapshots.iterator(chunk_size=500)] output: dict[str, object] | list[dict[str, object]] if with_headers: output = { **main_index_header, "num_links": len(snapshot_dicts), "updated": datetime.now(tz.utc), "last_run_cmd": sys.argv, "links": snapshot_dicts, } else: output = snapshot_dicts return to_json(output, indent=4, sort_keys=True) def _snapshots_to_csv( snapshots: QuerySet["Snapshot", "Snapshot"], *, cols: list[str], with_headers: bool, ) -> str: header = ",".join(cols) if with_headers else "" rows = [snapshot.to_csv(cols=cols, separator=",") for snapshot in snapshots.iterator(chunk_size=500)] return "\n".join((header, *rows)) def _snapshots_to_html( snapshots: QuerySet["Snapshot", "Snapshot"], *, with_headers: bool, ) -> str: from datetime import datetime, timezone as tz from django.template.loader import render_to_string from archivebox.config import VERSION from archivebox.config.common import SERVER_CONFIG from archivebox.config.version import get_COMMIT_HASH template = "static_index.html" if with_headers else "minimal_index.html" snapshot_list = list(snapshots.iterator(chunk_size=500)) return render_to_string( template, { "version": VERSION, "git_sha": get_COMMIT_HASH() or VERSION, "num_links": str(len(snapshot_list)), "date_updated": datetime.now(tz.utc).strftime("%Y-%m-%d"), "time_updated": datetime.now(tz.utc).strftime("%Y-%m-%d %H:%M"), "links": snapshot_list, "FOOTER_INFO": SERVER_CONFIG.FOOTER_INFO, }, ) def get_snapshots( snapshots: QuerySet["Snapshot", "Snapshot"] | None = None, filter_patterns: list[str] | None = None, filter_type: str = "substring", after: float | None = None, before: float | None = None, out_dir: Path = DATA_DIR, ) -> QuerySet["Snapshot", "Snapshot"]: """Filter and return Snapshots matching the given criteria.""" from archivebox.core.models import Snapshot if snapshots is not None: result = snapshots else: result = Snapshot.objects.all() if after is not None: result = result.filter(timestamp__gte=after) if before is not None: result = result.filter(timestamp__lt=before) if filter_patterns: result = _apply_pattern_filters(result, filter_patterns, filter_type) # Prefetch crawl relationship to avoid N+1 queries when accessing output_dir result = result.select_related("crawl", "crawl__created_by") if not result.exists(): stderr("[!] No Snapshots matched your filters:", filter_patterns, f"({filter_type})", color="lightyellow") return result @enforce_types def search( filter_patterns: list[str] | None = None, filter_type: str = "substring", status: str = "indexed", before: float | None = None, after: float | None = None, sort: str | None = None, json: bool = False, html: bool = False, csv: str | None = None, with_headers: bool = False, ): """List, filter, and export information about archive entries""" if with_headers and not (json or html or csv): stderr("[X] --with-headers requires --json, --html or --csv\n", color="red") raise SystemExit(2) # Query DB directly - no filesystem scanning snapshots = get_snapshots( filter_patterns=list(filter_patterns) if filter_patterns else None, filter_type=filter_type, before=before, after=after, ) # Apply status filter if status == "archived": snapshots = snapshots.filter(downloaded_at__isnull=False) elif status == "unarchived": snapshots = snapshots.filter(downloaded_at__isnull=True) # 'indexed' = all snapshots (no filter) if sort: snapshots = snapshots.order_by(sort) # Export to requested format if json: output = _snapshots_to_json(snapshots, with_headers=with_headers) elif html: output = _snapshots_to_html(snapshots, with_headers=with_headers) elif csv: output = _snapshots_to_csv(snapshots, cols=csv.split(","), with_headers=with_headers) else: from archivebox.misc.logging_util import printable_folders # Convert to dict for printable_folders folders: dict[str, Snapshot | None] = {str(snapshot.output_dir): snapshot for snapshot in snapshots} output = printable_folders(folders, with_headers) # Structured exports must be written directly to stdout. # rich.print() reflows long lines to console width, which corrupts JSON/CSV/HTML output. sys.stdout.write(output) if not output.endswith("\n"): sys.stdout.write("\n") return output @click.command() @click.option( "--filter-type", "-f", type=click.Choice(["search", *LINK_FILTERS.keys()]), default="substring", help="Pattern matching type for filtering URLs", ) @click.option("--status", "-s", type=click.Choice(STATUS_CHOICES), default="indexed", help="List snapshots with the given status") @click.option("--before", "-b", type=float, help="List snapshots bookmarked before the given UNIX timestamp") @click.option("--after", "-a", type=float, help="List snapshots bookmarked after the given UNIX timestamp") @click.option("--sort", "-o", type=str, help="Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at") @click.option("--json", "-J", is_flag=True, help="Print output in JSON format") @click.option("--html", "-M", is_flag=True, help="Print output in HTML format (suitable for viewing statically without a server)") @click.option("--csv", "-C", type=str, help="Print output as CSV with the provided fields, e.g.: created_at,url,title") @click.option("--with-headers", "-H", is_flag=True, help="Include extra CSV/HTML headers in the output") @click.help_option("--help", "-h") @click.argument("filter_patterns", nargs=-1) @docstring(search.__doc__) def main(**kwargs): return search(**kwargs) if __name__ == "__main__": main()