Files
ArchiveBox/archivebox/cli/archivebox_search.py
Nick Sweeting b749b26c5d wip
2026-03-23 03:58:32 -07:00

258 lines
8.6 KiB
Python

#!/usr/bin/env python3
__package__ = "archivebox.cli"
__command__ = "archivebox search"
import sys
from pathlib import Path
from typing import TYPE_CHECKING
from collections.abc import Callable
import rich_click as click
from django.db.models import Q, QuerySet
from archivebox.config import DATA_DIR
from archivebox.misc.logging import stderr
from archivebox.misc.util import enforce_types, docstring
if TYPE_CHECKING:
from archivebox.core.models import Snapshot
# Filter types for URL matching
LINK_FILTERS: dict[str, Callable[[str], Q]] = {
"exact": lambda pattern: Q(url=pattern),
"substring": lambda pattern: Q(url__icontains=pattern),
"regex": lambda pattern: Q(url__iregex=pattern),
"domain": lambda pattern: (
Q(url__istartswith=f"http://{pattern}") | Q(url__istartswith=f"https://{pattern}") | Q(url__istartswith=f"ftp://{pattern}")
),
"tag": lambda pattern: Q(tags__name=pattern),
"timestamp": lambda pattern: Q(timestamp=pattern),
}
STATUS_CHOICES = ["indexed", "archived", "unarchived"]
def _apply_pattern_filters(
snapshots: QuerySet["Snapshot", "Snapshot"],
filter_patterns: list[str],
filter_type: str,
) -> QuerySet["Snapshot", "Snapshot"]:
filter_builder = LINK_FILTERS.get(filter_type)
if filter_builder is None:
stderr()
stderr(f"[X] Got invalid pattern for --filter-type={filter_type}", color="red")
raise SystemExit(2)
query = Q()
for pattern in filter_patterns:
query |= filter_builder(pattern)
return snapshots.filter(query)
def _snapshots_to_json(
snapshots: QuerySet["Snapshot", "Snapshot"],
*,
with_headers: bool,
) -> str:
from datetime import datetime, timezone as tz
from archivebox.config import VERSION
from archivebox.config.common import SERVER_CONFIG
from archivebox.misc.util import to_json
main_index_header = (
{
"info": "This is an index of site data archived by ArchiveBox: The self-hosted web archive.",
"schema": "archivebox.index.json",
"copyright_info": SERVER_CONFIG.FOOTER_INFO,
"meta": {
"project": "ArchiveBox",
"version": VERSION,
"git_sha": VERSION,
"website": "https://ArchiveBox.io",
"docs": "https://github.com/ArchiveBox/ArchiveBox/wiki",
"source": "https://github.com/ArchiveBox/ArchiveBox",
"issues": "https://github.com/ArchiveBox/ArchiveBox/issues",
"dependencies": {},
},
}
if with_headers
else {}
)
snapshot_dicts = [snapshot.to_dict(extended=True) for snapshot in snapshots.iterator(chunk_size=500)]
output: dict[str, object] | list[dict[str, object]]
if with_headers:
output = {
**main_index_header,
"num_links": len(snapshot_dicts),
"updated": datetime.now(tz.utc),
"last_run_cmd": sys.argv,
"links": snapshot_dicts,
}
else:
output = snapshot_dicts
return to_json(output, indent=4, sort_keys=True)
def _snapshots_to_csv(
snapshots: QuerySet["Snapshot", "Snapshot"],
*,
cols: list[str],
with_headers: bool,
) -> str:
header = ",".join(cols) if with_headers else ""
rows = [snapshot.to_csv(cols=cols, separator=",") for snapshot in snapshots.iterator(chunk_size=500)]
return "\n".join((header, *rows))
def _snapshots_to_html(
snapshots: QuerySet["Snapshot", "Snapshot"],
*,
with_headers: bool,
) -> str:
from datetime import datetime, timezone as tz
from django.template.loader import render_to_string
from archivebox.config import VERSION
from archivebox.config.common import SERVER_CONFIG
from archivebox.config.version import get_COMMIT_HASH
template = "static_index.html" if with_headers else "minimal_index.html"
snapshot_list = list(snapshots.iterator(chunk_size=500))
return render_to_string(
template,
{
"version": VERSION,
"git_sha": get_COMMIT_HASH() or VERSION,
"num_links": str(len(snapshot_list)),
"date_updated": datetime.now(tz.utc).strftime("%Y-%m-%d"),
"time_updated": datetime.now(tz.utc).strftime("%Y-%m-%d %H:%M"),
"links": snapshot_list,
"FOOTER_INFO": SERVER_CONFIG.FOOTER_INFO,
},
)
def get_snapshots(
snapshots: QuerySet["Snapshot", "Snapshot"] | None = None,
filter_patterns: list[str] | None = None,
filter_type: str = "substring",
after: float | None = None,
before: float | None = None,
out_dir: Path = DATA_DIR,
) -> QuerySet["Snapshot", "Snapshot"]:
"""Filter and return Snapshots matching the given criteria."""
from archivebox.core.models import Snapshot
if snapshots is not None:
result = snapshots
else:
result = Snapshot.objects.all()
if after is not None:
result = result.filter(timestamp__gte=after)
if before is not None:
result = result.filter(timestamp__lt=before)
if filter_patterns:
result = _apply_pattern_filters(result, filter_patterns, filter_type)
# Prefetch crawl relationship to avoid N+1 queries when accessing output_dir
result = result.select_related("crawl", "crawl__created_by")
if not result.exists():
stderr("[!] No Snapshots matched your filters:", filter_patterns, f"({filter_type})", color="lightyellow")
return result
@enforce_types
def search(
filter_patterns: list[str] | None = None,
filter_type: str = "substring",
status: str = "indexed",
before: float | None = None,
after: float | None = None,
sort: str | None = None,
json: bool = False,
html: bool = False,
csv: str | None = None,
with_headers: bool = False,
):
"""List, filter, and export information about archive entries"""
if with_headers and not (json or html or csv):
stderr("[X] --with-headers requires --json, --html or --csv\n", color="red")
raise SystemExit(2)
# Query DB directly - no filesystem scanning
snapshots = get_snapshots(
filter_patterns=list(filter_patterns) if filter_patterns else None,
filter_type=filter_type,
before=before,
after=after,
)
# Apply status filter
if status == "archived":
snapshots = snapshots.filter(downloaded_at__isnull=False)
elif status == "unarchived":
snapshots = snapshots.filter(downloaded_at__isnull=True)
# 'indexed' = all snapshots (no filter)
if sort:
snapshots = snapshots.order_by(sort)
# Export to requested format
if json:
output = _snapshots_to_json(snapshots, with_headers=with_headers)
elif html:
output = _snapshots_to_html(snapshots, with_headers=with_headers)
elif csv:
output = _snapshots_to_csv(snapshots, cols=csv.split(","), with_headers=with_headers)
else:
from archivebox.misc.logging_util import printable_folders
# Convert to dict for printable_folders
folders: dict[str, Snapshot | None] = {str(snapshot.output_dir): snapshot for snapshot in snapshots}
output = printable_folders(folders, with_headers)
# Structured exports must be written directly to stdout.
# rich.print() reflows long lines to console width, which corrupts JSON/CSV/HTML output.
sys.stdout.write(output)
if not output.endswith("\n"):
sys.stdout.write("\n")
return output
@click.command()
@click.option(
"--filter-type",
"-f",
type=click.Choice(["search", *LINK_FILTERS.keys()]),
default="substring",
help="Pattern matching type for filtering URLs",
)
@click.option("--status", "-s", type=click.Choice(STATUS_CHOICES), default="indexed", help="List snapshots with the given status")
@click.option("--before", "-b", type=float, help="List snapshots bookmarked before the given UNIX timestamp")
@click.option("--after", "-a", type=float, help="List snapshots bookmarked after the given UNIX timestamp")
@click.option("--sort", "-o", type=str, help="Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at")
@click.option("--json", "-J", is_flag=True, help="Print output in JSON format")
@click.option("--html", "-M", is_flag=True, help="Print output in HTML format (suitable for viewing statically without a server)")
@click.option("--csv", "-C", type=str, help="Print output as CSV with the provided fields, e.g.: created_at,url,title")
@click.option("--with-headers", "-H", is_flag=True, help="Include extra CSV/HTML headers in the output")
@click.help_option("--help", "-h")
@click.argument("filter_patterns", nargs=-1)
@docstring(search.__doc__)
def main(**kwargs):
return search(**kwargs)
if __name__ == "__main__":
main()