mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
258 lines
8.6 KiB
Python
258 lines
8.6 KiB
Python
#!/usr/bin/env python3
|
|
|
|
__package__ = "archivebox.cli"
|
|
__command__ = "archivebox search"
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import TYPE_CHECKING
|
|
from collections.abc import Callable
|
|
|
|
import rich_click as click
|
|
|
|
from django.db.models import Q, QuerySet
|
|
|
|
from archivebox.config import DATA_DIR
|
|
from archivebox.misc.logging import stderr
|
|
from archivebox.misc.util import enforce_types, docstring
|
|
|
|
if TYPE_CHECKING:
|
|
from archivebox.core.models import Snapshot
|
|
|
|
# Filter types for URL matching
|
|
LINK_FILTERS: dict[str, Callable[[str], Q]] = {
|
|
"exact": lambda pattern: Q(url=pattern),
|
|
"substring": lambda pattern: Q(url__icontains=pattern),
|
|
"regex": lambda pattern: Q(url__iregex=pattern),
|
|
"domain": lambda pattern: (
|
|
Q(url__istartswith=f"http://{pattern}") | Q(url__istartswith=f"https://{pattern}") | Q(url__istartswith=f"ftp://{pattern}")
|
|
),
|
|
"tag": lambda pattern: Q(tags__name=pattern),
|
|
"timestamp": lambda pattern: Q(timestamp=pattern),
|
|
}
|
|
|
|
STATUS_CHOICES = ["indexed", "archived", "unarchived"]
|
|
|
|
|
|
def _apply_pattern_filters(
|
|
snapshots: QuerySet["Snapshot", "Snapshot"],
|
|
filter_patterns: list[str],
|
|
filter_type: str,
|
|
) -> QuerySet["Snapshot", "Snapshot"]:
|
|
filter_builder = LINK_FILTERS.get(filter_type)
|
|
if filter_builder is None:
|
|
stderr()
|
|
stderr(f"[X] Got invalid pattern for --filter-type={filter_type}", color="red")
|
|
raise SystemExit(2)
|
|
|
|
query = Q()
|
|
for pattern in filter_patterns:
|
|
query |= filter_builder(pattern)
|
|
return snapshots.filter(query)
|
|
|
|
|
|
def _snapshots_to_json(
|
|
snapshots: QuerySet["Snapshot", "Snapshot"],
|
|
*,
|
|
with_headers: bool,
|
|
) -> str:
|
|
from datetime import datetime, timezone as tz
|
|
|
|
from archivebox.config import VERSION
|
|
from archivebox.config.common import SERVER_CONFIG
|
|
from archivebox.misc.util import to_json
|
|
|
|
main_index_header = (
|
|
{
|
|
"info": "This is an index of site data archived by ArchiveBox: The self-hosted web archive.",
|
|
"schema": "archivebox.index.json",
|
|
"copyright_info": SERVER_CONFIG.FOOTER_INFO,
|
|
"meta": {
|
|
"project": "ArchiveBox",
|
|
"version": VERSION,
|
|
"git_sha": VERSION,
|
|
"website": "https://ArchiveBox.io",
|
|
"docs": "https://github.com/ArchiveBox/ArchiveBox/wiki",
|
|
"source": "https://github.com/ArchiveBox/ArchiveBox",
|
|
"issues": "https://github.com/ArchiveBox/ArchiveBox/issues",
|
|
"dependencies": {},
|
|
},
|
|
}
|
|
if with_headers
|
|
else {}
|
|
)
|
|
|
|
snapshot_dicts = [snapshot.to_dict(extended=True) for snapshot in snapshots.iterator(chunk_size=500)]
|
|
output: dict[str, object] | list[dict[str, object]]
|
|
if with_headers:
|
|
output = {
|
|
**main_index_header,
|
|
"num_links": len(snapshot_dicts),
|
|
"updated": datetime.now(tz.utc),
|
|
"last_run_cmd": sys.argv,
|
|
"links": snapshot_dicts,
|
|
}
|
|
else:
|
|
output = snapshot_dicts
|
|
|
|
return to_json(output, indent=4, sort_keys=True)
|
|
|
|
|
|
def _snapshots_to_csv(
|
|
snapshots: QuerySet["Snapshot", "Snapshot"],
|
|
*,
|
|
cols: list[str],
|
|
with_headers: bool,
|
|
) -> str:
|
|
header = ",".join(cols) if with_headers else ""
|
|
rows = [snapshot.to_csv(cols=cols, separator=",") for snapshot in snapshots.iterator(chunk_size=500)]
|
|
return "\n".join((header, *rows))
|
|
|
|
|
|
def _snapshots_to_html(
|
|
snapshots: QuerySet["Snapshot", "Snapshot"],
|
|
*,
|
|
with_headers: bool,
|
|
) -> str:
|
|
from datetime import datetime, timezone as tz
|
|
|
|
from django.template.loader import render_to_string
|
|
|
|
from archivebox.config import VERSION
|
|
from archivebox.config.common import SERVER_CONFIG
|
|
from archivebox.config.version import get_COMMIT_HASH
|
|
|
|
template = "static_index.html" if with_headers else "minimal_index.html"
|
|
snapshot_list = list(snapshots.iterator(chunk_size=500))
|
|
|
|
return render_to_string(
|
|
template,
|
|
{
|
|
"version": VERSION,
|
|
"git_sha": get_COMMIT_HASH() or VERSION,
|
|
"num_links": str(len(snapshot_list)),
|
|
"date_updated": datetime.now(tz.utc).strftime("%Y-%m-%d"),
|
|
"time_updated": datetime.now(tz.utc).strftime("%Y-%m-%d %H:%M"),
|
|
"links": snapshot_list,
|
|
"FOOTER_INFO": SERVER_CONFIG.FOOTER_INFO,
|
|
},
|
|
)
|
|
|
|
|
|
def get_snapshots(
|
|
snapshots: QuerySet["Snapshot", "Snapshot"] | None = None,
|
|
filter_patterns: list[str] | None = None,
|
|
filter_type: str = "substring",
|
|
after: float | None = None,
|
|
before: float | None = None,
|
|
out_dir: Path = DATA_DIR,
|
|
) -> QuerySet["Snapshot", "Snapshot"]:
|
|
"""Filter and return Snapshots matching the given criteria."""
|
|
from archivebox.core.models import Snapshot
|
|
|
|
if snapshots is not None:
|
|
result = snapshots
|
|
else:
|
|
result = Snapshot.objects.all()
|
|
|
|
if after is not None:
|
|
result = result.filter(timestamp__gte=after)
|
|
if before is not None:
|
|
result = result.filter(timestamp__lt=before)
|
|
if filter_patterns:
|
|
result = _apply_pattern_filters(result, filter_patterns, filter_type)
|
|
|
|
# Prefetch crawl relationship to avoid N+1 queries when accessing output_dir
|
|
result = result.select_related("crawl", "crawl__created_by")
|
|
|
|
if not result.exists():
|
|
stderr("[!] No Snapshots matched your filters:", filter_patterns, f"({filter_type})", color="lightyellow")
|
|
|
|
return result
|
|
|
|
|
|
@enforce_types
|
|
def search(
|
|
filter_patterns: list[str] | None = None,
|
|
filter_type: str = "substring",
|
|
status: str = "indexed",
|
|
before: float | None = None,
|
|
after: float | None = None,
|
|
sort: str | None = None,
|
|
json: bool = False,
|
|
html: bool = False,
|
|
csv: str | None = None,
|
|
with_headers: bool = False,
|
|
):
|
|
"""List, filter, and export information about archive entries"""
|
|
|
|
if with_headers and not (json or html or csv):
|
|
stderr("[X] --with-headers requires --json, --html or --csv\n", color="red")
|
|
raise SystemExit(2)
|
|
|
|
# Query DB directly - no filesystem scanning
|
|
snapshots = get_snapshots(
|
|
filter_patterns=list(filter_patterns) if filter_patterns else None,
|
|
filter_type=filter_type,
|
|
before=before,
|
|
after=after,
|
|
)
|
|
|
|
# Apply status filter
|
|
if status == "archived":
|
|
snapshots = snapshots.filter(downloaded_at__isnull=False)
|
|
elif status == "unarchived":
|
|
snapshots = snapshots.filter(downloaded_at__isnull=True)
|
|
# 'indexed' = all snapshots (no filter)
|
|
|
|
if sort:
|
|
snapshots = snapshots.order_by(sort)
|
|
|
|
# Export to requested format
|
|
if json:
|
|
output = _snapshots_to_json(snapshots, with_headers=with_headers)
|
|
elif html:
|
|
output = _snapshots_to_html(snapshots, with_headers=with_headers)
|
|
elif csv:
|
|
output = _snapshots_to_csv(snapshots, cols=csv.split(","), with_headers=with_headers)
|
|
else:
|
|
from archivebox.misc.logging_util import printable_folders
|
|
|
|
# Convert to dict for printable_folders
|
|
folders: dict[str, Snapshot | None] = {str(snapshot.output_dir): snapshot for snapshot in snapshots}
|
|
output = printable_folders(folders, with_headers)
|
|
|
|
# Structured exports must be written directly to stdout.
|
|
# rich.print() reflows long lines to console width, which corrupts JSON/CSV/HTML output.
|
|
sys.stdout.write(output)
|
|
if not output.endswith("\n"):
|
|
sys.stdout.write("\n")
|
|
return output
|
|
|
|
|
|
@click.command()
|
|
@click.option(
|
|
"--filter-type",
|
|
"-f",
|
|
type=click.Choice(["search", *LINK_FILTERS.keys()]),
|
|
default="substring",
|
|
help="Pattern matching type for filtering URLs",
|
|
)
|
|
@click.option("--status", "-s", type=click.Choice(STATUS_CHOICES), default="indexed", help="List snapshots with the given status")
|
|
@click.option("--before", "-b", type=float, help="List snapshots bookmarked before the given UNIX timestamp")
|
|
@click.option("--after", "-a", type=float, help="List snapshots bookmarked after the given UNIX timestamp")
|
|
@click.option("--sort", "-o", type=str, help="Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at")
|
|
@click.option("--json", "-J", is_flag=True, help="Print output in JSON format")
|
|
@click.option("--html", "-M", is_flag=True, help="Print output in HTML format (suitable for viewing statically without a server)")
|
|
@click.option("--csv", "-C", type=str, help="Print output as CSV with the provided fields, e.g.: created_at,url,title")
|
|
@click.option("--with-headers", "-H", is_flag=True, help="Include extra CSV/HTML headers in the output")
|
|
@click.help_option("--help", "-h")
|
|
@click.argument("filter_patterns", nargs=-1)
|
|
@docstring(search.__doc__)
|
|
def main(**kwargs):
|
|
return search(**kwargs)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|