Files
ArchiveBox/archivebox/cli/archivebox_search.py
2026-03-15 20:12:27 -07:00

242 lines
8.6 KiB
Python

#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox search'
import sys
from pathlib import Path
from typing import TYPE_CHECKING, Callable
import rich_click as click
from django.db.models import Q, QuerySet
from archivebox.config import DATA_DIR
from archivebox.misc.logging import stderr
from archivebox.misc.util import enforce_types, docstring
if TYPE_CHECKING:
from archivebox.core.models import Snapshot
# Filter types for URL matching
LINK_FILTERS: dict[str, Callable[[str], Q]] = {
'exact': lambda pattern: Q(url=pattern),
'substring': lambda pattern: Q(url__icontains=pattern),
'regex': lambda pattern: Q(url__iregex=pattern),
'domain': lambda pattern: (
Q(url__istartswith=f'http://{pattern}')
| Q(url__istartswith=f'https://{pattern}')
| Q(url__istartswith=f'ftp://{pattern}')
),
'tag': lambda pattern: Q(tags__name=pattern),
'timestamp': lambda pattern: Q(timestamp=pattern),
}
STATUS_CHOICES = ['indexed', 'archived', 'unarchived']
def _apply_pattern_filters(
snapshots: QuerySet['Snapshot', 'Snapshot'],
filter_patterns: list[str],
filter_type: str,
) -> QuerySet['Snapshot', 'Snapshot']:
filter_builder = LINK_FILTERS.get(filter_type)
if filter_builder is None:
stderr()
stderr(f'[X] Got invalid pattern for --filter-type={filter_type}', color='red')
raise SystemExit(2)
query = Q()
for pattern in filter_patterns:
query |= filter_builder(pattern)
return snapshots.filter(query)
def _snapshots_to_json(
snapshots: QuerySet['Snapshot', 'Snapshot'],
*,
with_headers: bool,
) -> str:
from datetime import datetime, timezone as tz
from archivebox.config import VERSION
from archivebox.config.common import SERVER_CONFIG
from archivebox.misc.util import to_json
main_index_header = {
'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
'schema': 'archivebox.index.json',
'copyright_info': SERVER_CONFIG.FOOTER_INFO,
'meta': {
'project': 'ArchiveBox',
'version': VERSION,
'git_sha': VERSION,
'website': 'https://ArchiveBox.io',
'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki',
'source': 'https://github.com/ArchiveBox/ArchiveBox',
'issues': 'https://github.com/ArchiveBox/ArchiveBox/issues',
'dependencies': {},
},
} if with_headers else {}
snapshot_dicts = [snapshot.to_dict(extended=True) for snapshot in snapshots.iterator(chunk_size=500)]
output: dict[str, object] | list[dict[str, object]]
if with_headers:
output = {
**main_index_header,
'num_links': len(snapshot_dicts),
'updated': datetime.now(tz.utc),
'last_run_cmd': sys.argv,
'links': snapshot_dicts,
}
else:
output = snapshot_dicts
return to_json(output, indent=4, sort_keys=True)
def _snapshots_to_csv(
snapshots: QuerySet['Snapshot', 'Snapshot'],
*,
cols: list[str],
with_headers: bool,
) -> str:
header = ','.join(cols) if with_headers else ''
rows = [snapshot.to_csv(cols=cols, separator=',') for snapshot in snapshots.iterator(chunk_size=500)]
return '\n'.join((header, *rows))
def _snapshots_to_html(
snapshots: QuerySet['Snapshot', 'Snapshot'],
*,
with_headers: bool,
) -> str:
from datetime import datetime, timezone as tz
from django.template.loader import render_to_string
from archivebox.config import VERSION
from archivebox.config.common import SERVER_CONFIG
from archivebox.config.version import get_COMMIT_HASH
template = 'static_index.html' if with_headers else 'minimal_index.html'
snapshot_list = list(snapshots.iterator(chunk_size=500))
return render_to_string(template, {
'version': VERSION,
'git_sha': get_COMMIT_HASH() or VERSION,
'num_links': str(len(snapshot_list)),
'date_updated': datetime.now(tz.utc).strftime('%Y-%m-%d'),
'time_updated': datetime.now(tz.utc).strftime('%Y-%m-%d %H:%M'),
'links': snapshot_list,
'FOOTER_INFO': SERVER_CONFIG.FOOTER_INFO,
})
def get_snapshots(snapshots: QuerySet['Snapshot', 'Snapshot'] | None=None,
filter_patterns: list[str] | None=None,
filter_type: str='substring',
after: float | None=None,
before: float | None=None,
out_dir: Path=DATA_DIR) -> QuerySet['Snapshot', 'Snapshot']:
"""Filter and return Snapshots matching the given criteria."""
from archivebox.core.models import Snapshot
if snapshots is not None:
result = snapshots
else:
result = Snapshot.objects.all()
if after is not None:
result = result.filter(timestamp__gte=after)
if before is not None:
result = result.filter(timestamp__lt=before)
if filter_patterns:
result = _apply_pattern_filters(result, filter_patterns, filter_type)
# Prefetch crawl relationship to avoid N+1 queries when accessing output_dir
result = result.select_related('crawl', 'crawl__created_by')
if not result.exists():
stderr('[!] No Snapshots matched your filters:', filter_patterns, f'({filter_type})', color='lightyellow')
return result
@enforce_types
def search(filter_patterns: list[str] | None=None,
filter_type: str='substring',
status: str='indexed',
before: float | None=None,
after: float | None=None,
sort: str | None=None,
json: bool=False,
html: bool=False,
csv: str | None=None,
with_headers: bool=False):
"""List, filter, and export information about archive entries"""
if with_headers and not (json or html or csv):
stderr('[X] --with-headers requires --json, --html or --csv\n', color='red')
raise SystemExit(2)
# Query DB directly - no filesystem scanning
snapshots = get_snapshots(
filter_patterns=list(filter_patterns) if filter_patterns else None,
filter_type=filter_type,
before=before,
after=after,
)
# Apply status filter
if status == 'archived':
snapshots = snapshots.filter(downloaded_at__isnull=False)
elif status == 'unarchived':
snapshots = snapshots.filter(downloaded_at__isnull=True)
# 'indexed' = all snapshots (no filter)
if sort:
snapshots = snapshots.order_by(sort)
# Export to requested format
if json:
output = _snapshots_to_json(snapshots, with_headers=with_headers)
elif html:
output = _snapshots_to_html(snapshots, with_headers=with_headers)
elif csv:
output = _snapshots_to_csv(snapshots, cols=csv.split(','), with_headers=with_headers)
else:
from archivebox.misc.logging_util import printable_folders
# Convert to dict for printable_folders
folders: dict[str, Snapshot | None] = {snapshot.output_dir: snapshot for snapshot in snapshots}
output = printable_folders(folders, with_headers)
# Structured exports must be written directly to stdout.
# rich.print() reflows long lines to console width, which corrupts JSON/CSV/HTML output.
sys.stdout.write(output)
if not output.endswith('\n'):
sys.stdout.write('\n')
return output
@click.command()
@click.option('--filter-type', '-f', type=click.Choice(['search', *LINK_FILTERS.keys()]), default='substring', help='Pattern matching type for filtering URLs')
@click.option('--status', '-s', type=click.Choice(STATUS_CHOICES), default='indexed', help='List snapshots with the given status')
@click.option('--before', '-b', type=float, help='List snapshots bookmarked before the given UNIX timestamp')
@click.option('--after', '-a', type=float, help='List snapshots bookmarked after the given UNIX timestamp')
@click.option('--sort', '-o', type=str, help='Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at')
@click.option('--json', '-J', is_flag=True, help='Print output in JSON format')
@click.option('--html', '-M', is_flag=True, help='Print output in HTML format (suitable for viewing statically without a server)')
@click.option('--csv', '-C', type=str, help='Print output as CSV with the provided fields, e.g.: created_at,url,title')
@click.option('--with-headers', '-H', is_flag=True, help='Include extra CSV/HTML headers in the output')
@click.help_option('--help', '-h')
@click.argument('filter_patterns', nargs=-1)
@docstring(search.__doc__)
def main(**kwargs):
return search(**kwargs)
if __name__ == '__main__':
main()