ArchiveBox/archivebox/cli/archivebox_status.py

#!/usr/bin/env python3

__package__ = "archivebox.cli"

from pathlib import Path

import rich_click as click
from rich import print

from archivebox.misc.util import enforce_types, docstring
from archivebox.config import DATA_DIR, CONSTANTS, ARCHIVE_DIR
from archivebox.config.common import SHELL_CONFIG
from archivebox.misc.legacy import parse_json_links_details
from archivebox.misc.system import get_dir_size
from archivebox.misc.logging_util import printable_filesize


@enforce_types
def status(out_dir: Path = DATA_DIR) -> None:
    """Print out some info and statistics about the archive collection"""

    from django.contrib.auth import get_user_model
    from django.db.models import Sum
    from django.db.models.functions import Coalesce
    from archivebox.core.models import Snapshot

    User = get_user_model()

    print("[green]\\[*] Scanning archive main index...[/green]")
    print(f"[yellow]   {out_dir}/*[/yellow]")
    num_bytes, num_dirs, num_files = get_dir_size(out_dir, recursive=False, pattern="index.")
    size = printable_filesize(num_bytes)
    print(f"    Index size: {size} across {num_files} files")
    print()

    links = list(Snapshot.objects.annotate(output_size_sum=Coalesce(Sum("archiveresult__output_size"), 0)))
    num_sql_links = len(links)
    num_link_details = sum(1 for link in parse_json_links_details(out_dir=out_dir))
    print(f"    > SQL Main Index: {num_sql_links} links".ljust(36), f"(found in {CONSTANTS.SQL_INDEX_FILENAME})")
    print(f"    > JSON Link Details: {num_link_details} links".ljust(36), f"(found in {ARCHIVE_DIR.name}/*/index.json)")
    print()
    print("[green]\\[*] Scanning archive data directories...[/green]")
    users_dir = out_dir / "users"
    scan_roots = [root for root in (ARCHIVE_DIR, users_dir) if root.exists()]
    scan_roots_display = ", ".join(str(root) for root in scan_roots) if scan_roots else str(ARCHIVE_DIR)
    print(f"[yellow]   {scan_roots_display}[/yellow]")
    num_bytes = num_dirs = num_files = 0
    for root in scan_roots:
        root_bytes, root_dirs, root_files = get_dir_size(root)
        num_bytes += root_bytes
        num_dirs += root_dirs
        num_files += root_files
    size = printable_filesize(num_bytes)
    print(f"    Size: {size} across {num_files} files in {num_dirs} directories")

    # Use DB as source of truth for snapshot status
    num_indexed = len(links)
    num_archived = sum(1 for snapshot in links if snapshot.is_archived)
    num_unarchived = max(num_indexed - num_archived, 0)
    print(f"    > indexed: {num_indexed}".ljust(36), "(total snapshots in DB)")
    print(f"      > archived: {num_archived}".ljust(36), "(snapshots with archived content)")
    print(f"      > unarchived: {num_unarchived}".ljust(36), "(snapshots pending archiving)")

    # Count snapshot directories on filesystem across both legacy and current layouts.
    expected_snapshot_dirs = {str(Path(snapshot.output_dir).resolve()) for snapshot in links if Path(snapshot.output_dir).exists()}
    discovered_snapshot_dirs = set()

    if ARCHIVE_DIR.exists():
        discovered_snapshot_dirs.update(str(entry.resolve()) for entry in ARCHIVE_DIR.iterdir() if entry.is_dir())

    if users_dir.exists():
        discovered_snapshot_dirs.update(str(entry.resolve()) for entry in users_dir.glob("*/snapshots/*/*/*") if entry.is_dir())

    orphaned_dirs = sorted(discovered_snapshot_dirs - expected_snapshot_dirs)
    num_present = len(discovered_snapshot_dirs)
    num_valid = len(discovered_snapshot_dirs & expected_snapshot_dirs)
    print()
    print(f"    > present: {num_present}".ljust(36), "(snapshot directories on disk)")
    print(f"      > [green]valid:[/green] {num_valid}".ljust(36), "               (directories with matching DB entry)")

    num_orphaned = len(orphaned_dirs)
    print(f"      > [red]orphaned:[/red] {num_orphaned}".ljust(36), "         (directories without matching DB entry)")

    if num_indexed:
        print("    [violet]Hint:[/violet] You can list snapshots by status like so:")
        print("        [green]archivebox list --status=<status>  (e.g. archived, queued, etc.)[/green]")

    if orphaned_dirs:
        print("    [violet]Hint:[/violet] To automatically import orphaned data directories into the main index, run:")
        print("        [green]archivebox init[/green]")

    print()
    print("[green]\\[*] Scanning recent archive changes and user logins:[/green]")
    print(f"[yellow]   {CONSTANTS.LOGS_DIR}/*[/yellow]")
    admin_users = User.objects.filter(is_superuser=True).exclude(username="system")
    users = [user.get_username() for user in admin_users]
    print(f"    UI users {len(users)}: {', '.join(users)}")
    last_login = admin_users.order_by("last_login").last()
    if last_login:
        print(f"    Last UI login: {last_login.get_username()} @ {str(last_login.last_login)[:16]}")
    last_downloaded = Snapshot.objects.order_by("downloaded_at").last()
    if last_downloaded:
        print(f"    Last changes: {str(last_downloaded.downloaded_at)[:16]}")

    if not users:
        print()
        print("    [violet]Hint:[/violet] You can create an admin user by running:")
        print("        [green]archivebox manage createsuperuser[/green]")

    print()
    recent_snapshots = sorted(
        links,
        key=lambda snapshot: snapshot.downloaded_at or snapshot.modified_at or snapshot.created_at,
        reverse=True,
    )[:10]
    for snapshot in recent_snapshots:
        if not snapshot.downloaded_at:
            continue
        print(
            (
                "[grey53] "
                f"   > {str(snapshot.downloaded_at)[:16]} "
                f"[{snapshot.num_outputs} {('X', '√')[snapshot.is_archived]} {printable_filesize(snapshot.archive_size)}] "
                f'"{snapshot.title}": {snapshot.url}'
                "[/grey53]"
            )[: SHELL_CONFIG.TERM_WIDTH],
        )
    print("[grey53]   ...")


@click.command()
@docstring(status.__doc__)
def main(**kwargs):
    """Print out some info and statistics about the archive collection"""
    status(**kwargs)


if __name__ == "__main__":
    main()