From 057b49ad85011286b2eace4631d20df7f17549d6 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 28 Dec 2025 19:19:03 +0000 Subject: [PATCH] Update status command to use DB as source of truth Remove imports of deleted folder utility functions and rewrite status command to query Snapshot model directly. This aligns with the fs_version refactor where the DB is the single source of truth. - Use Snapshot.objects queries for indexed/archived/unarchived counts - Scan filesystem directly for present/orphaned directory counts - Simplify output to focus on essential status information --- archivebox/cli/archivebox_status.py | 68 ++++++++++++----------------- 1 file changed, 27 insertions(+), 41 deletions(-) diff --git a/archivebox/cli/archivebox_status.py b/archivebox/cli/archivebox_status.py index e37f01ba..de5ada95 100644 --- a/archivebox/cli/archivebox_status.py +++ b/archivebox/cli/archivebox_status.py @@ -11,18 +11,6 @@ from archivebox.misc.util import enforce_types, docstring from archivebox.config import DATA_DIR, CONSTANTS, ARCHIVE_DIR from archivebox.config.common import SHELL_CONFIG from archivebox.misc.legacy import parse_json_links_details -from archivebox.misc.folders import ( - get_indexed_folders, - get_archived_folders, - get_invalid_folders, - get_unarchived_folders, - get_present_folders, - get_valid_folders, - get_duplicate_folders, - get_orphaned_folders, - get_corrupted_folders, - get_unrecognized_folders, -) from archivebox.misc.system import get_dir_size from archivebox.misc.logging_util import printable_filesize @@ -55,42 +43,40 @@ def status(out_dir: Path=DATA_DIR) -> None: size = printable_filesize(num_bytes) print(f' Size: {size} across {num_files} files in {num_dirs} directories') - num_indexed = len(get_indexed_folders(links, out_dir=out_dir)) - num_archived = len(get_archived_folders(links, out_dir=out_dir)) - num_unarchived = len(get_unarchived_folders(links, out_dir=out_dir)) - print(f' > indexed: {num_indexed}'.ljust(36), f'({get_indexed_folders.__doc__})') - print(f' > archived: {num_archived}'.ljust(36), f'({get_archived_folders.__doc__})') - print(f' > unarchived: {num_unarchived}'.ljust(36), f'({get_unarchived_folders.__doc__})') - - num_present = len(get_present_folders(links, out_dir=out_dir)) - num_valid = len(get_valid_folders(links, out_dir=out_dir)) + # Use DB as source of truth for snapshot status + num_indexed = links.count() + num_archived = links.filter(status='archived').count() or links.exclude(downloaded_at=None).count() + num_unarchived = links.filter(status='queued').count() or links.filter(downloaded_at=None).count() + print(f' > indexed: {num_indexed}'.ljust(36), '(total snapshots in DB)') + print(f' > archived: {num_archived}'.ljust(36), '(snapshots with archived content)') + print(f' > unarchived: {num_unarchived}'.ljust(36), '(snapshots pending archiving)') + + # Count directories on filesystem + num_present = 0 + orphaned_dirs = [] + if ARCHIVE_DIR.exists(): + for entry in ARCHIVE_DIR.iterdir(): + if entry.is_dir(): + num_present += 1 + if not links.filter(timestamp=entry.name).exists(): + orphaned_dirs.append(str(entry)) + + num_valid = min(num_present, num_indexed) # approximate print() - print(f' > present: {num_present}'.ljust(36), f'({get_present_folders.__doc__})') - print(f' > [green]valid:[/green] {num_valid}'.ljust(36), f' ({get_valid_folders.__doc__})') - - duplicate = get_duplicate_folders(links, out_dir=out_dir) - orphaned = get_orphaned_folders(links, out_dir=out_dir) - corrupted = get_corrupted_folders(links, out_dir=out_dir) - unrecognized = get_unrecognized_folders(links, out_dir=out_dir) - num_invalid = len({**duplicate, **orphaned, **corrupted, **unrecognized}) - print(f' > [red]invalid:[/red] {num_invalid}'.ljust(36), f' ({get_invalid_folders.__doc__})') - print(f' > duplicate: {len(duplicate)}'.ljust(36), f'({get_duplicate_folders.__doc__})') - print(f' > orphaned: {len(orphaned)}'.ljust(36), f'({get_orphaned_folders.__doc__})') - print(f' > corrupted: {len(corrupted)}'.ljust(36), f'({get_corrupted_folders.__doc__})') - print(f' > unrecognized: {len(unrecognized)}'.ljust(36), f'({get_unrecognized_folders.__doc__})') + print(f' > present: {num_present}'.ljust(36), '(directories in archive/)') + print(f' > [green]valid:[/green] {num_valid}'.ljust(36), ' (directories with matching DB entry)') + + num_orphaned = len(orphaned_dirs) + print(f' > [red]orphaned:[/red] {num_orphaned}'.ljust(36), ' (directories without matching DB entry)') if num_indexed: - print(' [violet]Hint:[/violet] You can list link data directories by status like so:') - print(' [green]archivebox list --status= (e.g. indexed, corrupted, archived, etc.)[/green]') + print(' [violet]Hint:[/violet] You can list snapshots by status like so:') + print(' [green]archivebox list --status= (e.g. archived, queued, etc.)[/green]') - if orphaned: + if orphaned_dirs: print(' [violet]Hint:[/violet] To automatically import orphaned data directories into the main index, run:') print(' [green]archivebox init[/green]') - if num_invalid: - print(' [violet]Hint:[/violet] You may need to manually remove or fix some invalid data directories, afterwards make sure to run:') - print(' [green]archivebox init[/green]') - print() print('[green]\\[*] Scanning recent archive changes and user logins:[/green]') print(f'[yellow] {CONSTANTS.LOGS_DIR}/*[/yellow]')