Update status command to use DB as source of truth

Remove imports of deleted folder utility functions and rewrite
status command to query Snapshot model directly. This aligns with
the fs_version refactor where the DB is the single source of truth.

- Use Snapshot.objects queries for indexed/archived/unarchived counts
- Scan filesystem directly for present/orphaned directory counts
- Simplify output to focus on essential status information
This commit is contained in:
Claude
2025-12-28 19:19:03 +00:00
parent 767458e4e0
commit 057b49ad85

View File

@@ -11,18 +11,6 @@ from archivebox.misc.util import enforce_types, docstring
from archivebox.config import DATA_DIR, CONSTANTS, ARCHIVE_DIR
from archivebox.config.common import SHELL_CONFIG
from archivebox.misc.legacy import parse_json_links_details
from archivebox.misc.folders import (
get_indexed_folders,
get_archived_folders,
get_invalid_folders,
get_unarchived_folders,
get_present_folders,
get_valid_folders,
get_duplicate_folders,
get_orphaned_folders,
get_corrupted_folders,
get_unrecognized_folders,
)
from archivebox.misc.system import get_dir_size
from archivebox.misc.logging_util import printable_filesize
@@ -55,42 +43,40 @@ def status(out_dir: Path=DATA_DIR) -> None:
size = printable_filesize(num_bytes)
print(f' Size: {size} across {num_files} files in {num_dirs} directories')
num_indexed = len(get_indexed_folders(links, out_dir=out_dir))
num_archived = len(get_archived_folders(links, out_dir=out_dir))
num_unarchived = len(get_unarchived_folders(links, out_dir=out_dir))
print(f' > indexed: {num_indexed}'.ljust(36), f'({get_indexed_folders.__doc__})')
print(f' > archived: {num_archived}'.ljust(36), f'({get_archived_folders.__doc__})')
print(f' > unarchived: {num_unarchived}'.ljust(36), f'({get_unarchived_folders.__doc__})')
num_present = len(get_present_folders(links, out_dir=out_dir))
num_valid = len(get_valid_folders(links, out_dir=out_dir))
# Use DB as source of truth for snapshot status
num_indexed = links.count()
num_archived = links.filter(status='archived').count() or links.exclude(downloaded_at=None).count()
num_unarchived = links.filter(status='queued').count() or links.filter(downloaded_at=None).count()
print(f' > indexed: {num_indexed}'.ljust(36), '(total snapshots in DB)')
print(f' > archived: {num_archived}'.ljust(36), '(snapshots with archived content)')
print(f' > unarchived: {num_unarchived}'.ljust(36), '(snapshots pending archiving)')
# Count directories on filesystem
num_present = 0
orphaned_dirs = []
if ARCHIVE_DIR.exists():
for entry in ARCHIVE_DIR.iterdir():
if entry.is_dir():
num_present += 1
if not links.filter(timestamp=entry.name).exists():
orphaned_dirs.append(str(entry))
num_valid = min(num_present, num_indexed) # approximate
print()
print(f' > present: {num_present}'.ljust(36), f'({get_present_folders.__doc__})')
print(f' > [green]valid:[/green] {num_valid}'.ljust(36), f' ({get_valid_folders.__doc__})')
duplicate = get_duplicate_folders(links, out_dir=out_dir)
orphaned = get_orphaned_folders(links, out_dir=out_dir)
corrupted = get_corrupted_folders(links, out_dir=out_dir)
unrecognized = get_unrecognized_folders(links, out_dir=out_dir)
num_invalid = len({**duplicate, **orphaned, **corrupted, **unrecognized})
print(f' > [red]invalid:[/red] {num_invalid}'.ljust(36), f' ({get_invalid_folders.__doc__})')
print(f' > duplicate: {len(duplicate)}'.ljust(36), f'({get_duplicate_folders.__doc__})')
print(f' > orphaned: {len(orphaned)}'.ljust(36), f'({get_orphaned_folders.__doc__})')
print(f' > corrupted: {len(corrupted)}'.ljust(36), f'({get_corrupted_folders.__doc__})')
print(f' > unrecognized: {len(unrecognized)}'.ljust(36), f'({get_unrecognized_folders.__doc__})')
print(f' > present: {num_present}'.ljust(36), '(directories in archive/)')
print(f' > [green]valid:[/green] {num_valid}'.ljust(36), ' (directories with matching DB entry)')
num_orphaned = len(orphaned_dirs)
print(f' > [red]orphaned:[/red] {num_orphaned}'.ljust(36), ' (directories without matching DB entry)')
if num_indexed:
print(' [violet]Hint:[/violet] You can list link data directories by status like so:')
print(' [green]archivebox list --status=<status> (e.g. indexed, corrupted, archived, etc.)[/green]')
print(' [violet]Hint:[/violet] You can list snapshots by status like so:')
print(' [green]archivebox list --status=<status> (e.g. archived, queued, etc.)[/green]')
if orphaned:
if orphaned_dirs:
print(' [violet]Hint:[/violet] To automatically import orphaned data directories into the main index, run:')
print(' [green]archivebox init[/green]')
if num_invalid:
print(' [violet]Hint:[/violet] You may need to manually remove or fix some invalid data directories, afterwards make sure to run:')
print(' [green]archivebox init[/green]')
print()
print('[green]\\[*] Scanning recent archive changes and user logins:[/green]')
print(f'[yellow] {CONSTANTS.LOGS_DIR}/*[/yellow]')