mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-01-03 01:15:57 +10:00
Update status command to use DB as source of truth
Remove imports of deleted folder utility functions and rewrite status command to query Snapshot model directly. This aligns with the fs_version refactor where the DB is the single source of truth. - Use Snapshot.objects queries for indexed/archived/unarchived counts - Scan filesystem directly for present/orphaned directory counts - Simplify output to focus on essential status information
This commit is contained in:
@@ -11,18 +11,6 @@ from archivebox.misc.util import enforce_types, docstring
|
||||
from archivebox.config import DATA_DIR, CONSTANTS, ARCHIVE_DIR
|
||||
from archivebox.config.common import SHELL_CONFIG
|
||||
from archivebox.misc.legacy import parse_json_links_details
|
||||
from archivebox.misc.folders import (
|
||||
get_indexed_folders,
|
||||
get_archived_folders,
|
||||
get_invalid_folders,
|
||||
get_unarchived_folders,
|
||||
get_present_folders,
|
||||
get_valid_folders,
|
||||
get_duplicate_folders,
|
||||
get_orphaned_folders,
|
||||
get_corrupted_folders,
|
||||
get_unrecognized_folders,
|
||||
)
|
||||
from archivebox.misc.system import get_dir_size
|
||||
from archivebox.misc.logging_util import printable_filesize
|
||||
|
||||
@@ -55,42 +43,40 @@ def status(out_dir: Path=DATA_DIR) -> None:
|
||||
size = printable_filesize(num_bytes)
|
||||
print(f' Size: {size} across {num_files} files in {num_dirs} directories')
|
||||
|
||||
num_indexed = len(get_indexed_folders(links, out_dir=out_dir))
|
||||
num_archived = len(get_archived_folders(links, out_dir=out_dir))
|
||||
num_unarchived = len(get_unarchived_folders(links, out_dir=out_dir))
|
||||
print(f' > indexed: {num_indexed}'.ljust(36), f'({get_indexed_folders.__doc__})')
|
||||
print(f' > archived: {num_archived}'.ljust(36), f'({get_archived_folders.__doc__})')
|
||||
print(f' > unarchived: {num_unarchived}'.ljust(36), f'({get_unarchived_folders.__doc__})')
|
||||
|
||||
num_present = len(get_present_folders(links, out_dir=out_dir))
|
||||
num_valid = len(get_valid_folders(links, out_dir=out_dir))
|
||||
# Use DB as source of truth for snapshot status
|
||||
num_indexed = links.count()
|
||||
num_archived = links.filter(status='archived').count() or links.exclude(downloaded_at=None).count()
|
||||
num_unarchived = links.filter(status='queued').count() or links.filter(downloaded_at=None).count()
|
||||
print(f' > indexed: {num_indexed}'.ljust(36), '(total snapshots in DB)')
|
||||
print(f' > archived: {num_archived}'.ljust(36), '(snapshots with archived content)')
|
||||
print(f' > unarchived: {num_unarchived}'.ljust(36), '(snapshots pending archiving)')
|
||||
|
||||
# Count directories on filesystem
|
||||
num_present = 0
|
||||
orphaned_dirs = []
|
||||
if ARCHIVE_DIR.exists():
|
||||
for entry in ARCHIVE_DIR.iterdir():
|
||||
if entry.is_dir():
|
||||
num_present += 1
|
||||
if not links.filter(timestamp=entry.name).exists():
|
||||
orphaned_dirs.append(str(entry))
|
||||
|
||||
num_valid = min(num_present, num_indexed) # approximate
|
||||
print()
|
||||
print(f' > present: {num_present}'.ljust(36), f'({get_present_folders.__doc__})')
|
||||
print(f' > [green]valid:[/green] {num_valid}'.ljust(36), f' ({get_valid_folders.__doc__})')
|
||||
|
||||
duplicate = get_duplicate_folders(links, out_dir=out_dir)
|
||||
orphaned = get_orphaned_folders(links, out_dir=out_dir)
|
||||
corrupted = get_corrupted_folders(links, out_dir=out_dir)
|
||||
unrecognized = get_unrecognized_folders(links, out_dir=out_dir)
|
||||
num_invalid = len({**duplicate, **orphaned, **corrupted, **unrecognized})
|
||||
print(f' > [red]invalid:[/red] {num_invalid}'.ljust(36), f' ({get_invalid_folders.__doc__})')
|
||||
print(f' > duplicate: {len(duplicate)}'.ljust(36), f'({get_duplicate_folders.__doc__})')
|
||||
print(f' > orphaned: {len(orphaned)}'.ljust(36), f'({get_orphaned_folders.__doc__})')
|
||||
print(f' > corrupted: {len(corrupted)}'.ljust(36), f'({get_corrupted_folders.__doc__})')
|
||||
print(f' > unrecognized: {len(unrecognized)}'.ljust(36), f'({get_unrecognized_folders.__doc__})')
|
||||
print(f' > present: {num_present}'.ljust(36), '(directories in archive/)')
|
||||
print(f' > [green]valid:[/green] {num_valid}'.ljust(36), ' (directories with matching DB entry)')
|
||||
|
||||
num_orphaned = len(orphaned_dirs)
|
||||
print(f' > [red]orphaned:[/red] {num_orphaned}'.ljust(36), ' (directories without matching DB entry)')
|
||||
|
||||
if num_indexed:
|
||||
print(' [violet]Hint:[/violet] You can list link data directories by status like so:')
|
||||
print(' [green]archivebox list --status=<status> (e.g. indexed, corrupted, archived, etc.)[/green]')
|
||||
print(' [violet]Hint:[/violet] You can list snapshots by status like so:')
|
||||
print(' [green]archivebox list --status=<status> (e.g. archived, queued, etc.)[/green]')
|
||||
|
||||
if orphaned:
|
||||
if orphaned_dirs:
|
||||
print(' [violet]Hint:[/violet] To automatically import orphaned data directories into the main index, run:')
|
||||
print(' [green]archivebox init[/green]')
|
||||
|
||||
if num_invalid:
|
||||
print(' [violet]Hint:[/violet] You may need to manually remove or fix some invalid data directories, afterwards make sure to run:')
|
||||
print(' [green]archivebox init[/green]')
|
||||
|
||||
print()
|
||||
print('[green]\\[*] Scanning recent archive changes and user logins:[/green]')
|
||||
print(f'[yellow] {CONSTANTS.LOGS_DIR}/*[/yellow]')
|
||||
|
||||
Reference in New Issue
Block a user