mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-01-03 09:25:42 +10:00
286 lines
8.7 KiB
Python
286 lines
8.7 KiB
Python
#!/usr/bin/env python3
|
|
|
|
__package__ = 'archivebox.cli'
|
|
|
|
import os
|
|
import time
|
|
import rich_click as click
|
|
|
|
from typing import Iterable
|
|
from pathlib import Path
|
|
|
|
from archivebox.misc.util import enforce_types, docstring
|
|
|
|
|
|
@enforce_types
|
|
def update(filter_patterns: Iterable[str] = (),
|
|
filter_type: str = 'exact',
|
|
before: float | None = None,
|
|
after: float | None = None,
|
|
resume: str | None = None,
|
|
batch_size: int = 100,
|
|
continuous: bool = False) -> None:
|
|
"""
|
|
Update snapshots: import orphans, reconcile, and re-run failed extractors.
|
|
|
|
Two-phase operation:
|
|
- Phase 1: Scan archive/ for orphaned snapshots (skip symlinks)
|
|
- Phase 2: Process all DB snapshots (reconcile + re-queue for archiving)
|
|
- Phase 3: Deduplicate exact duplicates
|
|
|
|
With filters: Only phase 2 (DB query), no filesystem scan.
|
|
Without filters: All phases (full update).
|
|
"""
|
|
|
|
from rich import print
|
|
from archivebox.config.django import setup_django
|
|
setup_django()
|
|
|
|
from archivebox.core.models import Snapshot
|
|
from django.utils import timezone
|
|
|
|
while True:
|
|
if filter_patterns or before or after:
|
|
# Filtered mode: query DB only
|
|
print('[*] Processing filtered snapshots from database...')
|
|
stats = process_filtered_snapshots(
|
|
filter_patterns=filter_patterns,
|
|
filter_type=filter_type,
|
|
before=before,
|
|
after=after,
|
|
batch_size=batch_size
|
|
)
|
|
print_stats(stats)
|
|
else:
|
|
# Full mode: import orphans + process DB + deduplicate
|
|
stats_combined = {'phase1': {}, 'phase2': {}, 'deduplicated': 0}
|
|
|
|
print('[*] Phase 1: Scanning archive/ for orphaned snapshots...')
|
|
stats_combined['phase1'] = import_orphans_from_archive(
|
|
resume_from=resume,
|
|
batch_size=batch_size
|
|
)
|
|
|
|
print('[*] Phase 2: Processing all database snapshots...')
|
|
stats_combined['phase2'] = process_all_db_snapshots(batch_size=batch_size)
|
|
|
|
print('[*] Phase 3: Deduplicating...')
|
|
stats_combined['deduplicated'] = Snapshot.find_and_merge_duplicates()
|
|
|
|
print_combined_stats(stats_combined)
|
|
|
|
if not continuous:
|
|
break
|
|
|
|
print('[yellow]Sleeping 60s before next pass...[/yellow]')
|
|
time.sleep(60)
|
|
resume = None
|
|
|
|
|
|
def import_orphans_from_archive(resume_from: str = None, batch_size: int = 100) -> dict:
|
|
"""
|
|
Scan archive/ for orphaned snapshots.
|
|
Skip symlinks (already migrated).
|
|
Create DB records and trigger migration on save().
|
|
"""
|
|
from archivebox.core.models import Snapshot
|
|
from archivebox.config import CONSTANTS
|
|
from django.db import transaction
|
|
|
|
stats = {'processed': 0, 'imported': 0, 'migrated': 0, 'invalid': 0}
|
|
|
|
archive_dir = CONSTANTS.ARCHIVE_DIR
|
|
if not archive_dir.exists():
|
|
return stats
|
|
|
|
print('[*] Scanning and sorting by modification time...')
|
|
|
|
# Scan and sort by mtime (newest first)
|
|
# Loading (mtime, path) tuples is fine even for millions (~100MB for 1M entries)
|
|
entries = [
|
|
(e.stat().st_mtime, e.path)
|
|
for e in os.scandir(archive_dir)
|
|
if e.is_dir(follow_symlinks=False) # Skip symlinks
|
|
]
|
|
entries.sort(reverse=True) # Newest first
|
|
print(f'[*] Found {len(entries)} directories to check')
|
|
|
|
for mtime, entry_path in entries:
|
|
entry_path = Path(entry_path)
|
|
|
|
# Resume from timestamp if specified
|
|
if resume_from and entry_path.name < resume_from:
|
|
continue
|
|
|
|
stats['processed'] += 1
|
|
|
|
# Check if already in DB
|
|
snapshot = Snapshot.load_from_directory(entry_path)
|
|
if snapshot:
|
|
continue # Already in DB, skip
|
|
|
|
# Not in DB - create orphaned snapshot
|
|
snapshot = Snapshot.create_from_directory(entry_path)
|
|
if not snapshot:
|
|
# Invalid directory
|
|
Snapshot.move_directory_to_invalid(entry_path)
|
|
stats['invalid'] += 1
|
|
print(f" [{stats['processed']}] Invalid: {entry_path.name}")
|
|
continue
|
|
|
|
needs_migration = snapshot.fs_migration_needed
|
|
|
|
snapshot.save() # Creates DB record + triggers migration
|
|
|
|
stats['imported'] += 1
|
|
if needs_migration:
|
|
stats['migrated'] += 1
|
|
print(f" [{stats['processed']}] Imported + migrated: {entry_path.name}")
|
|
else:
|
|
print(f" [{stats['processed']}] Imported: {entry_path.name}")
|
|
|
|
if stats['processed'] % batch_size == 0:
|
|
transaction.commit()
|
|
|
|
transaction.commit()
|
|
return stats
|
|
|
|
|
|
def process_all_db_snapshots(batch_size: int = 100) -> dict:
|
|
"""
|
|
Process all snapshots in DB.
|
|
Reconcile index.json and queue for archiving.
|
|
"""
|
|
from archivebox.core.models import Snapshot
|
|
from django.db import transaction
|
|
from django.utils import timezone
|
|
|
|
stats = {'processed': 0, 'reconciled': 0, 'queued': 0}
|
|
|
|
total = Snapshot.objects.count()
|
|
print(f'[*] Processing {total} snapshots from database...')
|
|
|
|
for snapshot in Snapshot.objects.iterator():
|
|
# Reconcile index.json with DB
|
|
snapshot.reconcile_with_index_json()
|
|
|
|
# Queue for archiving (state machine will handle it)
|
|
snapshot.status = Snapshot.StatusChoices.QUEUED
|
|
snapshot.retry_at = timezone.now()
|
|
snapshot.save()
|
|
|
|
stats['reconciled'] += 1
|
|
stats['queued'] += 1
|
|
stats['processed'] += 1
|
|
|
|
if stats['processed'] % batch_size == 0:
|
|
transaction.commit()
|
|
print(f" [{stats['processed']}/{total}] Processed...")
|
|
|
|
transaction.commit()
|
|
return stats
|
|
|
|
|
|
def process_filtered_snapshots(
|
|
filter_patterns: Iterable[str],
|
|
filter_type: str,
|
|
before: float | None,
|
|
after: float | None,
|
|
batch_size: int
|
|
) -> dict:
|
|
"""Process snapshots matching filters (DB query only)."""
|
|
from archivebox.core.models import Snapshot
|
|
from django.db import transaction
|
|
from django.utils import timezone
|
|
from datetime import datetime
|
|
|
|
stats = {'processed': 0, 'reconciled': 0, 'queued': 0}
|
|
|
|
snapshots = Snapshot.objects.all()
|
|
|
|
if filter_patterns:
|
|
snapshots = Snapshot.objects.filter_by_patterns(list(filter_patterns), filter_type)
|
|
|
|
if before:
|
|
snapshots = snapshots.filter(bookmarked_at__lt=datetime.fromtimestamp(before))
|
|
if after:
|
|
snapshots = snapshots.filter(bookmarked_at__gt=datetime.fromtimestamp(after))
|
|
|
|
total = snapshots.count()
|
|
print(f'[*] Found {total} matching snapshots')
|
|
|
|
for snapshot in snapshots.iterator():
|
|
# Reconcile index.json with DB
|
|
snapshot.reconcile_with_index_json()
|
|
|
|
# Queue for archiving
|
|
snapshot.status = Snapshot.StatusChoices.QUEUED
|
|
snapshot.retry_at = timezone.now()
|
|
snapshot.save()
|
|
|
|
stats['reconciled'] += 1
|
|
stats['queued'] += 1
|
|
stats['processed'] += 1
|
|
|
|
if stats['processed'] % batch_size == 0:
|
|
transaction.commit()
|
|
print(f" [{stats['processed']}/{total}] Processed...")
|
|
|
|
transaction.commit()
|
|
return stats
|
|
|
|
|
|
def print_stats(stats: dict):
|
|
"""Print statistics for filtered mode."""
|
|
from rich import print
|
|
|
|
print(f"""
|
|
[green]Update Complete[/green]
|
|
Processed: {stats['processed']}
|
|
Reconciled: {stats['reconciled']}
|
|
Queued: {stats['queued']}
|
|
""")
|
|
|
|
|
|
def print_combined_stats(stats_combined: dict):
|
|
"""Print statistics for full mode."""
|
|
from rich import print
|
|
|
|
s1 = stats_combined['phase1']
|
|
s2 = stats_combined['phase2']
|
|
|
|
print(f"""
|
|
[green]Archive Update Complete[/green]
|
|
|
|
Phase 1 (Import Orphans):
|
|
Checked: {s1.get('processed', 0)}
|
|
Imported: {s1.get('imported', 0)}
|
|
Migrated: {s1.get('migrated', 0)}
|
|
Invalid: {s1.get('invalid', 0)}
|
|
|
|
Phase 2 (Process DB):
|
|
Processed: {s2.get('processed', 0)}
|
|
Reconciled: {s2.get('reconciled', 0)}
|
|
Queued: {s2.get('queued', 0)}
|
|
|
|
Phase 3 (Deduplicate):
|
|
Merged: {stats_combined['deduplicated']}
|
|
""")
|
|
|
|
|
|
@click.command()
|
|
@click.option('--resume', type=str, help='Resume from timestamp')
|
|
@click.option('--before', type=float, help='Only snapshots before timestamp')
|
|
@click.option('--after', type=float, help='Only snapshots after timestamp')
|
|
@click.option('--filter-type', '-t', type=click.Choice(['exact', 'substring', 'regex', 'domain', 'tag', 'timestamp']), default='exact')
|
|
@click.option('--batch-size', type=int, default=100, help='Commit every N snapshots')
|
|
@click.option('--continuous', is_flag=True, help='Run continuously as background worker')
|
|
@click.argument('filter_patterns', nargs=-1)
|
|
@docstring(update.__doc__)
|
|
def main(**kwargs):
|
|
update(**kwargs)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|