Files
ArchiveBox/archivebox/cli/archivebox_install.py
2025-12-29 22:12:57 -08:00

120 lines
3.8 KiB
Python
Executable File

#!/usr/bin/env python3
__package__ = 'archivebox.cli'
import os
import sys
import shutil
import rich_click as click
from rich import print
from archivebox.misc.util import docstring, enforce_types
@enforce_types
def install(dry_run: bool=False) -> None:
"""Detect and install ArchiveBox dependencies by running a dependency-check crawl"""
from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP
from archivebox.config.paths import ARCHIVE_DIR
from archivebox.misc.logging import stderr
from archivebox.cli.archivebox_init import init
if not (os.access(ARCHIVE_DIR, os.R_OK) and ARCHIVE_DIR.is_dir()):
init() # must init full index because we need a db to store Binary entries in
print('\n[green][+] Detecting ArchiveBox dependencies...[/green]')
if IS_ROOT:
EUID = os.geteuid()
print()
print(f'[yellow]:warning: Running as UID=[blue]{EUID}[/blue].[/yellow]')
print(f' DATA_DIR will be owned by [blue]{ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/blue].')
print()
if dry_run:
print('[dim]Dry run - would create a crawl to detect dependencies[/dim]')
return
# Set up Django
from archivebox.config.django import setup_django
setup_django()
from django.utils import timezone
from archivebox.crawls.models import Crawl
from archivebox.base_models.models import get_or_create_system_user_pk
# Create a crawl for dependency detection
# Using a minimal crawl that will trigger on_Crawl hooks
created_by_id = get_or_create_system_user_pk()
crawl, created = Crawl.objects.get_or_create(
urls='archivebox://install',
defaults={
'label': 'Dependency detection',
'created_by_id': created_by_id,
'max_depth': 0,
'status': 'queued',
}
)
# If crawl already existed, reset it to queued state so it can be processed again
if not created:
crawl.status = 'queued'
crawl.retry_at = timezone.now()
crawl.save()
print(f'[+] Created dependency detection crawl: {crawl.id}')
print(f'[+] Crawl status: {crawl.status}, retry_at: {crawl.retry_at}')
# Verify the crawl is in the queue
from archivebox.crawls.models import Crawl as CrawlModel
queued_crawls = CrawlModel.objects.filter(
retry_at__lte=timezone.now()
).exclude(
status__in=CrawlModel.FINAL_STATES
)
print(f'[+] Crawls in queue: {queued_crawls.count()}')
if queued_crawls.exists():
for c in queued_crawls:
print(f' - Crawl {c.id}: status={c.status}, retry_at={c.retry_at}')
print('[+] Running crawl to detect binaries via on_Crawl hooks...')
print()
# Run the crawl synchronously (this triggers on_Crawl hooks)
from archivebox.workers.orchestrator import Orchestrator
orchestrator = Orchestrator(exit_on_idle=True)
orchestrator.runloop()
print()
# Check for superuser
from django.contrib.auth import get_user_model
User = get_user_model()
if not User.objects.filter(is_superuser=True).exclude(username='system').exists():
stderr('\n[+] Don\'t forget to create a new admin user for the Web UI...', color='green')
stderr(' archivebox manage createsuperuser')
print()
# Run version to show full status
archivebox_path = shutil.which('archivebox') or sys.executable
if 'python' in archivebox_path:
os.system(f'{sys.executable} -m archivebox version')
else:
os.system(f'{archivebox_path} version')
@click.command()
@click.option('--dry-run', '-d', is_flag=True, help='Show what would happen without actually running', default=False)
@docstring(install.__doc__)
def main(**kwargs) -> None:
install(**kwargs)
if __name__ == '__main__':
main()