logging and admin ui improvements

This commit is contained in:
Nick Sweeting
2025-12-25 01:10:41 -08:00
parent 8218675ed4
commit 866f993f26
60 changed files with 2932 additions and 497 deletions

View File

@@ -115,12 +115,10 @@ def add(urls: str | list[str],
# - Repeat until max_depth reached
if bg:
# Background mode: start orchestrator and return immediately
print('[yellow]\\[*] Running in background mode - starting orchestrator...[/yellow]')
orchestrator = Orchestrator(exit_on_idle=True)
orchestrator.start() # Fork to background
# Background mode: just queue work and return (orchestrator via server will pick it up)
print('[yellow]\\[*] URLs queued. Orchestrator will process them (run `archivebox server` if not already running).[/yellow]')
else:
# Foreground mode: run orchestrator until all work is done
# Foreground mode: run orchestrator inline until all work is done
print(f'[green]\\[*] Starting orchestrator to process crawl...[/green]')
orchestrator = Orchestrator(exit_on_idle=True)
orchestrator.runloop() # Block until complete

View File

@@ -117,11 +117,11 @@ def run_plugins(
if snapshot_id:
snapshot_ids.add(snapshot_id)
elif record.get('url'):
# Look up by URL
try:
snap = Snapshot.objects.get(url=record['url'])
# Look up by URL (get most recent if multiple exist)
snap = Snapshot.objects.filter(url=record['url']).order_by('-created_at').first()
if snap:
snapshot_ids.add(str(snap.id))
except Snapshot.DoesNotExist:
else:
rprint(f'[yellow]Snapshot not found for URL: {record["url"]}[/yellow]', file=sys.stderr)
elif record_type == TYPE_ARCHIVERESULT:

View File

@@ -49,20 +49,45 @@ def install(dry_run: bool=False) -> None:
# Using a minimal crawl that will trigger on_Crawl hooks
created_by_id = get_or_create_system_user_pk()
seed = Seed.objects.create(
seed, _created = Seed.objects.get_or_create(
uri='archivebox://install',
label='Dependency detection',
created_by_id=created_by_id,
defaults={
'extractor': 'auto',
}
)
crawl = Crawl.objects.create(
crawl, created = Crawl.objects.get_or_create(
seed=seed,
max_depth=0,
created_by_id=created_by_id,
status='queued',
defaults={
'status': 'queued',
}
)
# If crawl already existed, reset it to queued state so it can be processed again
if not created:
crawl.status = 'queued'
crawl.retry_at = timezone.now()
crawl.save()
print(f'[+] Created dependency detection crawl: {crawl.id}')
print(f'[+] Crawl status: {crawl.status}, retry_at: {crawl.retry_at}')
# Verify the crawl is in the queue
from crawls.models import Crawl as CrawlModel
queued_crawls = CrawlModel.objects.filter(
retry_at__lte=timezone.now()
).exclude(
status__in=CrawlModel.FINAL_STATES
)
print(f'[+] Crawls in queue: {queued_crawls.count()}')
if queued_crawls.exists():
for c in queued_crawls:
print(f' - Crawl {c.id}: status={c.status}, retry_at={c.retry_at}')
print('[+] Running crawl to detect binaries via on_Crawl hooks...')
print()

View File

@@ -56,20 +56,53 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
except IndexError:
pass
print('[green][+] Starting ArchiveBox webserver...[/green]')
print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
print(f' [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]')
print(' > Writing ArchiveBox error log to ./logs/errors.log')
if SHELL_CONFIG.DEBUG:
print('[green][+] Starting ArchiveBox webserver in DEBUG mode...[/green]')
print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
print(f' [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]')
print(' > Writing ArchiveBox error log to ./logs/errors.log')
if not reload:
runserver_args.append('--noreload') # '--insecure'
if nothreading:
runserver_args.append('--nothreading')
call_command("runserver", *runserver_args)
else:
from workers.supervisord_util import start_server_workers
from workers.supervisord_util import (
get_existing_supervisord_process,
get_worker,
start_server_workers,
tail_multiple_worker_logs,
)
# Check if supervisord is already running
supervisor = get_existing_supervisord_process()
if supervisor:
daphne_proc = get_worker(supervisor, 'worker_daphne')
# If daphne is already running, just tail logs
if daphne_proc and daphne_proc.get('statename') == 'RUNNING':
orchestrator_proc = get_worker(supervisor, 'worker_orchestrator')
print('[yellow][!] ArchiveBox server is already running[/yellow]')
print(f' [green]√[/green] Web server (worker_daphne) is RUNNING on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
if orchestrator_proc and orchestrator_proc.get('statename') == 'RUNNING':
print(f' [green]√[/green] Background worker (worker_orchestrator) is RUNNING')
print()
print('[blue][i] Tailing worker logs (Ctrl+C to stop watching)...[/i][/blue]')
print()
# Tail logs for both workers
tail_multiple_worker_logs(
log_files=['logs/worker_daphne.log', 'logs/worker_orchestrator.log'],
follow=True,
)
return
# Otherwise, daphne is not running - fall through to start it
# No existing workers found - start new ones
print('[green][+] Starting ArchiveBox webserver...[/green]')
print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
print(f' [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]')
print(' > Writing ArchiveBox error log to ./logs/errors.log')
print()
start_server_workers(host=host, port=port, daemonize=daemonize)
print("\n[i][green][🟩] ArchiveBox server shut down gracefully.[/green][/i]")

View File

@@ -119,12 +119,13 @@ def version(quiet: bool=False,
else:
for key in sorted(set(binary_config_keys)):
# Get the actual binary name/path from config value
bin_value = config.get(key, '').strip()
# Prioritize Machine.config overrides over base config
bin_value = machine.config.get(key) or config.get(key, '').strip()
if not bin_value:
continue
# Check if it's a path (has slashes) or just a name
is_path = '/' in bin_value
is_path = '/' in str(bin_value)
if is_path:
# It's a full path - match against abspath