logging and admin ui improvements

2026-04-04 23:07:56 +10:00 · 2025-12-25 01:10:41 -08:00
parent 8218675ed4
commit 866f993f26
60 changed files with 2932 additions and 497 deletions
--- a/archivebox/cli/archivebox_add.py
+++ b/archivebox/cli/archivebox_add.py
@@ -115,12 +115,10 @@ def add(urls: str | list[str],
    #    - Repeat until max_depth reached

    if bg:
-        # Background mode: start orchestrator and return immediately
-        print('[yellow]\\[*] Running in background mode - starting orchestrator...[/yellow]')
-        orchestrator = Orchestrator(exit_on_idle=True)
-        orchestrator.start()  # Fork to background
+        # Background mode: just queue work and return (orchestrator via server will pick it up)
+        print('[yellow]\\[*] URLs queued. Orchestrator will process them (run `archivebox server` if not already running).[/yellow]')
    else:
-        # Foreground mode: run orchestrator until all work is done
+        # Foreground mode: run orchestrator inline until all work is done
        print(f'[green]\\[*] Starting orchestrator to process crawl...[/green]')
        orchestrator = Orchestrator(exit_on_idle=True)
        orchestrator.runloop()  # Block until complete
--- a/archivebox/cli/archivebox_extract.py
+++ b/archivebox/cli/archivebox_extract.py
@@ -117,11 +117,11 @@ def run_plugins(
            if snapshot_id:
                snapshot_ids.add(snapshot_id)
            elif record.get('url'):
-                # Look up by URL
-                try:
-                    snap = Snapshot.objects.get(url=record['url'])
+                # Look up by URL (get most recent if multiple exist)
+                snap = Snapshot.objects.filter(url=record['url']).order_by('-created_at').first()
+                if snap:
                    snapshot_ids.add(str(snap.id))
-                except Snapshot.DoesNotExist:
+                else:
                    rprint(f'[yellow]Snapshot not found for URL: {record["url"]}[/yellow]', file=sys.stderr)

        elif record_type == TYPE_ARCHIVERESULT:
--- a/archivebox/cli/archivebox_install.py
+++ b/archivebox/cli/archivebox_install.py
@@ -49,20 +49,45 @@ def install(dry_run: bool=False) -> None:
    # Using a minimal crawl that will trigger on_Crawl hooks
    created_by_id = get_or_create_system_user_pk()

-    seed = Seed.objects.create(
+    seed, _created = Seed.objects.get_or_create(
        uri='archivebox://install',
        label='Dependency detection',
        created_by_id=created_by_id,
+        defaults={
+            'extractor': 'auto',
+        }
    )

-    crawl = Crawl.objects.create(
+    crawl, created = Crawl.objects.get_or_create(
        seed=seed,
        max_depth=0,
        created_by_id=created_by_id,
-        status='queued',
+        defaults={
+            'status': 'queued',
+        }
    )

+    # If crawl already existed, reset it to queued state so it can be processed again
+    if not created:
+        crawl.status = 'queued'
+        crawl.retry_at = timezone.now()
+        crawl.save()
+
    print(f'[+] Created dependency detection crawl: {crawl.id}')
+    print(f'[+] Crawl status: {crawl.status}, retry_at: {crawl.retry_at}')
+
+    # Verify the crawl is in the queue
+    from crawls.models import Crawl as CrawlModel
+    queued_crawls = CrawlModel.objects.filter(
+        retry_at__lte=timezone.now()
+    ).exclude(
+        status__in=CrawlModel.FINAL_STATES
+    )
+    print(f'[+] Crawls in queue: {queued_crawls.count()}')
+    if queued_crawls.exists():
+        for c in queued_crawls:
+            print(f'    - Crawl {c.id}: status={c.status}, retry_at={c.retry_at}')
+
    print('[+] Running crawl to detect binaries via on_Crawl hooks...')
    print()

--- a/archivebox/cli/archivebox_server.py
+++ b/archivebox/cli/archivebox_server.py
@@ -56,20 +56,53 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
    except IndexError:
        pass

-    print('[green][+] Starting ArchiveBox webserver...[/green]')
-    print(f'    [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
-    print(f'    [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]')
-    print('    > Writing ArchiveBox error log to ./logs/errors.log')
-
    if SHELL_CONFIG.DEBUG:
+        print('[green][+] Starting ArchiveBox webserver in DEBUG mode...[/green]')
+        print(f'    [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
+        print(f'    [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]')
+        print('    > Writing ArchiveBox error log to ./logs/errors.log')
        if not reload:
            runserver_args.append('--noreload')  # '--insecure'
        if nothreading:
            runserver_args.append('--nothreading')
        call_command("runserver", *runserver_args)
    else:
-        from workers.supervisord_util import start_server_workers
+        from workers.supervisord_util import (
+            get_existing_supervisord_process,
+            get_worker,
+            start_server_workers,
+            tail_multiple_worker_logs,
+        )

+        # Check if supervisord is already running
+        supervisor = get_existing_supervisord_process()
+        if supervisor:
+            daphne_proc = get_worker(supervisor, 'worker_daphne')
+
+            # If daphne is already running, just tail logs
+            if daphne_proc and daphne_proc.get('statename') == 'RUNNING':
+                orchestrator_proc = get_worker(supervisor, 'worker_orchestrator')
+                print('[yellow][!] ArchiveBox server is already running[/yellow]')
+                print(f'    [green]√[/green] Web server (worker_daphne) is RUNNING on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
+                if orchestrator_proc and orchestrator_proc.get('statename') == 'RUNNING':
+                    print(f'    [green]√[/green] Background worker (worker_orchestrator) is RUNNING')
+                print()
+                print('[blue][i] Tailing worker logs (Ctrl+C to stop watching)...[/i][/blue]')
+                print()
+
+                # Tail logs for both workers
+                tail_multiple_worker_logs(
+                    log_files=['logs/worker_daphne.log', 'logs/worker_orchestrator.log'],
+                    follow=True,
+                )
+                return
+            # Otherwise, daphne is not running - fall through to start it
+
+        # No existing workers found - start new ones
+        print('[green][+] Starting ArchiveBox webserver...[/green]')
+        print(f'    [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
+        print(f'    [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]')
+        print('    > Writing ArchiveBox error log to ./logs/errors.log')
        print()
        start_server_workers(host=host, port=port, daemonize=daemonize)
        print("\n[i][green][🟩] ArchiveBox server shut down gracefully.[/green][/i]")
--- a/archivebox/cli/archivebox_version.py
+++ b/archivebox/cli/archivebox_version.py
@@ -119,12 +119,13 @@ def version(quiet: bool=False,
    else:
        for key in sorted(set(binary_config_keys)):
            # Get the actual binary name/path from config value
-            bin_value = config.get(key, '').strip()
+            # Prioritize Machine.config overrides over base config
+            bin_value = machine.config.get(key) or config.get(key, '').strip()
            if not bin_value:
                continue

            # Check if it's a path (has slashes) or just a name
-            is_path = '/' in bin_value
+            is_path = '/' in str(bin_value)

            if is_path:
                # It's a full path - match against abspath