From 866f993f2676b995ab30e5e2bc1503c76559003d Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 25 Dec 2025 01:10:41 -0800 Subject: [PATCH] logging and admin ui improvements --- archivebox/ArchiveBox.conf | 3 + archivebox/api/admin.py | 39 +- archivebox/cli/archivebox_add.py | 8 +- archivebox/cli/archivebox_extract.py | 8 +- archivebox/cli/archivebox_install.py | 31 +- archivebox/cli/archivebox_server.py | 45 +- archivebox/cli/archivebox_version.py | 5 +- archivebox/config/django.py | 199 ++-- archivebox/core/admin_archiveresults.py | 178 ++- archivebox/core/admin_snapshots.py | 47 +- archivebox/core/admin_tags.py | 16 +- archivebox/core/apps.py | 40 - .../0025_allow_duplicate_urls_per_crawl.py | 22 + archivebox/core/models.py | 68 +- archivebox/core/statemachines.py | 59 +- archivebox/core/views.py | 63 +- archivebox/crawls/admin.py | 248 +++- archivebox/crawls/models.py | 56 +- archivebox/crawls/statemachines.py | 126 +- archivebox/hooks.py | 8 + archivebox/logs/errors.log | 2 + archivebox/machine/admin.py | 96 +- archivebox/misc/logging_util.py | 24 +- .../on_Snapshot__39_accessibility.js | 9 +- .../on_Snapshot__13_archive_org.py | 2 +- .../on_Snapshot__45_chrome_cleanup.py | 2 +- .../on_Snapshot__30_chrome_navigate.js | 2 +- .../consolelog/on_Snapshot__21_consolelog.js | 9 +- archivebox/plugins/dom/on_Snapshot__36_dom.js | 11 +- .../favicon/on_Snapshot__11_favicon.py | 2 +- archivebox/plugins/git/on_Snapshot__12_git.py | 2 +- .../headers/on_Snapshot__33_headers.js | 9 +- .../htmltotext/on_Snapshot__54_htmltotext.py | 5 +- .../plugins/media/on_Snapshot__51_media.py | 7 +- .../mercury/on_Snapshot__53_mercury.py | 5 +- .../on_Snapshot__40_parse_dom_outlinks.js | 9 +- archivebox/plugins/pdf/on_Snapshot__35_pdf.js | 11 +- .../on_Snapshot__52_readability.py | 5 +- .../redirects/on_Snapshot__22_redirects.js | 9 +- .../responses/on_Snapshot__24_responses.js | 10 +- .../screenshot/on_Snapshot__34_screenshot.js | 11 +- .../on_Crawl__00_validate_ripgrep.py | 131 +++ .../search_backend_ripgrep/tests/__init__.py | 0 .../tests/test_ripgrep_detection.py | 306 +++++ .../on_Snapshot__91_index_sonic.py | 2 +- .../on_Snapshot__90_index_sqlite.py | 2 +- archivebox/plugins/seo/on_Snapshot__38_seo.js | 9 +- .../singlefile/on_Snapshot__04_singlefile.js | 8 +- .../singlefile/on_Snapshot__37_singlefile.py | 11 +- archivebox/plugins/ssl/on_Snapshot__23_ssl.js | 9 +- .../staticfile/on_Snapshot__31_staticfile.py | 7 +- .../plugins/title/on_Snapshot__32_title.js | 9 +- .../plugins/wget/on_Snapshot__50_wget.py | 4 +- archivebox/templates/admin/base.html | 1025 +++++++++++++++++ .../templates/admin/progress_monitor.html | 237 +++- .../management/commands/orchestrator.py | 11 +- archivebox/workers/orchestrator.py | 35 +- archivebox/workers/supervisord_util.py | 85 +- archivebox/workers/tasks.py | 23 +- archivebox/workers/worker.py | 4 +- 60 files changed, 2932 insertions(+), 497 deletions(-) create mode 100644 archivebox/ArchiveBox.conf create mode 100644 archivebox/core/migrations/0025_allow_duplicate_urls_per_crawl.py create mode 100644 archivebox/logs/errors.log create mode 100755 archivebox/plugins/search_backend_ripgrep/on_Crawl__00_validate_ripgrep.py create mode 100644 archivebox/plugins/search_backend_ripgrep/tests/__init__.py create mode 100644 archivebox/plugins/search_backend_ripgrep/tests/test_ripgrep_detection.py diff --git a/archivebox/ArchiveBox.conf b/archivebox/ArchiveBox.conf new file mode 100644 index 00000000..fb119776 --- /dev/null +++ b/archivebox/ArchiveBox.conf @@ -0,0 +1,3 @@ +[SERVER_CONFIG] +SECRET_KEY = amuxg7v5e2l_6jrktp_f3kszlpx4ieqk4rtwda5q6nfiavits4 + diff --git a/archivebox/api/admin.py b/archivebox/api/admin.py index 056f0ead..78545257 100644 --- a/archivebox/api/admin.py +++ b/archivebox/api/admin.py @@ -13,7 +13,21 @@ class APITokenAdmin(BaseModelAdmin): sort_fields = ('id', 'created_at', 'created_by', 'expires') readonly_fields = ('created_at', 'modified_at') search_fields = ('id', 'created_by__username', 'token') - fields = ('created_by', 'token', 'expires', *readonly_fields) + + fieldsets = ( + ('Token', { + 'fields': ('token', 'expires'), + 'classes': ('card',), + }), + ('Owner', { + 'fields': ('created_by',), + 'classes': ('card',), + }), + ('Timestamps', { + 'fields': ('created_at', 'modified_at'), + 'classes': ('card',), + }), + ) list_filter = ('created_by',) ordering = ['-created_at'] @@ -25,6 +39,29 @@ class CustomWebhookAdmin(WebhookAdmin, BaseModelAdmin): sort_fields = ('created_at', 'created_by', 'id', 'referenced_model', 'endpoint', 'last_success', 'last_error') readonly_fields = ('created_at', 'modified_at', *WebhookAdmin.readonly_fields) + fieldsets = ( + ('Webhook', { + 'fields': ('name', 'signal', 'referenced_model', 'endpoint'), + 'classes': ('card', 'wide'), + }), + ('Authentication', { + 'fields': ('auth_token',), + 'classes': ('card',), + }), + ('Status', { + 'fields': ('enabled', 'last_success', 'last_error'), + 'classes': ('card',), + }), + ('Owner', { + 'fields': ('created_by',), + 'classes': ('card',), + }), + ('Timestamps', { + 'fields': ('created_at', 'modified_at'), + 'classes': ('card',), + }), + ) + def register_admin(admin_site): admin_site.register(APIToken, APITokenAdmin) diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index e9bcc53e..b668d26b 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -115,12 +115,10 @@ def add(urls: str | list[str], # - Repeat until max_depth reached if bg: - # Background mode: start orchestrator and return immediately - print('[yellow]\\[*] Running in background mode - starting orchestrator...[/yellow]') - orchestrator = Orchestrator(exit_on_idle=True) - orchestrator.start() # Fork to background + # Background mode: just queue work and return (orchestrator via server will pick it up) + print('[yellow]\\[*] URLs queued. Orchestrator will process them (run `archivebox server` if not already running).[/yellow]') else: - # Foreground mode: run orchestrator until all work is done + # Foreground mode: run orchestrator inline until all work is done print(f'[green]\\[*] Starting orchestrator to process crawl...[/green]') orchestrator = Orchestrator(exit_on_idle=True) orchestrator.runloop() # Block until complete diff --git a/archivebox/cli/archivebox_extract.py b/archivebox/cli/archivebox_extract.py index c3fa89ef..affea542 100644 --- a/archivebox/cli/archivebox_extract.py +++ b/archivebox/cli/archivebox_extract.py @@ -117,11 +117,11 @@ def run_plugins( if snapshot_id: snapshot_ids.add(snapshot_id) elif record.get('url'): - # Look up by URL - try: - snap = Snapshot.objects.get(url=record['url']) + # Look up by URL (get most recent if multiple exist) + snap = Snapshot.objects.filter(url=record['url']).order_by('-created_at').first() + if snap: snapshot_ids.add(str(snap.id)) - except Snapshot.DoesNotExist: + else: rprint(f'[yellow]Snapshot not found for URL: {record["url"]}[/yellow]', file=sys.stderr) elif record_type == TYPE_ARCHIVERESULT: diff --git a/archivebox/cli/archivebox_install.py b/archivebox/cli/archivebox_install.py index 5674b3d8..b797944d 100755 --- a/archivebox/cli/archivebox_install.py +++ b/archivebox/cli/archivebox_install.py @@ -49,20 +49,45 @@ def install(dry_run: bool=False) -> None: # Using a minimal crawl that will trigger on_Crawl hooks created_by_id = get_or_create_system_user_pk() - seed = Seed.objects.create( + seed, _created = Seed.objects.get_or_create( uri='archivebox://install', label='Dependency detection', created_by_id=created_by_id, + defaults={ + 'extractor': 'auto', + } ) - crawl = Crawl.objects.create( + crawl, created = Crawl.objects.get_or_create( seed=seed, max_depth=0, created_by_id=created_by_id, - status='queued', + defaults={ + 'status': 'queued', + } ) + # If crawl already existed, reset it to queued state so it can be processed again + if not created: + crawl.status = 'queued' + crawl.retry_at = timezone.now() + crawl.save() + print(f'[+] Created dependency detection crawl: {crawl.id}') + print(f'[+] Crawl status: {crawl.status}, retry_at: {crawl.retry_at}') + + # Verify the crawl is in the queue + from crawls.models import Crawl as CrawlModel + queued_crawls = CrawlModel.objects.filter( + retry_at__lte=timezone.now() + ).exclude( + status__in=CrawlModel.FINAL_STATES + ) + print(f'[+] Crawls in queue: {queued_crawls.count()}') + if queued_crawls.exists(): + for c in queued_crawls: + print(f' - Crawl {c.id}: status={c.status}, retry_at={c.retry_at}') + print('[+] Running crawl to detect binaries via on_Crawl hooks...') print() diff --git a/archivebox/cli/archivebox_server.py b/archivebox/cli/archivebox_server.py index c369e6ce..146e047c 100644 --- a/archivebox/cli/archivebox_server.py +++ b/archivebox/cli/archivebox_server.py @@ -56,20 +56,53 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,), except IndexError: pass - print('[green][+] Starting ArchiveBox webserver...[/green]') - print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]') - print(f' [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]') - print(' > Writing ArchiveBox error log to ./logs/errors.log') - if SHELL_CONFIG.DEBUG: + print('[green][+] Starting ArchiveBox webserver in DEBUG mode...[/green]') + print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]') + print(f' [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]') + print(' > Writing ArchiveBox error log to ./logs/errors.log') if not reload: runserver_args.append('--noreload') # '--insecure' if nothreading: runserver_args.append('--nothreading') call_command("runserver", *runserver_args) else: - from workers.supervisord_util import start_server_workers + from workers.supervisord_util import ( + get_existing_supervisord_process, + get_worker, + start_server_workers, + tail_multiple_worker_logs, + ) + # Check if supervisord is already running + supervisor = get_existing_supervisord_process() + if supervisor: + daphne_proc = get_worker(supervisor, 'worker_daphne') + + # If daphne is already running, just tail logs + if daphne_proc and daphne_proc.get('statename') == 'RUNNING': + orchestrator_proc = get_worker(supervisor, 'worker_orchestrator') + print('[yellow][!] ArchiveBox server is already running[/yellow]') + print(f' [green]√[/green] Web server (worker_daphne) is RUNNING on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]') + if orchestrator_proc and orchestrator_proc.get('statename') == 'RUNNING': + print(f' [green]√[/green] Background worker (worker_orchestrator) is RUNNING') + print() + print('[blue][i] Tailing worker logs (Ctrl+C to stop watching)...[/i][/blue]') + print() + + # Tail logs for both workers + tail_multiple_worker_logs( + log_files=['logs/worker_daphne.log', 'logs/worker_orchestrator.log'], + follow=True, + ) + return + # Otherwise, daphne is not running - fall through to start it + + # No existing workers found - start new ones + print('[green][+] Starting ArchiveBox webserver...[/green]') + print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]') + print(f' [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]') + print(' > Writing ArchiveBox error log to ./logs/errors.log') print() start_server_workers(host=host, port=port, daemonize=daemonize) print("\n[i][green][🟩] ArchiveBox server shut down gracefully.[/green][/i]") diff --git a/archivebox/cli/archivebox_version.py b/archivebox/cli/archivebox_version.py index 2b231c9f..c891b8ea 100755 --- a/archivebox/cli/archivebox_version.py +++ b/archivebox/cli/archivebox_version.py @@ -119,12 +119,13 @@ def version(quiet: bool=False, else: for key in sorted(set(binary_config_keys)): # Get the actual binary name/path from config value - bin_value = config.get(key, '').strip() + # Prioritize Machine.config overrides over base config + bin_value = machine.config.get(key) or config.get(key, '').strip() if not bin_value: continue # Check if it's a path (has slashes) or just a name - is_path = '/' in bin_value + is_path = '/' in str(bin_value) if is_path: # It's a full path - match against abspath diff --git a/archivebox/config/django.py b/archivebox/config/django.py index 77169ee3..d7910ec0 100644 --- a/archivebox/config/django.py +++ b/archivebox/config/django.py @@ -5,7 +5,6 @@ import sys from datetime import datetime, timezone -from rich.progress import Progress from rich.console import Console import django @@ -27,16 +26,6 @@ STDERR = Console(stderr=True) logging.CONSOLE = CONSOLE -INITIAL_STARTUP_PROGRESS = None -INITIAL_STARTUP_PROGRESS_TASK = 0 - -def bump_startup_progress_bar(advance=1): - global INITIAL_STARTUP_PROGRESS - global INITIAL_STARTUP_PROGRESS_TASK - if INITIAL_STARTUP_PROGRESS: - INITIAL_STARTUP_PROGRESS.update(INITIAL_STARTUP_PROGRESS_TASK, advance=advance) # type: ignore - - def setup_django_minimal(): # sys.path.append(str(CONSTANTS.PACKAGE_DIR)) # os.environ.setdefault('ARCHIVEBOX_DATA_DIR', str(CONSTANTS.DATA_DIR)) @@ -49,9 +38,7 @@ DJANGO_SET_UP = False def setup_django(check_db=False, in_memory_db=False) -> None: from rich.panel import Panel - - global INITIAL_STARTUP_PROGRESS - global INITIAL_STARTUP_PROGRESS_TASK + global DJANGO_SET_UP if DJANGO_SET_UP: @@ -59,118 +46,100 @@ def setup_django(check_db=False, in_memory_db=False) -> None: # TODO: figure out why CLI entrypoints with init_pending are running this twice sometimes return - with Progress(transient=True, expand=True, console=STDERR) as INITIAL_STARTUP_PROGRESS: - INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS.add_task("[green]Loading modules...", total=25, visible=True) - - from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, SudoPermission - - # if running as root, chown the data dir to the archivebox user to make sure it's accessible to the archivebox user - if IS_ROOT and ARCHIVEBOX_USER != 0: - with SudoPermission(uid=0): - # running as root is a special case where it's ok to be a bit slower - # make sure data dir is always owned by the correct user - os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}" 2>/dev/null') - os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}"/* 2>/dev/null') + from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, SudoPermission - bump_startup_progress_bar() - try: - from django.core.management import call_command - - bump_startup_progress_bar() + # if running as root, chown the data dir to the archivebox user to make sure it's accessible to the archivebox user + if IS_ROOT and ARCHIVEBOX_USER != 0: + with SudoPermission(uid=0): + # running as root is a special case where it's ok to be a bit slower + # make sure data dir is always owned by the correct user + os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}" 2>/dev/null') + os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}"/* 2>/dev/null') - if in_memory_db: - raise Exception('dont use this anymore') - - # some commands (e.g. oneshot) dont store a long-lived sqlite3 db file on disk. - # in those cases we create a temporary in-memory db and run the migrations - # immediately to get a usable in-memory-database at startup - os.environ.setdefault("ARCHIVEBOX_DATABASE_NAME", ":memory:") + try: + from django.core.management import call_command + + if in_memory_db: + raise Exception('dont use this anymore') + + # some commands (e.g. oneshot) dont store a long-lived sqlite3 db file on disk. + # in those cases we create a temporary in-memory db and run the migrations + # immediately to get a usable in-memory-database at startup + os.environ.setdefault("ARCHIVEBOX_DATABASE_NAME", ":memory:") + django.setup() + + call_command("migrate", interactive=False, verbosity=0) + else: + # Otherwise use default sqlite3 file-based database and initialize django + # without running migrations automatically (user runs them manually by calling init) + try: django.setup() - - bump_startup_progress_bar() - call_command("migrate", interactive=False, verbosity=0) - else: - # Otherwise use default sqlite3 file-based database and initialize django - # without running migrations automatically (user runs them manually by calling init) - try: - django.setup() - except Exception as e: - bump_startup_progress_bar(advance=1000) - - is_using_meta_cmd = any(ignored_subcommand in sys.argv for ignored_subcommand in ('help', 'version', '--help', '--version')) - if not is_using_meta_cmd: - # show error message to user only if they're not running a meta command / just trying to get help - STDERR.print() - STDERR.print(Panel( - f'\n[red]{e.__class__.__name__}[/red]: [yellow]{e}[/yellow]\nPlease check your config and [blue]DATA_DIR[/blue] permissions.\n', - title='\n\n[red][X] Error while trying to load database![/red]', - subtitle='[grey53]NO WRITES CAN BE PERFORMED[/grey53]', - expand=False, - style='bold red', - )) - STDERR.print() - STDERR.print_exception(show_locals=False) - return - - bump_startup_progress_bar() - - from django.conf import settings - - # log startup message to the error log - with open(settings.ERROR_LOG, "a", encoding='utf-8') as f: - command = ' '.join(sys.argv) - ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S') - f.write(f"\n> {command}; TS={ts} VERSION={CONSTANTS.VERSION} IN_DOCKER={SHELL_CONFIG.IN_DOCKER} IS_TTY={SHELL_CONFIG.IS_TTY}\n") - - if check_db: - # make sure the data dir is owned by a non-root user - if CONSTANTS.DATA_DIR.stat().st_uid == 0: - STDERR.print('[red][X] Error: ArchiveBox DATA_DIR cannot be owned by root![/red]') - STDERR.print(f' {CONSTANTS.DATA_DIR}') + except Exception as e: + is_using_meta_cmd = any(ignored_subcommand in sys.argv for ignored_subcommand in ('help', 'version', '--help', '--version')) + if not is_using_meta_cmd: + # show error message to user only if they're not running a meta command / just trying to get help STDERR.print() - STDERR.print('[violet]Hint:[/violet] Are you running archivebox in the right folder? (and as a non-root user?)') - STDERR.print(' cd path/to/your/archive/data') - STDERR.print(' archivebox [command]') + STDERR.print(Panel( + f'\n[red]{e.__class__.__name__}[/red]: [yellow]{e}[/yellow]\nPlease check your config and [blue]DATA_DIR[/blue] permissions.\n', + title='\n\n[red][X] Error while trying to load database![/red]', + subtitle='[grey53]NO WRITES CAN BE PERFORMED[/grey53]', + expand=False, + style='bold red', + )) STDERR.print() - raise SystemExit(9) - - # Create cache table in DB if needed - try: - from django.core.cache import cache - cache.get('test', None) - except django.db.utils.OperationalError: - call_command("createcachetable", verbosity=0) + STDERR.print_exception(show_locals=False) + return - bump_startup_progress_bar() + from django.conf import settings - # if archivebox gets imported multiple times, we have to close - # the sqlite3 whenever we init from scratch to avoid multiple threads - # sharing the same connection by accident - from django.db import connections - for conn in connections.all(): - conn.close_if_unusable_or_obsolete() + # log startup message to the error log + with open(settings.ERROR_LOG, "a", encoding='utf-8') as f: + command = ' '.join(sys.argv) + ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S') + f.write(f"\n> {command}; TS={ts} VERSION={CONSTANTS.VERSION} IN_DOCKER={SHELL_CONFIG.IN_DOCKER} IS_TTY={SHELL_CONFIG.IS_TTY}\n") - sql_index_path = CONSTANTS.DATABASE_FILE - assert os.access(sql_index_path, os.F_OK), ( - f'No database file {sql_index_path} found in: {CONSTANTS.DATA_DIR} (Are you in an ArchiveBox collection directory?)') + if check_db: + # make sure the data dir is owned by a non-root user + if CONSTANTS.DATA_DIR.stat().st_uid == 0: + STDERR.print('[red][X] Error: ArchiveBox DATA_DIR cannot be owned by root![/red]') + STDERR.print(f' {CONSTANTS.DATA_DIR}') + STDERR.print() + STDERR.print('[violet]Hint:[/violet] Are you running archivebox in the right folder? (and as a non-root user?)') + STDERR.print(' cd path/to/your/archive/data') + STDERR.print(' archivebox [command]') + STDERR.print() + raise SystemExit(9) - bump_startup_progress_bar() + # Create cache table in DB if needed + try: + from django.core.cache import cache + cache.get('test', None) + except django.db.utils.OperationalError: + call_command("createcachetable", verbosity=0) - # https://docs.pydantic.dev/logfire/integrations/django/ Logfire Debugging - # if settings.DEBUG_LOGFIRE: - # from opentelemetry.instrumentation.sqlite3 import SQLite3Instrumentor - # SQLite3Instrumentor().instrument() + # if archivebox gets imported multiple times, we have to close + # the sqlite3 whenever we init from scratch to avoid multiple threads + # sharing the same connection by accident + from django.db import connections + for conn in connections.all(): + conn.close_if_unusable_or_obsolete() - # import logfire + sql_index_path = CONSTANTS.DATABASE_FILE + assert os.access(sql_index_path, os.F_OK), ( + f'No database file {sql_index_path} found in: {CONSTANTS.DATA_DIR} (Are you in an ArchiveBox collection directory?)') - # logfire.configure() - # logfire.instrument_django(is_sql_commentor_enabled=True) - # logfire.info(f'Started ArchiveBox v{CONSTANTS.VERSION}', argv=sys.argv) + # https://docs.pydantic.dev/logfire/integrations/django/ Logfire Debugging + # if settings.DEBUG_LOGFIRE: + # from opentelemetry.instrumentation.sqlite3 import SQLite3Instrumentor + # SQLite3Instrumentor().instrument() + + # import logfire + + # logfire.configure() + # logfire.instrument_django(is_sql_commentor_enabled=True) + # logfire.info(f'Started ArchiveBox v{CONSTANTS.VERSION}', argv=sys.argv) + + except KeyboardInterrupt: + raise SystemExit(2) - except KeyboardInterrupt: - raise SystemExit(2) - DJANGO_SET_UP = True - - INITIAL_STARTUP_PROGRESS = None - INITIAL_STARTUP_PROGRESS_TASK = None diff --git a/archivebox/core/admin_archiveresults.py b/archivebox/core/admin_archiveresults.py index 5497d2a6..59864571 100644 --- a/archivebox/core/admin_archiveresults.py +++ b/archivebox/core/admin_archiveresults.py @@ -19,6 +19,150 @@ from archivebox.hooks import get_extractor_icon from core.models import ArchiveResult, Snapshot +def render_archiveresults_list(archiveresults_qs, limit=50): + """Render a nice inline list view of archive results with status, extractor, output, and actions.""" + + results = list(archiveresults_qs.order_by('-end_ts').select_related('snapshot')[:limit]) + + if not results: + return mark_safe('
No Archive Results yet...
') + + # Status colors + status_colors = { + 'succeeded': ('#166534', '#dcfce7'), # green + 'failed': ('#991b1b', '#fee2e2'), # red + 'queued': ('#6b7280', '#f3f4f6'), # gray + 'started': ('#92400e', '#fef3c7'), # amber + } + + rows = [] + for idx, result in enumerate(results): + status = result.status or 'queued' + color, bg = status_colors.get(status, ('#6b7280', '#f3f4f6')) + + # Get extractor icon + icon = get_extractor_icon(result.extractor) + + # Format timestamp + end_time = result.end_ts.strftime('%Y-%m-%d %H:%M:%S') if result.end_ts else '-' + + # Truncate output for display + full_output = result.output or '-' + output_display = full_output[:60] + if len(full_output) > 60: + output_display += '...' + + # Get full command as tooltip + cmd_str = ' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd or '-') + + # Build output link + output_link = f'/archive/{result.snapshot.timestamp}/{result.output}' if result.output and result.status == 'succeeded' else f'/archive/{result.snapshot.timestamp}/' + + # Get version - try cmd_version field + version = result.cmd_version if result.cmd_version else '-' + + # Unique ID for this row's expandable output + row_id = f'output_{idx}_{str(result.id)[:8]}' + + rows.append(f''' + + + {status} + + + {icon} + + + {result.extractor} + + + + {output_display} + + + + {end_time} + + + {version} + + +
+ 📄 + ✏️ +
+ + + + +
+ + Details & Output + +
+
+ ID: {str(result.id)[:8]}... + Version: {version} + PWD: {result.pwd or '-'} +
+
+ Output: +
+
{full_output}
+
+ Command: +
+
{cmd_str}
+
+
+ + + ''') + + total_count = archiveresults_qs.count() + footer = '' + if total_count > limit: + footer = f''' + + + Showing {limit} of {total_count} results   + View all → + + + ''' + + return mark_safe(f''' +
+ + + + + + + + + + + + + + {''.join(rows)} + {footer} + +
StatusExtractorOutputCompletedVersionActions
+
+ ''') + + class ArchiveResultInline(admin.TabularInline): name = 'Archive Results Log' @@ -97,18 +241,44 @@ class ArchiveResultAdmin(BaseModelAdmin): sort_fields = ('id', 'created_by', 'created_at', 'extractor', 'status') readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'extractor_with_icon') search_fields = ('id', 'snapshot__url', 'extractor', 'output', 'cmd_version', 'cmd', 'snapshot__timestamp') - fields = ('snapshot', 'extractor', 'status', 'retry_at', 'start_ts', 'end_ts', 'created_by', 'pwd', 'cmd_version', 'cmd', 'output', *readonly_fields) autocomplete_fields = ['snapshot'] + fieldsets = ( + ('Snapshot', { + 'fields': ('snapshot', 'snapshot_info', 'tags_str'), + 'classes': ('card', 'wide'), + }), + ('Extractor', { + 'fields': ('extractor', 'extractor_with_icon', 'status', 'retry_at'), + 'classes': ('card',), + }), + ('Timing', { + 'fields': ('start_ts', 'end_ts', 'created_at', 'modified_at'), + 'classes': ('card',), + }), + ('Command', { + 'fields': ('cmd', 'cmd_str', 'cmd_version', 'pwd'), + 'classes': ('card',), + }), + ('Output', { + 'fields': ('output', 'output_summary'), + 'classes': ('card', 'wide'), + }), + ('Metadata', { + 'fields': ('created_by',), + 'classes': ('card',), + }), + ) + list_filter = ('status', 'extractor', 'start_ts', 'cmd_version') ordering = ['-start_ts'] list_per_page = SERVER_CONFIG.SNAPSHOTS_PER_PAGE - + paginator = AccelleratedPaginator save_on_top = True - + actions = ['delete_selected'] - + class Meta: verbose_name = 'Archive Result' verbose_name_plural = 'Archive Results' diff --git a/archivebox/core/admin_snapshots.py b/archivebox/core/admin_snapshots.py index d1917e52..d25f291c 100644 --- a/archivebox/core/admin_snapshots.py +++ b/archivebox/core/admin_snapshots.py @@ -25,7 +25,7 @@ from archivebox.workers.tasks import bg_archive_snapshots, bg_add from core.models import Tag from core.admin_tags import TagInline -from core.admin_archiveresults import ArchiveResultInline +from core.admin_archiveresults import ArchiveResultInline, render_archiveresults_list # GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': [], 'CAN_UPGRADE': False} @@ -54,13 +54,48 @@ class SnapshotActionForm(ActionForm): class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): list_display = ('created_at', 'title_str', 'status', 'files', 'size', 'url_str') sort_fields = ('title_str', 'url_str', 'created_at', 'status', 'crawl') - readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'output_dir') + readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'output_dir', 'archiveresults_list') search_fields = ('id', 'url', 'timestamp', 'title', 'tags__name') list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'created_by', 'tags__name') - fields = ('url', 'title', 'created_by', 'bookmarked_at', 'status', 'retry_at', 'crawl', 'config', *readonly_fields) + + fieldsets = ( + ('URL', { + 'fields': ('url', 'title'), + 'classes': ('card', 'wide'), + }), + ('Status', { + 'fields': ('status', 'retry_at', 'status_info'), + 'classes': ('card',), + }), + ('Timestamps', { + 'fields': ('bookmarked_at', 'created_at', 'modified_at', 'downloaded_at'), + 'classes': ('card',), + }), + ('Relations', { + 'fields': ('crawl', 'created_by', 'tags_str'), + 'classes': ('card',), + }), + ('Config', { + 'fields': ('config',), + 'classes': ('card',), + }), + ('Files', { + 'fields': ('output_dir',), + 'classes': ('card',), + }), + ('Actions', { + 'fields': ('admin_actions',), + 'classes': ('card', 'wide'), + }), + ('Archive Results', { + 'fields': ('archiveresults_list',), + 'classes': ('card', 'wide'), + }), + ) + ordering = ['-created_at'] actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots'] - inlines = [TagInline, ArchiveResultInline] + inlines = [TagInline] # Removed ArchiveResultInline, using custom renderer instead list_per_page = min(max(5, SERVER_CONFIG.SNAPSHOTS_PER_PAGE), 5000) action_form = SnapshotActionForm @@ -155,6 +190,10 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): obj.extension or '-', ) + @admin.display(description='Archive Results') + def archiveresults_list(self, obj): + return render_archiveresults_list(obj.archiveresult_set.all()) + @admin.display( description='Title', ordering='title', diff --git a/archivebox/core/admin_tags.py b/archivebox/core/admin_tags.py index b0f09b9b..f2d0a8cf 100644 --- a/archivebox/core/admin_tags.py +++ b/archivebox/core/admin_tags.py @@ -51,11 +51,25 @@ class TagAdmin(BaseModelAdmin): sort_fields = ('name', 'slug', 'id', 'created_by', 'created_at') readonly_fields = ('slug', 'id', 'created_at', 'modified_at', 'snapshots') search_fields = ('id', 'name', 'slug') - fields = ('name', 'created_by', *readonly_fields) actions = ['delete_selected', 'merge_tags'] ordering = ['-created_at'] # inlines = [TaggedItemInline] + fieldsets = ( + ('Tag Info', { + 'fields': ('name', 'slug'), + 'classes': ('card',), + }), + ('Metadata', { + 'fields': ('id', 'created_by', 'created_at', 'modified_at'), + 'classes': ('card',), + }), + ('Snapshots', { + 'fields': ('snapshots',), + 'classes': ('card', 'wide'), + }), + ) + paginator = AccelleratedPaginator diff --git a/archivebox/core/apps.py b/archivebox/core/apps.py index 5193166d..4581f208 100644 --- a/archivebox/core/apps.py +++ b/archivebox/core/apps.py @@ -1,7 +1,5 @@ __package__ = 'archivebox.core' -import sys - from django.apps import AppConfig @@ -12,41 +10,3 @@ class CoreConfig(AppConfig): """Register the archivebox.core.admin_site as the main django admin site""" from core.admin_site import register_admin_site register_admin_site() - - # Auto-start the orchestrator when running the web server - self._maybe_start_orchestrator() - - def _maybe_start_orchestrator(self): - """Start the orchestrator if we're running a web server.""" - import os - - # Don't start orchestrator during migrations, shell, tests, etc. - # Only start when running: runserver, daphne, gunicorn, uwsgi - if not self._is_web_server(): - return - - # Don't start if RUN_ORCHESTRATOR env var is explicitly set to false - if os.environ.get('RUN_ORCHESTRATOR', '').lower() in ('false', '0', 'no'): - return - - # Don't start in autoreload child process (avoid double-start) - if os.environ.get('RUN_MAIN') != 'true' and 'runserver' in sys.argv: - return - - try: - from workers.orchestrator import Orchestrator - - if not Orchestrator.is_running(): - # Start orchestrator as daemon (won't exit on idle when started by server) - orchestrator = Orchestrator(exit_on_idle=False) - orchestrator.start() - except Exception as e: - # Don't crash the server if orchestrator fails to start - import logging - logging.getLogger('archivebox').warning(f'Failed to auto-start orchestrator: {e}') - - def _is_web_server(self) -> bool: - """Check if we're running a web server command.""" - # Check for common web server indicators - server_commands = ('runserver', 'daphne', 'gunicorn', 'uwsgi', 'server') - return any(cmd in ' '.join(sys.argv).lower() for cmd in server_commands) diff --git a/archivebox/core/migrations/0025_allow_duplicate_urls_per_crawl.py b/archivebox/core/migrations/0025_allow_duplicate_urls_per_crawl.py new file mode 100644 index 00000000..0c2d80d6 --- /dev/null +++ b/archivebox/core/migrations/0025_allow_duplicate_urls_per_crawl.py @@ -0,0 +1,22 @@ +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0024_snapshot_crawl'), + ] + + operations = [ + # Remove the unique constraint on url + migrations.AlterField( + model_name='snapshot', + name='url', + field=models.URLField(db_index=True, unique=False), + ), + # Add unique constraint on (url, crawl) combination + migrations.AddConstraint( + model_name='snapshot', + constraint=models.UniqueConstraint(fields=['url', 'crawl'], name='unique_url_per_crawl'), + ), + ] diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 543435aa..57369460 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -60,7 +60,8 @@ class Tag(ModelWithSerializers): return self.name def save(self, *args, **kwargs): - if self._state.adding: + is_new = self._state.adding + if is_new: self.slug = slugify(self.name) existing = set(Tag.objects.filter(slug__startswith=self.slug).values_list("slug", flat=True)) i = None @@ -72,6 +73,19 @@ class Tag(ModelWithSerializers): i = (i or 0) + 1 super().save(*args, **kwargs) + if is_new: + from archivebox.misc.logging_util import log_worker_event + log_worker_event( + worker_type='DB', + event='Created Tag', + indent_level=0, + metadata={ + 'id': self.id, + 'name': self.name, + 'slug': self.slug, + }, + ) + @property def api_url(self) -> str: return reverse_lazy('api-1:get_tag', args=[self.id]) @@ -241,12 +255,13 @@ class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)): if tag.strip() )) - try: - snapshot = self.get(url=url) + # Get most recent snapshot with this URL (URLs can exist in multiple crawls) + snapshot = self.filter(url=url).order_by('-created_at').first() + if snapshot: if title and (not snapshot.title or len(title) > len(snapshot.title or '')): snapshot.title = title snapshot.save(update_fields=['title', 'modified_at']) - except self.model.DoesNotExist: + else: if timestamp: while self.filter(timestamp=timestamp).exists(): timestamp = str(float(timestamp) + 1.0) @@ -284,7 +299,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea created_at = models.DateTimeField(default=timezone.now, db_index=True) modified_at = models.DateTimeField(auto_now=True) - url = models.URLField(unique=True, db_index=True) + url = models.URLField(unique=False, db_index=True) # URLs can appear in multiple crawls timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False) bookmarked_at = models.DateTimeField(default=timezone.now, db_index=True) crawl: Crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name='snapshot_set', db_index=True) # type: ignore @@ -313,11 +328,16 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea class Meta(TypedModelMeta): verbose_name = "Snapshot" verbose_name_plural = "Snapshots" + constraints = [ + # Allow same URL in different crawls, but not duplicates within same crawl + models.UniqueConstraint(fields=['url', 'crawl'], name='unique_url_per_crawl'), + ] def __str__(self): return f'[{self.id}] {self.url[:64]}' def save(self, *args, **kwargs): + is_new = self._state.adding if not self.bookmarked_at: self.bookmarked_at = self.created_at or timezone.now() if not self.timestamp: @@ -327,6 +347,21 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea self.crawl.urls += f'\n{self.url}' self.crawl.save() + if is_new: + from archivebox.misc.logging_util import log_worker_event + log_worker_event( + worker_type='DB', + event='Created Snapshot', + indent_level=2, + url=self.url, + metadata={ + 'id': str(self.id), + 'crawl_id': str(self.crawl_id) if self.crawl_id else None, + 'depth': self.depth, + 'status': self.status, + }, + ) + def output_dir_parent(self) -> str: return 'archive' @@ -807,6 +842,24 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi def __str__(self): return f'[{self.id}] {self.snapshot.url[:64]} -> {self.extractor}' + def save(self, *args, **kwargs): + is_new = self._state.adding + super().save(*args, **kwargs) + if is_new: + from archivebox.misc.logging_util import log_worker_event + log_worker_event( + worker_type='DB', + event='Created ArchiveResult', + indent_level=3, + extractor=self.extractor, + metadata={ + 'id': str(self.id), + 'snapshot_id': str(self.snapshot_id), + 'snapshot_url': str(self.snapshot.url)[:64], + 'status': self.status, + }, + ) + @cached_property def snapshot_dir(self): return Path(self.snapshot.output_dir) @@ -879,7 +932,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi from django.utils import timezone from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook - extractor_dir = Path(self.snapshot.output_dir) / self.extractor config_objects = [self.snapshot.crawl, self.snapshot] if self.snapshot.crawl else [self.snapshot] # Find hook for this extractor @@ -899,6 +951,10 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi self.save() return + # Use plugin directory name instead of extractor name (removes numeric prefix) + plugin_name = hook.parent.name + extractor_dir = Path(self.snapshot.output_dir) / plugin_name + # Run the hook start_ts = timezone.now() result = run_hook( diff --git a/archivebox/core/statemachines.py b/archivebox/core/statemachines.py index 26a0ed7f..fde35403 100644 --- a/archivebox/core/statemachines.py +++ b/archivebox/core/statemachines.py @@ -45,15 +45,14 @@ class SnapshotMachine(StateMachine, strict_states=True): super().__init__(snapshot, *args, **kwargs) def __repr__(self) -> str: - return f'[grey53]Snapshot\\[{self.snapshot.id}] 🏃‍♂️ Worker\\[pid={os.getpid()}].tick()[/grey53] [blue]{self.snapshot.status.upper()}[/blue] ⚙️ [grey37]Machine[/grey37]' - + return f'Snapshot[{self.snapshot.id}]' + def __str__(self) -> str: return self.__repr__() - + def can_start(self) -> bool: can_start = bool(self.snapshot.url) - if not can_start: - print(f'{self}.can_start() [blue]QUEUED[/blue] ➡️❌ [blue]STARTED[/blue] cant start yet +{timezone.now() - self.snapshot.retry_at}s') + # Suppressed: queue waiting logs return can_start def is_finished(self) -> bool: @@ -73,15 +72,15 @@ class SnapshotMachine(StateMachine, strict_states=True): @queued.enter def enter_queued(self): - print(f'{self}.on_queued() ↳ snapshot.retry_at = now()') + # Suppressed: state transition logs self.snapshot.update_for_workers( retry_at=timezone.now(), status=Snapshot.StatusChoices.QUEUED, ) - + @started.enter def enter_started(self): - print(f'{self}.on_started() ↳ snapshot.run()') + # Suppressed: state transition logs # lock the snapshot while we create the pending archiveresults self.snapshot.update_for_workers( retry_at=timezone.now() + timedelta(seconds=30), # if failed, wait 30s before retrying @@ -95,10 +94,10 @@ class SnapshotMachine(StateMachine, strict_states=True): retry_at=timezone.now() + timedelta(seconds=5), # wait 5s before checking it again status=Snapshot.StatusChoices.STARTED, ) - + @sealed.enter def enter_sealed(self): - print(f'{self}.on_sealed() ↳ snapshot.retry_at=None') + # Suppressed: state transition logs self.snapshot.update_for_workers( retry_at=None, status=Snapshot.StatusChoices.SEALED, @@ -161,15 +160,14 @@ class ArchiveResultMachine(StateMachine, strict_states=True): super().__init__(archiveresult, *args, **kwargs) def __repr__(self) -> str: - return f'[grey53]ArchiveResult\\[{self.archiveresult.id}] 🏃‍♂️ Worker\\[pid={os.getpid()}].tick()[/grey53] [blue]{self.archiveresult.status.upper()}[/blue] ⚙️ [grey37]Machine[/grey37]' - + return f'ArchiveResult[{self.archiveresult.id}]' + def __str__(self) -> str: return self.__repr__() - + def can_start(self) -> bool: can_start = bool(self.archiveresult.snapshot.url) - if not can_start: - print(f'{self}.can_start() [blue]QUEUED[/blue] ➡️❌ [blue]STARTED[/blue]: cant start yet +{timezone.now() - self.archiveresult.retry_at}s') + # Suppressed: queue waiting logs return can_start def is_succeeded(self) -> bool: @@ -202,41 +200,34 @@ class ArchiveResultMachine(StateMachine, strict_states=True): @queued.enter def enter_queued(self): - print(f'{self}.on_queued() ↳ archiveresult.retry_at = now()') + # Suppressed: state transition logs self.archiveresult.update_for_workers( retry_at=timezone.now(), status=ArchiveResult.StatusChoices.QUEUED, start_ts=None, ) # bump the snapshot's retry_at so they pickup any new changes - + @started.enter def enter_started(self): - print(f'{self}.on_started() ↳ archiveresult.start_ts + run_extractor()') - + # Suppressed: state transition logs # Lock the object and mark start time self.archiveresult.update_for_workers( retry_at=timezone.now() + timedelta(seconds=120), # 2 min timeout for extractor status=ArchiveResult.StatusChoices.STARTED, start_ts=timezone.now(), ) - + # Run the extractor - this updates status, output, timestamps, etc. self.archiveresult.run() - + # Save the updated result self.archiveresult.save() - - # Log the result - if self.archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED: - print(f'{self} ✅ extractor succeeded: {self.archiveresult.output[:50] if self.archiveresult.output else ""}...') - elif self.archiveresult.status == ArchiveResult.StatusChoices.FAILED: - print(f'{self} ❌ extractor failed: {self.archiveresult.output[:100] if self.archiveresult.output else ""}...') - elif self.archiveresult.status == ArchiveResult.StatusChoices.SKIPPED: - print(f'{self} ⏭️ extractor skipped: {self.archiveresult.output[:50] if self.archiveresult.output else ""}') + + # Suppressed: extractor result logs (already logged by worker) @backoff.enter def enter_backoff(self): - print(f'{self}.on_backoff() ↳ archiveresult.retries += 1, archiveresult.bump_retry_at(+60s), archiveresult.end_ts = None') + # Suppressed: state transition logs self.archiveresult.update_for_workers( retry_at=timezone.now() + timedelta(seconds=60), status=ArchiveResult.StatusChoices.BACKOFF, @@ -244,10 +235,10 @@ class ArchiveResultMachine(StateMachine, strict_states=True): # retries=F('retries') + 1, # F() equivalent to getattr(self.archiveresult, 'retries', 0) + 1, ) self.archiveresult.save(write_indexes=True) - + @succeeded.enter def enter_succeeded(self): - print(f'{self}.on_succeeded() ↳ archiveresult.retry_at = None, archiveresult.end_ts = now()') + # Suppressed: state transition logs self.archiveresult.update_for_workers( retry_at=None, status=ArchiveResult.StatusChoices.SUCCEEDED, @@ -270,7 +261,7 @@ class ArchiveResultMachine(StateMachine, strict_states=True): @failed.enter def enter_failed(self): - print(f'{self}.on_failed() ↳ archiveresult.retry_at = None, archiveresult.end_ts = now()') + # Suppressed: state transition logs self.archiveresult.update_for_workers( retry_at=None, status=ArchiveResult.StatusChoices.FAILED, @@ -291,7 +282,7 @@ class ArchiveResultMachine(StateMachine, strict_states=True): @skipped.enter def enter_skipped(self): - print(f'{self}.on_skipped() ↳ archiveresult.retry_at = None, archiveresult.end_ts = now()') + # Suppressed: state transition logs self.archiveresult.update_for_workers( retry_at=None, status=ArchiveResult.StatusChoices.SKIPPED, diff --git a/archivebox/core/views.py b/archivebox/core/views.py index 43110364..3f9b1794 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -503,15 +503,7 @@ class AddView(UserPassesTestMixin, FormView): mark_safe(f"Adding {rough_url_count} URLs in the background. (refresh in a minute start seeing results) {crawl.admin_change_url}"), ) - # Start orchestrator in background to process the queued crawl - try: - from archivebox.workers.tasks import ensure_orchestrator_running - ensure_orchestrator_running() - except Exception as e: - # Orchestrator may already be running via supervisord, or fail to start - # This is not fatal - the crawl will be processed when orchestrator runs - print(f'[!] Failed to start orchestrator: {e}') - + # Orchestrator (managed by supervisord) will pick up the queued crawl return redirect(crawl.admin_change_url) @@ -539,6 +531,7 @@ def live_progress_view(request): from workers.orchestrator import Orchestrator from crawls.models import Crawl from core.models import Snapshot, ArchiveResult + from django.db.models import Case, When, Value, IntegerField # Get orchestrator status orchestrator_running = Orchestrator.is_running() @@ -570,8 +563,26 @@ def live_progress_view(request): crawl_snapshots = Snapshot.objects.filter(crawl=crawl) total_snapshots = crawl_snapshots.count() completed_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.SEALED).count() + started_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.STARTED).count() pending_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.QUEUED).count() + # Count URLs in the crawl (for when snapshots haven't been created yet) + urls_count = 0 + if crawl.urls: + urls_count = len([u for u in crawl.urls.split('\n') if u.strip()]) + elif crawl.seed and crawl.seed.uri: + # Try to get URL count from seed + if crawl.seed.uri.startswith('file:///'): + try: + from pathlib import Path + seed_file = Path(crawl.seed.uri.replace('file://', '')) + if seed_file.exists(): + urls_count = len([l for l in seed_file.read_text().split('\n') if l.strip() and not l.startswith('#')]) + except: + pass + else: + urls_count = 1 # Single URL seed + # Calculate crawl progress crawl_progress = int((completed_snapshots / total_snapshots) * 100) if total_snapshots > 0 else 0 @@ -590,16 +601,24 @@ def live_progress_view(request): # Calculate snapshot progress snapshot_progress = int(((completed_extractors + failed_extractors) / total_extractors) * 100) if total_extractors > 0 else 0 - # Get active extractors for this snapshot - active_extractors = [ + # Get all extractors for this snapshot + # Order: started first, then queued, then completed + all_extractors = [ { 'id': str(ar.id), 'extractor': ar.extractor, 'status': ar.status, - 'started': ar.start_ts.isoformat() if ar.start_ts else None, - 'progress': 50, } - for ar in snapshot_results.filter(status=ArchiveResult.StatusChoices.STARTED).order_by('-start_ts')[:5] + for ar in snapshot_results.annotate( + status_order=Case( + When(status=ArchiveResult.StatusChoices.STARTED, then=Value(0)), + When(status=ArchiveResult.StatusChoices.QUEUED, then=Value(1)), + When(status=ArchiveResult.StatusChoices.SUCCEEDED, then=Value(2)), + When(status=ArchiveResult.StatusChoices.FAILED, then=Value(3)), + default=Value(4), + output_field=IntegerField(), + ) + ).order_by('status_order', 'extractor') ] active_snapshots_for_crawl.append({ @@ -612,9 +631,17 @@ def live_progress_view(request): 'completed_extractors': completed_extractors, 'failed_extractors': failed_extractors, 'pending_extractors': pending_extractors, - 'active_extractors': active_extractors, + 'all_extractors': all_extractors, }) + # Check if crawl can start (for debugging stuck crawls) + can_start = bool(crawl.seed and crawl.seed.uri) + seed_uri = crawl.seed.uri[:60] if crawl.seed and crawl.seed.uri else None + + # Check if retry_at is in the future (would prevent worker from claiming) + retry_at_future = crawl.retry_at > timezone.now() if crawl.retry_at else False + seconds_until_retry = int((crawl.retry_at - timezone.now()).total_seconds()) if crawl.retry_at and retry_at_future else 0 + active_crawls.append({ 'id': str(crawl.id), 'label': str(crawl)[:60], @@ -622,11 +649,17 @@ def live_progress_view(request): 'started': crawl.modified_at.isoformat() if crawl.modified_at else None, 'progress': crawl_progress, 'max_depth': crawl.max_depth, + 'urls_count': urls_count, 'total_snapshots': total_snapshots, 'completed_snapshots': completed_snapshots, + 'started_snapshots': started_snapshots, 'failed_snapshots': 0, 'pending_snapshots': pending_snapshots, 'active_snapshots': active_snapshots_for_crawl, + 'can_start': can_start, + 'seed_uri': seed_uri, + 'retry_at_future': retry_at_future, + 'seconds_until_retry': seconds_until_retry, }) return JsonResponse({ diff --git a/archivebox/crawls/admin.py b/archivebox/crawls/admin.py index 611a80bc..e5e7f2eb 100644 --- a/archivebox/crawls/admin.py +++ b/archivebox/crawls/admin.py @@ -8,6 +8,7 @@ from django.contrib import admin, messages from django.urls import path from django.http import JsonResponse from django.views.decorators.http import require_POST +from django.db.models import Count, Q from archivebox import DATA_DIR @@ -19,13 +20,155 @@ from core.models import Snapshot from crawls.models import Seed, Crawl, CrawlSchedule +def render_snapshots_list(snapshots_qs, limit=20): + """Render a nice inline list view of snapshots with status, title, URL, and progress.""" + + snapshots = snapshots_qs.order_by('-created_at')[:limit].annotate( + total_results=Count('archiveresult'), + succeeded_results=Count('archiveresult', filter=Q(archiveresult__status='succeeded')), + failed_results=Count('archiveresult', filter=Q(archiveresult__status='failed')), + ) + + if not snapshots: + return mark_safe('
No Snapshots yet...
') + + # Status colors matching Django admin and progress monitor + status_colors = { + 'queued': ('#6c757d', '#f8f9fa'), # gray + 'started': ('#856404', '#fff3cd'), # amber + 'sealed': ('#155724', '#d4edda'), # green + 'failed': ('#721c24', '#f8d7da'), # red + } + + rows = [] + for snapshot in snapshots: + status = snapshot.status or 'queued' + color, bg = status_colors.get(status, ('#6c757d', '#f8f9fa')) + + # Calculate progress + total = snapshot.total_results + done = snapshot.succeeded_results + snapshot.failed_results + progress_pct = int((done / total) * 100) if total > 0 else 0 + progress_text = f'{done}/{total}' if total > 0 else '-' + + # Truncate title and URL + title = (snapshot.title or 'Untitled')[:60] + if len(snapshot.title or '') > 60: + title += '...' + url_display = snapshot.url[:50] + if len(snapshot.url) > 50: + url_display += '...' + + # Format date + date_str = snapshot.created_at.strftime('%Y-%m-%d %H:%M') if snapshot.created_at else '-' + + rows.append(f''' + + + {status} + + + + + + + + {title} + + + {url_display} + + +
+
+
+
+ {progress_text} +
+ + + {date_str} + + + ''') + + total_count = snapshots_qs.count() + footer = '' + if total_count > limit: + footer = f''' + + + Showing {limit} of {total_count} snapshots + + + ''' + + return mark_safe(f''' +
+ + + + + + + + + + + + + {''.join(rows)} + {footer} + +
StatusTitleURLProgressCreated
+
+ ''') + + class SeedAdmin(ConfigEditorMixin, BaseModelAdmin): list_display = ('id', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str', 'crawls', 'num_crawls', 'num_snapshots') sort_fields = ('id', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str') search_fields = ('id', 'created_by__username', 'label', 'notes', 'uri', 'extractor', 'tags_str') readonly_fields = ('created_at', 'modified_at', 'scheduled_crawls', 'crawls', 'snapshots', 'contents') - fields = ('label', 'notes', 'uri', 'extractor', 'tags_str', 'config', 'created_by', *readonly_fields) + + fieldsets = ( + ('Source', { + 'fields': ('uri', 'contents'), + 'classes': ('card', 'wide'), + }), + ('Info', { + 'fields': ('label', 'notes', 'tags_str'), + 'classes': ('card',), + }), + ('Settings', { + 'fields': ('extractor', 'config'), + 'classes': ('card',), + }), + ('Metadata', { + 'fields': ('created_by', 'created_at', 'modified_at'), + 'classes': ('card',), + }), + ('Crawls', { + 'fields': ('scheduled_crawls', 'crawls'), + 'classes': ('card',), + }), + ('Snapshots', { + 'fields': ('snapshots',), + 'classes': ('card',), + }), + ) list_filter = ('extractor', 'created_by') ordering = ['-created_at'] @@ -51,22 +194,19 @@ class SeedAdmin(ConfigEditorMixin, BaseModelAdmin): )) or mark_safe('No Crawls yet...') def snapshots(self, obj): - return format_html_join('
', ' - {}', ( - (snapshot.admin_change_url, snapshot) - for snapshot in obj.snapshot_set.all().order_by('-created_at')[:20] - )) or mark_safe('No Snapshots yet...') + return render_snapshots_list(obj.snapshot_set.all()) def contents(self, obj): - if obj.uri.startswith('file:///data/'): - source_file = DATA_DIR / obj.uri.replace('file:///data/', '', 1) + source_file = obj.get_file_path() + if source_file: contents = "" try: contents = source_file.read_text().strip()[:14_000] except Exception as e: contents = f'Error reading {source_file}: {e}' - + return format_html('{}:
{}
', source_file, contents) - + return format_html('See URLs here: {}', obj.uri, obj.uri) @@ -78,7 +218,37 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin): search_fields = ('id', 'created_by__username', 'max_depth', 'label', 'notes', 'seed_id', 'schedule_id', 'status', 'seed__uri') readonly_fields = ('created_at', 'modified_at', 'snapshots', 'seed_urls_editor') - fields = ('label', 'notes', 'seed_urls_editor', 'config', 'status', 'retry_at', 'max_depth', 'seed', 'schedule', 'created_by', 'created_at', 'modified_at', 'snapshots') + + fieldsets = ( + ('URLs', { + 'fields': ('seed_urls_editor',), + 'classes': ('card', 'wide'), + }), + ('Info', { + 'fields': ('label', 'notes'), + 'classes': ('card',), + }), + ('Settings', { + 'fields': ('max_depth', 'config'), + 'classes': ('card',), + }), + ('Status', { + 'fields': ('status', 'retry_at'), + 'classes': ('card',), + }), + ('Relations', { + 'fields': ('seed', 'schedule', 'created_by'), + 'classes': ('card',), + }), + ('Timestamps', { + 'fields': ('created_at', 'modified_at'), + 'classes': ('card',), + }), + ('Snapshots', { + 'fields': ('snapshots',), + 'classes': ('card', 'wide'), + }), + ) list_filter = ('max_depth', 'seed', 'schedule', 'created_by', 'status', 'retry_at') ordering = ['-created_at', '-retry_at'] @@ -90,6 +260,16 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin): def recrawl(self, request, obj): """Duplicate this crawl as a new crawl with the same seed and settings.""" from django.utils import timezone + from django.shortcuts import redirect + + # Validate seed has a URI (required for crawl to start) + if not obj.seed: + messages.error(request, 'Cannot recrawl: original crawl has no seed.') + return redirect('admin:crawls_crawl_change', obj.id) + + if not obj.seed.uri: + messages.error(request, 'Cannot recrawl: seed has no URI.') + return redirect('admin:crawls_crawl_change', obj.id) new_crawl = Crawl.objects.create( seed=obj.seed, @@ -110,8 +290,6 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin): f'It will start processing shortly.' ) - # Redirect to the new crawl's change page - from django.shortcuts import redirect return redirect('admin:crawls_crawl_change', new_crawl.id) def get_urls(self): @@ -133,7 +311,8 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin): except Crawl.DoesNotExist: return JsonResponse({'success': False, 'error': 'Crawl not found'}, status=404) - if not (crawl.seed and crawl.seed.uri and crawl.seed.uri.startswith('file:///data/')): + source_file = crawl.seed.get_file_path() if crawl.seed else None + if not source_file: return JsonResponse({'success': False, 'error': 'Seed is not a local file'}, status=400) try: @@ -142,8 +321,6 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin): except json.JSONDecodeError: return JsonResponse({'success': False, 'error': 'Invalid JSON'}, status=400) - source_file = DATA_DIR / crawl.seed.uri.replace('file:///data/', '', 1) - try: # Ensure parent directory exists source_file.parent.mkdir(parents=True, exist_ok=True) @@ -156,10 +333,7 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin): return obj.snapshot_set.count() def snapshots(self, obj): - return format_html_join('
', '{}', ( - (snapshot.admin_change_url, snapshot) - for snapshot in obj.snapshot_set.all().order_by('-created_at')[:20] - )) or mark_safe('No Snapshots yet...') + return render_snapshots_list(obj.snapshot_set.all()) @admin.display(description='Schedule', ordering='schedule') def schedule_str(self, obj): @@ -186,13 +360,12 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin): seed_uri = obj.urls # Check if it's a local file we can edit - is_file = seed_uri.startswith('file:///data/') + source_file = obj.seed.get_file_path() if obj.seed else None + is_file = source_file is not None contents = "" error = None - source_file = None - if is_file: - source_file = DATA_DIR / seed_uri.replace('file:///data/', '', 1) + if is_file and source_file: try: contents = source_file.read_text().strip() except Exception as e: @@ -337,7 +510,29 @@ class CrawlScheduleAdmin(BaseModelAdmin): search_fields = ('id', 'created_by__username', 'label', 'notes', 'schedule_id', 'template_id', 'template__seed__uri') readonly_fields = ('created_at', 'modified_at', 'crawls', 'snapshots') - fields = ('label', 'notes', 'schedule', 'template', 'created_by', *readonly_fields) + + fieldsets = ( + ('Schedule Info', { + 'fields': ('label', 'notes'), + 'classes': ('card',), + }), + ('Configuration', { + 'fields': ('schedule', 'template'), + 'classes': ('card',), + }), + ('Metadata', { + 'fields': ('created_by', 'created_at', 'modified_at'), + 'classes': ('card',), + }), + ('Crawls', { + 'fields': ('crawls',), + 'classes': ('card', 'wide'), + }), + ('Snapshots', { + 'fields': ('snapshots',), + 'classes': ('card', 'wide'), + }), + ) list_filter = ('created_by',) ordering = ['-created_at'] @@ -362,10 +557,7 @@ class CrawlScheduleAdmin(BaseModelAdmin): def snapshots(self, obj): crawl_ids = obj.crawl_set.values_list('pk', flat=True) - return format_html_join('
', ' - {}', ( - (snapshot.admin_change_url, snapshot) - for snapshot in Snapshot.objects.filter(crawl_id__in=crawl_ids).order_by('-created_at')[:20] - )) or mark_safe('No Snapshots yet...') + return render_snapshots_list(Snapshot.objects.filter(crawl_id__in=crawl_ids)) def register_admin(admin_site): diff --git a/archivebox/crawls/models.py b/archivebox/crawls/models.py index fadd693d..4bd00328 100644 --- a/archivebox/crawls/models.py +++ b/archivebox/crawls/models.py @@ -44,9 +44,27 @@ class Seed(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthS def __str__(self): return f'[{self.id}] {self.uri[:64]}' + def save(self, *args, **kwargs): + is_new = self._state.adding + super().save(*args, **kwargs) + if is_new: + from archivebox.misc.logging_util import log_worker_event + log_worker_event( + worker_type='DB', + event='Created Seed', + indent_level=0, + metadata={ + 'id': str(self.id), + 'uri': str(self.uri)[:64], + 'extractor': self.extractor, + 'label': self.label or None, + }, + ) + @classmethod def from_file(cls, source_file: Path, label: str = '', parser: str = 'auto', tag: str = '', created_by=None, config=None): - source_path = str(source_file.resolve()).replace(str(CONSTANTS.DATA_DIR), '/data') + # Use absolute path for file:// URLs so extractors can find the files + source_path = str(source_file.resolve()) seed, _ = cls.objects.get_or_create( label=label or source_file.name, uri=f'file://{source_path}', created_by_id=getattr(created_by, 'pk', created_by) or get_or_create_system_user_pk(), @@ -62,6 +80,25 @@ class Seed(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthS def api_url(self) -> str: return reverse_lazy('api-1:get_seed', args=[self.id]) + def get_file_path(self) -> Path | None: + """ + Get the filesystem path for file:// URIs. + Handles both old format (file:///data/...) and new format (file:///absolute/path). + Returns None if URI is not a file:// URI. + """ + if not self.uri.startswith('file://'): + return None + + # Remove file:// prefix + path_str = self.uri.replace('file://', '', 1) + + # Handle old format: file:///data/... -> DATA_DIR/... + if path_str.startswith('/data/'): + return CONSTANTS.DATA_DIR / path_str.replace('/data/', '', 1) + + # Handle new format: file:///absolute/path + return Path(path_str) + @property def snapshot_set(self) -> QuerySet['Snapshot']: from core.models import Snapshot @@ -136,6 +173,23 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith def __str__(self): return f'[{self.id}] {self.seed.uri[:64] if self.seed else ""}' + def save(self, *args, **kwargs): + is_new = self._state.adding + super().save(*args, **kwargs) + if is_new: + from archivebox.misc.logging_util import log_worker_event + log_worker_event( + worker_type='DB', + event='Created Crawl', + indent_level=1, + metadata={ + 'id': str(self.id), + 'seed_uri': str(self.seed.uri)[:64] if self.seed else None, + 'max_depth': self.max_depth, + 'status': self.status, + }, + ) + @classmethod def from_seed(cls, seed: Seed, max_depth: int = 0, persona: str = 'Default', tags_str: str = '', config=None, created_by=None): crawl, _ = cls.objects.get_or_create( diff --git a/archivebox/crawls/statemachines.py b/archivebox/crawls/statemachines.py index 6b68e15b..a71cd010 100644 --- a/archivebox/crawls/statemachines.py +++ b/archivebox/crawls/statemachines.py @@ -36,13 +36,19 @@ class CrawlMachine(StateMachine, strict_states=True): super().__init__(crawl, *args, **kwargs) def __repr__(self) -> str: - return f'[grey53]Crawl\\[{self.crawl.id}] 🏃‍♂️ Worker\\[pid={os.getpid()}].tick()[/grey53] [blue]{self.crawl.status.upper()}[/blue] ⚙️ [grey37]Machine[/grey37]' - + return f'Crawl[{self.crawl.id}]' + def __str__(self) -> str: return self.__repr__() def can_start(self) -> bool: - return bool(self.crawl.seed and self.crawl.seed.uri) + if not self.crawl.seed: + print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: no seed[/red]') + return False + if not self.crawl.seed.uri: + print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: seed has no URI[/red]') + return False + return True def is_finished(self) -> bool: from core.models import Snapshot, ArchiveResult @@ -73,25 +79,121 @@ class CrawlMachine(StateMachine, strict_states=True): @started.enter def enter_started(self): - print(f'{self}.on_started(): [blue]↳ STARTED[/blue] crawl.run()') + # Suppressed: state transition logs # lock the crawl object while we create snapshots self.crawl.update_for_workers( retry_at=timezone.now() + timedelta(seconds=5), status=Crawl.StatusChoices.QUEUED, ) - # Run the crawl - creates root snapshot and processes queued URLs - self.crawl.run() + try: + # Run on_Crawl hooks to validate/install dependencies + self._run_crawl_hooks() - # only update status to STARTED once snapshots are created - self.crawl.update_for_workers( - retry_at=timezone.now() + timedelta(seconds=5), - status=Crawl.StatusChoices.STARTED, + # Run the crawl - creates root snapshot and processes queued URLs + self.crawl.run() + + # only update status to STARTED once snapshots are created + self.crawl.update_for_workers( + retry_at=timezone.now() + timedelta(seconds=5), + status=Crawl.StatusChoices.STARTED, + ) + except Exception as e: + print(f'[red]⚠️ Crawl {self.crawl.id} failed to start: {e}[/red]') + import traceback + traceback.print_exc() + # Re-raise so the worker knows it failed + raise + + def _run_crawl_hooks(self): + """Run on_Crawl hooks to validate/install dependencies.""" + from pathlib import Path + from archivebox.hooks import run_hooks, discover_hooks + from archivebox.config import CONSTANTS + + # Discover and run all on_Crawl hooks + hooks = discover_hooks('Crawl') + if not hooks: + return + + # Create a temporary output directory for hook results + output_dir = Path(CONSTANTS.DATA_DIR) / 'tmp' / f'crawl_{self.crawl.id}' + output_dir.mkdir(parents=True, exist_ok=True) + + # Run all on_Crawl hooks + results = run_hooks( + event_name='Crawl', + output_dir=output_dir, + timeout=60, + config_objects=[self.crawl, self.crawl.seed] if self.crawl.seed else [self.crawl], + crawl_id=str(self.crawl.id), + seed_uri=self.crawl.seed.uri if self.crawl.seed else '', ) - @sealed.enter + # Process hook results - parse JSONL output and create DB objects + self._process_hook_results(results) + + def _process_hook_results(self, results: list): + """Process JSONL output from hooks to create InstalledBinary and update Machine config.""" + import json + from machine.models import Machine, InstalledBinary + + machine = Machine.current() + + for result in results: + if result['returncode'] != 0: + # Hook failed - might indicate missing dependency + continue + + # Parse JSONL output + for line in result['stdout'].strip().split('\n'): + if not line.strip(): + continue + + try: + obj = json.loads(line) + obj_type = obj.get('type') + + if obj_type == 'InstalledBinary': + # Create or update InstalledBinary record + # Skip if essential fields are missing + if not obj.get('name') or not obj.get('abspath') or not obj.get('version'): + continue + + InstalledBinary.objects.update_or_create( + machine=machine, + name=obj['name'], + defaults={ + 'abspath': obj['abspath'], + 'version': obj['version'], + 'sha256': obj.get('sha256') or '', + 'binprovider': obj.get('binprovider') or 'env', + } + ) + + elif obj_type == 'Machine': + # Update Machine config + method = obj.get('_method', 'update') + if method == 'update': + key = obj.get('key', '') + value = obj.get('value') + if key.startswith('config/'): + config_key = key[7:] # Remove 'config/' prefix + machine.config[config_key] = value + machine.save(update_fields=['config']) + + elif obj_type == 'Dependency': + # Dependency request - could trigger installation + # For now just log it (installation hooks would be separate) + print(f'[yellow]Dependency requested: {obj.get("bin_name")}[/yellow]') + + except json.JSONDecodeError: + # Not JSON, skip + continue + + @sealed.enter def enter_sealed(self): - print(f'{self}.on_sealed(): [blue]↳ SEALED[/blue] crawl.retry_at=None') + # Suppressed: state transition logs self.crawl.update_for_workers( retry_at=None, status=Crawl.StatusChoices.SEALED, diff --git a/archivebox/hooks.py b/archivebox/hooks.py index 4c2bdd09..4b06324a 100644 --- a/archivebox/hooks.py +++ b/archivebox/hooks.py @@ -245,6 +245,14 @@ def run_hook( env.setdefault('USER_AGENT', str(getattr(config, 'USER_AGENT', ''))) env.setdefault('RESOLUTION', str(getattr(config, 'RESOLUTION', ''))) + # Pass SEARCH_BACKEND_ENGINE from new-style config + try: + from archivebox.config.configset import get_config + search_config = get_config() + env.setdefault('SEARCH_BACKEND_ENGINE', str(search_config.get('SEARCH_BACKEND_ENGINE', 'ripgrep'))) + except Exception: + env.setdefault('SEARCH_BACKEND_ENGINE', 'ripgrep') + # Create output directory if needed output_dir.mkdir(parents=True, exist_ok=True) diff --git a/archivebox/logs/errors.log b/archivebox/logs/errors.log new file mode 100644 index 00000000..715cf9d3 --- /dev/null +++ b/archivebox/logs/errors.log @@ -0,0 +1,2 @@ + +> /Users/squash/Local/Code/archiveboxes/archivebox-nue/archivebox/cli/archivebox_init.py --force; TS=2025-12-25__08:03:12 VERSION=0.9.0rc1 IN_DOCKER=False IS_TTY=False diff --git a/archivebox/machine/admin.py b/archivebox/machine/admin.py index b1796025..adb6dd19 100644 --- a/archivebox/machine/admin.py +++ b/archivebox/machine/admin.py @@ -12,7 +12,33 @@ class MachineAdmin(ConfigEditorMixin, BaseModelAdmin): sort_fields = ('id', 'created_at', 'hostname', 'ips', 'os_platform', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'os_arch', 'os_family', 'os_release', 'hw_uuid') readonly_fields = ('guid', 'created_at', 'modified_at', 'ips') - fields = (*readonly_fields, 'hostname', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'hw_uuid', 'os_arch', 'os_family', 'os_platform', 'os_kernel', 'os_release', 'stats', 'config', 'num_uses_succeeded', 'num_uses_failed') + + fieldsets = ( + ('Identity', { + 'fields': ('hostname', 'guid', 'ips'), + 'classes': ('card',), + }), + ('Hardware', { + 'fields': ('hw_manufacturer', 'hw_product', 'hw_uuid', 'hw_in_docker', 'hw_in_vm'), + 'classes': ('card',), + }), + ('Operating System', { + 'fields': ('os_platform', 'os_family', 'os_arch', 'os_kernel', 'os_release'), + 'classes': ('card',), + }), + ('Statistics', { + 'fields': ('stats', 'num_uses_succeeded', 'num_uses_failed'), + 'classes': ('card',), + }), + ('Configuration', { + 'fields': ('config',), + 'classes': ('card', 'wide'), + }), + ('Timestamps', { + 'fields': ('created_at', 'modified_at'), + 'classes': ('card',), + }), + ) list_filter = ('hw_in_docker', 'hw_in_vm', 'os_arch', 'os_family', 'os_platform') ordering = ['-created_at'] @@ -33,7 +59,29 @@ class NetworkInterfaceAdmin(BaseModelAdmin): search_fields = ('id', 'machine__id', 'iface', 'ip_public', 'ip_local', 'mac_address', 'dns_server', 'hostname', 'isp', 'city', 'region', 'country') readonly_fields = ('machine', 'created_at', 'modified_at', 'mac_address', 'ip_public', 'ip_local', 'dns_server') - fields = (*readonly_fields, 'iface', 'hostname', 'isp', 'city', 'region', 'country', 'num_uses_succeeded', 'num_uses_failed') + + fieldsets = ( + ('Machine', { + 'fields': ('machine',), + 'classes': ('card',), + }), + ('Network', { + 'fields': ('iface', 'ip_public', 'ip_local', 'mac_address', 'dns_server'), + 'classes': ('card',), + }), + ('Location', { + 'fields': ('hostname', 'isp', 'city', 'region', 'country'), + 'classes': ('card',), + }), + ('Usage', { + 'fields': ('num_uses_succeeded', 'num_uses_failed'), + 'classes': ('card',), + }), + ('Timestamps', { + 'fields': ('created_at', 'modified_at'), + 'classes': ('card',), + }), + ) list_filter = ('isp', 'country', 'region') ordering = ['-created_at'] @@ -54,7 +102,25 @@ class DependencyAdmin(ConfigEditorMixin, BaseModelAdmin): search_fields = ('id', 'bin_name', 'bin_providers') readonly_fields = ('id', 'created_at', 'modified_at', 'is_installed', 'installed_count') - fields = ('bin_name', 'bin_providers', 'custom_cmds', 'config', *readonly_fields) + + fieldsets = ( + ('Binary', { + 'fields': ('bin_name', 'bin_providers', 'is_installed', 'installed_count'), + 'classes': ('card',), + }), + ('Commands', { + 'fields': ('custom_cmds',), + 'classes': ('card',), + }), + ('Configuration', { + 'fields': ('config',), + 'classes': ('card', 'wide'), + }), + ('Timestamps', { + 'fields': ('id', 'created_at', 'modified_at'), + 'classes': ('card',), + }), + ) list_filter = ('bin_providers', 'created_at') ordering = ['-created_at'] @@ -82,7 +148,29 @@ class InstalledBinaryAdmin(BaseModelAdmin): search_fields = ('id', 'machine__id', 'name', 'binprovider', 'version', 'abspath', 'sha256', 'dependency__bin_name') readonly_fields = ('created_at', 'modified_at') - fields = ('machine', 'dependency', 'name', 'binprovider', 'abspath', 'version', 'sha256', *readonly_fields, 'num_uses_succeeded', 'num_uses_failed') + + fieldsets = ( + ('Binary Info', { + 'fields': ('name', 'dependency', 'binprovider'), + 'classes': ('card',), + }), + ('Location', { + 'fields': ('machine', 'abspath'), + 'classes': ('card',), + }), + ('Version', { + 'fields': ('version', 'sha256'), + 'classes': ('card',), + }), + ('Usage', { + 'fields': ('num_uses_succeeded', 'num_uses_failed'), + 'classes': ('card',), + }), + ('Timestamps', { + 'fields': ('created_at', 'modified_at'), + 'classes': ('card',), + }), + ) list_filter = ('name', 'binprovider', 'machine_id', 'dependency') ordering = ['-created_at'] diff --git a/archivebox/misc/logging_util.py b/archivebox/misc/logging_util.py index 469b705b..766eed98 100644 --- a/archivebox/misc/logging_util.py +++ b/archivebox/misc/logging_util.py @@ -544,16 +544,21 @@ def log_worker_event( # Build worker identifier worker_parts = [worker_type] - if pid: + # Don't add pid/worker_id for DB operations (they happen in whatever process is running) + if pid and worker_type != 'DB': worker_parts.append(f'pid={pid}') - if worker_id and worker_type in ('CrawlWorker', 'Orchestrator'): + if worker_id and worker_type in ('CrawlWorker', 'Orchestrator') and worker_type != 'DB': worker_parts.append(f'id={worker_id}') - if url and worker_type == 'SnapshotWorker': + if url and worker_type in ('SnapshotWorker', 'DB'): worker_parts.append(f'url={truncate_url(url)}') - if extractor and worker_type == 'ArchiveResultWorker': + if extractor and worker_type in ('ArchiveResultWorker', 'DB'): worker_parts.append(f'extractor={extractor}') - worker_label = f'{worker_parts[0]}[{", ".join(worker_parts[1:])}]' + # Format worker label - only add brackets if there are additional identifiers + if len(worker_parts) > 1: + worker_label = f'{worker_parts[0]}[{", ".join(worker_parts[1:])}]' + else: + worker_label = worker_parts[0] # Build metadata string metadata_str = '' @@ -579,12 +584,14 @@ def log_worker_event( meta_parts.append(f'{k}: {len(v)}') else: meta_parts.append(f'{k}: {v}') - metadata_str = ' {' + ', '.join(meta_parts) + '}' + metadata_str = ' | '.join(meta_parts) # Determine color based on event color = 'white' if event in ('Starting...', 'Started', 'STARTED', 'Started in background'): color = 'green' + elif event.startswith('Created'): + color = 'cyan' # DB creation events elif event in ('Processing...', 'PROCESSING'): color = 'blue' elif event in ('Completed', 'COMPLETED', 'All work complete'): @@ -606,8 +613,9 @@ def log_worker_event( text.append(indent) # Indentation # Append worker label and event with color text.append(f'{worker_label} {event}{error_str}', style=color) - # Append metadata without color - text.append(metadata_str) + # Append metadata without color (add separator if metadata exists) + if metadata_str: + text.append(f' | {metadata_str}') CONSOLE.print(text) diff --git a/archivebox/plugins/accessibility/on_Snapshot__39_accessibility.js b/archivebox/plugins/accessibility/on_Snapshot__39_accessibility.js index 88f90fb4..c509be9a 100755 --- a/archivebox/plugins/accessibility/on_Snapshot__39_accessibility.js +++ b/archivebox/plugins/accessibility/on_Snapshot__39_accessibility.js @@ -21,9 +21,9 @@ const puppeteer = require('puppeteer-core'); // Extractor metadata const EXTRACTOR_NAME = 'accessibility'; -const OUTPUT_DIR = 'accessibility'; +const OUTPUT_DIR = '.'; const OUTPUT_FILE = 'accessibility.json'; -const CHROME_SESSION_DIR = 'chrome_session'; +const CHROME_SESSION_DIR = '../chrome_session'; // Parse command line arguments function parseArgs() { @@ -60,10 +60,7 @@ function getCdpUrl() { // Extract accessibility info async function extractAccessibility(url) { - // Create output directory - if (!fs.existsSync(OUTPUT_DIR)) { - fs.mkdirSync(OUTPUT_DIR, { recursive: true }); - } + // Output directory is current directory (hook already runs in output dir) const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE); let browser = null; diff --git a/archivebox/plugins/archive_org/on_Snapshot__13_archive_org.py b/archivebox/plugins/archive_org/on_Snapshot__13_archive_org.py index f9eca9bf..1fbd0a6b 100644 --- a/archivebox/plugins/archive_org/on_Snapshot__13_archive_org.py +++ b/archivebox/plugins/archive_org/on_Snapshot__13_archive_org.py @@ -24,7 +24,7 @@ import rich_click as click # Extractor metadata EXTRACTOR_NAME = 'archive_org' -OUTPUT_DIR = 'archive_org' +OUTPUT_DIR = '.' OUTPUT_FILE = 'archive.org.txt' diff --git a/archivebox/plugins/chrome_cleanup/on_Snapshot__45_chrome_cleanup.py b/archivebox/plugins/chrome_cleanup/on_Snapshot__45_chrome_cleanup.py index 2baedcad..fae91ffb 100644 --- a/archivebox/plugins/chrome_cleanup/on_Snapshot__45_chrome_cleanup.py +++ b/archivebox/plugins/chrome_cleanup/on_Snapshot__45_chrome_cleanup.py @@ -26,7 +26,7 @@ import rich_click as click # Extractor metadata EXTRACTOR_NAME = 'chrome_cleanup' -CHROME_SESSION_DIR = 'chrome_session' +CHROME_SESSION_DIR = '../chrome_session' def get_env(name: str, default: str = '') -> str: diff --git a/archivebox/plugins/chrome_navigate/on_Snapshot__30_chrome_navigate.js b/archivebox/plugins/chrome_navigate/on_Snapshot__30_chrome_navigate.js index c883a74f..b34c8c96 100644 --- a/archivebox/plugins/chrome_navigate/on_Snapshot__30_chrome_navigate.js +++ b/archivebox/plugins/chrome_navigate/on_Snapshot__30_chrome_navigate.js @@ -31,7 +31,7 @@ const puppeteer = require('puppeteer-core'); // Extractor metadata const EXTRACTOR_NAME = 'chrome_navigate'; -const CHROME_SESSION_DIR = 'chrome_session'; +const CHROME_SESSION_DIR = '../chrome_session'; // Parse command line arguments function parseArgs() { diff --git a/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.js b/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.js index ba72f2a2..fc90aa03 100755 --- a/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.js +++ b/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.js @@ -21,9 +21,9 @@ const puppeteer = require('puppeteer-core'); // Extractor metadata const EXTRACTOR_NAME = 'consolelog'; -const OUTPUT_DIR = 'consolelog'; +const OUTPUT_DIR = '.'; const OUTPUT_FILE = 'console.jsonl'; -const CHROME_SESSION_DIR = 'chrome_session'; +const CHROME_SESSION_DIR = '../chrome_session'; // Parse command line arguments function parseArgs() { @@ -86,10 +86,7 @@ async function serializeArgs(args) { async function captureConsoleLogs(url) { const captureTimeout = (getEnvInt('CONSOLELOG_TIMEOUT') || 5) * 1000; - // Create output directory - if (!fs.existsSync(OUTPUT_DIR)) { - fs.mkdirSync(OUTPUT_DIR, { recursive: true }); - } + // Output directory is current directory (hook already runs in output dir) const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE); // Clear existing file diff --git a/archivebox/plugins/dom/on_Snapshot__36_dom.js b/archivebox/plugins/dom/on_Snapshot__36_dom.js index b3b65614..6020ed55 100644 --- a/archivebox/plugins/dom/on_Snapshot__36_dom.js +++ b/archivebox/plugins/dom/on_Snapshot__36_dom.js @@ -24,9 +24,9 @@ const puppeteer = require('puppeteer-core'); // Extractor metadata const EXTRACTOR_NAME = 'dom'; -const OUTPUT_DIR = 'dom'; +const OUTPUT_DIR = '.'; const OUTPUT_FILE = 'output.html'; -const CHROME_SESSION_DIR = 'chrome_session'; +const CHROME_SESSION_DIR = '../chrome_session'; // Parse command line arguments function parseArgs() { @@ -58,7 +58,7 @@ function getEnvInt(name, defaultValue = 0) { } // Check if staticfile extractor already downloaded this URL -const STATICFILE_DIR = 'staticfile'; +const STATICFILE_DIR = '../staticfile'; function hasStaticFileOutput() { return fs.existsSync(STATICFILE_DIR) && fs.readdirSync(STATICFILE_DIR).length > 0; } @@ -114,10 +114,7 @@ async function dumpDom(url) { const { width, height } = parseResolution(resolution); - // Create output directory - if (!fs.existsSync(OUTPUT_DIR)) { - fs.mkdirSync(OUTPUT_DIR, { recursive: true }); - } + // Output directory is current directory (hook already runs in output dir) const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE); let browser = null; diff --git a/archivebox/plugins/favicon/on_Snapshot__11_favicon.py b/archivebox/plugins/favicon/on_Snapshot__11_favicon.py index 61280af2..78c9e4b3 100644 --- a/archivebox/plugins/favicon/on_Snapshot__11_favicon.py +++ b/archivebox/plugins/favicon/on_Snapshot__11_favicon.py @@ -26,7 +26,7 @@ import rich_click as click # Extractor metadata EXTRACTOR_NAME = 'favicon' -OUTPUT_DIR = 'favicon' +OUTPUT_DIR = '.' OUTPUT_FILE = 'favicon.ico' diff --git a/archivebox/plugins/git/on_Snapshot__12_git.py b/archivebox/plugins/git/on_Snapshot__12_git.py index 4d2db822..16e0c43e 100644 --- a/archivebox/plugins/git/on_Snapshot__12_git.py +++ b/archivebox/plugins/git/on_Snapshot__12_git.py @@ -26,7 +26,7 @@ import rich_click as click EXTRACTOR_NAME = 'git' BIN_NAME = 'git' BIN_PROVIDERS = 'apt,brew,env' -OUTPUT_DIR = 'repo' +OUTPUT_DIR = '.' def get_env(name: str, default: str = '') -> str: diff --git a/archivebox/plugins/headers/on_Snapshot__33_headers.js b/archivebox/plugins/headers/on_Snapshot__33_headers.js index 79ba3eed..5ead49f5 100644 --- a/archivebox/plugins/headers/on_Snapshot__33_headers.js +++ b/archivebox/plugins/headers/on_Snapshot__33_headers.js @@ -22,9 +22,9 @@ const http = require('http'); // Extractor metadata const EXTRACTOR_NAME = 'headers'; -const OUTPUT_DIR = 'headers'; +const OUTPUT_DIR = '.'; const OUTPUT_FILE = 'headers.json'; -const CHROME_SESSION_DIR = 'chrome_session'; +const CHROME_SESSION_DIR = '../chrome_session'; const CHROME_HEADERS_FILE = 'response_headers.json'; // Parse command line arguments @@ -110,10 +110,7 @@ function fetchHeaders(url) { } async function extractHeaders(url) { - // Create output directory - if (!fs.existsSync(OUTPUT_DIR)) { - fs.mkdirSync(OUTPUT_DIR, { recursive: true }); - } + // Output directory is current directory (hook already runs in output dir) const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE); // Try Chrome session first diff --git a/archivebox/plugins/htmltotext/on_Snapshot__54_htmltotext.py b/archivebox/plugins/htmltotext/on_Snapshot__54_htmltotext.py index 43a53b30..21293014 100644 --- a/archivebox/plugins/htmltotext/on_Snapshot__54_htmltotext.py +++ b/archivebox/plugins/htmltotext/on_Snapshot__54_htmltotext.py @@ -28,7 +28,7 @@ import rich_click as click # Extractor metadata EXTRACTOR_NAME = 'htmltotext' -OUTPUT_DIR = 'htmltotext' +OUTPUT_DIR = '.' OUTPUT_FILE = 'htmltotext.txt' @@ -114,9 +114,8 @@ def extract_htmltotext(url: str) -> tuple[bool, str | None, str]: if not text or len(text) < 10: return False, None, 'No meaningful text extracted from HTML' - # Create output directory and write output + # Output directory is current directory (hook already runs in output dir) output_dir = Path(OUTPUT_DIR) - output_dir.mkdir(exist_ok=True) output_path = output_dir / OUTPUT_FILE output_path.write_text(text, encoding='utf-8') diff --git a/archivebox/plugins/media/on_Snapshot__51_media.py b/archivebox/plugins/media/on_Snapshot__51_media.py index 552f5258..1677fc2c 100644 --- a/archivebox/plugins/media/on_Snapshot__51_media.py +++ b/archivebox/plugins/media/on_Snapshot__51_media.py @@ -39,7 +39,7 @@ import rich_click as click EXTRACTOR_NAME = 'media' BIN_NAME = 'yt-dlp' BIN_PROVIDERS = 'pip,apt,brew,env' -OUTPUT_DIR = 'media' +OUTPUT_DIR = '.' def get_env(name: str, default: str = '') -> str: @@ -62,7 +62,7 @@ def get_env_int(name: str, default: int = 0) -> int: return default -STATICFILE_DIR = 'staticfile' +STATICFILE_DIR = '../staticfile' def has_staticfile_output() -> bool: """Check if staticfile extractor already downloaded this URL.""" @@ -129,9 +129,8 @@ def save_media(url: str, binary: str) -> tuple[bool, str | None, str]: extra_args = get_env('YTDLP_EXTRA_ARGS') or get_env('YOUTUBEDL_EXTRA_ARGS', '') media_max_size = get_env('MEDIA_MAX_SIZE', '750m') - # Create output directory + # Output directory is current directory (hook already runs in output dir) output_dir = Path(OUTPUT_DIR) - output_dir.mkdir(exist_ok=True) # Build command (later options take precedence) cmd = [ diff --git a/archivebox/plugins/mercury/on_Snapshot__53_mercury.py b/archivebox/plugins/mercury/on_Snapshot__53_mercury.py index e9b5f63a..efd3ed6b 100644 --- a/archivebox/plugins/mercury/on_Snapshot__53_mercury.py +++ b/archivebox/plugins/mercury/on_Snapshot__53_mercury.py @@ -27,7 +27,7 @@ import rich_click as click EXTRACTOR_NAME = 'mercury' BIN_NAME = 'postlight-parser' BIN_PROVIDERS = 'npm,env' -OUTPUT_DIR = 'mercury' +OUTPUT_DIR = '.' def get_env(name: str, default: str = '') -> str: @@ -72,9 +72,8 @@ def extract_mercury(url: str, binary: str) -> tuple[bool, str | None, str]: """ timeout = get_env_int('TIMEOUT', 60) - # Create output directory + # Output directory is current directory (hook already runs in output dir) output_dir = Path(OUTPUT_DIR) - output_dir.mkdir(exist_ok=True) try: # Get text version diff --git a/archivebox/plugins/parse_dom_outlinks/on_Snapshot__40_parse_dom_outlinks.js b/archivebox/plugins/parse_dom_outlinks/on_Snapshot__40_parse_dom_outlinks.js index 9cff5e33..72708e95 100755 --- a/archivebox/plugins/parse_dom_outlinks/on_Snapshot__40_parse_dom_outlinks.js +++ b/archivebox/plugins/parse_dom_outlinks/on_Snapshot__40_parse_dom_outlinks.js @@ -24,10 +24,10 @@ const puppeteer = require('puppeteer-core'); // Extractor metadata const EXTRACTOR_NAME = 'parse_dom_outlinks'; -const OUTPUT_DIR = 'parse_dom_outlinks'; +const OUTPUT_DIR = '.'; const OUTPUT_FILE = 'outlinks.json'; const URLS_FILE = 'urls.jsonl'; // For crawl system -const CHROME_SESSION_DIR = 'chrome_session'; +const CHROME_SESSION_DIR = '../chrome_session'; // Parse command line arguments function parseArgs() { @@ -64,10 +64,7 @@ function getCdpUrl() { // Extract outlinks async function extractOutlinks(url) { - // Create output directory - if (!fs.existsSync(OUTPUT_DIR)) { - fs.mkdirSync(OUTPUT_DIR, { recursive: true }); - } + // Output directory is current directory (hook already runs in output dir) const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE); let browser = null; diff --git a/archivebox/plugins/pdf/on_Snapshot__35_pdf.js b/archivebox/plugins/pdf/on_Snapshot__35_pdf.js index c6967b46..e4787be7 100644 --- a/archivebox/plugins/pdf/on_Snapshot__35_pdf.js +++ b/archivebox/plugins/pdf/on_Snapshot__35_pdf.js @@ -23,9 +23,9 @@ const puppeteer = require('puppeteer-core'); // Extractor metadata const EXTRACTOR_NAME = 'pdf'; -const OUTPUT_DIR = 'pdf'; +const OUTPUT_DIR = '.'; const OUTPUT_FILE = 'output.pdf'; -const CHROME_SESSION_DIR = 'chrome_session'; +const CHROME_SESSION_DIR = '../chrome_session'; // Parse command line arguments function parseArgs() { @@ -57,7 +57,7 @@ function getEnvInt(name, defaultValue = 0) { } // Check if staticfile extractor already downloaded this URL -const STATICFILE_DIR = 'staticfile'; +const STATICFILE_DIR = '../staticfile'; function hasStaticFileOutput() { return fs.existsSync(STATICFILE_DIR) && fs.readdirSync(STATICFILE_DIR).length > 0; } @@ -113,10 +113,7 @@ async function printToPdf(url) { const { width, height } = parseResolution(resolution); - // Create output directory - if (!fs.existsSync(OUTPUT_DIR)) { - fs.mkdirSync(OUTPUT_DIR, { recursive: true }); - } + // Output directory is current directory (hook already runs in output dir) const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE); let browser = null; diff --git a/archivebox/plugins/readability/on_Snapshot__52_readability.py b/archivebox/plugins/readability/on_Snapshot__52_readability.py index 165bc71c..a161e03f 100644 --- a/archivebox/plugins/readability/on_Snapshot__52_readability.py +++ b/archivebox/plugins/readability/on_Snapshot__52_readability.py @@ -29,7 +29,7 @@ import rich_click as click EXTRACTOR_NAME = 'readability' BIN_NAME = 'readability-extractor' BIN_PROVIDERS = 'npm,env' -OUTPUT_DIR = 'readability' +OUTPUT_DIR = '.' def get_env(name: str, default: str = '') -> str: @@ -101,9 +101,8 @@ def extract_readability(url: str, binary: str) -> tuple[bool, str | None, str]: if not html_source: return False, None, 'No HTML source found (run singlefile, dom, or wget first)' - # Create output directory + # Output directory is current directory (hook already runs in output dir) output_dir = Path(OUTPUT_DIR) - output_dir.mkdir(exist_ok=True) try: # Run readability-extractor (outputs JSON by default) diff --git a/archivebox/plugins/redirects/on_Snapshot__22_redirects.js b/archivebox/plugins/redirects/on_Snapshot__22_redirects.js index 3aba0581..aaa43232 100755 --- a/archivebox/plugins/redirects/on_Snapshot__22_redirects.js +++ b/archivebox/plugins/redirects/on_Snapshot__22_redirects.js @@ -21,9 +21,9 @@ const puppeteer = require('puppeteer-core'); // Extractor metadata const EXTRACTOR_NAME = 'redirects'; -const OUTPUT_DIR = 'redirects'; +const OUTPUT_DIR = '.'; const OUTPUT_FILE = 'redirects.json'; -const CHROME_SESSION_DIR = 'chrome_session'; +const CHROME_SESSION_DIR = '../chrome_session'; // Parse command line arguments function parseArgs() { @@ -60,10 +60,7 @@ function getCdpUrl() { // Track redirect chain async function trackRedirects(url) { - // Create output directory - if (!fs.existsSync(OUTPUT_DIR)) { - fs.mkdirSync(OUTPUT_DIR, { recursive: true }); - } + // Output directory is current directory (hook already runs in output dir) const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE); let browser = null; diff --git a/archivebox/plugins/responses/on_Snapshot__24_responses.js b/archivebox/plugins/responses/on_Snapshot__24_responses.js index f5094dea..c69743b4 100755 --- a/archivebox/plugins/responses/on_Snapshot__24_responses.js +++ b/archivebox/plugins/responses/on_Snapshot__24_responses.js @@ -26,8 +26,8 @@ const puppeteer = require('puppeteer-core'); // Extractor metadata const EXTRACTOR_NAME = 'responses'; -const OUTPUT_DIR = 'responses'; -const CHROME_SESSION_DIR = 'chrome_session'; +const OUTPUT_DIR = '.'; +const CHROME_SESSION_DIR = '../chrome_session'; // Resource types to capture (by default, capture everything) const DEFAULT_TYPES = ['script', 'stylesheet', 'font', 'image', 'media', 'xhr', 'websocket']; @@ -149,10 +149,8 @@ async function archiveResponses(originalUrl) { const typesStr = getEnv('RESPONSES_TYPES', DEFAULT_TYPES.join(',')); const typesToSave = typesStr.split(',').map(t => t.trim().toLowerCase()); - // Create output directories - if (!fs.existsSync(OUTPUT_DIR)) { - fs.mkdirSync(OUTPUT_DIR, { recursive: true }); - } + // Output directory is current directory (hook already runs in output dir) + // Create subdirectories for organizing responses const allDir = path.join(OUTPUT_DIR, 'all'); if (!fs.existsSync(allDir)) { fs.mkdirSync(allDir, { recursive: true }); diff --git a/archivebox/plugins/screenshot/on_Snapshot__34_screenshot.js b/archivebox/plugins/screenshot/on_Snapshot__34_screenshot.js index 35465ef1..db9b6467 100644 --- a/archivebox/plugins/screenshot/on_Snapshot__34_screenshot.js +++ b/archivebox/plugins/screenshot/on_Snapshot__34_screenshot.js @@ -23,9 +23,9 @@ const puppeteer = require('puppeteer-core'); // Extractor metadata const EXTRACTOR_NAME = 'screenshot'; -const OUTPUT_DIR = 'screenshot'; +const OUTPUT_DIR = '.'; const OUTPUT_FILE = 'screenshot.png'; -const CHROME_SESSION_DIR = 'chrome_session'; +const CHROME_SESSION_DIR = '../chrome_session'; // Parse command line arguments function parseArgs() { @@ -57,7 +57,7 @@ function getEnvInt(name, defaultValue = 0) { } // Check if staticfile extractor already downloaded this URL -const STATICFILE_DIR = 'staticfile'; +const STATICFILE_DIR = '../staticfile'; function hasStaticFileOutput() { return fs.existsSync(STATICFILE_DIR) && fs.readdirSync(STATICFILE_DIR).length > 0; } @@ -116,10 +116,7 @@ async function takeScreenshot(url) { const { width, height } = parseResolution(resolution); - // Create output directory - if (!fs.existsSync(OUTPUT_DIR)) { - fs.mkdirSync(OUTPUT_DIR, { recursive: true }); - } + // Output directory is current directory (hook already runs in output dir) const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE); let browser = null; diff --git a/archivebox/plugins/search_backend_ripgrep/on_Crawl__00_validate_ripgrep.py b/archivebox/plugins/search_backend_ripgrep/on_Crawl__00_validate_ripgrep.py new file mode 100755 index 00000000..714b36df --- /dev/null +++ b/archivebox/plugins/search_backend_ripgrep/on_Crawl__00_validate_ripgrep.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python3 +""" +Validation hook for ripgrep binary. + +Only runs if SEARCH_BACKEND_ENGINE is set to 'ripgrep'. +Outputs JSONL for InstalledBinary and Machine config updates. +""" + +import os +import sys +import json +import shutil +import hashlib +import subprocess +from pathlib import Path + + +def get_binary_version(abspath: str) -> str | None: + """Get version string from ripgrep binary.""" + try: + result = subprocess.run( + [abspath, '--version'], + capture_output=True, + text=True, + timeout=5, + ) + if result.returncode == 0 and result.stdout: + # ripgrep version string: "ripgrep 14.1.0" + first_line = result.stdout.strip().split('\n')[0] + parts = first_line.split() + for i, part in enumerate(parts): + if part.lower() == 'ripgrep' and i + 1 < len(parts): + return parts[i + 1] + # Try to find version number pattern + for part in parts: + if part[0].isdigit() and '.' in part: + return part + return first_line[:32] + except Exception: + pass + return None + + +def get_binary_hash(abspath: str) -> str | None: + """Get SHA256 hash of binary.""" + try: + with open(abspath, 'rb') as f: + return hashlib.sha256(f.read()).hexdigest() + except Exception: + return None + + +def find_ripgrep() -> dict | None: + """Find ripgrep binary using shutil.which or env var.""" + # Check env var first - if it's an absolute path and exists, use it + ripgrep_env = os.environ.get('RIPGREP_BINARY', '') + if ripgrep_env and '/' in ripgrep_env and Path(ripgrep_env).is_file(): + abspath = ripgrep_env + else: + # Otherwise try shutil.which with the env var as the binary name + abspath = shutil.which(ripgrep_env) if ripgrep_env else None + if not abspath: + abspath = shutil.which('rg') + + if abspath and Path(abspath).is_file(): + return { + 'name': 'rg', + 'abspath': abspath, + 'version': get_binary_version(abspath), + 'sha256': get_binary_hash(abspath), + 'binprovider': 'env', + } + + return None + + +def main(): + """Validate ripgrep binary and output JSONL.""" + + # Check if ripgrep search backend is enabled + search_backend = os.environ.get('SEARCH_BACKEND_ENGINE', '').lower() + + if search_backend != 'ripgrep': + # No-op: ripgrep is not the active search backend + sys.exit(0) + + result = find_ripgrep() + + if result and result.get('abspath'): + # Output InstalledBinary + print(json.dumps({ + 'type': 'InstalledBinary', + 'name': result['name'], + 'abspath': result['abspath'], + 'version': result['version'], + 'sha256': result['sha256'], + 'binprovider': result['binprovider'], + })) + + # Output Machine config update + print(json.dumps({ + 'type': 'Machine', + '_method': 'update', + 'key': 'config/RIPGREP_BINARY', + 'value': result['abspath'], + })) + + if result['version']: + print(json.dumps({ + 'type': 'Machine', + '_method': 'update', + 'key': 'config/RIPGREP_VERSION', + 'value': result['version'], + })) + + sys.exit(0) + else: + # Output Dependency request + print(json.dumps({ + 'type': 'Dependency', + 'bin_name': 'rg', + 'bin_providers': 'apt,brew,cargo,env', + })) + + # Exit non-zero to indicate binary not found + print(f"ripgrep binary not found", file=sys.stderr) + sys.exit(1) + + +if __name__ == '__main__': + main() diff --git a/archivebox/plugins/search_backend_ripgrep/tests/__init__.py b/archivebox/plugins/search_backend_ripgrep/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/archivebox/plugins/search_backend_ripgrep/tests/test_ripgrep_detection.py b/archivebox/plugins/search_backend_ripgrep/tests/test_ripgrep_detection.py new file mode 100644 index 00000000..5e36f5bf --- /dev/null +++ b/archivebox/plugins/search_backend_ripgrep/tests/test_ripgrep_detection.py @@ -0,0 +1,306 @@ +#!/usr/bin/env python3 +""" +Tests for ripgrep binary detection and archivebox install functionality. + +Guards against regressions in: +1. Machine.config overrides not being used in version command +2. Ripgrep hook not resolving binary names via shutil.which() +3. SEARCH_BACKEND_ENGINE not being passed to hook environment +""" + +import os +import sys +import json +import shutil +import tempfile +import subprocess +from pathlib import Path +from unittest.mock import patch, MagicMock + +import pytest + + +def test_ripgrep_hook_detects_binary_from_path(): + """Test that ripgrep hook finds binary using shutil.which() when env var is just a name.""" + hook_path = Path(__file__).parent.parent / 'on_Crawl__00_validate_ripgrep.py' + + # Skip if rg is not installed + if not shutil.which('rg'): + pytest.skip("ripgrep (rg) not installed") + + # Set SEARCH_BACKEND_ENGINE to enable the hook + env = os.environ.copy() + env['SEARCH_BACKEND_ENGINE'] = 'ripgrep' + env['RIPGREP_BINARY'] = 'rg' # Just the name, not the full path (this was the bug) + + result = subprocess.run( + [sys.executable, str(hook_path)], + capture_output=True, + text=True, + env=env, + timeout=10, + ) + + assert result.returncode == 0, f"Hook failed: {result.stderr}" + + # Parse JSONL output + lines = [line for line in result.stdout.strip().split('\n') if line.strip()] + assert len(lines) >= 2, "Expected at least 2 JSONL lines (InstalledBinary + Machine config)" + + installed_binary = json.loads(lines[0]) + assert installed_binary['type'] == 'InstalledBinary' + assert installed_binary['name'] == 'rg' + assert '/' in installed_binary['abspath'], "Expected full path, not just binary name" + assert Path(installed_binary['abspath']).is_file(), "Binary path should exist" + assert installed_binary['version'], "Version should be detected" + + machine_config = json.loads(lines[1]) + assert machine_config['type'] == 'Machine' + assert machine_config['key'] == 'config/RIPGREP_BINARY' + assert '/' in machine_config['value'], "Machine config should store full path" + + +def test_ripgrep_hook_skips_when_backend_not_ripgrep(): + """Test that ripgrep hook exits silently when search backend is not ripgrep.""" + hook_path = Path(__file__).parent.parent / 'on_Crawl__00_validate_ripgrep.py' + + env = os.environ.copy() + env['SEARCH_BACKEND_ENGINE'] = 'sqlite' # Different backend + + result = subprocess.run( + [sys.executable, str(hook_path)], + capture_output=True, + text=True, + env=env, + timeout=10, + ) + + assert result.returncode == 0, "Hook should exit successfully when backend is not ripgrep" + assert result.stdout.strip() == '', "Hook should produce no output when backend is not ripgrep" + + +def test_ripgrep_hook_handles_absolute_path(): + """Test that ripgrep hook works when RIPGREP_BINARY is an absolute path.""" + hook_path = Path(__file__).parent.parent / 'on_Crawl__00_validate_ripgrep.py' + + rg_path = shutil.which('rg') + if not rg_path: + pytest.skip("ripgrep (rg) not installed") + + env = os.environ.copy() + env['SEARCH_BACKEND_ENGINE'] = 'ripgrep' + env['RIPGREP_BINARY'] = rg_path # Full absolute path + + result = subprocess.run( + [sys.executable, str(hook_path)], + capture_output=True, + text=True, + env=env, + timeout=10, + ) + + assert result.returncode == 0, f"Hook failed: {result.stderr}" + assert result.stdout.strip(), "Hook should produce output" + + installed_binary = json.loads(result.stdout.strip().split('\n')[0]) + assert installed_binary['abspath'] == rg_path + + +@pytest.mark.django_db +def test_machine_config_overrides_base_config(): + """ + Test that Machine.config overrides take precedence over base config. + + Guards against regression where archivebox version was showing binaries + as "not installed" even though they were detected and stored in Machine.config. + """ + from machine.models import Machine, InstalledBinary + + machine = Machine.current() + + # Simulate a hook detecting chrome and storing it with a different path than base config + detected_chrome_path = '/custom/path/to/chrome' + machine.config['CHROME_BINARY'] = detected_chrome_path + machine.config['CHROME_VERSION'] = '143.0.7499.170' + machine.save() + + # Create InstalledBinary record + InstalledBinary.objects.create( + machine=machine, + name='chrome', + abspath=detected_chrome_path, + version='143.0.7499.170', + binprovider='env', + ) + + # Verify Machine.config takes precedence + from archivebox.config.configset import get_config + config = get_config() + + # Machine.config should override the base config value + assert machine.config.get('CHROME_BINARY') == detected_chrome_path + + # The version command should use Machine.config, not base config + # (Base config might have 'chromium' while Machine.config has the full path) + bin_value = machine.config.get('CHROME_BINARY') or config.get('CHROME_BINARY', '') + assert bin_value == detected_chrome_path, \ + "Machine.config override should take precedence over base config" + + +@pytest.mark.django_db +def test_search_backend_engine_passed_to_hooks(): + """ + Test that SEARCH_BACKEND_ENGINE is passed to hook environment. + + Guards against regression where hooks couldn't determine which search backend was active. + """ + from pathlib import Path + from archivebox.hooks import build_hook_environment + from archivebox.config.configset import get_config + + config = get_config() + search_backend = config.get('SEARCH_BACKEND_ENGINE', 'ripgrep') + + env = build_hook_environment(overrides=None) + + assert 'SEARCH_BACKEND_ENGINE' in env, \ + "SEARCH_BACKEND_ENGINE must be in hook environment" + assert env['SEARCH_BACKEND_ENGINE'] == search_backend, \ + f"Expected SEARCH_BACKEND_ENGINE={search_backend}, got {env.get('SEARCH_BACKEND_ENGINE')}" + + +@pytest.mark.django_db +def test_install_creates_installedbinary_records(): + """ + Test that archivebox install creates InstalledBinary records for detected binaries. + + This is an integration test that verifies the full install flow. + """ + from machine.models import Machine, InstalledBinary + from crawls.models import Seed, Crawl + from crawls.statemachines import CrawlMachine + from archivebox.base_models.models import get_or_create_system_user_pk + + machine = Machine.current() + initial_binary_count = InstalledBinary.objects.filter(machine=machine).count() + + # Create an install crawl (like archivebox install does) + created_by_id = get_or_create_system_user_pk() + seed, _ = Seed.objects.get_or_create( + uri='archivebox://test-install', + label='Test dependency detection', + created_by_id=created_by_id, + defaults={'extractor': 'auto'}, + ) + + crawl = Crawl.objects.create( + seed=seed, + max_depth=0, + created_by_id=created_by_id, + status='queued', + ) + + # Run the crawl state machine (this triggers hooks) + sm = CrawlMachine(crawl) + sm.send('tick') # queued -> started (runs hooks) + + # Verify InstalledBinary records were created + final_binary_count = InstalledBinary.objects.filter(machine=machine).count() + assert final_binary_count > initial_binary_count, \ + "archivebox install should create InstalledBinary records" + + # Verify at least some common binaries were detected + common_binaries = ['git', 'wget', 'node'] + detected = [] + for bin_name in common_binaries: + if InstalledBinary.objects.filter(machine=machine, name=bin_name).exists(): + detected.append(bin_name) + + assert detected, f"At least one of {common_binaries} should be detected" + + # Verify detected binaries have valid paths and versions + for binary in InstalledBinary.objects.filter(machine=machine): + if binary.abspath: # Only check non-empty paths + assert '/' in binary.abspath, \ + f"{binary.name} should have full path, not just name: {binary.abspath}" + # Version might be empty for some binaries, that's ok + + +@pytest.mark.django_db +def test_ripgrep_only_detected_when_backend_enabled(): + """ + Test that ripgrep is only detected when SEARCH_BACKEND_ENGINE='ripgrep'. + + Guards against ripgrep being installed/detected when not needed. + """ + from machine.models import Machine, InstalledBinary + from crawls.models import Seed, Crawl + from crawls.statemachines import CrawlMachine + from archivebox.base_models.models import get_or_create_system_user_pk + from django.conf import settings + + if not shutil.which('rg'): + pytest.skip("ripgrep (rg) not installed") + + machine = Machine.current() + + # Clear any existing ripgrep records + InstalledBinary.objects.filter(machine=machine, name='rg').delete() + + # Test 1: With ripgrep backend - should be detected + with patch('archivebox.config.configset.get_config') as mock_config: + mock_config.return_value = {'SEARCH_BACKEND_ENGINE': 'ripgrep', 'RIPGREP_BINARY': 'rg'} + + created_by_id = get_or_create_system_user_pk() + seed = Seed.objects.create( + uri='archivebox://test-rg-enabled', + label='Test ripgrep detection enabled', + created_by_id=created_by_id, + extractor='auto', + ) + + crawl = Crawl.objects.create( + seed=seed, + max_depth=0, + created_by_id=created_by_id, + status='queued', + ) + + sm = CrawlMachine(crawl) + sm.send('tick') + + # Ripgrep should be detected + rg_detected = InstalledBinary.objects.filter(machine=machine, name='rg').exists() + assert rg_detected, "Ripgrep should be detected when SEARCH_BACKEND_ENGINE='ripgrep'" + + # Clear records again + InstalledBinary.objects.filter(machine=machine, name='rg').delete() + + # Test 2: With different backend - should NOT be detected + with patch('archivebox.config.configset.get_config') as mock_config: + mock_config.return_value = {'SEARCH_BACKEND_ENGINE': 'sqlite', 'RIPGREP_BINARY': 'rg'} + + seed2 = Seed.objects.create( + uri='archivebox://test-rg-disabled', + label='Test ripgrep detection disabled', + created_by_id=created_by_id, + extractor='auto', + ) + + crawl2 = Crawl.objects.create( + seed=seed2, + max_depth=0, + created_by_id=created_by_id, + status='queued', + ) + + sm2 = CrawlMachine(crawl2) + sm2.send('tick') + + # Ripgrep should NOT be detected + rg_detected = InstalledBinary.objects.filter(machine=machine, name='rg').exists() + assert not rg_detected, "Ripgrep should NOT be detected when SEARCH_BACKEND_ENGINE!='ripgrep'" + + +if __name__ == '__main__': + pytest.main([__file__, '-v']) diff --git a/archivebox/plugins/search_backend_sonic/on_Snapshot__91_index_sonic.py b/archivebox/plugins/search_backend_sonic/on_Snapshot__91_index_sonic.py index a5d74236..fc496e74 100644 --- a/archivebox/plugins/search_backend_sonic/on_Snapshot__91_index_sonic.py +++ b/archivebox/plugins/search_backend_sonic/on_Snapshot__91_index_sonic.py @@ -29,7 +29,7 @@ import rich_click as click # Extractor metadata EXTRACTOR_NAME = 'index_sonic' -OUTPUT_DIR = 'search_index' +OUTPUT_DIR = '.' # Text file patterns to index INDEXABLE_FILES = [ diff --git a/archivebox/plugins/search_backend_sqlite/on_Snapshot__90_index_sqlite.py b/archivebox/plugins/search_backend_sqlite/on_Snapshot__90_index_sqlite.py index 36445ded..9f5f7311 100644 --- a/archivebox/plugins/search_backend_sqlite/on_Snapshot__90_index_sqlite.py +++ b/archivebox/plugins/search_backend_sqlite/on_Snapshot__90_index_sqlite.py @@ -27,7 +27,7 @@ import rich_click as click # Extractor metadata EXTRACTOR_NAME = 'index_sqlite' -OUTPUT_DIR = 'search_index' +OUTPUT_DIR = '.' # Text file patterns to index, in priority order INDEXABLE_FILES = [ diff --git a/archivebox/plugins/seo/on_Snapshot__38_seo.js b/archivebox/plugins/seo/on_Snapshot__38_seo.js index 3effeff3..b9efbd07 100755 --- a/archivebox/plugins/seo/on_Snapshot__38_seo.js +++ b/archivebox/plugins/seo/on_Snapshot__38_seo.js @@ -21,9 +21,9 @@ const puppeteer = require('puppeteer-core'); // Extractor metadata const EXTRACTOR_NAME = 'seo'; -const OUTPUT_DIR = 'seo'; +const OUTPUT_DIR = '.'; const OUTPUT_FILE = 'seo.json'; -const CHROME_SESSION_DIR = 'chrome_session'; +const CHROME_SESSION_DIR = '../chrome_session'; // Parse command line arguments function parseArgs() { @@ -60,10 +60,7 @@ function getCdpUrl() { // Extract SEO metadata async function extractSeo(url) { - // Create output directory - if (!fs.existsSync(OUTPUT_DIR)) { - fs.mkdirSync(OUTPUT_DIR, { recursive: true }); - } + // Output directory is current directory (hook already runs in output dir) const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE); let browser = null; diff --git a/archivebox/plugins/singlefile/on_Snapshot__04_singlefile.js b/archivebox/plugins/singlefile/on_Snapshot__04_singlefile.js index 6d9a6710..81d23435 100755 --- a/archivebox/plugins/singlefile/on_Snapshot__04_singlefile.js +++ b/archivebox/plugins/singlefile/on_Snapshot__04_singlefile.js @@ -40,7 +40,7 @@ const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR || const CHROME_DOWNLOADS_DIR = process.env.CHROME_DOWNLOADS_DIR || path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_downloads'); -const OUTPUT_DIR = 'singlefile'; +const OUTPUT_DIR = '.'; const OUTPUT_FILE = 'singlefile.html'; /** @@ -102,8 +102,7 @@ async function saveSinglefileWithExtension(page, extension, options = {}) { .filter(fn => fn.endsWith('.html')) ); - // Ensure output directory exists - await fs.promises.mkdir(OUTPUT_DIR, { recursive: true }); + // Output directory is current directory (hook already runs in output dir) const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE); console.log(`[🛠️] Saving SingleFile HTML using extension (${extension.id})...`); @@ -170,8 +169,7 @@ async function saveSinglefileWithCLI(url, options = {}) { return null; } - // Ensure output directory exists - await fs.promises.mkdir(OUTPUT_DIR, { recursive: true }); + // Output directory is current directory (hook already runs in output dir) const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE); // Build command diff --git a/archivebox/plugins/singlefile/on_Snapshot__37_singlefile.py b/archivebox/plugins/singlefile/on_Snapshot__37_singlefile.py index 1dfcfe23..2fa60327 100644 --- a/archivebox/plugins/singlefile/on_Snapshot__37_singlefile.py +++ b/archivebox/plugins/singlefile/on_Snapshot__37_singlefile.py @@ -41,7 +41,7 @@ import rich_click as click EXTRACTOR_NAME = 'singlefile' BIN_NAME = 'single-file' BIN_PROVIDERS = 'npm,env' -OUTPUT_DIR = 'singlefile' +OUTPUT_DIR = '.' OUTPUT_FILE = 'singlefile.html' @@ -65,7 +65,7 @@ def get_env_int(name: str, default: int = 0) -> int: return default -STATICFILE_DIR = 'staticfile' +STATICFILE_DIR = '../staticfile' def has_staticfile_output() -> bool: """Check if staticfile extractor already downloaded this URL.""" @@ -135,7 +135,7 @@ def get_version(binary: str) -> str: return '' -CHROME_SESSION_DIR = 'chrome_session' +CHROME_SESSION_DIR = '../chrome_session' def get_cdp_url() -> str | None: @@ -203,9 +203,8 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]: if extra_args: cmd.extend(extra_args.split()) - # Create output directory + # Output directory is current directory (hook already runs in output dir) output_dir = Path(OUTPUT_DIR) - output_dir.mkdir(exist_ok=True) output_path = output_dir / OUTPUT_FILE cmd.extend([url, str(output_path)]) @@ -274,7 +273,7 @@ def main(url: str, snapshot_id: str): sys.exit(1) version = get_version(binary) - cmd_str = f'{binary} {url} {OUTPUT_DIR}/{OUTPUT_FILE}' + cmd_str = f'{binary} {url} {OUTPUT_FILE}' # Run extraction success, output, error = save_singlefile(url, binary) diff --git a/archivebox/plugins/ssl/on_Snapshot__23_ssl.js b/archivebox/plugins/ssl/on_Snapshot__23_ssl.js index 78e7592e..2ce4cd65 100755 --- a/archivebox/plugins/ssl/on_Snapshot__23_ssl.js +++ b/archivebox/plugins/ssl/on_Snapshot__23_ssl.js @@ -21,9 +21,9 @@ const puppeteer = require('puppeteer-core'); // Extractor metadata const EXTRACTOR_NAME = 'ssl'; -const OUTPUT_DIR = 'ssl'; +const OUTPUT_DIR = '.'; const OUTPUT_FILE = 'ssl.json'; -const CHROME_SESSION_DIR = 'chrome_session'; +const CHROME_SESSION_DIR = '../chrome_session'; // Parse command line arguments function parseArgs() { @@ -60,10 +60,7 @@ function getCdpUrl() { // Extract SSL details async function extractSsl(url) { - // Create output directory - if (!fs.existsSync(OUTPUT_DIR)) { - fs.mkdirSync(OUTPUT_DIR, { recursive: true }); - } + // Output directory is current directory (hook already runs in output dir) const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE); // Only extract SSL for HTTPS URLs diff --git a/archivebox/plugins/staticfile/on_Snapshot__31_staticfile.py b/archivebox/plugins/staticfile/on_Snapshot__31_staticfile.py index 237f2d82..62aff11d 100644 --- a/archivebox/plugins/staticfile/on_Snapshot__31_staticfile.py +++ b/archivebox/plugins/staticfile/on_Snapshot__31_staticfile.py @@ -31,8 +31,8 @@ import rich_click as click # Extractor metadata EXTRACTOR_NAME = 'staticfile' -OUTPUT_DIR = 'staticfile' -CHROME_SESSION_DIR = 'chrome_session' +OUTPUT_DIR = '.' +CHROME_SESSION_DIR = '../chrome_session' # Content-Types that indicate static files # These can't be meaningfully processed by Chrome-based extractors @@ -214,9 +214,8 @@ def download_file(url: str) -> tuple[bool, str | None, str]: if content_length and int(content_length) > max_size: return False, None, f'File too large: {int(content_length)} bytes > {max_size} max' - # Create output directory + # Output directory is current directory (hook already runs in output dir) output_dir = Path(OUTPUT_DIR) - output_dir.mkdir(exist_ok=True) # Determine filename filename = get_filename_from_url(url) diff --git a/archivebox/plugins/title/on_Snapshot__32_title.js b/archivebox/plugins/title/on_Snapshot__32_title.js index afc60fb8..eb760444 100644 --- a/archivebox/plugins/title/on_Snapshot__32_title.js +++ b/archivebox/plugins/title/on_Snapshot__32_title.js @@ -21,9 +21,9 @@ const http = require('http'); // Extractor metadata const EXTRACTOR_NAME = 'title'; -const OUTPUT_DIR = 'title'; +const OUTPUT_DIR = '.'; const OUTPUT_FILE = 'title.txt'; -const CHROME_SESSION_DIR = 'chrome_session'; +const CHROME_SESSION_DIR = '../chrome_session'; // Parse command line arguments function parseArgs() { @@ -162,10 +162,7 @@ async function getTitleFromCdp(cdpUrl) { } async function extractTitle(url) { - // Create output directory - if (!fs.existsSync(OUTPUT_DIR)) { - fs.mkdirSync(OUTPUT_DIR, { recursive: true }); - } + // Output directory is current directory (hook already runs in output dir) const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE); // Try Chrome session first diff --git a/archivebox/plugins/wget/on_Snapshot__50_wget.py b/archivebox/plugins/wget/on_Snapshot__50_wget.py index 4b409d8c..265d43c2 100644 --- a/archivebox/plugins/wget/on_Snapshot__50_wget.py +++ b/archivebox/plugins/wget/on_Snapshot__50_wget.py @@ -43,7 +43,7 @@ import rich_click as click EXTRACTOR_NAME = 'wget' BIN_NAME = 'wget' BIN_PROVIDERS = 'apt,brew,env' -OUTPUT_DIR = 'wget' +OUTPUT_DIR = '.' def get_env(name: str, default: str = '') -> str: @@ -66,7 +66,7 @@ def get_env_int(name: str, default: int = 0) -> int: return default -STATICFILE_DIR = 'staticfile' +STATICFILE_DIR = '../staticfile' def has_staticfile_output() -> bool: """Check if staticfile extractor already downloaded this URL.""" diff --git a/archivebox/templates/admin/base.html b/archivebox/templates/admin/base.html index 8c580cc5..8d3f1e90 100644 --- a/archivebox/templates/admin/base.html +++ b/archivebox/templates/admin/base.html @@ -30,6 +30,1031 @@ color: white; cursor: pointer; } + + /* ============================================ + Modern card-based admin UI (shadcn-inspired) + ============================================ */ + + /* Base font improvements */ + body, html { + font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif; + -webkit-font-smoothing: antialiased; + -moz-osx-font-smoothing: grayscale; + font-size: 15px; + line-height: 1.6; + color: #0f172a; + background: #f8fafc; + } + + #container { + background: #f8fafc; + } + + #content { + padding: 24px; + } + + /* Main form container - flexbox grid */ + #content-main form > div, + #content form > div { + display: flex; + flex-wrap: wrap; + gap: 20px; + align-items: stretch; + } + + /* Each fieldset becomes a card */ + #content-main form fieldset, + #content form fieldset, + #content-main form .module:not(.inline-group), + #content form .module:not(.inline-group) { + background: #fff !important; + border: 1px solid #e2e8f0 !important; + border-top: 1px solid #e2e8f0 !important; + border-left: 1px solid #e2e8f0 !important; + border-right: 1px solid #e2e8f0 !important; + border-bottom: 1px solid #e2e8f0 !important; + border-radius: 12px !important; + padding: 0 !important; + margin: 0 !important; + box-shadow: 0 1px 3px rgba(0,0,0,0.04), 0 1px 2px rgba(0,0,0,0.06); + flex: 1 1 340px; + min-width: 320px; + max-width: calc(33.33% - 14px); + box-sizing: border-box; + display: flex; + flex-direction: column; + transition: box-shadow 0.2s ease, border-color 0.2s ease; + overflow: hidden; + } + + /* Wide fieldsets MUST override card max-width - placed after card rules for specificity */ + #content-main form fieldset.wide, + #content form fieldset.wide, + #content-main form fieldset:has(.field-archiveresults_list), + #content form fieldset:has(.field-archiveresults_list), + #content-main form fieldset:has(.field-snapshots), + #content form fieldset:has(.field-snapshots) { + flex: 1 1 100% !important; + max-width: 100% !important; + min-width: 100% !important; + width: 100% !important; + flex-basis: 100% !important; + } + + /* Inline groups should NOT have card constraints */ + #content-main form .inline-group, + #content form .inline-group, + .inline-group fieldset, + .inline-group .module { + flex: 1 1 100% !important; + max-width: 100% !important; + min-width: 100% !important; + width: 100% !important; + } + + #content-main form fieldset:hover, + #content form fieldset:hover { + box-shadow: 0 4px 6px rgba(0,0,0,0.05), 0 2px 4px rgba(0,0,0,0.06); + border-color: #cbd5e1; + } + + /* Archive results list content should take full width */ + .field-archiveresults_list, + .field-archiveresults_list .readonly, + .field-snapshots, + .field-snapshots .readonly { + width: 100% !important; + max-width: 100% !important; + background: transparent !important; + border: none !important; + padding: 0 !important; + } + + /* Card headers - no borders, just background */ + #content-main form fieldset h2, + #content form fieldset h2, + #content-main form .module h2, + #content form .module h2 { + margin: 0 !important; + padding: 8px 16px !important; + background: #f1f5f9 !important; + color: #334155 !important; + font-size: 12px !important; + font-weight: 600 !important; + font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif !important; + border: none !important; + border-top: none !important; + border-left: none !important; + border-right: none !important; + border-bottom: none !important; + border-radius: 0 !important; + text-transform: uppercase; + letter-spacing: 0.5px; + flex-shrink: 0; + -webkit-font-smoothing: antialiased; + box-shadow: none !important; + outline: none !important; + } + + /* Collapse toggle styling */ + #content-main form fieldset h2 a.collapse-toggle, + #content form fieldset h2 a.collapse-toggle { + color: #64748b; + } + + /* Card content area */ + #content-main form fieldset > div, + #content form fieldset > div { + padding: 20px; + flex: 1; + overflow-x: hidden; + overflow-y: visible; + min-width: 0; + } + + /* Form rows inside cards */ + #content-main form fieldset .form-row, + #content form fieldset .form-row { + padding: 8px 0; + border-bottom: 1px solid #f1f5f9; + min-width: 0; + min-height: auto; + } + + #content-main form fieldset .form-row:first-child, + #content form fieldset .form-row:first-child { + padding-top: 0; + } + + #content-main form fieldset .form-row:last-child, + #content form fieldset .form-row:last-child { + border-bottom: none; + padding-bottom: 0; + } + + /* Remove borders from nested fieldsets and flex-containers inside cards */ + #content-main form fieldset fieldset, + #content form fieldset fieldset, + #content-main form fieldset .flex-container, + #content form fieldset .flex-container, + #content-main form .module fieldset, + #content form .module fieldset { + background: transparent !important; + border: none !important; + border-radius: 0 !important; + box-shadow: none !important; + padding: 0 !important; + margin: 0 !important; + min-width: 0 !important; + max-width: 94% !important; + flex: none !important; + display: block !important; + } + + /* Nested fieldset headers should be invisible */ + #content-main form fieldset fieldset h2, + #content form fieldset fieldset h2, + #content-main form fieldset .flex-container legend, + #content form fieldset .flex-container legend { + background: transparent !important; + padding: 0 0 4px 0 !important; + font-size: 13px !important; + color: #374151 !important; + text-transform: none !important; + letter-spacing: normal !important; + } + + /* Ensure form elements inside cards don't overflow */ + #content-main form fieldset input, + #content-main form fieldset select, + #content-main form fieldset textarea, + #content form fieldset input, + #content form fieldset select, + #content form fieldset textarea { + max-width: 100%; + box-sizing: border-box; + } + + /* Related widget wrapper should fit within card */ + #content-main form fieldset .related-widget-wrapper, + #content form fieldset .related-widget-wrapper { + max-width: 100%; + } + + #content-main form fieldset .related-widget-wrapper select, + #content form fieldset .related-widget-wrapper select { + min-width: 0; + flex: 1; + } + + /* Labels inside cards */ + #content-main form fieldset .form-row > label, + #content form fieldset .form-row > label, + #content-main form fieldset .form-row > .flex-container > label, + #content form fieldset .form-row > .flex-container > label, + #content-main form label, + #content form label, + .aligned label, + legend { + font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif; + font-weight: 500; + color: #374151; + display: block; + margin-bottom: 8px; + float: none !important; + width: auto !important; + padding: 0 !important; + font-size: 13px; + letter-spacing: -0.01em; + -webkit-font-smoothing: antialiased; + -moz-osx-font-smoothing: grayscale; + } + + /* Readonly fields styling */ + #content-main form fieldset .readonly, + #content form fieldset .readonly { + background: #f8fafc; + padding: 12px 14px; + border-radius: 8px; + font-family: ui-monospace, SFMono-Regular, "SF Mono", Menlo, Monaco, Consolas, monospace; + font-size: 13px; + word-break: break-word; + line-height: 1.6; + border: 1px solid #e2e8f0; + color: #475569; + } + + /* Long content in readonly */ + #content-main form fieldset .readonly pre, + #content form fieldset .readonly pre { + margin: 0; + white-space: pre-wrap; + word-break: break-word; + font-family: inherit; + } + + /* Input styling */ + #content-main form input[type="text"], + #content-main form input[type="number"], + #content-main form input[type="url"], + #content-main form input[type="email"], + #content-main form input[type="password"], + #content form input[type="text"], + #content form input[type="number"], + #content form input[type="url"], + #content form input[type="email"], + #content form input[type="password"] { + width: 100%; + padding: 10px 14px; + border: 1px solid #d1d5db; + border-radius: 8px; + font-size: 14px; + font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif; + box-sizing: border-box; + background: #fff; + color: #1e293b; + transition: border-color 0.15s ease, box-shadow 0.15s ease; + -webkit-font-smoothing: antialiased; + } + + #content-main form select, + #content form select { + width: 100%; + border: 1px solid #d1d5db; + border-radius: 8px; + font-size: 14px; + font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif; + box-sizing: border-box; + background: #fff; + color: #1e293b; + transition: border-color 0.15s ease, box-shadow 0.15s ease; + -webkit-font-smoothing: antialiased; + } + + #content-main form input::placeholder, + #content form input::placeholder { + color: #94a3b8; + } + + /* Focus states */ + #content-main form input:focus, + #content-main form select:focus, + #content-main form textarea:focus, + #content form input:focus, + #content form select:focus, + #content form textarea:focus { + border-color: #3b82f6; + outline: none; + box-shadow: 0 0 0 3px rgba(59, 130, 246, 0.15); + } + + /* Textarea styling */ + #content-main form textarea, + #content form textarea { + width: 100%; + box-sizing: border-box; + border: 1px solid #d1d5db; + border-radius: 8px; + padding: 12px 14px; + font-size: 14px; + font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif; + line-height: 1.6; + resize: vertical; + min-height: 80px; + color: #1e293b; + transition: border-color 0.15s ease, box-shadow 0.15s ease; + -webkit-font-smoothing: antialiased; + } + + /* Fix vTextField width */ + .vTextField { + width: 100% !important; + } + + /* ============================================ + Button styling (shadcn-inspired) + ============================================ */ + + /* Base button styles */ + input[type="submit"], + button, + .button, + .btn, + a.button, + .submit-row input, + .submit-row a.button { + display: inline-flex; + align-items: center; + justify-content: center; + gap: 8px; + padding: 10px 18px; + font-size: 14px; + font-weight: 500; + font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif; + line-height: 1.4; + border-radius: 8px; + border: 1px solid transparent; + cursor: pointer; + transition: all 0.15s ease; + text-decoration: none; + white-space: nowrap; + -webkit-font-smoothing: antialiased; + } + + /* Primary button (default) */ + input[type="submit"], + button[type="submit"], + .button.default, + .submit-row input[type="submit"] { + background: #0f172a; + color: #fff; + border-color: #0f172a; + } + + input[type="submit"]:hover, + button[type="submit"]:hover, + .button.default:hover, + .submit-row input[type="submit"]:hover { + background: #1e293b; + border-color: #1e293b; + } + + input[type="submit"]:active, + button[type="submit"]:active { + background: #334155; + transform: translateY(1px); + } + + /* Secondary/outline buttons */ + button:not([type="submit"]), + .button:not(.default), + a.button { + background: #fff; + color: #374151; + border-color: #d1d5db; + } + + button:not([type="submit"]):hover, + .button:not(.default):hover, + a.button:hover { + background: #f9fafb; + border-color: #9ca3af; + color: #1f2937; + } + + /* Danger button */ + .deletelink, + a.deletelink, + button.deletelink, + input[name="delete"], + .button.delete { + background: #fff; + color: #dc2626; + border-color: #fecaca; + } + + .deletelink:hover, + a.deletelink:hover, + button.deletelink:hover, + input[name="delete"]:hover, + .button.delete:hover { + background: #fef2f2; + border-color: #f87171; + color: #b91c1c; + } + + /* Small buttons */ + .btn-sm, + .object-tools a, + .datetimeshortcuts a { + padding: 6px 12px; + font-size: 13px; + border-radius: 6px; + } + + /* Object tools (top action buttons) */ + .object-tools { + margin-bottom: 20px; + } + + .object-tools li { + margin-left: 10px; + } + + .object-tools a { + background: #fff; + color: #374151; + border: 1px solid #d1d5db; + text-decoration: none; + display: inline-flex; + align-items: center; + } + + .object-tools a:hover { + background: #f9fafb; + border-color: #9ca3af; + } + + /* Submit row styling */ + .submit-row { + margin-top: 24px; + padding: 20px; + background: #fff; + border-radius: 12px; + border: 1px solid #e2e8f0; + box-shadow: 0 1px 3px rgba(0,0,0,0.04); + clear: both; + flex: 1 1 100%; + display: flex; + gap: 12px; + flex-wrap: wrap; + align-items: center; + } + + .submit-row p { + margin: 0; + } + + .submit-row .deletelink-box { + margin-left: auto; + } + + /* Responsive: 2 columns on medium screens */ + @media (max-width: 1400px) { + #content-main form fieldset, + #content form fieldset { + max-width: calc(50% - 10px); + flex: 1 1 320px; + } + } + + /* Responsive: stack on smaller screens */ + @media (max-width: 900px) { + #content-main form fieldset, + #content form fieldset { + flex: 1 1 100%; + max-width: 100%; + min-width: auto; + } + + #content { + padding: 16px; + } + } + + /* Module content padding */ + #content-main form .module > div, + #content form .module > div { + padding: 12px; + } + + /* Fix for JSON/config editor */ + .field-config .readonly, + .field-config textarea { + width: 100%; + min-height: 120px; + max-height: none; + } + + /* Related widget styling */ + .related-widget-wrapper { + display: flex; + align-items: center; + gap: 8px; + flex-wrap: wrap; + } + + .related-widget-wrapper select { + flex: 1; + min-width: 150px; + } + + .related-widget-wrapper a { + flex-shrink: 0; + padding: 8px; + border-radius: 6px; + color: #64748b; + transition: color 0.15s ease, background 0.15s ease; + } + + .related-widget-wrapper a:hover { + color: #1e293b; + background: #f1f5f9; + } + + /* Help text styling */ + .help { + font-size: 13px; + color: #64748b; + margin-top: 6px; + line-height: 1.5; + } + + /* Error styling */ + .errorlist { + color: #dc2626; + font-size: 13px; + margin: 6px 0; + padding: 0; + list-style: none; + } + + .errorlist li { + background: #fef2f2; + padding: 8px 12px; + border-radius: 6px; + border: 1px solid #fecaca; + } + + /* Inline related objects - force full width */ + .inline-group, + #archiveresult_set-group, + #content-main form .inline-group, + #content-main form > div > .inline-group, + #content form > div > .inline-group, + .change-form .inline-group, + div.inline-group { + flex: 1 1 100% !important; + max-width: 100% !important; + min-width: 100% !important; + width: 100% !important; + margin-top: 20px; + flex-basis: 100% !important; + } + + /* Ensure inline-group breaks out of card grid */ + #content-main form > div, + #content form > div { + flex-wrap: wrap; + } + + /* TabularInline table full width */ + .inline-group .tabular, + .inline-group table { + width: 100% !important; + } + + .inline-related { + margin: 12px 0; + padding: 16px; + background: #fff; + border-radius: 10px; + border: 1px solid #e2e8f0; + } + + .inline-related h3 { + margin: -16px -16px 16px -16px; + padding: 12px 16px; + background: #f8fafc; + border-radius: 9px 9px 0 0; + border-bottom: 1px solid #e2e8f0; + font-size: 13px; + font-weight: 600; + color: #374151; + } + + /* Tabular inline styling */ + .tabular { + border-radius: 8px; + overflow: hidden; + border: 1px solid #e2e8f0; + } + + .tabular td, .tabular th { + padding: 12px 14px; + font-size: 13px; + border-bottom: 1px solid #f1f5f9; + } + + .tabular th { + background: #f8fafc; + font-weight: 600; + color: #374151; + text-align: left; + } + + .tabular tr:last-child td { + border-bottom: none; + } + + /* Delete checkbox */ + .inline-deletelink { + color: #dc2626; + font-size: 13px; + } + + /* Datetime widgets */ + .datetimeshortcuts { + margin-left: 10px; + } + + .datetimeshortcuts a { + background: #f1f5f9; + color: #475569; + border: none; + padding: 4px 10px; + } + + .datetimeshortcuts a:hover { + background: #e2e8f0; + color: #1e293b; + } + + /* Aligned forms - fix label positioning */ + .aligned .form-row > div { + margin-left: 0 !important; + } + + /* Checkbox styling */ + input[type="checkbox"] { + width: 18px; + height: 18px; + border-radius: 4px; + border: 1px solid #d1d5db; + cursor: pointer; + accent-color: #3b82f6; + } + + /* Links styling */ + a { + color: #2563eb; + text-decoration: none; + transition: color 0.15s ease; + } + + a:hover { + color: #1d4ed8; + } + + /* Messages/alerts */ + .messagelist { + padding: 0; + margin: 0 0 20px 0; + } + + .messagelist li { + padding: 14px 18px; + border-radius: 10px; + font-size: 14px; + margin-bottom: 10px; + display: flex; + align-items: center; + gap: 10px; + } + + ul.messagelist li.success { + background: #f0fdf4 !important; + background-image: none !important; + border: 1px solid #bbf7d0; + color: #166534; + } + + .messagelist li.warning { + background: #fffbeb; + border: 1px solid #fde68a; + color: #92400e; + } + + .messagelist li.error { + background: #fef2f2; + border: 1px solid #fecaca; + color: #991b1b; + } + + /* Breadcrumbs */ + .breadcrumbs { + background: transparent; + padding: 12px 24px; + font-size: 13px; + color: #64748b; + } + + .breadcrumbs a { + color: #64748b; + } + + .breadcrumbs a:hover { + color: #1e293b; + } + + /* Action buttons in cards */ + .card .btn, + .card button { + margin-top: 10px; + } + + /* Select2 overrides */ + .select2-container--default .select2-selection--single, + .select2-container--default .select2-selection--multiple { + border: 1px solid #d1d5db; + border-radius: 8px; + min-height: 42px; + } + + .select2-container--default .select2-selection--single:focus, + .select2-container--default .select2-selection--multiple:focus { + border-color: #3b82f6; + box-shadow: 0 0 0 3px rgba(59, 130, 246, 0.15); + } + + /* ============================================ + Admin List/Changelist Page Styling + ============================================ */ + + /* Results table container */ + #changelist { + background: #fff; + border-radius: 12px; + border: 1px solid #e2e8f0; + box-shadow: 0 1px 3px rgba(0,0,0,0.04); + overflow: hidden; + } + + /* Table styling */ + #result_list { + width: 100%; + border-collapse: collapse; + font-size: 14px; + } + + #result_list thead th { + background: #f8fafc; + border-bottom: 2px solid #e2e8f0; + padding: 12px 16px; + font-weight: 600; + font-size: 13px; + color: #475569; + text-align: left; + text-transform: uppercase; + letter-spacing: 0.025em; + white-space: nowrap; + } + + #result_list thead th a { + color: #475569; + text-decoration: none; + } + + #result_list thead th a:hover { + color: #1e293b; + } + + #result_list thead th.sorted { + background: #f1f5f9; + } + + #result_list thead th .text span { + padding-right: 5px; + } + + #result_list tbody tr { + border-bottom: 1px solid #f1f5f9; + transition: background-color 0.15s ease; + } + + #result_list tbody tr:hover { + background-color: #f8fafc; + } + + #result_list tbody tr.selected { + background-color: #eff6ff; + } + + #result_list tbody td { + padding: 12px 16px; + color: #334155; + vertical-align: middle; + } + + #result_list tbody td a { + color: #2563eb; + font-weight: 500; + } + + #result_list tbody td a:hover { + color: #1d4ed8; + text-decoration: underline; + } + + /* Checkbox column */ + #result_list .action-checkbox, + #result_list th.action-checkbox-column { + width: 40px; + text-align: center; + padding: 12px 8px; + } + + /* Pagination */ + .paginator { + background: #f8fafc; + padding: 12px 16px; + border-top: 1px solid #e2e8f0; + font-size: 14px; + color: #64748b; + } + + .paginator a { + color: #2563eb; + padding: 6px 12px; + border-radius: 6px; + margin: 0 2px; + text-decoration: none; + } + + .paginator a:hover { + background: #e2e8f0; + } + + /* Toolbar / search bar */ + #toolbar { + padding: 16px; + background: #fff; + border-bottom: 1px solid #e2e8f0; + display: flex; + align-items: center; + gap: 12px; + } + + #toolbar form { + display: flex; + align-items: center; + gap: 8px; + flex: 1; + } + + #searchbar { + flex: 1; + max-width: 400px; + padding: 10px 14px; + border: 1px solid #d1d5db; + border-radius: 8px; + font-size: 14px; + } + + #searchbar:focus { + border-color: #3b82f6; + outline: none; + box-shadow: 0 0 0 3px rgba(59, 130, 246, 0.15); + } + + /* Filter sidebar */ + #changelist-filter { + background: #fff; + border: 1px solid #e2e8f0; + border-radius: 12px; + box-shadow: 0 1px 3px rgba(0,0,0,0.04); + overflow: hidden; + } + + #changelist-filter h2 { + background: #f8fafc; + padding: 12px 16px; + font-size: 13px; + font-weight: 600; + color: #475569; + text-transform: uppercase; + letter-spacing: 0.025em; + margin: 0; + border-bottom: 1px solid #e2e8f0; + } + + #changelist-filter h3 { + padding: 12px 16px 8px; + font-size: 12px; + font-weight: 600; + color: #64748b; + text-transform: uppercase; + letter-spacing: 0.05em; + margin: 0; + } + + #changelist-filter ul { + padding: 0 8px 12px; + margin: 0; + list-style: none; + } + + #changelist-filter li { + margin: 0; + } + + #changelist-filter li a { + display: block; + padding: 8px 12px; + color: #475569; + text-decoration: none; + border-radius: 6px; + font-size: 14px; + transition: background-color 0.15s ease; + } + + #changelist-filter li a:hover { + background: #f1f5f9; + color: #1e293b; + } + + #changelist-filter li.selected a { + background: #eff6ff; + color: #2563eb; + font-weight: 500; + } + + /* Actions bar */ + .actions { + padding: 12px 16px; + background: #f8fafc; + border-bottom: 1px solid #e2e8f0; + display: flex; + align-items: center; + gap: 12px; + flex-wrap: wrap; + } + + .actions label { + font-size: 14px; + color: #475569; + } + + .actions select { + padding: 8px 12px; + border: 1px solid #d1d5db; + border-radius: 6px; + font-size: 14px; + background: #fff; + } + + .actions .button { + padding: 8px 16px; + font-size: 14px; + } + + /* Object count */ + .actions .action-counter { + color: #64748b; + font-size: 14px; + } + + /* Empty results */ + #changelist-form .results + p, + .paginator + p { + padding: 40px; + text-align: center; + color: #64748b; + font-size: 15px; + } + + /* Date hierarchy */ + .xfull { + padding: 12px 16px; + background: #f8fafc; + border-bottom: 1px solid #e2e8f0; + } + + .xfull a { + color: #2563eb; + margin-right: 8px; + } {% endblock %} diff --git a/archivebox/templates/admin/progress_monitor.html b/archivebox/templates/admin/progress_monitor.html index bdf9d64f..1b9d9dde 100644 --- a/archivebox/templates/admin/progress_monitor.html +++ b/archivebox/templates/admin/progress_monitor.html @@ -57,13 +57,24 @@ box-shadow: 0 0 8px #3fb950; animation: pulse 2s infinite; } + #progress-monitor .status-dot.idle { + background: #d29922; + box-shadow: 0 0 4px #d29922; + } #progress-monitor .status-dot.stopped { - background: #f85149; + background: #6e7681; + } + #progress-monitor .status-dot.flash { + animation: flash 0.3s ease-out; } @keyframes pulse { 0%, 100% { opacity: 1; box-shadow: 0 0 8px #3fb950; } 50% { opacity: 0.6; box-shadow: 0 0 4px #3fb950; } } + @keyframes flash { + 0% { transform: scale(1.5); } + 100% { transform: scale(1); } + } /* Stats */ #progress-monitor .stats { @@ -89,6 +100,19 @@ #progress-monitor .stat-value.error { color: #f85149; } #progress-monitor .stat-value.warning { color: #d29922; } #progress-monitor .stat-value.info { color: #58a6ff; } + #progress-monitor .stat.clickable { + cursor: pointer; + padding: 2px 6px; + margin: -2px -6px; + border-radius: 4px; + transition: background 0.2s; + } + #progress-monitor .stat.clickable:hover { + background: rgba(255,255,255,0.1); + } + #progress-monitor .stat.clickable:active { + background: rgba(255,255,255,0.2); + } /* Toggle Button */ #progress-monitor .toggle-btn { @@ -259,48 +283,86 @@ padding: 0 12px 8px; } - /* Extractor List */ + /* Extractor List - Compact Badge Layout */ #progress-monitor .extractor-list { padding: 8px 12px; background: rgba(0,0,0,0.2); border-top: 1px solid #21262d; + display: flex; + flex-wrap: wrap; + gap: 4px; } - #progress-monitor .extractor-item { + #progress-monitor .extractor-badge { + position: relative; + display: inline-flex; + align-items: center; + gap: 4px; + padding: 3px 8px; + border-radius: 4px; + font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace; + font-size: 10px; + background: #21262d; + overflow: hidden; + white-space: nowrap; + } + #progress-monitor .extractor-badge .progress-fill { + position: absolute; + top: 0; + left: 0; + bottom: 0; + z-index: 0; + transition: width 0.3s ease-out; + } + #progress-monitor .extractor-badge .badge-content { + position: relative; + z-index: 1; display: flex; align-items: center; - gap: 8px; - padding: 4px 0; + gap: 4px; } - #progress-monitor .extractor-icon { - font-size: 12px; - width: 16px; - text-align: center; + #progress-monitor .extractor-badge.queued { + color: #8b949e; } - #progress-monitor .extractor-icon.running { + #progress-monitor .extractor-badge.queued .progress-fill { + background: rgba(110, 118, 129, 0.2); + width: 0%; + } + #progress-monitor .extractor-badge.started { color: #d29922; - animation: spin 1s linear infinite; } - #progress-monitor .extractor-icon.success { + #progress-monitor .extractor-badge.started .progress-fill { + background: rgba(210, 153, 34, 0.3); + width: 50%; + animation: progress-pulse 1.5s ease-in-out infinite; + } + @keyframes progress-pulse { + 0%, 100% { opacity: 0.5; } + 50% { opacity: 1; } + } + #progress-monitor .extractor-badge.succeeded { color: #3fb950; } - #progress-monitor .extractor-icon.failed { + #progress-monitor .extractor-badge.succeeded .progress-fill { + background: rgba(63, 185, 80, 0.25); + width: 100%; + } + #progress-monitor .extractor-badge.failed { color: #f85149; } - #progress-monitor .extractor-icon.pending { - color: #8b949e; + #progress-monitor .extractor-badge.failed .progress-fill { + background: rgba(248, 81, 73, 0.25); + width: 100%; + } + #progress-monitor .extractor-badge .badge-icon { + font-size: 10px; + } + #progress-monitor .extractor-badge.started .badge-icon { + animation: spin 1s linear infinite; } @keyframes spin { from { transform: rotate(0deg); } to { transform: rotate(360deg); } } - #progress-monitor .extractor-name { - flex: 1; - font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace; - font-size: 11px; - } - #progress-monitor .extractor-progress { - width: 60px; - } /* Status Badge */ #progress-monitor .status-badge { @@ -356,11 +418,11 @@ Queued 0 -
+
Done 0
-
+
Failed 0
@@ -390,6 +452,24 @@ let expandedCrawls = new Set(JSON.parse(localStorage.getItem('progress-monitor-expanded-crawls') || '[]')); let expandedSnapshots = new Set(JSON.parse(localStorage.getItem('progress-monitor-expanded-snapshots') || '[]')); + // Baselines for resettable counters + let succeededBaseline = parseInt(localStorage.getItem('progress-succeeded-baseline') || '0'); + let failedBaseline = parseInt(localStorage.getItem('progress-failed-baseline') || '0'); + let lastSucceeded = 0; + let lastFailed = 0; + + // Click handlers for resetting counters + document.getElementById('stat-succeeded').addEventListener('click', function() { + succeededBaseline = lastSucceeded; + localStorage.setItem('progress-succeeded-baseline', succeededBaseline); + document.getElementById('total-succeeded').textContent = '0'; + }); + document.getElementById('stat-failed').addEventListener('click', function() { + failedBaseline = lastFailed; + localStorage.setItem('progress-failed-baseline', failedBaseline); + document.getElementById('total-failed').textContent = '0'; + }); + function formatUrl(url) { try { const u = new URL(url); @@ -400,24 +480,18 @@ } function renderExtractor(extractor) { - const iconClass = extractor.status === 'started' ? 'running' : - extractor.status === 'succeeded' ? 'success' : - extractor.status === 'failed' ? 'failed' : 'pending'; const icon = extractor.status === 'started' ? '↻' : extractor.status === 'succeeded' ? '✓' : extractor.status === 'failed' ? '✗' : '○'; return ` -
- ${icon} - ${extractor.extractor} -
-
-
-
-
-
+ + + + ${icon} + ${extractor.extractor} + + `; } @@ -427,10 +501,14 @@ const statusIcon = snapshot.status === 'started' ? '↻' : '📄'; let extractorHtml = ''; - if (snapshot.active_extractors && snapshot.active_extractors.length > 0) { + if (snapshot.all_extractors && snapshot.all_extractors.length > 0) { + // Sort extractors alphabetically by name to prevent reordering on updates + const sortedExtractors = [...snapshot.all_extractors].sort((a, b) => + a.extractor.localeCompare(b.extractor) + ); extractorHtml = `
- ${snapshot.active_extractors.map(e => renderExtractor(e)).join('')} + ${sortedExtractors.map(e => renderExtractor(e)).join('')}
`; } @@ -438,7 +516,7 @@ return `
- ${snapshot.active_extractors?.length ? '▶' : ''} + ${snapshot.all_extractors?.length ? '▶' : ''} ${statusIcon}
${formatUrl(snapshot.url)}
@@ -469,6 +547,40 @@ snapshotsHtml = crawl.active_snapshots.map(s => renderSnapshot(s, crawl.id)).join(''); } + // Show warning if crawl is stuck (queued but can't start) + let warningHtml = ''; + if (crawl.status === 'queued' && !crawl.can_start) { + warningHtml = ` +
+ ⚠️ Crawl cannot start: ${crawl.seed_uri ? 'unknown error' : 'no seed URI'} +
+ `; + } else if (crawl.status === 'queued' && crawl.retry_at_future) { + // Queued but retry_at is in future (was claimed by worker, will retry) + warningHtml = ` +
+ 🔄 Retrying in ${crawl.seconds_until_retry}s...${crawl.seed_uri ? ` (${crawl.seed_uri})` : ''} +
+ `; + } else if (crawl.status === 'queued' && crawl.total_snapshots === 0) { + // Queued and waiting to be picked up by worker + warningHtml = ` +
+ ⏳ Waiting for worker to pick up...${crawl.seed_uri ? ` (${crawl.seed_uri})` : ''} +
+ `; + } + + // Show snapshot info or URL count if no snapshots yet + let metaText = `depth: ${crawl.max_depth}`; + if (crawl.total_snapshots > 0) { + metaText += ` | ${crawl.total_snapshots} snapshots`; + } else if (crawl.urls_count > 0) { + metaText += ` | ${crawl.urls_count} URLs`; + } else if (crawl.seed_uri) { + metaText += ` | ${crawl.seed_uri.substring(0, 40)}${crawl.seed_uri.length > 40 ? '...' : ''}`; + } + return `
@@ -476,10 +588,11 @@ ${statusIcon}
${crawl.label}
-
depth: ${crawl.max_depth} | ${crawl.total_snapshots} snapshots
+
${metaText}
${crawl.completed_snapshots} done + ${crawl.started_snapshots || 0} active ${crawl.pending_snapshots} pending
${crawl.status} @@ -490,6 +603,7 @@ style="width: ${crawl.progress}%">
+ ${warningHtml}
${snapshotsHtml} @@ -542,25 +656,48 @@ data.snapshots_pending > 0 || data.snapshots_started > 0 || data.archiveresults_pending > 0 || data.archiveresults_started > 0; - // Update orchestrator status + // Update orchestrator status - show "Running" only when there's actual activity + // Don't distinguish between "Stopped" and "Idle" since orchestrator starts/stops frequently const dot = document.getElementById('orchestrator-dot'); const text = document.getElementById('orchestrator-text'); - if (data.orchestrator_running) { - dot.classList.remove('stopped'); + const hasWorkers = data.total_workers > 0; + + if (hasWorkers || hasActivity) { + dot.classList.remove('stopped', 'idle'); dot.classList.add('running'); text.textContent = 'Running'; } else { - dot.classList.remove('running'); - dot.classList.add('stopped'); - text.textContent = 'Stopped'; + // No activity - show as idle (whether orchestrator process exists or not) + dot.classList.remove('stopped', 'running'); + dot.classList.add('idle'); + text.textContent = 'Idle'; } + // Pulse the dot to show we got fresh data + dot.classList.add('flash'); + setTimeout(() => dot.classList.remove('flash'), 300); + // Update stats document.getElementById('worker-count').textContent = data.total_workers; document.getElementById('total-queued').textContent = data.crawls_pending + data.snapshots_pending + data.archiveresults_pending; - document.getElementById('total-succeeded').textContent = data.archiveresults_succeeded; - document.getElementById('total-failed').textContent = data.archiveresults_failed; + + // Store raw values and display relative to baseline + lastSucceeded = data.archiveresults_succeeded; + lastFailed = data.archiveresults_failed; + + // If baseline is higher than current (e.g. after DB reset), reset baseline + if (succeededBaseline > lastSucceeded) { + succeededBaseline = 0; + localStorage.setItem('progress-succeeded-baseline', '0'); + } + if (failedBaseline > lastFailed) { + failedBaseline = 0; + localStorage.setItem('progress-failed-baseline', '0'); + } + + document.getElementById('total-succeeded').textContent = lastSucceeded - succeededBaseline; + document.getElementById('total-failed').textContent = lastFailed - failedBaseline; // Render crawl tree if (data.active_crawls.length > 0) { diff --git a/archivebox/workers/management/commands/orchestrator.py b/archivebox/workers/management/commands/orchestrator.py index 27ef11d0..10360625 100644 --- a/archivebox/workers/management/commands/orchestrator.py +++ b/archivebox/workers/management/commands/orchestrator.py @@ -7,9 +7,14 @@ class Command(BaseCommand): help = 'Run the archivebox orchestrator' def add_arguments(self, parser): - parser.add_argument('--daemon', '-d', action='store_true', help="Run forever (don't exit on idle)") + parser.add_argument( + '--exit-on-idle', + action='store_true', + default=False, + help="Exit when all work is complete (default: run forever)" + ) def handle(self, *args, **kwargs): - daemon = kwargs.get('daemon', False) - orchestrator = Orchestrator(exit_on_idle=not daemon) + exit_on_idle = kwargs.get('exit_on_idle', False) + orchestrator = Orchestrator(exit_on_idle=exit_on_idle) orchestrator.runloop() diff --git a/archivebox/workers/orchestrator.py b/archivebox/workers/orchestrator.py index a4c1a390..4536fa83 100644 --- a/archivebox/workers/orchestrator.py +++ b/archivebox/workers/orchestrator.py @@ -12,16 +12,17 @@ Architecture: └── Each worker spawns task subprocesses via CLI Usage: - # Embedded in other commands (exits when done) + # Default: runs forever (for use as subprocess of server) + orchestrator = Orchestrator(exit_on_idle=False) + orchestrator.runloop() + + # Exit when done (for embedded use in other commands) orchestrator = Orchestrator(exit_on_idle=True) orchestrator.runloop() - - # Daemon mode (runs forever) - orchestrator = Orchestrator(exit_on_idle=False) - orchestrator.start() # fork and return - + # Or run via CLI - archivebox orchestrator [--daemon] + archivebox manage orchestrator # runs forever + archivebox manage orchestrator --exit-on-idle # exits when done """ __package__ = 'archivebox.workers' @@ -45,6 +46,14 @@ from .pid_utils import ( ) +def _run_orchestrator_process(exit_on_idle: bool) -> None: + """Top-level function for multiprocessing (must be picklable).""" + from archivebox.config.django import setup_django + setup_django() + orchestrator = Orchestrator(exit_on_idle=exit_on_idle) + orchestrator.runloop() + + class Orchestrator: """ Manages worker processes by polling queues and spawning workers as needed. @@ -277,12 +286,12 @@ class Orchestrator: Fork orchestrator as a background process. Returns the PID of the new process. """ - def run_orchestrator(): - from archivebox.config.django import setup_django - setup_django() - self.runloop() - - proc = Process(target=run_orchestrator, name='orchestrator') + # Use module-level function to avoid pickle errors with local functions + proc = Process( + target=_run_orchestrator_process, + args=(self.exit_on_idle,), + name='orchestrator' + ) proc.start() assert proc.pid is not None diff --git a/archivebox/workers/supervisord_util.py b/archivebox/workers/supervisord_util.py index 69b440c4..898f87fe 100644 --- a/archivebox/workers/supervisord_util.py +++ b/archivebox/workers/supervisord_util.py @@ -28,7 +28,7 @@ WORKERS_DIR_NAME = "workers" ORCHESTRATOR_WORKER = { "name": "worker_orchestrator", - "command": "archivebox manage orchestrator", + "command": "archivebox manage orchestrator", # runs forever by default "autostart": "true", "autorestart": "true", "stdout_logfile": "logs/worker_orchestrator.log", @@ -332,14 +332,14 @@ def stop_worker(supervisor, daemon_name): def tail_worker_logs(log_path: str): get_or_create_supervisord_process(daemonize=False) - + from rich.live import Live from rich.table import Table - + table = Table() table.add_column("TS") table.add_column("URL") - + try: with Live(table, refresh_per_second=1) as live: # update 4 times a second to feel fluid with open(log_path, 'r') as f: @@ -352,6 +352,83 @@ def tail_worker_logs(log_path: str): except SystemExit: pass + +def tail_multiple_worker_logs(log_files: list[str], follow=True): + """Tail multiple log files simultaneously, interleaving their output.""" + import select + from pathlib import Path + + # Convert relative paths to absolute paths + log_paths = [] + for log_file in log_files: + log_path = Path(log_file) + if not log_path.is_absolute(): + log_path = CONSTANTS.DATA_DIR / log_path + + # Create log file if it doesn't exist + if not log_path.exists(): + log_path.parent.mkdir(parents=True, exist_ok=True) + log_path.touch() + + log_paths.append(log_path) + + # Open all log files + file_handles = [] + for log_path in log_paths: + try: + f = open(log_path, 'r') + # Seek to end of file if following + if follow: + f.seek(0, 2) # Seek to end + file_handles.append((log_path.name, f)) + except Exception as e: + print(f"[yellow]Warning: Could not open {log_path}: {e}[/yellow]") + + if not file_handles: + print("[red]No log files could be opened[/red]") + return + + # Print which logs we're tailing + log_names = [name for name, _ in file_handles] + print(f"[dim]Tailing: {', '.join(log_names)}[/dim]") + print() + + try: + while follow: + # Read available lines from all files + for log_name, f in file_handles: + line = f.readline() + if line: + # Colorize based on log source + if 'orchestrator' in log_name.lower(): + color = 'cyan' + elif 'daphne' in log_name.lower(): + color = 'green' + else: + color = 'white' + + # Strip ANSI codes if present (supervisord does this but just in case) + import re + line_clean = re.sub(r'\x1b\[[0-9;]*m', '', line.rstrip()) + + if line_clean: + print(f'[{color}][{log_name}][/{color}] {line_clean}') + + # Small sleep to avoid busy-waiting + time.sleep(0.1) + + except (KeyboardInterrupt, BrokenPipeError, IOError): + print("\n[yellow][i] Stopped tailing logs[/i][/yellow]") + except SystemExit: + pass + finally: + # Close all file handles + for _, f in file_handles: + try: + f.close() + except Exception: + pass + def watch_worker(supervisor, daemon_name, interval=5): """loop continuously and monitor worker's health""" while True: diff --git a/archivebox/workers/tasks.py b/archivebox/workers/tasks.py index 01858e7f..87bb3f32 100644 --- a/archivebox/workers/tasks.py +++ b/archivebox/workers/tasks.py @@ -3,6 +3,9 @@ Background task functions for queuing work to the orchestrator. These functions queue Snapshots/Crawls for processing by setting their status to QUEUED, which the orchestrator workers will pick up and process. + +NOTE: These functions do NOT start the orchestrator - they assume it's already +running via `archivebox server` (supervisord) or will be run inline by the CLI. """ __package__ = 'archivebox.workers' @@ -10,16 +13,6 @@ __package__ = 'archivebox.workers' from django.utils import timezone -def ensure_orchestrator_running(): - """Ensure the orchestrator is running to process queued items.""" - from .orchestrator import Orchestrator - - if not Orchestrator.is_running(): - # Start orchestrator in background - orchestrator = Orchestrator(exit_on_idle=True) - orchestrator.start() - - def bg_add(add_kwargs: dict) -> int: """ Add URLs and queue them for archiving. @@ -36,9 +29,6 @@ def bg_add(add_kwargs: dict) -> int: result = add(**add_kwargs) - # Ensure orchestrator is running to process the new snapshots - ensure_orchestrator_running() - return len(result) if result else 0 @@ -66,10 +56,6 @@ def bg_archive_snapshots(snapshots, kwargs: dict | None = None) -> int: ) queued_count += 1 - # Ensure orchestrator is running to process the queued snapshots - if queued_count > 0: - ensure_orchestrator_running() - return queued_count @@ -90,9 +76,6 @@ def bg_archive_snapshot(snapshot, overwrite: bool = False, methods: list | None status=Snapshot.StatusChoices.QUEUED, retry_at=timezone.now(), ) - - # Ensure orchestrator is running to process the queued snapshot - ensure_orchestrator_running() return 1 return 0 diff --git a/archivebox/workers/worker.py b/archivebox/workers/worker.py index 78e062da..991a0e72 100644 --- a/archivebox/workers/worker.py +++ b/archivebox/workers/worker.py @@ -67,8 +67,8 @@ class Worker: # Configuration (can be overridden by subclasses) MAX_TICK_TIME: ClassVar[int] = 60 MAX_CONCURRENT_TASKS: ClassVar[int] = 1 - POLL_INTERVAL: ClassVar[float] = 0.5 - IDLE_TIMEOUT: ClassVar[int] = 3 # Exit after N idle iterations (set to 0 to never exit) + POLL_INTERVAL: ClassVar[float] = 1.0 + IDLE_TIMEOUT: ClassVar[int] = 10 # Exit after N idle iterations (10 sec at 1.0 poll interval) def __init__(self, worker_id: int = 0, daemon: bool = False, **kwargs: Any): self.worker_id = worker_id