diff --git a/archivebox/ArchiveBox.conf b/archivebox/ArchiveBox.conf
new file mode 100644
index 00000000..fb119776
--- /dev/null
+++ b/archivebox/ArchiveBox.conf
@@ -0,0 +1,3 @@
+[SERVER_CONFIG]
+SECRET_KEY = amuxg7v5e2l_6jrktp_f3kszlpx4ieqk4rtwda5q6nfiavits4
+
diff --git a/archivebox/api/admin.py b/archivebox/api/admin.py
index 056f0ead..78545257 100644
--- a/archivebox/api/admin.py
+++ b/archivebox/api/admin.py
@@ -13,7 +13,21 @@ class APITokenAdmin(BaseModelAdmin):
sort_fields = ('id', 'created_at', 'created_by', 'expires')
readonly_fields = ('created_at', 'modified_at')
search_fields = ('id', 'created_by__username', 'token')
- fields = ('created_by', 'token', 'expires', *readonly_fields)
+
+ fieldsets = (
+ ('Token', {
+ 'fields': ('token', 'expires'),
+ 'classes': ('card',),
+ }),
+ ('Owner', {
+ 'fields': ('created_by',),
+ 'classes': ('card',),
+ }),
+ ('Timestamps', {
+ 'fields': ('created_at', 'modified_at'),
+ 'classes': ('card',),
+ }),
+ )
list_filter = ('created_by',)
ordering = ['-created_at']
@@ -25,6 +39,29 @@ class CustomWebhookAdmin(WebhookAdmin, BaseModelAdmin):
sort_fields = ('created_at', 'created_by', 'id', 'referenced_model', 'endpoint', 'last_success', 'last_error')
readonly_fields = ('created_at', 'modified_at', *WebhookAdmin.readonly_fields)
+ fieldsets = (
+ ('Webhook', {
+ 'fields': ('name', 'signal', 'referenced_model', 'endpoint'),
+ 'classes': ('card', 'wide'),
+ }),
+ ('Authentication', {
+ 'fields': ('auth_token',),
+ 'classes': ('card',),
+ }),
+ ('Status', {
+ 'fields': ('enabled', 'last_success', 'last_error'),
+ 'classes': ('card',),
+ }),
+ ('Owner', {
+ 'fields': ('created_by',),
+ 'classes': ('card',),
+ }),
+ ('Timestamps', {
+ 'fields': ('created_at', 'modified_at'),
+ 'classes': ('card',),
+ }),
+ )
+
def register_admin(admin_site):
admin_site.register(APIToken, APITokenAdmin)
diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py
index e9bcc53e..b668d26b 100644
--- a/archivebox/cli/archivebox_add.py
+++ b/archivebox/cli/archivebox_add.py
@@ -115,12 +115,10 @@ def add(urls: str | list[str],
# - Repeat until max_depth reached
if bg:
- # Background mode: start orchestrator and return immediately
- print('[yellow]\\[*] Running in background mode - starting orchestrator...[/yellow]')
- orchestrator = Orchestrator(exit_on_idle=True)
- orchestrator.start() # Fork to background
+ # Background mode: just queue work and return (orchestrator via server will pick it up)
+ print('[yellow]\\[*] URLs queued. Orchestrator will process them (run `archivebox server` if not already running).[/yellow]')
else:
- # Foreground mode: run orchestrator until all work is done
+ # Foreground mode: run orchestrator inline until all work is done
print(f'[green]\\[*] Starting orchestrator to process crawl...[/green]')
orchestrator = Orchestrator(exit_on_idle=True)
orchestrator.runloop() # Block until complete
diff --git a/archivebox/cli/archivebox_extract.py b/archivebox/cli/archivebox_extract.py
index c3fa89ef..affea542 100644
--- a/archivebox/cli/archivebox_extract.py
+++ b/archivebox/cli/archivebox_extract.py
@@ -117,11 +117,11 @@ def run_plugins(
if snapshot_id:
snapshot_ids.add(snapshot_id)
elif record.get('url'):
- # Look up by URL
- try:
- snap = Snapshot.objects.get(url=record['url'])
+ # Look up by URL (get most recent if multiple exist)
+ snap = Snapshot.objects.filter(url=record['url']).order_by('-created_at').first()
+ if snap:
snapshot_ids.add(str(snap.id))
- except Snapshot.DoesNotExist:
+ else:
rprint(f'[yellow]Snapshot not found for URL: {record["url"]}[/yellow]', file=sys.stderr)
elif record_type == TYPE_ARCHIVERESULT:
diff --git a/archivebox/cli/archivebox_install.py b/archivebox/cli/archivebox_install.py
index 5674b3d8..b797944d 100755
--- a/archivebox/cli/archivebox_install.py
+++ b/archivebox/cli/archivebox_install.py
@@ -49,20 +49,45 @@ def install(dry_run: bool=False) -> None:
# Using a minimal crawl that will trigger on_Crawl hooks
created_by_id = get_or_create_system_user_pk()
- seed = Seed.objects.create(
+ seed, _created = Seed.objects.get_or_create(
uri='archivebox://install',
label='Dependency detection',
created_by_id=created_by_id,
+ defaults={
+ 'extractor': 'auto',
+ }
)
- crawl = Crawl.objects.create(
+ crawl, created = Crawl.objects.get_or_create(
seed=seed,
max_depth=0,
created_by_id=created_by_id,
- status='queued',
+ defaults={
+ 'status': 'queued',
+ }
)
+ # If crawl already existed, reset it to queued state so it can be processed again
+ if not created:
+ crawl.status = 'queued'
+ crawl.retry_at = timezone.now()
+ crawl.save()
+
print(f'[+] Created dependency detection crawl: {crawl.id}')
+ print(f'[+] Crawl status: {crawl.status}, retry_at: {crawl.retry_at}')
+
+ # Verify the crawl is in the queue
+ from crawls.models import Crawl as CrawlModel
+ queued_crawls = CrawlModel.objects.filter(
+ retry_at__lte=timezone.now()
+ ).exclude(
+ status__in=CrawlModel.FINAL_STATES
+ )
+ print(f'[+] Crawls in queue: {queued_crawls.count()}')
+ if queued_crawls.exists():
+ for c in queued_crawls:
+ print(f' - Crawl {c.id}: status={c.status}, retry_at={c.retry_at}')
+
print('[+] Running crawl to detect binaries via on_Crawl hooks...')
print()
diff --git a/archivebox/cli/archivebox_server.py b/archivebox/cli/archivebox_server.py
index c369e6ce..146e047c 100644
--- a/archivebox/cli/archivebox_server.py
+++ b/archivebox/cli/archivebox_server.py
@@ -56,20 +56,53 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
except IndexError:
pass
- print('[green][+] Starting ArchiveBox webserver...[/green]')
- print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
- print(f' [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]')
- print(' > Writing ArchiveBox error log to ./logs/errors.log')
-
if SHELL_CONFIG.DEBUG:
+ print('[green][+] Starting ArchiveBox webserver in DEBUG mode...[/green]')
+ print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
+ print(f' [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]')
+ print(' > Writing ArchiveBox error log to ./logs/errors.log')
if not reload:
runserver_args.append('--noreload') # '--insecure'
if nothreading:
runserver_args.append('--nothreading')
call_command("runserver", *runserver_args)
else:
- from workers.supervisord_util import start_server_workers
+ from workers.supervisord_util import (
+ get_existing_supervisord_process,
+ get_worker,
+ start_server_workers,
+ tail_multiple_worker_logs,
+ )
+ # Check if supervisord is already running
+ supervisor = get_existing_supervisord_process()
+ if supervisor:
+ daphne_proc = get_worker(supervisor, 'worker_daphne')
+
+ # If daphne is already running, just tail logs
+ if daphne_proc and daphne_proc.get('statename') == 'RUNNING':
+ orchestrator_proc = get_worker(supervisor, 'worker_orchestrator')
+ print('[yellow][!] ArchiveBox server is already running[/yellow]')
+ print(f' [green]√[/green] Web server (worker_daphne) is RUNNING on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
+ if orchestrator_proc and orchestrator_proc.get('statename') == 'RUNNING':
+ print(f' [green]√[/green] Background worker (worker_orchestrator) is RUNNING')
+ print()
+ print('[blue][i] Tailing worker logs (Ctrl+C to stop watching)...[/i][/blue]')
+ print()
+
+ # Tail logs for both workers
+ tail_multiple_worker_logs(
+ log_files=['logs/worker_daphne.log', 'logs/worker_orchestrator.log'],
+ follow=True,
+ )
+ return
+ # Otherwise, daphne is not running - fall through to start it
+
+ # No existing workers found - start new ones
+ print('[green][+] Starting ArchiveBox webserver...[/green]')
+ print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
+ print(f' [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]')
+ print(' > Writing ArchiveBox error log to ./logs/errors.log')
print()
start_server_workers(host=host, port=port, daemonize=daemonize)
print("\n[i][green][🟩] ArchiveBox server shut down gracefully.[/green][/i]")
diff --git a/archivebox/cli/archivebox_version.py b/archivebox/cli/archivebox_version.py
index 2b231c9f..c891b8ea 100755
--- a/archivebox/cli/archivebox_version.py
+++ b/archivebox/cli/archivebox_version.py
@@ -119,12 +119,13 @@ def version(quiet: bool=False,
else:
for key in sorted(set(binary_config_keys)):
# Get the actual binary name/path from config value
- bin_value = config.get(key, '').strip()
+ # Prioritize Machine.config overrides over base config
+ bin_value = machine.config.get(key) or config.get(key, '').strip()
if not bin_value:
continue
# Check if it's a path (has slashes) or just a name
- is_path = '/' in bin_value
+ is_path = '/' in str(bin_value)
if is_path:
# It's a full path - match against abspath
diff --git a/archivebox/config/django.py b/archivebox/config/django.py
index 77169ee3..d7910ec0 100644
--- a/archivebox/config/django.py
+++ b/archivebox/config/django.py
@@ -5,7 +5,6 @@ import sys
from datetime import datetime, timezone
-from rich.progress import Progress
from rich.console import Console
import django
@@ -27,16 +26,6 @@ STDERR = Console(stderr=True)
logging.CONSOLE = CONSOLE
-INITIAL_STARTUP_PROGRESS = None
-INITIAL_STARTUP_PROGRESS_TASK = 0
-
-def bump_startup_progress_bar(advance=1):
- global INITIAL_STARTUP_PROGRESS
- global INITIAL_STARTUP_PROGRESS_TASK
- if INITIAL_STARTUP_PROGRESS:
- INITIAL_STARTUP_PROGRESS.update(INITIAL_STARTUP_PROGRESS_TASK, advance=advance) # type: ignore
-
-
def setup_django_minimal():
# sys.path.append(str(CONSTANTS.PACKAGE_DIR))
# os.environ.setdefault('ARCHIVEBOX_DATA_DIR', str(CONSTANTS.DATA_DIR))
@@ -49,9 +38,7 @@ DJANGO_SET_UP = False
def setup_django(check_db=False, in_memory_db=False) -> None:
from rich.panel import Panel
-
- global INITIAL_STARTUP_PROGRESS
- global INITIAL_STARTUP_PROGRESS_TASK
+
global DJANGO_SET_UP
if DJANGO_SET_UP:
@@ -59,118 +46,100 @@ def setup_django(check_db=False, in_memory_db=False) -> None:
# TODO: figure out why CLI entrypoints with init_pending are running this twice sometimes
return
- with Progress(transient=True, expand=True, console=STDERR) as INITIAL_STARTUP_PROGRESS:
- INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS.add_task("[green]Loading modules...", total=25, visible=True)
-
- from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, SudoPermission
-
- # if running as root, chown the data dir to the archivebox user to make sure it's accessible to the archivebox user
- if IS_ROOT and ARCHIVEBOX_USER != 0:
- with SudoPermission(uid=0):
- # running as root is a special case where it's ok to be a bit slower
- # make sure data dir is always owned by the correct user
- os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}" 2>/dev/null')
- os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}"/* 2>/dev/null')
+ from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, SudoPermission
- bump_startup_progress_bar()
- try:
- from django.core.management import call_command
-
- bump_startup_progress_bar()
+ # if running as root, chown the data dir to the archivebox user to make sure it's accessible to the archivebox user
+ if IS_ROOT and ARCHIVEBOX_USER != 0:
+ with SudoPermission(uid=0):
+ # running as root is a special case where it's ok to be a bit slower
+ # make sure data dir is always owned by the correct user
+ os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}" 2>/dev/null')
+ os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}"/* 2>/dev/null')
- if in_memory_db:
- raise Exception('dont use this anymore')
-
- # some commands (e.g. oneshot) dont store a long-lived sqlite3 db file on disk.
- # in those cases we create a temporary in-memory db and run the migrations
- # immediately to get a usable in-memory-database at startup
- os.environ.setdefault("ARCHIVEBOX_DATABASE_NAME", ":memory:")
+ try:
+ from django.core.management import call_command
+
+ if in_memory_db:
+ raise Exception('dont use this anymore')
+
+ # some commands (e.g. oneshot) dont store a long-lived sqlite3 db file on disk.
+ # in those cases we create a temporary in-memory db and run the migrations
+ # immediately to get a usable in-memory-database at startup
+ os.environ.setdefault("ARCHIVEBOX_DATABASE_NAME", ":memory:")
+ django.setup()
+
+ call_command("migrate", interactive=False, verbosity=0)
+ else:
+ # Otherwise use default sqlite3 file-based database and initialize django
+ # without running migrations automatically (user runs them manually by calling init)
+ try:
django.setup()
-
- bump_startup_progress_bar()
- call_command("migrate", interactive=False, verbosity=0)
- else:
- # Otherwise use default sqlite3 file-based database and initialize django
- # without running migrations automatically (user runs them manually by calling init)
- try:
- django.setup()
- except Exception as e:
- bump_startup_progress_bar(advance=1000)
-
- is_using_meta_cmd = any(ignored_subcommand in sys.argv for ignored_subcommand in ('help', 'version', '--help', '--version'))
- if not is_using_meta_cmd:
- # show error message to user only if they're not running a meta command / just trying to get help
- STDERR.print()
- STDERR.print(Panel(
- f'\n[red]{e.__class__.__name__}[/red]: [yellow]{e}[/yellow]\nPlease check your config and [blue]DATA_DIR[/blue] permissions.\n',
- title='\n\n[red][X] Error while trying to load database![/red]',
- subtitle='[grey53]NO WRITES CAN BE PERFORMED[/grey53]',
- expand=False,
- style='bold red',
- ))
- STDERR.print()
- STDERR.print_exception(show_locals=False)
- return
-
- bump_startup_progress_bar()
-
- from django.conf import settings
-
- # log startup message to the error log
- with open(settings.ERROR_LOG, "a", encoding='utf-8') as f:
- command = ' '.join(sys.argv)
- ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
- f.write(f"\n> {command}; TS={ts} VERSION={CONSTANTS.VERSION} IN_DOCKER={SHELL_CONFIG.IN_DOCKER} IS_TTY={SHELL_CONFIG.IS_TTY}\n")
-
- if check_db:
- # make sure the data dir is owned by a non-root user
- if CONSTANTS.DATA_DIR.stat().st_uid == 0:
- STDERR.print('[red][X] Error: ArchiveBox DATA_DIR cannot be owned by root![/red]')
- STDERR.print(f' {CONSTANTS.DATA_DIR}')
+ except Exception as e:
+ is_using_meta_cmd = any(ignored_subcommand in sys.argv for ignored_subcommand in ('help', 'version', '--help', '--version'))
+ if not is_using_meta_cmd:
+ # show error message to user only if they're not running a meta command / just trying to get help
STDERR.print()
- STDERR.print('[violet]Hint:[/violet] Are you running archivebox in the right folder? (and as a non-root user?)')
- STDERR.print(' cd path/to/your/archive/data')
- STDERR.print(' archivebox [command]')
+ STDERR.print(Panel(
+ f'\n[red]{e.__class__.__name__}[/red]: [yellow]{e}[/yellow]\nPlease check your config and [blue]DATA_DIR[/blue] permissions.\n',
+ title='\n\n[red][X] Error while trying to load database![/red]',
+ subtitle='[grey53]NO WRITES CAN BE PERFORMED[/grey53]',
+ expand=False,
+ style='bold red',
+ ))
STDERR.print()
- raise SystemExit(9)
-
- # Create cache table in DB if needed
- try:
- from django.core.cache import cache
- cache.get('test', None)
- except django.db.utils.OperationalError:
- call_command("createcachetable", verbosity=0)
+ STDERR.print_exception(show_locals=False)
+ return
- bump_startup_progress_bar()
+ from django.conf import settings
- # if archivebox gets imported multiple times, we have to close
- # the sqlite3 whenever we init from scratch to avoid multiple threads
- # sharing the same connection by accident
- from django.db import connections
- for conn in connections.all():
- conn.close_if_unusable_or_obsolete()
+ # log startup message to the error log
+ with open(settings.ERROR_LOG, "a", encoding='utf-8') as f:
+ command = ' '.join(sys.argv)
+ ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
+ f.write(f"\n> {command}; TS={ts} VERSION={CONSTANTS.VERSION} IN_DOCKER={SHELL_CONFIG.IN_DOCKER} IS_TTY={SHELL_CONFIG.IS_TTY}\n")
- sql_index_path = CONSTANTS.DATABASE_FILE
- assert os.access(sql_index_path, os.F_OK), (
- f'No database file {sql_index_path} found in: {CONSTANTS.DATA_DIR} (Are you in an ArchiveBox collection directory?)')
+ if check_db:
+ # make sure the data dir is owned by a non-root user
+ if CONSTANTS.DATA_DIR.stat().st_uid == 0:
+ STDERR.print('[red][X] Error: ArchiveBox DATA_DIR cannot be owned by root![/red]')
+ STDERR.print(f' {CONSTANTS.DATA_DIR}')
+ STDERR.print()
+ STDERR.print('[violet]Hint:[/violet] Are you running archivebox in the right folder? (and as a non-root user?)')
+ STDERR.print(' cd path/to/your/archive/data')
+ STDERR.print(' archivebox [command]')
+ STDERR.print()
+ raise SystemExit(9)
- bump_startup_progress_bar()
+ # Create cache table in DB if needed
+ try:
+ from django.core.cache import cache
+ cache.get('test', None)
+ except django.db.utils.OperationalError:
+ call_command("createcachetable", verbosity=0)
- # https://docs.pydantic.dev/logfire/integrations/django/ Logfire Debugging
- # if settings.DEBUG_LOGFIRE:
- # from opentelemetry.instrumentation.sqlite3 import SQLite3Instrumentor
- # SQLite3Instrumentor().instrument()
+ # if archivebox gets imported multiple times, we have to close
+ # the sqlite3 whenever we init from scratch to avoid multiple threads
+ # sharing the same connection by accident
+ from django.db import connections
+ for conn in connections.all():
+ conn.close_if_unusable_or_obsolete()
- # import logfire
+ sql_index_path = CONSTANTS.DATABASE_FILE
+ assert os.access(sql_index_path, os.F_OK), (
+ f'No database file {sql_index_path} found in: {CONSTANTS.DATA_DIR} (Are you in an ArchiveBox collection directory?)')
- # logfire.configure()
- # logfire.instrument_django(is_sql_commentor_enabled=True)
- # logfire.info(f'Started ArchiveBox v{CONSTANTS.VERSION}', argv=sys.argv)
+ # https://docs.pydantic.dev/logfire/integrations/django/ Logfire Debugging
+ # if settings.DEBUG_LOGFIRE:
+ # from opentelemetry.instrumentation.sqlite3 import SQLite3Instrumentor
+ # SQLite3Instrumentor().instrument()
+
+ # import logfire
+
+ # logfire.configure()
+ # logfire.instrument_django(is_sql_commentor_enabled=True)
+ # logfire.info(f'Started ArchiveBox v{CONSTANTS.VERSION}', argv=sys.argv)
+
+ except KeyboardInterrupt:
+ raise SystemExit(2)
- except KeyboardInterrupt:
- raise SystemExit(2)
-
DJANGO_SET_UP = True
-
- INITIAL_STARTUP_PROGRESS = None
- INITIAL_STARTUP_PROGRESS_TASK = None
diff --git a/archivebox/core/admin_archiveresults.py b/archivebox/core/admin_archiveresults.py
index 5497d2a6..59864571 100644
--- a/archivebox/core/admin_archiveresults.py
+++ b/archivebox/core/admin_archiveresults.py
@@ -19,6 +19,150 @@ from archivebox.hooks import get_extractor_icon
from core.models import ArchiveResult, Snapshot
+def render_archiveresults_list(archiveresults_qs, limit=50):
+ """Render a nice inline list view of archive results with status, extractor, output, and actions."""
+
+ results = list(archiveresults_qs.order_by('-end_ts').select_related('snapshot')[:limit])
+
+ if not results:
+ return mark_safe('
No Archive Results yet...
')
+
+ # Status colors
+ status_colors = {
+ 'succeeded': ('#166534', '#dcfce7'), # green
+ 'failed': ('#991b1b', '#fee2e2'), # red
+ 'queued': ('#6b7280', '#f3f4f6'), # gray
+ 'started': ('#92400e', '#fef3c7'), # amber
+ }
+
+ rows = []
+ for idx, result in enumerate(results):
+ status = result.status or 'queued'
+ color, bg = status_colors.get(status, ('#6b7280', '#f3f4f6'))
+
+ # Get extractor icon
+ icon = get_extractor_icon(result.extractor)
+
+ # Format timestamp
+ end_time = result.end_ts.strftime('%Y-%m-%d %H:%M:%S') if result.end_ts else '-'
+
+ # Truncate output for display
+ full_output = result.output or '-'
+ output_display = full_output[:60]
+ if len(full_output) > 60:
+ output_display += '...'
+
+ # Get full command as tooltip
+ cmd_str = ' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd or '-')
+
+ # Build output link
+ output_link = f'/archive/{result.snapshot.timestamp}/{result.output}' if result.output and result.status == 'succeeded' else f'/archive/{result.snapshot.timestamp}/'
+
+ # Get version - try cmd_version field
+ version = result.cmd_version if result.cmd_version else '-'
+
+ # Unique ID for this row's expandable output
+ row_id = f'output_{idx}_{str(result.id)[:8]}'
+
+ rows.append(f'''
+
+ |
+ {status}
+ |
+
+ {icon}
+ |
+
+ {result.extractor}
+ |
+
+
+ {output_display}
+
+ |
+
+ {end_time}
+ |
+
+ {version}
+ |
+
+
+ |
+
+
+
+
+
+ Details & Output
+
+
+
+ ID: {str(result.id)[:8]}...
+ Version: {version}
+ PWD: {result.pwd or '-'}
+
+
+ Output:
+
+ {full_output}
+
+ Command:
+
+ {cmd_str}
+
+
+ |
+
+ ''')
+
+ total_count = archiveresults_qs.count()
+ footer = ''
+ if total_count > limit:
+ footer = f'''
+
+ |
+ Showing {limit} of {total_count} results
+ View all →
+ |
+
+ '''
+
+ return mark_safe(f'''
+
+
+
+
+ | Status |
+ |
+ Extractor |
+ Output |
+ Completed |
+ Version |
+ Actions |
+
+
+
+ {''.join(rows)}
+ {footer}
+
+
+
+ ''')
+
+
class ArchiveResultInline(admin.TabularInline):
name = 'Archive Results Log'
@@ -97,18 +241,44 @@ class ArchiveResultAdmin(BaseModelAdmin):
sort_fields = ('id', 'created_by', 'created_at', 'extractor', 'status')
readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'extractor_with_icon')
search_fields = ('id', 'snapshot__url', 'extractor', 'output', 'cmd_version', 'cmd', 'snapshot__timestamp')
- fields = ('snapshot', 'extractor', 'status', 'retry_at', 'start_ts', 'end_ts', 'created_by', 'pwd', 'cmd_version', 'cmd', 'output', *readonly_fields)
autocomplete_fields = ['snapshot']
+ fieldsets = (
+ ('Snapshot', {
+ 'fields': ('snapshot', 'snapshot_info', 'tags_str'),
+ 'classes': ('card', 'wide'),
+ }),
+ ('Extractor', {
+ 'fields': ('extractor', 'extractor_with_icon', 'status', 'retry_at'),
+ 'classes': ('card',),
+ }),
+ ('Timing', {
+ 'fields': ('start_ts', 'end_ts', 'created_at', 'modified_at'),
+ 'classes': ('card',),
+ }),
+ ('Command', {
+ 'fields': ('cmd', 'cmd_str', 'cmd_version', 'pwd'),
+ 'classes': ('card',),
+ }),
+ ('Output', {
+ 'fields': ('output', 'output_summary'),
+ 'classes': ('card', 'wide'),
+ }),
+ ('Metadata', {
+ 'fields': ('created_by',),
+ 'classes': ('card',),
+ }),
+ )
+
list_filter = ('status', 'extractor', 'start_ts', 'cmd_version')
ordering = ['-start_ts']
list_per_page = SERVER_CONFIG.SNAPSHOTS_PER_PAGE
-
+
paginator = AccelleratedPaginator
save_on_top = True
-
+
actions = ['delete_selected']
-
+
class Meta:
verbose_name = 'Archive Result'
verbose_name_plural = 'Archive Results'
diff --git a/archivebox/core/admin_snapshots.py b/archivebox/core/admin_snapshots.py
index d1917e52..d25f291c 100644
--- a/archivebox/core/admin_snapshots.py
+++ b/archivebox/core/admin_snapshots.py
@@ -25,7 +25,7 @@ from archivebox.workers.tasks import bg_archive_snapshots, bg_add
from core.models import Tag
from core.admin_tags import TagInline
-from core.admin_archiveresults import ArchiveResultInline
+from core.admin_archiveresults import ArchiveResultInline, render_archiveresults_list
# GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': [], 'CAN_UPGRADE': False}
@@ -54,13 +54,48 @@ class SnapshotActionForm(ActionForm):
class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
list_display = ('created_at', 'title_str', 'status', 'files', 'size', 'url_str')
sort_fields = ('title_str', 'url_str', 'created_at', 'status', 'crawl')
- readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'output_dir')
+ readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'output_dir', 'archiveresults_list')
search_fields = ('id', 'url', 'timestamp', 'title', 'tags__name')
list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'created_by', 'tags__name')
- fields = ('url', 'title', 'created_by', 'bookmarked_at', 'status', 'retry_at', 'crawl', 'config', *readonly_fields)
+
+ fieldsets = (
+ ('URL', {
+ 'fields': ('url', 'title'),
+ 'classes': ('card', 'wide'),
+ }),
+ ('Status', {
+ 'fields': ('status', 'retry_at', 'status_info'),
+ 'classes': ('card',),
+ }),
+ ('Timestamps', {
+ 'fields': ('bookmarked_at', 'created_at', 'modified_at', 'downloaded_at'),
+ 'classes': ('card',),
+ }),
+ ('Relations', {
+ 'fields': ('crawl', 'created_by', 'tags_str'),
+ 'classes': ('card',),
+ }),
+ ('Config', {
+ 'fields': ('config',),
+ 'classes': ('card',),
+ }),
+ ('Files', {
+ 'fields': ('output_dir',),
+ 'classes': ('card',),
+ }),
+ ('Actions', {
+ 'fields': ('admin_actions',),
+ 'classes': ('card', 'wide'),
+ }),
+ ('Archive Results', {
+ 'fields': ('archiveresults_list',),
+ 'classes': ('card', 'wide'),
+ }),
+ )
+
ordering = ['-created_at']
actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots']
- inlines = [TagInline, ArchiveResultInline]
+ inlines = [TagInline] # Removed ArchiveResultInline, using custom renderer instead
list_per_page = min(max(5, SERVER_CONFIG.SNAPSHOTS_PER_PAGE), 5000)
action_form = SnapshotActionForm
@@ -155,6 +190,10 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
obj.extension or '-',
)
+ @admin.display(description='Archive Results')
+ def archiveresults_list(self, obj):
+ return render_archiveresults_list(obj.archiveresult_set.all())
+
@admin.display(
description='Title',
ordering='title',
diff --git a/archivebox/core/admin_tags.py b/archivebox/core/admin_tags.py
index b0f09b9b..f2d0a8cf 100644
--- a/archivebox/core/admin_tags.py
+++ b/archivebox/core/admin_tags.py
@@ -51,11 +51,25 @@ class TagAdmin(BaseModelAdmin):
sort_fields = ('name', 'slug', 'id', 'created_by', 'created_at')
readonly_fields = ('slug', 'id', 'created_at', 'modified_at', 'snapshots')
search_fields = ('id', 'name', 'slug')
- fields = ('name', 'created_by', *readonly_fields)
actions = ['delete_selected', 'merge_tags']
ordering = ['-created_at']
# inlines = [TaggedItemInline]
+ fieldsets = (
+ ('Tag Info', {
+ 'fields': ('name', 'slug'),
+ 'classes': ('card',),
+ }),
+ ('Metadata', {
+ 'fields': ('id', 'created_by', 'created_at', 'modified_at'),
+ 'classes': ('card',),
+ }),
+ ('Snapshots', {
+ 'fields': ('snapshots',),
+ 'classes': ('card', 'wide'),
+ }),
+ )
+
paginator = AccelleratedPaginator
diff --git a/archivebox/core/apps.py b/archivebox/core/apps.py
index 5193166d..4581f208 100644
--- a/archivebox/core/apps.py
+++ b/archivebox/core/apps.py
@@ -1,7 +1,5 @@
__package__ = 'archivebox.core'
-import sys
-
from django.apps import AppConfig
@@ -12,41 +10,3 @@ class CoreConfig(AppConfig):
"""Register the archivebox.core.admin_site as the main django admin site"""
from core.admin_site import register_admin_site
register_admin_site()
-
- # Auto-start the orchestrator when running the web server
- self._maybe_start_orchestrator()
-
- def _maybe_start_orchestrator(self):
- """Start the orchestrator if we're running a web server."""
- import os
-
- # Don't start orchestrator during migrations, shell, tests, etc.
- # Only start when running: runserver, daphne, gunicorn, uwsgi
- if not self._is_web_server():
- return
-
- # Don't start if RUN_ORCHESTRATOR env var is explicitly set to false
- if os.environ.get('RUN_ORCHESTRATOR', '').lower() in ('false', '0', 'no'):
- return
-
- # Don't start in autoreload child process (avoid double-start)
- if os.environ.get('RUN_MAIN') != 'true' and 'runserver' in sys.argv:
- return
-
- try:
- from workers.orchestrator import Orchestrator
-
- if not Orchestrator.is_running():
- # Start orchestrator as daemon (won't exit on idle when started by server)
- orchestrator = Orchestrator(exit_on_idle=False)
- orchestrator.start()
- except Exception as e:
- # Don't crash the server if orchestrator fails to start
- import logging
- logging.getLogger('archivebox').warning(f'Failed to auto-start orchestrator: {e}')
-
- def _is_web_server(self) -> bool:
- """Check if we're running a web server command."""
- # Check for common web server indicators
- server_commands = ('runserver', 'daphne', 'gunicorn', 'uwsgi', 'server')
- return any(cmd in ' '.join(sys.argv).lower() for cmd in server_commands)
diff --git a/archivebox/core/migrations/0025_allow_duplicate_urls_per_crawl.py b/archivebox/core/migrations/0025_allow_duplicate_urls_per_crawl.py
new file mode 100644
index 00000000..0c2d80d6
--- /dev/null
+++ b/archivebox/core/migrations/0025_allow_duplicate_urls_per_crawl.py
@@ -0,0 +1,22 @@
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('core', '0024_snapshot_crawl'),
+ ]
+
+ operations = [
+ # Remove the unique constraint on url
+ migrations.AlterField(
+ model_name='snapshot',
+ name='url',
+ field=models.URLField(db_index=True, unique=False),
+ ),
+ # Add unique constraint on (url, crawl) combination
+ migrations.AddConstraint(
+ model_name='snapshot',
+ constraint=models.UniqueConstraint(fields=['url', 'crawl'], name='unique_url_per_crawl'),
+ ),
+ ]
diff --git a/archivebox/core/models.py b/archivebox/core/models.py
index 543435aa..57369460 100644
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -60,7 +60,8 @@ class Tag(ModelWithSerializers):
return self.name
def save(self, *args, **kwargs):
- if self._state.adding:
+ is_new = self._state.adding
+ if is_new:
self.slug = slugify(self.name)
existing = set(Tag.objects.filter(slug__startswith=self.slug).values_list("slug", flat=True))
i = None
@@ -72,6 +73,19 @@ class Tag(ModelWithSerializers):
i = (i or 0) + 1
super().save(*args, **kwargs)
+ if is_new:
+ from archivebox.misc.logging_util import log_worker_event
+ log_worker_event(
+ worker_type='DB',
+ event='Created Tag',
+ indent_level=0,
+ metadata={
+ 'id': self.id,
+ 'name': self.name,
+ 'slug': self.slug,
+ },
+ )
+
@property
def api_url(self) -> str:
return reverse_lazy('api-1:get_tag', args=[self.id])
@@ -241,12 +255,13 @@ class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)):
if tag.strip()
))
- try:
- snapshot = self.get(url=url)
+ # Get most recent snapshot with this URL (URLs can exist in multiple crawls)
+ snapshot = self.filter(url=url).order_by('-created_at').first()
+ if snapshot:
if title and (not snapshot.title or len(title) > len(snapshot.title or '')):
snapshot.title = title
snapshot.save(update_fields=['title', 'modified_at'])
- except self.model.DoesNotExist:
+ else:
if timestamp:
while self.filter(timestamp=timestamp).exists():
timestamp = str(float(timestamp) + 1.0)
@@ -284,7 +299,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
created_at = models.DateTimeField(default=timezone.now, db_index=True)
modified_at = models.DateTimeField(auto_now=True)
- url = models.URLField(unique=True, db_index=True)
+ url = models.URLField(unique=False, db_index=True) # URLs can appear in multiple crawls
timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False)
bookmarked_at = models.DateTimeField(default=timezone.now, db_index=True)
crawl: Crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name='snapshot_set', db_index=True) # type: ignore
@@ -313,11 +328,16 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
class Meta(TypedModelMeta):
verbose_name = "Snapshot"
verbose_name_plural = "Snapshots"
+ constraints = [
+ # Allow same URL in different crawls, but not duplicates within same crawl
+ models.UniqueConstraint(fields=['url', 'crawl'], name='unique_url_per_crawl'),
+ ]
def __str__(self):
return f'[{self.id}] {self.url[:64]}'
def save(self, *args, **kwargs):
+ is_new = self._state.adding
if not self.bookmarked_at:
self.bookmarked_at = self.created_at or timezone.now()
if not self.timestamp:
@@ -327,6 +347,21 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
self.crawl.urls += f'\n{self.url}'
self.crawl.save()
+ if is_new:
+ from archivebox.misc.logging_util import log_worker_event
+ log_worker_event(
+ worker_type='DB',
+ event='Created Snapshot',
+ indent_level=2,
+ url=self.url,
+ metadata={
+ 'id': str(self.id),
+ 'crawl_id': str(self.crawl_id) if self.crawl_id else None,
+ 'depth': self.depth,
+ 'status': self.status,
+ },
+ )
+
def output_dir_parent(self) -> str:
return 'archive'
@@ -807,6 +842,24 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
def __str__(self):
return f'[{self.id}] {self.snapshot.url[:64]} -> {self.extractor}'
+ def save(self, *args, **kwargs):
+ is_new = self._state.adding
+ super().save(*args, **kwargs)
+ if is_new:
+ from archivebox.misc.logging_util import log_worker_event
+ log_worker_event(
+ worker_type='DB',
+ event='Created ArchiveResult',
+ indent_level=3,
+ extractor=self.extractor,
+ metadata={
+ 'id': str(self.id),
+ 'snapshot_id': str(self.snapshot_id),
+ 'snapshot_url': str(self.snapshot.url)[:64],
+ 'status': self.status,
+ },
+ )
+
@cached_property
def snapshot_dir(self):
return Path(self.snapshot.output_dir)
@@ -879,7 +932,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
from django.utils import timezone
from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook
- extractor_dir = Path(self.snapshot.output_dir) / self.extractor
config_objects = [self.snapshot.crawl, self.snapshot] if self.snapshot.crawl else [self.snapshot]
# Find hook for this extractor
@@ -899,6 +951,10 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
self.save()
return
+ # Use plugin directory name instead of extractor name (removes numeric prefix)
+ plugin_name = hook.parent.name
+ extractor_dir = Path(self.snapshot.output_dir) / plugin_name
+
# Run the hook
start_ts = timezone.now()
result = run_hook(
diff --git a/archivebox/core/statemachines.py b/archivebox/core/statemachines.py
index 26a0ed7f..fde35403 100644
--- a/archivebox/core/statemachines.py
+++ b/archivebox/core/statemachines.py
@@ -45,15 +45,14 @@ class SnapshotMachine(StateMachine, strict_states=True):
super().__init__(snapshot, *args, **kwargs)
def __repr__(self) -> str:
- return f'[grey53]Snapshot\\[{self.snapshot.id}] 🏃♂️ Worker\\[pid={os.getpid()}].tick()[/grey53] [blue]{self.snapshot.status.upper()}[/blue] ⚙️ [grey37]Machine[/grey37]'
-
+ return f'Snapshot[{self.snapshot.id}]'
+
def __str__(self) -> str:
return self.__repr__()
-
+
def can_start(self) -> bool:
can_start = bool(self.snapshot.url)
- if not can_start:
- print(f'{self}.can_start() [blue]QUEUED[/blue] ➡️❌ [blue]STARTED[/blue] cant start yet +{timezone.now() - self.snapshot.retry_at}s')
+ # Suppressed: queue waiting logs
return can_start
def is_finished(self) -> bool:
@@ -73,15 +72,15 @@ class SnapshotMachine(StateMachine, strict_states=True):
@queued.enter
def enter_queued(self):
- print(f'{self}.on_queued() ↳ snapshot.retry_at = now()')
+ # Suppressed: state transition logs
self.snapshot.update_for_workers(
retry_at=timezone.now(),
status=Snapshot.StatusChoices.QUEUED,
)
-
+
@started.enter
def enter_started(self):
- print(f'{self}.on_started() ↳ snapshot.run()')
+ # Suppressed: state transition logs
# lock the snapshot while we create the pending archiveresults
self.snapshot.update_for_workers(
retry_at=timezone.now() + timedelta(seconds=30), # if failed, wait 30s before retrying
@@ -95,10 +94,10 @@ class SnapshotMachine(StateMachine, strict_states=True):
retry_at=timezone.now() + timedelta(seconds=5), # wait 5s before checking it again
status=Snapshot.StatusChoices.STARTED,
)
-
+
@sealed.enter
def enter_sealed(self):
- print(f'{self}.on_sealed() ↳ snapshot.retry_at=None')
+ # Suppressed: state transition logs
self.snapshot.update_for_workers(
retry_at=None,
status=Snapshot.StatusChoices.SEALED,
@@ -161,15 +160,14 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
super().__init__(archiveresult, *args, **kwargs)
def __repr__(self) -> str:
- return f'[grey53]ArchiveResult\\[{self.archiveresult.id}] 🏃♂️ Worker\\[pid={os.getpid()}].tick()[/grey53] [blue]{self.archiveresult.status.upper()}[/blue] ⚙️ [grey37]Machine[/grey37]'
-
+ return f'ArchiveResult[{self.archiveresult.id}]'
+
def __str__(self) -> str:
return self.__repr__()
-
+
def can_start(self) -> bool:
can_start = bool(self.archiveresult.snapshot.url)
- if not can_start:
- print(f'{self}.can_start() [blue]QUEUED[/blue] ➡️❌ [blue]STARTED[/blue]: cant start yet +{timezone.now() - self.archiveresult.retry_at}s')
+ # Suppressed: queue waiting logs
return can_start
def is_succeeded(self) -> bool:
@@ -202,41 +200,34 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
@queued.enter
def enter_queued(self):
- print(f'{self}.on_queued() ↳ archiveresult.retry_at = now()')
+ # Suppressed: state transition logs
self.archiveresult.update_for_workers(
retry_at=timezone.now(),
status=ArchiveResult.StatusChoices.QUEUED,
start_ts=None,
) # bump the snapshot's retry_at so they pickup any new changes
-
+
@started.enter
def enter_started(self):
- print(f'{self}.on_started() ↳ archiveresult.start_ts + run_extractor()')
-
+ # Suppressed: state transition logs
# Lock the object and mark start time
self.archiveresult.update_for_workers(
retry_at=timezone.now() + timedelta(seconds=120), # 2 min timeout for extractor
status=ArchiveResult.StatusChoices.STARTED,
start_ts=timezone.now(),
)
-
+
# Run the extractor - this updates status, output, timestamps, etc.
self.archiveresult.run()
-
+
# Save the updated result
self.archiveresult.save()
-
- # Log the result
- if self.archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED:
- print(f'{self} ✅ extractor succeeded: {self.archiveresult.output[:50] if self.archiveresult.output else ""}...')
- elif self.archiveresult.status == ArchiveResult.StatusChoices.FAILED:
- print(f'{self} ❌ extractor failed: {self.archiveresult.output[:100] if self.archiveresult.output else ""}...')
- elif self.archiveresult.status == ArchiveResult.StatusChoices.SKIPPED:
- print(f'{self} ⏭️ extractor skipped: {self.archiveresult.output[:50] if self.archiveresult.output else ""}')
+
+ # Suppressed: extractor result logs (already logged by worker)
@backoff.enter
def enter_backoff(self):
- print(f'{self}.on_backoff() ↳ archiveresult.retries += 1, archiveresult.bump_retry_at(+60s), archiveresult.end_ts = None')
+ # Suppressed: state transition logs
self.archiveresult.update_for_workers(
retry_at=timezone.now() + timedelta(seconds=60),
status=ArchiveResult.StatusChoices.BACKOFF,
@@ -244,10 +235,10 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
# retries=F('retries') + 1, # F() equivalent to getattr(self.archiveresult, 'retries', 0) + 1,
)
self.archiveresult.save(write_indexes=True)
-
+
@succeeded.enter
def enter_succeeded(self):
- print(f'{self}.on_succeeded() ↳ archiveresult.retry_at = None, archiveresult.end_ts = now()')
+ # Suppressed: state transition logs
self.archiveresult.update_for_workers(
retry_at=None,
status=ArchiveResult.StatusChoices.SUCCEEDED,
@@ -270,7 +261,7 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
@failed.enter
def enter_failed(self):
- print(f'{self}.on_failed() ↳ archiveresult.retry_at = None, archiveresult.end_ts = now()')
+ # Suppressed: state transition logs
self.archiveresult.update_for_workers(
retry_at=None,
status=ArchiveResult.StatusChoices.FAILED,
@@ -291,7 +282,7 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
@skipped.enter
def enter_skipped(self):
- print(f'{self}.on_skipped() ↳ archiveresult.retry_at = None, archiveresult.end_ts = now()')
+ # Suppressed: state transition logs
self.archiveresult.update_for_workers(
retry_at=None,
status=ArchiveResult.StatusChoices.SKIPPED,
diff --git a/archivebox/core/views.py b/archivebox/core/views.py
index 43110364..3f9b1794 100644
--- a/archivebox/core/views.py
+++ b/archivebox/core/views.py
@@ -503,15 +503,7 @@ class AddView(UserPassesTestMixin, FormView):
mark_safe(f"Adding {rough_url_count} URLs in the background. (refresh in a minute start seeing results) {crawl.admin_change_url}"),
)
- # Start orchestrator in background to process the queued crawl
- try:
- from archivebox.workers.tasks import ensure_orchestrator_running
- ensure_orchestrator_running()
- except Exception as e:
- # Orchestrator may already be running via supervisord, or fail to start
- # This is not fatal - the crawl will be processed when orchestrator runs
- print(f'[!] Failed to start orchestrator: {e}')
-
+ # Orchestrator (managed by supervisord) will pick up the queued crawl
return redirect(crawl.admin_change_url)
@@ -539,6 +531,7 @@ def live_progress_view(request):
from workers.orchestrator import Orchestrator
from crawls.models import Crawl
from core.models import Snapshot, ArchiveResult
+ from django.db.models import Case, When, Value, IntegerField
# Get orchestrator status
orchestrator_running = Orchestrator.is_running()
@@ -570,8 +563,26 @@ def live_progress_view(request):
crawl_snapshots = Snapshot.objects.filter(crawl=crawl)
total_snapshots = crawl_snapshots.count()
completed_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.SEALED).count()
+ started_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.STARTED).count()
pending_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.QUEUED).count()
+ # Count URLs in the crawl (for when snapshots haven't been created yet)
+ urls_count = 0
+ if crawl.urls:
+ urls_count = len([u for u in crawl.urls.split('\n') if u.strip()])
+ elif crawl.seed and crawl.seed.uri:
+ # Try to get URL count from seed
+ if crawl.seed.uri.startswith('file:///'):
+ try:
+ from pathlib import Path
+ seed_file = Path(crawl.seed.uri.replace('file://', ''))
+ if seed_file.exists():
+ urls_count = len([l for l in seed_file.read_text().split('\n') if l.strip() and not l.startswith('#')])
+ except:
+ pass
+ else:
+ urls_count = 1 # Single URL seed
+
# Calculate crawl progress
crawl_progress = int((completed_snapshots / total_snapshots) * 100) if total_snapshots > 0 else 0
@@ -590,16 +601,24 @@ def live_progress_view(request):
# Calculate snapshot progress
snapshot_progress = int(((completed_extractors + failed_extractors) / total_extractors) * 100) if total_extractors > 0 else 0
- # Get active extractors for this snapshot
- active_extractors = [
+ # Get all extractors for this snapshot
+ # Order: started first, then queued, then completed
+ all_extractors = [
{
'id': str(ar.id),
'extractor': ar.extractor,
'status': ar.status,
- 'started': ar.start_ts.isoformat() if ar.start_ts else None,
- 'progress': 50,
}
- for ar in snapshot_results.filter(status=ArchiveResult.StatusChoices.STARTED).order_by('-start_ts')[:5]
+ for ar in snapshot_results.annotate(
+ status_order=Case(
+ When(status=ArchiveResult.StatusChoices.STARTED, then=Value(0)),
+ When(status=ArchiveResult.StatusChoices.QUEUED, then=Value(1)),
+ When(status=ArchiveResult.StatusChoices.SUCCEEDED, then=Value(2)),
+ When(status=ArchiveResult.StatusChoices.FAILED, then=Value(3)),
+ default=Value(4),
+ output_field=IntegerField(),
+ )
+ ).order_by('status_order', 'extractor')
]
active_snapshots_for_crawl.append({
@@ -612,9 +631,17 @@ def live_progress_view(request):
'completed_extractors': completed_extractors,
'failed_extractors': failed_extractors,
'pending_extractors': pending_extractors,
- 'active_extractors': active_extractors,
+ 'all_extractors': all_extractors,
})
+ # Check if crawl can start (for debugging stuck crawls)
+ can_start = bool(crawl.seed and crawl.seed.uri)
+ seed_uri = crawl.seed.uri[:60] if crawl.seed and crawl.seed.uri else None
+
+ # Check if retry_at is in the future (would prevent worker from claiming)
+ retry_at_future = crawl.retry_at > timezone.now() if crawl.retry_at else False
+ seconds_until_retry = int((crawl.retry_at - timezone.now()).total_seconds()) if crawl.retry_at and retry_at_future else 0
+
active_crawls.append({
'id': str(crawl.id),
'label': str(crawl)[:60],
@@ -622,11 +649,17 @@ def live_progress_view(request):
'started': crawl.modified_at.isoformat() if crawl.modified_at else None,
'progress': crawl_progress,
'max_depth': crawl.max_depth,
+ 'urls_count': urls_count,
'total_snapshots': total_snapshots,
'completed_snapshots': completed_snapshots,
+ 'started_snapshots': started_snapshots,
'failed_snapshots': 0,
'pending_snapshots': pending_snapshots,
'active_snapshots': active_snapshots_for_crawl,
+ 'can_start': can_start,
+ 'seed_uri': seed_uri,
+ 'retry_at_future': retry_at_future,
+ 'seconds_until_retry': seconds_until_retry,
})
return JsonResponse({
diff --git a/archivebox/crawls/admin.py b/archivebox/crawls/admin.py
index 611a80bc..e5e7f2eb 100644
--- a/archivebox/crawls/admin.py
+++ b/archivebox/crawls/admin.py
@@ -8,6 +8,7 @@ from django.contrib import admin, messages
from django.urls import path
from django.http import JsonResponse
from django.views.decorators.http import require_POST
+from django.db.models import Count, Q
from archivebox import DATA_DIR
@@ -19,13 +20,155 @@ from core.models import Snapshot
from crawls.models import Seed, Crawl, CrawlSchedule
+def render_snapshots_list(snapshots_qs, limit=20):
+ """Render a nice inline list view of snapshots with status, title, URL, and progress."""
+
+ snapshots = snapshots_qs.order_by('-created_at')[:limit].annotate(
+ total_results=Count('archiveresult'),
+ succeeded_results=Count('archiveresult', filter=Q(archiveresult__status='succeeded')),
+ failed_results=Count('archiveresult', filter=Q(archiveresult__status='failed')),
+ )
+
+ if not snapshots:
+ return mark_safe('No Snapshots yet...
')
+
+ # Status colors matching Django admin and progress monitor
+ status_colors = {
+ 'queued': ('#6c757d', '#f8f9fa'), # gray
+ 'started': ('#856404', '#fff3cd'), # amber
+ 'sealed': ('#155724', '#d4edda'), # green
+ 'failed': ('#721c24', '#f8d7da'), # red
+ }
+
+ rows = []
+ for snapshot in snapshots:
+ status = snapshot.status or 'queued'
+ color, bg = status_colors.get(status, ('#6c757d', '#f8f9fa'))
+
+ # Calculate progress
+ total = snapshot.total_results
+ done = snapshot.succeeded_results + snapshot.failed_results
+ progress_pct = int((done / total) * 100) if total > 0 else 0
+ progress_text = f'{done}/{total}' if total > 0 else '-'
+
+ # Truncate title and URL
+ title = (snapshot.title or 'Untitled')[:60]
+ if len(snapshot.title or '') > 60:
+ title += '...'
+ url_display = snapshot.url[:50]
+ if len(snapshot.url) > 50:
+ url_display += '...'
+
+ # Format date
+ date_str = snapshot.created_at.strftime('%Y-%m-%d %H:%M') if snapshot.created_at else '-'
+
+ rows.append(f'''
+
+ |
+ {status}
+ |
+
+
+
+
+ |
+
+ {title}
+ |
+
+ {url_display}
+ |
+
+
+ |
+
+ {date_str}
+ |
+
+ ''')
+
+ total_count = snapshots_qs.count()
+ footer = ''
+ if total_count > limit:
+ footer = f'''
+
+ |
+ Showing {limit} of {total_count} snapshots
+ |
+
+ '''
+
+ return mark_safe(f'''
+
+
+
+
+ | Status |
+ |
+ Title |
+ URL |
+ Progress |
+ Created |
+
+
+
+ {''.join(rows)}
+ {footer}
+
+
+
+ ''')
+
+
class SeedAdmin(ConfigEditorMixin, BaseModelAdmin):
list_display = ('id', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str', 'crawls', 'num_crawls', 'num_snapshots')
sort_fields = ('id', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str')
search_fields = ('id', 'created_by__username', 'label', 'notes', 'uri', 'extractor', 'tags_str')
readonly_fields = ('created_at', 'modified_at', 'scheduled_crawls', 'crawls', 'snapshots', 'contents')
- fields = ('label', 'notes', 'uri', 'extractor', 'tags_str', 'config', 'created_by', *readonly_fields)
+
+ fieldsets = (
+ ('Source', {
+ 'fields': ('uri', 'contents'),
+ 'classes': ('card', 'wide'),
+ }),
+ ('Info', {
+ 'fields': ('label', 'notes', 'tags_str'),
+ 'classes': ('card',),
+ }),
+ ('Settings', {
+ 'fields': ('extractor', 'config'),
+ 'classes': ('card',),
+ }),
+ ('Metadata', {
+ 'fields': ('created_by', 'created_at', 'modified_at'),
+ 'classes': ('card',),
+ }),
+ ('Crawls', {
+ 'fields': ('scheduled_crawls', 'crawls'),
+ 'classes': ('card',),
+ }),
+ ('Snapshots', {
+ 'fields': ('snapshots',),
+ 'classes': ('card',),
+ }),
+ )
list_filter = ('extractor', 'created_by')
ordering = ['-created_at']
@@ -51,22 +194,19 @@ class SeedAdmin(ConfigEditorMixin, BaseModelAdmin):
)) or mark_safe('No Crawls yet...')
def snapshots(self, obj):
- return format_html_join('
', ' - {}', (
- (snapshot.admin_change_url, snapshot)
- for snapshot in obj.snapshot_set.all().order_by('-created_at')[:20]
- )) or mark_safe('No Snapshots yet...')
+ return render_snapshots_list(obj.snapshot_set.all())
def contents(self, obj):
- if obj.uri.startswith('file:///data/'):
- source_file = DATA_DIR / obj.uri.replace('file:///data/', '', 1)
+ source_file = obj.get_file_path()
+ if source_file:
contents = ""
try:
contents = source_file.read_text().strip()[:14_000]
except Exception as e:
contents = f'Error reading {source_file}: {e}'
-
+
return format_html('{}:
{}', source_file, contents)
-
+
return format_html('See URLs here: {}', obj.uri, obj.uri)
@@ -78,7 +218,37 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
search_fields = ('id', 'created_by__username', 'max_depth', 'label', 'notes', 'seed_id', 'schedule_id', 'status', 'seed__uri')
readonly_fields = ('created_at', 'modified_at', 'snapshots', 'seed_urls_editor')
- fields = ('label', 'notes', 'seed_urls_editor', 'config', 'status', 'retry_at', 'max_depth', 'seed', 'schedule', 'created_by', 'created_at', 'modified_at', 'snapshots')
+
+ fieldsets = (
+ ('URLs', {
+ 'fields': ('seed_urls_editor',),
+ 'classes': ('card', 'wide'),
+ }),
+ ('Info', {
+ 'fields': ('label', 'notes'),
+ 'classes': ('card',),
+ }),
+ ('Settings', {
+ 'fields': ('max_depth', 'config'),
+ 'classes': ('card',),
+ }),
+ ('Status', {
+ 'fields': ('status', 'retry_at'),
+ 'classes': ('card',),
+ }),
+ ('Relations', {
+ 'fields': ('seed', 'schedule', 'created_by'),
+ 'classes': ('card',),
+ }),
+ ('Timestamps', {
+ 'fields': ('created_at', 'modified_at'),
+ 'classes': ('card',),
+ }),
+ ('Snapshots', {
+ 'fields': ('snapshots',),
+ 'classes': ('card', 'wide'),
+ }),
+ )
list_filter = ('max_depth', 'seed', 'schedule', 'created_by', 'status', 'retry_at')
ordering = ['-created_at', '-retry_at']
@@ -90,6 +260,16 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
def recrawl(self, request, obj):
"""Duplicate this crawl as a new crawl with the same seed and settings."""
from django.utils import timezone
+ from django.shortcuts import redirect
+
+ # Validate seed has a URI (required for crawl to start)
+ if not obj.seed:
+ messages.error(request, 'Cannot recrawl: original crawl has no seed.')
+ return redirect('admin:crawls_crawl_change', obj.id)
+
+ if not obj.seed.uri:
+ messages.error(request, 'Cannot recrawl: seed has no URI.')
+ return redirect('admin:crawls_crawl_change', obj.id)
new_crawl = Crawl.objects.create(
seed=obj.seed,
@@ -110,8 +290,6 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
f'It will start processing shortly.'
)
- # Redirect to the new crawl's change page
- from django.shortcuts import redirect
return redirect('admin:crawls_crawl_change', new_crawl.id)
def get_urls(self):
@@ -133,7 +311,8 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
except Crawl.DoesNotExist:
return JsonResponse({'success': False, 'error': 'Crawl not found'}, status=404)
- if not (crawl.seed and crawl.seed.uri and crawl.seed.uri.startswith('file:///data/')):
+ source_file = crawl.seed.get_file_path() if crawl.seed else None
+ if not source_file:
return JsonResponse({'success': False, 'error': 'Seed is not a local file'}, status=400)
try:
@@ -142,8 +321,6 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
except json.JSONDecodeError:
return JsonResponse({'success': False, 'error': 'Invalid JSON'}, status=400)
- source_file = DATA_DIR / crawl.seed.uri.replace('file:///data/', '', 1)
-
try:
# Ensure parent directory exists
source_file.parent.mkdir(parents=True, exist_ok=True)
@@ -156,10 +333,7 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
return obj.snapshot_set.count()
def snapshots(self, obj):
- return format_html_join('
', '{}', (
- (snapshot.admin_change_url, snapshot)
- for snapshot in obj.snapshot_set.all().order_by('-created_at')[:20]
- )) or mark_safe('No Snapshots yet...')
+ return render_snapshots_list(obj.snapshot_set.all())
@admin.display(description='Schedule', ordering='schedule')
def schedule_str(self, obj):
@@ -186,13 +360,12 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
seed_uri = obj.urls
# Check if it's a local file we can edit
- is_file = seed_uri.startswith('file:///data/')
+ source_file = obj.seed.get_file_path() if obj.seed else None
+ is_file = source_file is not None
contents = ""
error = None
- source_file = None
- if is_file:
- source_file = DATA_DIR / seed_uri.replace('file:///data/', '', 1)
+ if is_file and source_file:
try:
contents = source_file.read_text().strip()
except Exception as e:
@@ -337,7 +510,29 @@ class CrawlScheduleAdmin(BaseModelAdmin):
search_fields = ('id', 'created_by__username', 'label', 'notes', 'schedule_id', 'template_id', 'template__seed__uri')
readonly_fields = ('created_at', 'modified_at', 'crawls', 'snapshots')
- fields = ('label', 'notes', 'schedule', 'template', 'created_by', *readonly_fields)
+
+ fieldsets = (
+ ('Schedule Info', {
+ 'fields': ('label', 'notes'),
+ 'classes': ('card',),
+ }),
+ ('Configuration', {
+ 'fields': ('schedule', 'template'),
+ 'classes': ('card',),
+ }),
+ ('Metadata', {
+ 'fields': ('created_by', 'created_at', 'modified_at'),
+ 'classes': ('card',),
+ }),
+ ('Crawls', {
+ 'fields': ('crawls',),
+ 'classes': ('card', 'wide'),
+ }),
+ ('Snapshots', {
+ 'fields': ('snapshots',),
+ 'classes': ('card', 'wide'),
+ }),
+ )
list_filter = ('created_by',)
ordering = ['-created_at']
@@ -362,10 +557,7 @@ class CrawlScheduleAdmin(BaseModelAdmin):
def snapshots(self, obj):
crawl_ids = obj.crawl_set.values_list('pk', flat=True)
- return format_html_join('
', ' - {}', (
- (snapshot.admin_change_url, snapshot)
- for snapshot in Snapshot.objects.filter(crawl_id__in=crawl_ids).order_by('-created_at')[:20]
- )) or mark_safe('No Snapshots yet...')
+ return render_snapshots_list(Snapshot.objects.filter(crawl_id__in=crawl_ids))
def register_admin(admin_site):
diff --git a/archivebox/crawls/models.py b/archivebox/crawls/models.py
index fadd693d..4bd00328 100644
--- a/archivebox/crawls/models.py
+++ b/archivebox/crawls/models.py
@@ -44,9 +44,27 @@ class Seed(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthS
def __str__(self):
return f'[{self.id}] {self.uri[:64]}'
+ def save(self, *args, **kwargs):
+ is_new = self._state.adding
+ super().save(*args, **kwargs)
+ if is_new:
+ from archivebox.misc.logging_util import log_worker_event
+ log_worker_event(
+ worker_type='DB',
+ event='Created Seed',
+ indent_level=0,
+ metadata={
+ 'id': str(self.id),
+ 'uri': str(self.uri)[:64],
+ 'extractor': self.extractor,
+ 'label': self.label or None,
+ },
+ )
+
@classmethod
def from_file(cls, source_file: Path, label: str = '', parser: str = 'auto', tag: str = '', created_by=None, config=None):
- source_path = str(source_file.resolve()).replace(str(CONSTANTS.DATA_DIR), '/data')
+ # Use absolute path for file:// URLs so extractors can find the files
+ source_path = str(source_file.resolve())
seed, _ = cls.objects.get_or_create(
label=label or source_file.name, uri=f'file://{source_path}',
created_by_id=getattr(created_by, 'pk', created_by) or get_or_create_system_user_pk(),
@@ -62,6 +80,25 @@ class Seed(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthS
def api_url(self) -> str:
return reverse_lazy('api-1:get_seed', args=[self.id])
+ def get_file_path(self) -> Path | None:
+ """
+ Get the filesystem path for file:// URIs.
+ Handles both old format (file:///data/...) and new format (file:///absolute/path).
+ Returns None if URI is not a file:// URI.
+ """
+ if not self.uri.startswith('file://'):
+ return None
+
+ # Remove file:// prefix
+ path_str = self.uri.replace('file://', '', 1)
+
+ # Handle old format: file:///data/... -> DATA_DIR/...
+ if path_str.startswith('/data/'):
+ return CONSTANTS.DATA_DIR / path_str.replace('/data/', '', 1)
+
+ # Handle new format: file:///absolute/path
+ return Path(path_str)
+
@property
def snapshot_set(self) -> QuerySet['Snapshot']:
from core.models import Snapshot
@@ -136,6 +173,23 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
def __str__(self):
return f'[{self.id}] {self.seed.uri[:64] if self.seed else ""}'
+ def save(self, *args, **kwargs):
+ is_new = self._state.adding
+ super().save(*args, **kwargs)
+ if is_new:
+ from archivebox.misc.logging_util import log_worker_event
+ log_worker_event(
+ worker_type='DB',
+ event='Created Crawl',
+ indent_level=1,
+ metadata={
+ 'id': str(self.id),
+ 'seed_uri': str(self.seed.uri)[:64] if self.seed else None,
+ 'max_depth': self.max_depth,
+ 'status': self.status,
+ },
+ )
+
@classmethod
def from_seed(cls, seed: Seed, max_depth: int = 0, persona: str = 'Default', tags_str: str = '', config=None, created_by=None):
crawl, _ = cls.objects.get_or_create(
diff --git a/archivebox/crawls/statemachines.py b/archivebox/crawls/statemachines.py
index 6b68e15b..a71cd010 100644
--- a/archivebox/crawls/statemachines.py
+++ b/archivebox/crawls/statemachines.py
@@ -36,13 +36,19 @@ class CrawlMachine(StateMachine, strict_states=True):
super().__init__(crawl, *args, **kwargs)
def __repr__(self) -> str:
- return f'[grey53]Crawl\\[{self.crawl.id}] 🏃♂️ Worker\\[pid={os.getpid()}].tick()[/grey53] [blue]{self.crawl.status.upper()}[/blue] ⚙️ [grey37]Machine[/grey37]'
-
+ return f'Crawl[{self.crawl.id}]'
+
def __str__(self) -> str:
return self.__repr__()
def can_start(self) -> bool:
- return bool(self.crawl.seed and self.crawl.seed.uri)
+ if not self.crawl.seed:
+ print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: no seed[/red]')
+ return False
+ if not self.crawl.seed.uri:
+ print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: seed has no URI[/red]')
+ return False
+ return True
def is_finished(self) -> bool:
from core.models import Snapshot, ArchiveResult
@@ -73,25 +79,121 @@ class CrawlMachine(StateMachine, strict_states=True):
@started.enter
def enter_started(self):
- print(f'{self}.on_started(): [blue]↳ STARTED[/blue] crawl.run()')
+ # Suppressed: state transition logs
# lock the crawl object while we create snapshots
self.crawl.update_for_workers(
retry_at=timezone.now() + timedelta(seconds=5),
status=Crawl.StatusChoices.QUEUED,
)
- # Run the crawl - creates root snapshot and processes queued URLs
- self.crawl.run()
+ try:
+ # Run on_Crawl hooks to validate/install dependencies
+ self._run_crawl_hooks()
- # only update status to STARTED once snapshots are created
- self.crawl.update_for_workers(
- retry_at=timezone.now() + timedelta(seconds=5),
- status=Crawl.StatusChoices.STARTED,
+ # Run the crawl - creates root snapshot and processes queued URLs
+ self.crawl.run()
+
+ # only update status to STARTED once snapshots are created
+ self.crawl.update_for_workers(
+ retry_at=timezone.now() + timedelta(seconds=5),
+ status=Crawl.StatusChoices.STARTED,
+ )
+ except Exception as e:
+ print(f'[red]⚠️ Crawl {self.crawl.id} failed to start: {e}[/red]')
+ import traceback
+ traceback.print_exc()
+ # Re-raise so the worker knows it failed
+ raise
+
+ def _run_crawl_hooks(self):
+ """Run on_Crawl hooks to validate/install dependencies."""
+ from pathlib import Path
+ from archivebox.hooks import run_hooks, discover_hooks
+ from archivebox.config import CONSTANTS
+
+ # Discover and run all on_Crawl hooks
+ hooks = discover_hooks('Crawl')
+ if not hooks:
+ return
+
+ # Create a temporary output directory for hook results
+ output_dir = Path(CONSTANTS.DATA_DIR) / 'tmp' / f'crawl_{self.crawl.id}'
+ output_dir.mkdir(parents=True, exist_ok=True)
+
+ # Run all on_Crawl hooks
+ results = run_hooks(
+ event_name='Crawl',
+ output_dir=output_dir,
+ timeout=60,
+ config_objects=[self.crawl, self.crawl.seed] if self.crawl.seed else [self.crawl],
+ crawl_id=str(self.crawl.id),
+ seed_uri=self.crawl.seed.uri if self.crawl.seed else '',
)
- @sealed.enter
+ # Process hook results - parse JSONL output and create DB objects
+ self._process_hook_results(results)
+
+ def _process_hook_results(self, results: list):
+ """Process JSONL output from hooks to create InstalledBinary and update Machine config."""
+ import json
+ from machine.models import Machine, InstalledBinary
+
+ machine = Machine.current()
+
+ for result in results:
+ if result['returncode'] != 0:
+ # Hook failed - might indicate missing dependency
+ continue
+
+ # Parse JSONL output
+ for line in result['stdout'].strip().split('\n'):
+ if not line.strip():
+ continue
+
+ try:
+ obj = json.loads(line)
+ obj_type = obj.get('type')
+
+ if obj_type == 'InstalledBinary':
+ # Create or update InstalledBinary record
+ # Skip if essential fields are missing
+ if not obj.get('name') or not obj.get('abspath') or not obj.get('version'):
+ continue
+
+ InstalledBinary.objects.update_or_create(
+ machine=machine,
+ name=obj['name'],
+ defaults={
+ 'abspath': obj['abspath'],
+ 'version': obj['version'],
+ 'sha256': obj.get('sha256') or '',
+ 'binprovider': obj.get('binprovider') or 'env',
+ }
+ )
+
+ elif obj_type == 'Machine':
+ # Update Machine config
+ method = obj.get('_method', 'update')
+ if method == 'update':
+ key = obj.get('key', '')
+ value = obj.get('value')
+ if key.startswith('config/'):
+ config_key = key[7:] # Remove 'config/' prefix
+ machine.config[config_key] = value
+ machine.save(update_fields=['config'])
+
+ elif obj_type == 'Dependency':
+ # Dependency request - could trigger installation
+ # For now just log it (installation hooks would be separate)
+ print(f'[yellow]Dependency requested: {obj.get("bin_name")}[/yellow]')
+
+ except json.JSONDecodeError:
+ # Not JSON, skip
+ continue
+
+ @sealed.enter
def enter_sealed(self):
- print(f'{self}.on_sealed(): [blue]↳ SEALED[/blue] crawl.retry_at=None')
+ # Suppressed: state transition logs
self.crawl.update_for_workers(
retry_at=None,
status=Crawl.StatusChoices.SEALED,
diff --git a/archivebox/hooks.py b/archivebox/hooks.py
index 4c2bdd09..4b06324a 100644
--- a/archivebox/hooks.py
+++ b/archivebox/hooks.py
@@ -245,6 +245,14 @@ def run_hook(
env.setdefault('USER_AGENT', str(getattr(config, 'USER_AGENT', '')))
env.setdefault('RESOLUTION', str(getattr(config, 'RESOLUTION', '')))
+ # Pass SEARCH_BACKEND_ENGINE from new-style config
+ try:
+ from archivebox.config.configset import get_config
+ search_config = get_config()
+ env.setdefault('SEARCH_BACKEND_ENGINE', str(search_config.get('SEARCH_BACKEND_ENGINE', 'ripgrep')))
+ except Exception:
+ env.setdefault('SEARCH_BACKEND_ENGINE', 'ripgrep')
+
# Create output directory if needed
output_dir.mkdir(parents=True, exist_ok=True)
diff --git a/archivebox/logs/errors.log b/archivebox/logs/errors.log
new file mode 100644
index 00000000..715cf9d3
--- /dev/null
+++ b/archivebox/logs/errors.log
@@ -0,0 +1,2 @@
+
+> /Users/squash/Local/Code/archiveboxes/archivebox-nue/archivebox/cli/archivebox_init.py --force; TS=2025-12-25__08:03:12 VERSION=0.9.0rc1 IN_DOCKER=False IS_TTY=False
diff --git a/archivebox/machine/admin.py b/archivebox/machine/admin.py
index b1796025..adb6dd19 100644
--- a/archivebox/machine/admin.py
+++ b/archivebox/machine/admin.py
@@ -12,7 +12,33 @@ class MachineAdmin(ConfigEditorMixin, BaseModelAdmin):
sort_fields = ('id', 'created_at', 'hostname', 'ips', 'os_platform', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'os_arch', 'os_family', 'os_release', 'hw_uuid')
readonly_fields = ('guid', 'created_at', 'modified_at', 'ips')
- fields = (*readonly_fields, 'hostname', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'hw_uuid', 'os_arch', 'os_family', 'os_platform', 'os_kernel', 'os_release', 'stats', 'config', 'num_uses_succeeded', 'num_uses_failed')
+
+ fieldsets = (
+ ('Identity', {
+ 'fields': ('hostname', 'guid', 'ips'),
+ 'classes': ('card',),
+ }),
+ ('Hardware', {
+ 'fields': ('hw_manufacturer', 'hw_product', 'hw_uuid', 'hw_in_docker', 'hw_in_vm'),
+ 'classes': ('card',),
+ }),
+ ('Operating System', {
+ 'fields': ('os_platform', 'os_family', 'os_arch', 'os_kernel', 'os_release'),
+ 'classes': ('card',),
+ }),
+ ('Statistics', {
+ 'fields': ('stats', 'num_uses_succeeded', 'num_uses_failed'),
+ 'classes': ('card',),
+ }),
+ ('Configuration', {
+ 'fields': ('config',),
+ 'classes': ('card', 'wide'),
+ }),
+ ('Timestamps', {
+ 'fields': ('created_at', 'modified_at'),
+ 'classes': ('card',),
+ }),
+ )
list_filter = ('hw_in_docker', 'hw_in_vm', 'os_arch', 'os_family', 'os_platform')
ordering = ['-created_at']
@@ -33,7 +59,29 @@ class NetworkInterfaceAdmin(BaseModelAdmin):
search_fields = ('id', 'machine__id', 'iface', 'ip_public', 'ip_local', 'mac_address', 'dns_server', 'hostname', 'isp', 'city', 'region', 'country')
readonly_fields = ('machine', 'created_at', 'modified_at', 'mac_address', 'ip_public', 'ip_local', 'dns_server')
- fields = (*readonly_fields, 'iface', 'hostname', 'isp', 'city', 'region', 'country', 'num_uses_succeeded', 'num_uses_failed')
+
+ fieldsets = (
+ ('Machine', {
+ 'fields': ('machine',),
+ 'classes': ('card',),
+ }),
+ ('Network', {
+ 'fields': ('iface', 'ip_public', 'ip_local', 'mac_address', 'dns_server'),
+ 'classes': ('card',),
+ }),
+ ('Location', {
+ 'fields': ('hostname', 'isp', 'city', 'region', 'country'),
+ 'classes': ('card',),
+ }),
+ ('Usage', {
+ 'fields': ('num_uses_succeeded', 'num_uses_failed'),
+ 'classes': ('card',),
+ }),
+ ('Timestamps', {
+ 'fields': ('created_at', 'modified_at'),
+ 'classes': ('card',),
+ }),
+ )
list_filter = ('isp', 'country', 'region')
ordering = ['-created_at']
@@ -54,7 +102,25 @@ class DependencyAdmin(ConfigEditorMixin, BaseModelAdmin):
search_fields = ('id', 'bin_name', 'bin_providers')
readonly_fields = ('id', 'created_at', 'modified_at', 'is_installed', 'installed_count')
- fields = ('bin_name', 'bin_providers', 'custom_cmds', 'config', *readonly_fields)
+
+ fieldsets = (
+ ('Binary', {
+ 'fields': ('bin_name', 'bin_providers', 'is_installed', 'installed_count'),
+ 'classes': ('card',),
+ }),
+ ('Commands', {
+ 'fields': ('custom_cmds',),
+ 'classes': ('card',),
+ }),
+ ('Configuration', {
+ 'fields': ('config',),
+ 'classes': ('card', 'wide'),
+ }),
+ ('Timestamps', {
+ 'fields': ('id', 'created_at', 'modified_at'),
+ 'classes': ('card',),
+ }),
+ )
list_filter = ('bin_providers', 'created_at')
ordering = ['-created_at']
@@ -82,7 +148,29 @@ class InstalledBinaryAdmin(BaseModelAdmin):
search_fields = ('id', 'machine__id', 'name', 'binprovider', 'version', 'abspath', 'sha256', 'dependency__bin_name')
readonly_fields = ('created_at', 'modified_at')
- fields = ('machine', 'dependency', 'name', 'binprovider', 'abspath', 'version', 'sha256', *readonly_fields, 'num_uses_succeeded', 'num_uses_failed')
+
+ fieldsets = (
+ ('Binary Info', {
+ 'fields': ('name', 'dependency', 'binprovider'),
+ 'classes': ('card',),
+ }),
+ ('Location', {
+ 'fields': ('machine', 'abspath'),
+ 'classes': ('card',),
+ }),
+ ('Version', {
+ 'fields': ('version', 'sha256'),
+ 'classes': ('card',),
+ }),
+ ('Usage', {
+ 'fields': ('num_uses_succeeded', 'num_uses_failed'),
+ 'classes': ('card',),
+ }),
+ ('Timestamps', {
+ 'fields': ('created_at', 'modified_at'),
+ 'classes': ('card',),
+ }),
+ )
list_filter = ('name', 'binprovider', 'machine_id', 'dependency')
ordering = ['-created_at']
diff --git a/archivebox/misc/logging_util.py b/archivebox/misc/logging_util.py
index 469b705b..766eed98 100644
--- a/archivebox/misc/logging_util.py
+++ b/archivebox/misc/logging_util.py
@@ -544,16 +544,21 @@ def log_worker_event(
# Build worker identifier
worker_parts = [worker_type]
- if pid:
+ # Don't add pid/worker_id for DB operations (they happen in whatever process is running)
+ if pid and worker_type != 'DB':
worker_parts.append(f'pid={pid}')
- if worker_id and worker_type in ('CrawlWorker', 'Orchestrator'):
+ if worker_id and worker_type in ('CrawlWorker', 'Orchestrator') and worker_type != 'DB':
worker_parts.append(f'id={worker_id}')
- if url and worker_type == 'SnapshotWorker':
+ if url and worker_type in ('SnapshotWorker', 'DB'):
worker_parts.append(f'url={truncate_url(url)}')
- if extractor and worker_type == 'ArchiveResultWorker':
+ if extractor and worker_type in ('ArchiveResultWorker', 'DB'):
worker_parts.append(f'extractor={extractor}')
- worker_label = f'{worker_parts[0]}[{", ".join(worker_parts[1:])}]'
+ # Format worker label - only add brackets if there are additional identifiers
+ if len(worker_parts) > 1:
+ worker_label = f'{worker_parts[0]}[{", ".join(worker_parts[1:])}]'
+ else:
+ worker_label = worker_parts[0]
# Build metadata string
metadata_str = ''
@@ -579,12 +584,14 @@ def log_worker_event(
meta_parts.append(f'{k}: {len(v)}')
else:
meta_parts.append(f'{k}: {v}')
- metadata_str = ' {' + ', '.join(meta_parts) + '}'
+ metadata_str = ' | '.join(meta_parts)
# Determine color based on event
color = 'white'
if event in ('Starting...', 'Started', 'STARTED', 'Started in background'):
color = 'green'
+ elif event.startswith('Created'):
+ color = 'cyan' # DB creation events
elif event in ('Processing...', 'PROCESSING'):
color = 'blue'
elif event in ('Completed', 'COMPLETED', 'All work complete'):
@@ -606,8 +613,9 @@ def log_worker_event(
text.append(indent) # Indentation
# Append worker label and event with color
text.append(f'{worker_label} {event}{error_str}', style=color)
- # Append metadata without color
- text.append(metadata_str)
+ # Append metadata without color (add separator if metadata exists)
+ if metadata_str:
+ text.append(f' | {metadata_str}')
CONSOLE.print(text)
diff --git a/archivebox/plugins/accessibility/on_Snapshot__39_accessibility.js b/archivebox/plugins/accessibility/on_Snapshot__39_accessibility.js
index 88f90fb4..c509be9a 100755
--- a/archivebox/plugins/accessibility/on_Snapshot__39_accessibility.js
+++ b/archivebox/plugins/accessibility/on_Snapshot__39_accessibility.js
@@ -21,9 +21,9 @@ const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'accessibility';
-const OUTPUT_DIR = 'accessibility';
+const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'accessibility.json';
-const CHROME_SESSION_DIR = 'chrome_session';
+const CHROME_SESSION_DIR = '../chrome_session';
// Parse command line arguments
function parseArgs() {
@@ -60,10 +60,7 @@ function getCdpUrl() {
// Extract accessibility info
async function extractAccessibility(url) {
- // Create output directory
- if (!fs.existsSync(OUTPUT_DIR)) {
- fs.mkdirSync(OUTPUT_DIR, { recursive: true });
- }
+ // Output directory is current directory (hook already runs in output dir)
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
let browser = null;
diff --git a/archivebox/plugins/archive_org/on_Snapshot__13_archive_org.py b/archivebox/plugins/archive_org/on_Snapshot__13_archive_org.py
index f9eca9bf..1fbd0a6b 100644
--- a/archivebox/plugins/archive_org/on_Snapshot__13_archive_org.py
+++ b/archivebox/plugins/archive_org/on_Snapshot__13_archive_org.py
@@ -24,7 +24,7 @@ import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'archive_org'
-OUTPUT_DIR = 'archive_org'
+OUTPUT_DIR = '.'
OUTPUT_FILE = 'archive.org.txt'
diff --git a/archivebox/plugins/chrome_cleanup/on_Snapshot__45_chrome_cleanup.py b/archivebox/plugins/chrome_cleanup/on_Snapshot__45_chrome_cleanup.py
index 2baedcad..fae91ffb 100644
--- a/archivebox/plugins/chrome_cleanup/on_Snapshot__45_chrome_cleanup.py
+++ b/archivebox/plugins/chrome_cleanup/on_Snapshot__45_chrome_cleanup.py
@@ -26,7 +26,7 @@ import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'chrome_cleanup'
-CHROME_SESSION_DIR = 'chrome_session'
+CHROME_SESSION_DIR = '../chrome_session'
def get_env(name: str, default: str = '') -> str:
diff --git a/archivebox/plugins/chrome_navigate/on_Snapshot__30_chrome_navigate.js b/archivebox/plugins/chrome_navigate/on_Snapshot__30_chrome_navigate.js
index c883a74f..b34c8c96 100644
--- a/archivebox/plugins/chrome_navigate/on_Snapshot__30_chrome_navigate.js
+++ b/archivebox/plugins/chrome_navigate/on_Snapshot__30_chrome_navigate.js
@@ -31,7 +31,7 @@ const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'chrome_navigate';
-const CHROME_SESSION_DIR = 'chrome_session';
+const CHROME_SESSION_DIR = '../chrome_session';
// Parse command line arguments
function parseArgs() {
diff --git a/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.js b/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.js
index ba72f2a2..fc90aa03 100755
--- a/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.js
+++ b/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.js
@@ -21,9 +21,9 @@ const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'consolelog';
-const OUTPUT_DIR = 'consolelog';
+const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'console.jsonl';
-const CHROME_SESSION_DIR = 'chrome_session';
+const CHROME_SESSION_DIR = '../chrome_session';
// Parse command line arguments
function parseArgs() {
@@ -86,10 +86,7 @@ async function serializeArgs(args) {
async function captureConsoleLogs(url) {
const captureTimeout = (getEnvInt('CONSOLELOG_TIMEOUT') || 5) * 1000;
- // Create output directory
- if (!fs.existsSync(OUTPUT_DIR)) {
- fs.mkdirSync(OUTPUT_DIR, { recursive: true });
- }
+ // Output directory is current directory (hook already runs in output dir)
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
// Clear existing file
diff --git a/archivebox/plugins/dom/on_Snapshot__36_dom.js b/archivebox/plugins/dom/on_Snapshot__36_dom.js
index b3b65614..6020ed55 100644
--- a/archivebox/plugins/dom/on_Snapshot__36_dom.js
+++ b/archivebox/plugins/dom/on_Snapshot__36_dom.js
@@ -24,9 +24,9 @@ const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'dom';
-const OUTPUT_DIR = 'dom';
+const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'output.html';
-const CHROME_SESSION_DIR = 'chrome_session';
+const CHROME_SESSION_DIR = '../chrome_session';
// Parse command line arguments
function parseArgs() {
@@ -58,7 +58,7 @@ function getEnvInt(name, defaultValue = 0) {
}
// Check if staticfile extractor already downloaded this URL
-const STATICFILE_DIR = 'staticfile';
+const STATICFILE_DIR = '../staticfile';
function hasStaticFileOutput() {
return fs.existsSync(STATICFILE_DIR) && fs.readdirSync(STATICFILE_DIR).length > 0;
}
@@ -114,10 +114,7 @@ async function dumpDom(url) {
const { width, height } = parseResolution(resolution);
- // Create output directory
- if (!fs.existsSync(OUTPUT_DIR)) {
- fs.mkdirSync(OUTPUT_DIR, { recursive: true });
- }
+ // Output directory is current directory (hook already runs in output dir)
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
let browser = null;
diff --git a/archivebox/plugins/favicon/on_Snapshot__11_favicon.py b/archivebox/plugins/favicon/on_Snapshot__11_favicon.py
index 61280af2..78c9e4b3 100644
--- a/archivebox/plugins/favicon/on_Snapshot__11_favicon.py
+++ b/archivebox/plugins/favicon/on_Snapshot__11_favicon.py
@@ -26,7 +26,7 @@ import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'favicon'
-OUTPUT_DIR = 'favicon'
+OUTPUT_DIR = '.'
OUTPUT_FILE = 'favicon.ico'
diff --git a/archivebox/plugins/git/on_Snapshot__12_git.py b/archivebox/plugins/git/on_Snapshot__12_git.py
index 4d2db822..16e0c43e 100644
--- a/archivebox/plugins/git/on_Snapshot__12_git.py
+++ b/archivebox/plugins/git/on_Snapshot__12_git.py
@@ -26,7 +26,7 @@ import rich_click as click
EXTRACTOR_NAME = 'git'
BIN_NAME = 'git'
BIN_PROVIDERS = 'apt,brew,env'
-OUTPUT_DIR = 'repo'
+OUTPUT_DIR = '.'
def get_env(name: str, default: str = '') -> str:
diff --git a/archivebox/plugins/headers/on_Snapshot__33_headers.js b/archivebox/plugins/headers/on_Snapshot__33_headers.js
index 79ba3eed..5ead49f5 100644
--- a/archivebox/plugins/headers/on_Snapshot__33_headers.js
+++ b/archivebox/plugins/headers/on_Snapshot__33_headers.js
@@ -22,9 +22,9 @@ const http = require('http');
// Extractor metadata
const EXTRACTOR_NAME = 'headers';
-const OUTPUT_DIR = 'headers';
+const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'headers.json';
-const CHROME_SESSION_DIR = 'chrome_session';
+const CHROME_SESSION_DIR = '../chrome_session';
const CHROME_HEADERS_FILE = 'response_headers.json';
// Parse command line arguments
@@ -110,10 +110,7 @@ function fetchHeaders(url) {
}
async function extractHeaders(url) {
- // Create output directory
- if (!fs.existsSync(OUTPUT_DIR)) {
- fs.mkdirSync(OUTPUT_DIR, { recursive: true });
- }
+ // Output directory is current directory (hook already runs in output dir)
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
// Try Chrome session first
diff --git a/archivebox/plugins/htmltotext/on_Snapshot__54_htmltotext.py b/archivebox/plugins/htmltotext/on_Snapshot__54_htmltotext.py
index 43a53b30..21293014 100644
--- a/archivebox/plugins/htmltotext/on_Snapshot__54_htmltotext.py
+++ b/archivebox/plugins/htmltotext/on_Snapshot__54_htmltotext.py
@@ -28,7 +28,7 @@ import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'htmltotext'
-OUTPUT_DIR = 'htmltotext'
+OUTPUT_DIR = '.'
OUTPUT_FILE = 'htmltotext.txt'
@@ -114,9 +114,8 @@ def extract_htmltotext(url: str) -> tuple[bool, str | None, str]:
if not text or len(text) < 10:
return False, None, 'No meaningful text extracted from HTML'
- # Create output directory and write output
+ # Output directory is current directory (hook already runs in output dir)
output_dir = Path(OUTPUT_DIR)
- output_dir.mkdir(exist_ok=True)
output_path = output_dir / OUTPUT_FILE
output_path.write_text(text, encoding='utf-8')
diff --git a/archivebox/plugins/media/on_Snapshot__51_media.py b/archivebox/plugins/media/on_Snapshot__51_media.py
index 552f5258..1677fc2c 100644
--- a/archivebox/plugins/media/on_Snapshot__51_media.py
+++ b/archivebox/plugins/media/on_Snapshot__51_media.py
@@ -39,7 +39,7 @@ import rich_click as click
EXTRACTOR_NAME = 'media'
BIN_NAME = 'yt-dlp'
BIN_PROVIDERS = 'pip,apt,brew,env'
-OUTPUT_DIR = 'media'
+OUTPUT_DIR = '.'
def get_env(name: str, default: str = '') -> str:
@@ -62,7 +62,7 @@ def get_env_int(name: str, default: int = 0) -> int:
return default
-STATICFILE_DIR = 'staticfile'
+STATICFILE_DIR = '../staticfile'
def has_staticfile_output() -> bool:
"""Check if staticfile extractor already downloaded this URL."""
@@ -129,9 +129,8 @@ def save_media(url: str, binary: str) -> tuple[bool, str | None, str]:
extra_args = get_env('YTDLP_EXTRA_ARGS') or get_env('YOUTUBEDL_EXTRA_ARGS', '')
media_max_size = get_env('MEDIA_MAX_SIZE', '750m')
- # Create output directory
+ # Output directory is current directory (hook already runs in output dir)
output_dir = Path(OUTPUT_DIR)
- output_dir.mkdir(exist_ok=True)
# Build command (later options take precedence)
cmd = [
diff --git a/archivebox/plugins/mercury/on_Snapshot__53_mercury.py b/archivebox/plugins/mercury/on_Snapshot__53_mercury.py
index e9b5f63a..efd3ed6b 100644
--- a/archivebox/plugins/mercury/on_Snapshot__53_mercury.py
+++ b/archivebox/plugins/mercury/on_Snapshot__53_mercury.py
@@ -27,7 +27,7 @@ import rich_click as click
EXTRACTOR_NAME = 'mercury'
BIN_NAME = 'postlight-parser'
BIN_PROVIDERS = 'npm,env'
-OUTPUT_DIR = 'mercury'
+OUTPUT_DIR = '.'
def get_env(name: str, default: str = '') -> str:
@@ -72,9 +72,8 @@ def extract_mercury(url: str, binary: str) -> tuple[bool, str | None, str]:
"""
timeout = get_env_int('TIMEOUT', 60)
- # Create output directory
+ # Output directory is current directory (hook already runs in output dir)
output_dir = Path(OUTPUT_DIR)
- output_dir.mkdir(exist_ok=True)
try:
# Get text version
diff --git a/archivebox/plugins/parse_dom_outlinks/on_Snapshot__40_parse_dom_outlinks.js b/archivebox/plugins/parse_dom_outlinks/on_Snapshot__40_parse_dom_outlinks.js
index 9cff5e33..72708e95 100755
--- a/archivebox/plugins/parse_dom_outlinks/on_Snapshot__40_parse_dom_outlinks.js
+++ b/archivebox/plugins/parse_dom_outlinks/on_Snapshot__40_parse_dom_outlinks.js
@@ -24,10 +24,10 @@ const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'parse_dom_outlinks';
-const OUTPUT_DIR = 'parse_dom_outlinks';
+const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'outlinks.json';
const URLS_FILE = 'urls.jsonl'; // For crawl system
-const CHROME_SESSION_DIR = 'chrome_session';
+const CHROME_SESSION_DIR = '../chrome_session';
// Parse command line arguments
function parseArgs() {
@@ -64,10 +64,7 @@ function getCdpUrl() {
// Extract outlinks
async function extractOutlinks(url) {
- // Create output directory
- if (!fs.existsSync(OUTPUT_DIR)) {
- fs.mkdirSync(OUTPUT_DIR, { recursive: true });
- }
+ // Output directory is current directory (hook already runs in output dir)
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
let browser = null;
diff --git a/archivebox/plugins/pdf/on_Snapshot__35_pdf.js b/archivebox/plugins/pdf/on_Snapshot__35_pdf.js
index c6967b46..e4787be7 100644
--- a/archivebox/plugins/pdf/on_Snapshot__35_pdf.js
+++ b/archivebox/plugins/pdf/on_Snapshot__35_pdf.js
@@ -23,9 +23,9 @@ const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'pdf';
-const OUTPUT_DIR = 'pdf';
+const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'output.pdf';
-const CHROME_SESSION_DIR = 'chrome_session';
+const CHROME_SESSION_DIR = '../chrome_session';
// Parse command line arguments
function parseArgs() {
@@ -57,7 +57,7 @@ function getEnvInt(name, defaultValue = 0) {
}
// Check if staticfile extractor already downloaded this URL
-const STATICFILE_DIR = 'staticfile';
+const STATICFILE_DIR = '../staticfile';
function hasStaticFileOutput() {
return fs.existsSync(STATICFILE_DIR) && fs.readdirSync(STATICFILE_DIR).length > 0;
}
@@ -113,10 +113,7 @@ async function printToPdf(url) {
const { width, height } = parseResolution(resolution);
- // Create output directory
- if (!fs.existsSync(OUTPUT_DIR)) {
- fs.mkdirSync(OUTPUT_DIR, { recursive: true });
- }
+ // Output directory is current directory (hook already runs in output dir)
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
let browser = null;
diff --git a/archivebox/plugins/readability/on_Snapshot__52_readability.py b/archivebox/plugins/readability/on_Snapshot__52_readability.py
index 165bc71c..a161e03f 100644
--- a/archivebox/plugins/readability/on_Snapshot__52_readability.py
+++ b/archivebox/plugins/readability/on_Snapshot__52_readability.py
@@ -29,7 +29,7 @@ import rich_click as click
EXTRACTOR_NAME = 'readability'
BIN_NAME = 'readability-extractor'
BIN_PROVIDERS = 'npm,env'
-OUTPUT_DIR = 'readability'
+OUTPUT_DIR = '.'
def get_env(name: str, default: str = '') -> str:
@@ -101,9 +101,8 @@ def extract_readability(url: str, binary: str) -> tuple[bool, str | None, str]:
if not html_source:
return False, None, 'No HTML source found (run singlefile, dom, or wget first)'
- # Create output directory
+ # Output directory is current directory (hook already runs in output dir)
output_dir = Path(OUTPUT_DIR)
- output_dir.mkdir(exist_ok=True)
try:
# Run readability-extractor (outputs JSON by default)
diff --git a/archivebox/plugins/redirects/on_Snapshot__22_redirects.js b/archivebox/plugins/redirects/on_Snapshot__22_redirects.js
index 3aba0581..aaa43232 100755
--- a/archivebox/plugins/redirects/on_Snapshot__22_redirects.js
+++ b/archivebox/plugins/redirects/on_Snapshot__22_redirects.js
@@ -21,9 +21,9 @@ const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'redirects';
-const OUTPUT_DIR = 'redirects';
+const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'redirects.json';
-const CHROME_SESSION_DIR = 'chrome_session';
+const CHROME_SESSION_DIR = '../chrome_session';
// Parse command line arguments
function parseArgs() {
@@ -60,10 +60,7 @@ function getCdpUrl() {
// Track redirect chain
async function trackRedirects(url) {
- // Create output directory
- if (!fs.existsSync(OUTPUT_DIR)) {
- fs.mkdirSync(OUTPUT_DIR, { recursive: true });
- }
+ // Output directory is current directory (hook already runs in output dir)
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
let browser = null;
diff --git a/archivebox/plugins/responses/on_Snapshot__24_responses.js b/archivebox/plugins/responses/on_Snapshot__24_responses.js
index f5094dea..c69743b4 100755
--- a/archivebox/plugins/responses/on_Snapshot__24_responses.js
+++ b/archivebox/plugins/responses/on_Snapshot__24_responses.js
@@ -26,8 +26,8 @@ const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'responses';
-const OUTPUT_DIR = 'responses';
-const CHROME_SESSION_DIR = 'chrome_session';
+const OUTPUT_DIR = '.';
+const CHROME_SESSION_DIR = '../chrome_session';
// Resource types to capture (by default, capture everything)
const DEFAULT_TYPES = ['script', 'stylesheet', 'font', 'image', 'media', 'xhr', 'websocket'];
@@ -149,10 +149,8 @@ async function archiveResponses(originalUrl) {
const typesStr = getEnv('RESPONSES_TYPES', DEFAULT_TYPES.join(','));
const typesToSave = typesStr.split(',').map(t => t.trim().toLowerCase());
- // Create output directories
- if (!fs.existsSync(OUTPUT_DIR)) {
- fs.mkdirSync(OUTPUT_DIR, { recursive: true });
- }
+ // Output directory is current directory (hook already runs in output dir)
+ // Create subdirectories for organizing responses
const allDir = path.join(OUTPUT_DIR, 'all');
if (!fs.existsSync(allDir)) {
fs.mkdirSync(allDir, { recursive: true });
diff --git a/archivebox/plugins/screenshot/on_Snapshot__34_screenshot.js b/archivebox/plugins/screenshot/on_Snapshot__34_screenshot.js
index 35465ef1..db9b6467 100644
--- a/archivebox/plugins/screenshot/on_Snapshot__34_screenshot.js
+++ b/archivebox/plugins/screenshot/on_Snapshot__34_screenshot.js
@@ -23,9 +23,9 @@ const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'screenshot';
-const OUTPUT_DIR = 'screenshot';
+const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'screenshot.png';
-const CHROME_SESSION_DIR = 'chrome_session';
+const CHROME_SESSION_DIR = '../chrome_session';
// Parse command line arguments
function parseArgs() {
@@ -57,7 +57,7 @@ function getEnvInt(name, defaultValue = 0) {
}
// Check if staticfile extractor already downloaded this URL
-const STATICFILE_DIR = 'staticfile';
+const STATICFILE_DIR = '../staticfile';
function hasStaticFileOutput() {
return fs.existsSync(STATICFILE_DIR) && fs.readdirSync(STATICFILE_DIR).length > 0;
}
@@ -116,10 +116,7 @@ async function takeScreenshot(url) {
const { width, height } = parseResolution(resolution);
- // Create output directory
- if (!fs.existsSync(OUTPUT_DIR)) {
- fs.mkdirSync(OUTPUT_DIR, { recursive: true });
- }
+ // Output directory is current directory (hook already runs in output dir)
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
let browser = null;
diff --git a/archivebox/plugins/search_backend_ripgrep/on_Crawl__00_validate_ripgrep.py b/archivebox/plugins/search_backend_ripgrep/on_Crawl__00_validate_ripgrep.py
new file mode 100755
index 00000000..714b36df
--- /dev/null
+++ b/archivebox/plugins/search_backend_ripgrep/on_Crawl__00_validate_ripgrep.py
@@ -0,0 +1,131 @@
+#!/usr/bin/env python3
+"""
+Validation hook for ripgrep binary.
+
+Only runs if SEARCH_BACKEND_ENGINE is set to 'ripgrep'.
+Outputs JSONL for InstalledBinary and Machine config updates.
+"""
+
+import os
+import sys
+import json
+import shutil
+import hashlib
+import subprocess
+from pathlib import Path
+
+
+def get_binary_version(abspath: str) -> str | None:
+ """Get version string from ripgrep binary."""
+ try:
+ result = subprocess.run(
+ [abspath, '--version'],
+ capture_output=True,
+ text=True,
+ timeout=5,
+ )
+ if result.returncode == 0 and result.stdout:
+ # ripgrep version string: "ripgrep 14.1.0"
+ first_line = result.stdout.strip().split('\n')[0]
+ parts = first_line.split()
+ for i, part in enumerate(parts):
+ if part.lower() == 'ripgrep' and i + 1 < len(parts):
+ return parts[i + 1]
+ # Try to find version number pattern
+ for part in parts:
+ if part[0].isdigit() and '.' in part:
+ return part
+ return first_line[:32]
+ except Exception:
+ pass
+ return None
+
+
+def get_binary_hash(abspath: str) -> str | None:
+ """Get SHA256 hash of binary."""
+ try:
+ with open(abspath, 'rb') as f:
+ return hashlib.sha256(f.read()).hexdigest()
+ except Exception:
+ return None
+
+
+def find_ripgrep() -> dict | None:
+ """Find ripgrep binary using shutil.which or env var."""
+ # Check env var first - if it's an absolute path and exists, use it
+ ripgrep_env = os.environ.get('RIPGREP_BINARY', '')
+ if ripgrep_env and '/' in ripgrep_env and Path(ripgrep_env).is_file():
+ abspath = ripgrep_env
+ else:
+ # Otherwise try shutil.which with the env var as the binary name
+ abspath = shutil.which(ripgrep_env) if ripgrep_env else None
+ if not abspath:
+ abspath = shutil.which('rg')
+
+ if abspath and Path(abspath).is_file():
+ return {
+ 'name': 'rg',
+ 'abspath': abspath,
+ 'version': get_binary_version(abspath),
+ 'sha256': get_binary_hash(abspath),
+ 'binprovider': 'env',
+ }
+
+ return None
+
+
+def main():
+ """Validate ripgrep binary and output JSONL."""
+
+ # Check if ripgrep search backend is enabled
+ search_backend = os.environ.get('SEARCH_BACKEND_ENGINE', '').lower()
+
+ if search_backend != 'ripgrep':
+ # No-op: ripgrep is not the active search backend
+ sys.exit(0)
+
+ result = find_ripgrep()
+
+ if result and result.get('abspath'):
+ # Output InstalledBinary
+ print(json.dumps({
+ 'type': 'InstalledBinary',
+ 'name': result['name'],
+ 'abspath': result['abspath'],
+ 'version': result['version'],
+ 'sha256': result['sha256'],
+ 'binprovider': result['binprovider'],
+ }))
+
+ # Output Machine config update
+ print(json.dumps({
+ 'type': 'Machine',
+ '_method': 'update',
+ 'key': 'config/RIPGREP_BINARY',
+ 'value': result['abspath'],
+ }))
+
+ if result['version']:
+ print(json.dumps({
+ 'type': 'Machine',
+ '_method': 'update',
+ 'key': 'config/RIPGREP_VERSION',
+ 'value': result['version'],
+ }))
+
+ sys.exit(0)
+ else:
+ # Output Dependency request
+ print(json.dumps({
+ 'type': 'Dependency',
+ 'bin_name': 'rg',
+ 'bin_providers': 'apt,brew,cargo,env',
+ }))
+
+ # Exit non-zero to indicate binary not found
+ print(f"ripgrep binary not found", file=sys.stderr)
+ sys.exit(1)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/archivebox/plugins/search_backend_ripgrep/tests/__init__.py b/archivebox/plugins/search_backend_ripgrep/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/archivebox/plugins/search_backend_ripgrep/tests/test_ripgrep_detection.py b/archivebox/plugins/search_backend_ripgrep/tests/test_ripgrep_detection.py
new file mode 100644
index 00000000..5e36f5bf
--- /dev/null
+++ b/archivebox/plugins/search_backend_ripgrep/tests/test_ripgrep_detection.py
@@ -0,0 +1,306 @@
+#!/usr/bin/env python3
+"""
+Tests for ripgrep binary detection and archivebox install functionality.
+
+Guards against regressions in:
+1. Machine.config overrides not being used in version command
+2. Ripgrep hook not resolving binary names via shutil.which()
+3. SEARCH_BACKEND_ENGINE not being passed to hook environment
+"""
+
+import os
+import sys
+import json
+import shutil
+import tempfile
+import subprocess
+from pathlib import Path
+from unittest.mock import patch, MagicMock
+
+import pytest
+
+
+def test_ripgrep_hook_detects_binary_from_path():
+ """Test that ripgrep hook finds binary using shutil.which() when env var is just a name."""
+ hook_path = Path(__file__).parent.parent / 'on_Crawl__00_validate_ripgrep.py'
+
+ # Skip if rg is not installed
+ if not shutil.which('rg'):
+ pytest.skip("ripgrep (rg) not installed")
+
+ # Set SEARCH_BACKEND_ENGINE to enable the hook
+ env = os.environ.copy()
+ env['SEARCH_BACKEND_ENGINE'] = 'ripgrep'
+ env['RIPGREP_BINARY'] = 'rg' # Just the name, not the full path (this was the bug)
+
+ result = subprocess.run(
+ [sys.executable, str(hook_path)],
+ capture_output=True,
+ text=True,
+ env=env,
+ timeout=10,
+ )
+
+ assert result.returncode == 0, f"Hook failed: {result.stderr}"
+
+ # Parse JSONL output
+ lines = [line for line in result.stdout.strip().split('\n') if line.strip()]
+ assert len(lines) >= 2, "Expected at least 2 JSONL lines (InstalledBinary + Machine config)"
+
+ installed_binary = json.loads(lines[0])
+ assert installed_binary['type'] == 'InstalledBinary'
+ assert installed_binary['name'] == 'rg'
+ assert '/' in installed_binary['abspath'], "Expected full path, not just binary name"
+ assert Path(installed_binary['abspath']).is_file(), "Binary path should exist"
+ assert installed_binary['version'], "Version should be detected"
+
+ machine_config = json.loads(lines[1])
+ assert machine_config['type'] == 'Machine'
+ assert machine_config['key'] == 'config/RIPGREP_BINARY'
+ assert '/' in machine_config['value'], "Machine config should store full path"
+
+
+def test_ripgrep_hook_skips_when_backend_not_ripgrep():
+ """Test that ripgrep hook exits silently when search backend is not ripgrep."""
+ hook_path = Path(__file__).parent.parent / 'on_Crawl__00_validate_ripgrep.py'
+
+ env = os.environ.copy()
+ env['SEARCH_BACKEND_ENGINE'] = 'sqlite' # Different backend
+
+ result = subprocess.run(
+ [sys.executable, str(hook_path)],
+ capture_output=True,
+ text=True,
+ env=env,
+ timeout=10,
+ )
+
+ assert result.returncode == 0, "Hook should exit successfully when backend is not ripgrep"
+ assert result.stdout.strip() == '', "Hook should produce no output when backend is not ripgrep"
+
+
+def test_ripgrep_hook_handles_absolute_path():
+ """Test that ripgrep hook works when RIPGREP_BINARY is an absolute path."""
+ hook_path = Path(__file__).parent.parent / 'on_Crawl__00_validate_ripgrep.py'
+
+ rg_path = shutil.which('rg')
+ if not rg_path:
+ pytest.skip("ripgrep (rg) not installed")
+
+ env = os.environ.copy()
+ env['SEARCH_BACKEND_ENGINE'] = 'ripgrep'
+ env['RIPGREP_BINARY'] = rg_path # Full absolute path
+
+ result = subprocess.run(
+ [sys.executable, str(hook_path)],
+ capture_output=True,
+ text=True,
+ env=env,
+ timeout=10,
+ )
+
+ assert result.returncode == 0, f"Hook failed: {result.stderr}"
+ assert result.stdout.strip(), "Hook should produce output"
+
+ installed_binary = json.loads(result.stdout.strip().split('\n')[0])
+ assert installed_binary['abspath'] == rg_path
+
+
+@pytest.mark.django_db
+def test_machine_config_overrides_base_config():
+ """
+ Test that Machine.config overrides take precedence over base config.
+
+ Guards against regression where archivebox version was showing binaries
+ as "not installed" even though they were detected and stored in Machine.config.
+ """
+ from machine.models import Machine, InstalledBinary
+
+ machine = Machine.current()
+
+ # Simulate a hook detecting chrome and storing it with a different path than base config
+ detected_chrome_path = '/custom/path/to/chrome'
+ machine.config['CHROME_BINARY'] = detected_chrome_path
+ machine.config['CHROME_VERSION'] = '143.0.7499.170'
+ machine.save()
+
+ # Create InstalledBinary record
+ InstalledBinary.objects.create(
+ machine=machine,
+ name='chrome',
+ abspath=detected_chrome_path,
+ version='143.0.7499.170',
+ binprovider='env',
+ )
+
+ # Verify Machine.config takes precedence
+ from archivebox.config.configset import get_config
+ config = get_config()
+
+ # Machine.config should override the base config value
+ assert machine.config.get('CHROME_BINARY') == detected_chrome_path
+
+ # The version command should use Machine.config, not base config
+ # (Base config might have 'chromium' while Machine.config has the full path)
+ bin_value = machine.config.get('CHROME_BINARY') or config.get('CHROME_BINARY', '')
+ assert bin_value == detected_chrome_path, \
+ "Machine.config override should take precedence over base config"
+
+
+@pytest.mark.django_db
+def test_search_backend_engine_passed_to_hooks():
+ """
+ Test that SEARCH_BACKEND_ENGINE is passed to hook environment.
+
+ Guards against regression where hooks couldn't determine which search backend was active.
+ """
+ from pathlib import Path
+ from archivebox.hooks import build_hook_environment
+ from archivebox.config.configset import get_config
+
+ config = get_config()
+ search_backend = config.get('SEARCH_BACKEND_ENGINE', 'ripgrep')
+
+ env = build_hook_environment(overrides=None)
+
+ assert 'SEARCH_BACKEND_ENGINE' in env, \
+ "SEARCH_BACKEND_ENGINE must be in hook environment"
+ assert env['SEARCH_BACKEND_ENGINE'] == search_backend, \
+ f"Expected SEARCH_BACKEND_ENGINE={search_backend}, got {env.get('SEARCH_BACKEND_ENGINE')}"
+
+
+@pytest.mark.django_db
+def test_install_creates_installedbinary_records():
+ """
+ Test that archivebox install creates InstalledBinary records for detected binaries.
+
+ This is an integration test that verifies the full install flow.
+ """
+ from machine.models import Machine, InstalledBinary
+ from crawls.models import Seed, Crawl
+ from crawls.statemachines import CrawlMachine
+ from archivebox.base_models.models import get_or_create_system_user_pk
+
+ machine = Machine.current()
+ initial_binary_count = InstalledBinary.objects.filter(machine=machine).count()
+
+ # Create an install crawl (like archivebox install does)
+ created_by_id = get_or_create_system_user_pk()
+ seed, _ = Seed.objects.get_or_create(
+ uri='archivebox://test-install',
+ label='Test dependency detection',
+ created_by_id=created_by_id,
+ defaults={'extractor': 'auto'},
+ )
+
+ crawl = Crawl.objects.create(
+ seed=seed,
+ max_depth=0,
+ created_by_id=created_by_id,
+ status='queued',
+ )
+
+ # Run the crawl state machine (this triggers hooks)
+ sm = CrawlMachine(crawl)
+ sm.send('tick') # queued -> started (runs hooks)
+
+ # Verify InstalledBinary records were created
+ final_binary_count = InstalledBinary.objects.filter(machine=machine).count()
+ assert final_binary_count > initial_binary_count, \
+ "archivebox install should create InstalledBinary records"
+
+ # Verify at least some common binaries were detected
+ common_binaries = ['git', 'wget', 'node']
+ detected = []
+ for bin_name in common_binaries:
+ if InstalledBinary.objects.filter(machine=machine, name=bin_name).exists():
+ detected.append(bin_name)
+
+ assert detected, f"At least one of {common_binaries} should be detected"
+
+ # Verify detected binaries have valid paths and versions
+ for binary in InstalledBinary.objects.filter(machine=machine):
+ if binary.abspath: # Only check non-empty paths
+ assert '/' in binary.abspath, \
+ f"{binary.name} should have full path, not just name: {binary.abspath}"
+ # Version might be empty for some binaries, that's ok
+
+
+@pytest.mark.django_db
+def test_ripgrep_only_detected_when_backend_enabled():
+ """
+ Test that ripgrep is only detected when SEARCH_BACKEND_ENGINE='ripgrep'.
+
+ Guards against ripgrep being installed/detected when not needed.
+ """
+ from machine.models import Machine, InstalledBinary
+ from crawls.models import Seed, Crawl
+ from crawls.statemachines import CrawlMachine
+ from archivebox.base_models.models import get_or_create_system_user_pk
+ from django.conf import settings
+
+ if not shutil.which('rg'):
+ pytest.skip("ripgrep (rg) not installed")
+
+ machine = Machine.current()
+
+ # Clear any existing ripgrep records
+ InstalledBinary.objects.filter(machine=machine, name='rg').delete()
+
+ # Test 1: With ripgrep backend - should be detected
+ with patch('archivebox.config.configset.get_config') as mock_config:
+ mock_config.return_value = {'SEARCH_BACKEND_ENGINE': 'ripgrep', 'RIPGREP_BINARY': 'rg'}
+
+ created_by_id = get_or_create_system_user_pk()
+ seed = Seed.objects.create(
+ uri='archivebox://test-rg-enabled',
+ label='Test ripgrep detection enabled',
+ created_by_id=created_by_id,
+ extractor='auto',
+ )
+
+ crawl = Crawl.objects.create(
+ seed=seed,
+ max_depth=0,
+ created_by_id=created_by_id,
+ status='queued',
+ )
+
+ sm = CrawlMachine(crawl)
+ sm.send('tick')
+
+ # Ripgrep should be detected
+ rg_detected = InstalledBinary.objects.filter(machine=machine, name='rg').exists()
+ assert rg_detected, "Ripgrep should be detected when SEARCH_BACKEND_ENGINE='ripgrep'"
+
+ # Clear records again
+ InstalledBinary.objects.filter(machine=machine, name='rg').delete()
+
+ # Test 2: With different backend - should NOT be detected
+ with patch('archivebox.config.configset.get_config') as mock_config:
+ mock_config.return_value = {'SEARCH_BACKEND_ENGINE': 'sqlite', 'RIPGREP_BINARY': 'rg'}
+
+ seed2 = Seed.objects.create(
+ uri='archivebox://test-rg-disabled',
+ label='Test ripgrep detection disabled',
+ created_by_id=created_by_id,
+ extractor='auto',
+ )
+
+ crawl2 = Crawl.objects.create(
+ seed=seed2,
+ max_depth=0,
+ created_by_id=created_by_id,
+ status='queued',
+ )
+
+ sm2 = CrawlMachine(crawl2)
+ sm2.send('tick')
+
+ # Ripgrep should NOT be detected
+ rg_detected = InstalledBinary.objects.filter(machine=machine, name='rg').exists()
+ assert not rg_detected, "Ripgrep should NOT be detected when SEARCH_BACKEND_ENGINE!='ripgrep'"
+
+
+if __name__ == '__main__':
+ pytest.main([__file__, '-v'])
diff --git a/archivebox/plugins/search_backend_sonic/on_Snapshot__91_index_sonic.py b/archivebox/plugins/search_backend_sonic/on_Snapshot__91_index_sonic.py
index a5d74236..fc496e74 100644
--- a/archivebox/plugins/search_backend_sonic/on_Snapshot__91_index_sonic.py
+++ b/archivebox/plugins/search_backend_sonic/on_Snapshot__91_index_sonic.py
@@ -29,7 +29,7 @@ import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'index_sonic'
-OUTPUT_DIR = 'search_index'
+OUTPUT_DIR = '.'
# Text file patterns to index
INDEXABLE_FILES = [
diff --git a/archivebox/plugins/search_backend_sqlite/on_Snapshot__90_index_sqlite.py b/archivebox/plugins/search_backend_sqlite/on_Snapshot__90_index_sqlite.py
index 36445ded..9f5f7311 100644
--- a/archivebox/plugins/search_backend_sqlite/on_Snapshot__90_index_sqlite.py
+++ b/archivebox/plugins/search_backend_sqlite/on_Snapshot__90_index_sqlite.py
@@ -27,7 +27,7 @@ import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'index_sqlite'
-OUTPUT_DIR = 'search_index'
+OUTPUT_DIR = '.'
# Text file patterns to index, in priority order
INDEXABLE_FILES = [
diff --git a/archivebox/plugins/seo/on_Snapshot__38_seo.js b/archivebox/plugins/seo/on_Snapshot__38_seo.js
index 3effeff3..b9efbd07 100755
--- a/archivebox/plugins/seo/on_Snapshot__38_seo.js
+++ b/archivebox/plugins/seo/on_Snapshot__38_seo.js
@@ -21,9 +21,9 @@ const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'seo';
-const OUTPUT_DIR = 'seo';
+const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'seo.json';
-const CHROME_SESSION_DIR = 'chrome_session';
+const CHROME_SESSION_DIR = '../chrome_session';
// Parse command line arguments
function parseArgs() {
@@ -60,10 +60,7 @@ function getCdpUrl() {
// Extract SEO metadata
async function extractSeo(url) {
- // Create output directory
- if (!fs.existsSync(OUTPUT_DIR)) {
- fs.mkdirSync(OUTPUT_DIR, { recursive: true });
- }
+ // Output directory is current directory (hook already runs in output dir)
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
let browser = null;
diff --git a/archivebox/plugins/singlefile/on_Snapshot__04_singlefile.js b/archivebox/plugins/singlefile/on_Snapshot__04_singlefile.js
index 6d9a6710..81d23435 100755
--- a/archivebox/plugins/singlefile/on_Snapshot__04_singlefile.js
+++ b/archivebox/plugins/singlefile/on_Snapshot__04_singlefile.js
@@ -40,7 +40,7 @@ const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
const CHROME_DOWNLOADS_DIR = process.env.CHROME_DOWNLOADS_DIR ||
path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_downloads');
-const OUTPUT_DIR = 'singlefile';
+const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'singlefile.html';
/**
@@ -102,8 +102,7 @@ async function saveSinglefileWithExtension(page, extension, options = {}) {
.filter(fn => fn.endsWith('.html'))
);
- // Ensure output directory exists
- await fs.promises.mkdir(OUTPUT_DIR, { recursive: true });
+ // Output directory is current directory (hook already runs in output dir)
const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
console.log(`[🛠️] Saving SingleFile HTML using extension (${extension.id})...`);
@@ -170,8 +169,7 @@ async function saveSinglefileWithCLI(url, options = {}) {
return null;
}
- // Ensure output directory exists
- await fs.promises.mkdir(OUTPUT_DIR, { recursive: true });
+ // Output directory is current directory (hook already runs in output dir)
const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
// Build command
diff --git a/archivebox/plugins/singlefile/on_Snapshot__37_singlefile.py b/archivebox/plugins/singlefile/on_Snapshot__37_singlefile.py
index 1dfcfe23..2fa60327 100644
--- a/archivebox/plugins/singlefile/on_Snapshot__37_singlefile.py
+++ b/archivebox/plugins/singlefile/on_Snapshot__37_singlefile.py
@@ -41,7 +41,7 @@ import rich_click as click
EXTRACTOR_NAME = 'singlefile'
BIN_NAME = 'single-file'
BIN_PROVIDERS = 'npm,env'
-OUTPUT_DIR = 'singlefile'
+OUTPUT_DIR = '.'
OUTPUT_FILE = 'singlefile.html'
@@ -65,7 +65,7 @@ def get_env_int(name: str, default: int = 0) -> int:
return default
-STATICFILE_DIR = 'staticfile'
+STATICFILE_DIR = '../staticfile'
def has_staticfile_output() -> bool:
"""Check if staticfile extractor already downloaded this URL."""
@@ -135,7 +135,7 @@ def get_version(binary: str) -> str:
return ''
-CHROME_SESSION_DIR = 'chrome_session'
+CHROME_SESSION_DIR = '../chrome_session'
def get_cdp_url() -> str | None:
@@ -203,9 +203,8 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]:
if extra_args:
cmd.extend(extra_args.split())
- # Create output directory
+ # Output directory is current directory (hook already runs in output dir)
output_dir = Path(OUTPUT_DIR)
- output_dir.mkdir(exist_ok=True)
output_path = output_dir / OUTPUT_FILE
cmd.extend([url, str(output_path)])
@@ -274,7 +273,7 @@ def main(url: str, snapshot_id: str):
sys.exit(1)
version = get_version(binary)
- cmd_str = f'{binary} {url} {OUTPUT_DIR}/{OUTPUT_FILE}'
+ cmd_str = f'{binary} {url} {OUTPUT_FILE}'
# Run extraction
success, output, error = save_singlefile(url, binary)
diff --git a/archivebox/plugins/ssl/on_Snapshot__23_ssl.js b/archivebox/plugins/ssl/on_Snapshot__23_ssl.js
index 78e7592e..2ce4cd65 100755
--- a/archivebox/plugins/ssl/on_Snapshot__23_ssl.js
+++ b/archivebox/plugins/ssl/on_Snapshot__23_ssl.js
@@ -21,9 +21,9 @@ const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'ssl';
-const OUTPUT_DIR = 'ssl';
+const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'ssl.json';
-const CHROME_SESSION_DIR = 'chrome_session';
+const CHROME_SESSION_DIR = '../chrome_session';
// Parse command line arguments
function parseArgs() {
@@ -60,10 +60,7 @@ function getCdpUrl() {
// Extract SSL details
async function extractSsl(url) {
- // Create output directory
- if (!fs.existsSync(OUTPUT_DIR)) {
- fs.mkdirSync(OUTPUT_DIR, { recursive: true });
- }
+ // Output directory is current directory (hook already runs in output dir)
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
// Only extract SSL for HTTPS URLs
diff --git a/archivebox/plugins/staticfile/on_Snapshot__31_staticfile.py b/archivebox/plugins/staticfile/on_Snapshot__31_staticfile.py
index 237f2d82..62aff11d 100644
--- a/archivebox/plugins/staticfile/on_Snapshot__31_staticfile.py
+++ b/archivebox/plugins/staticfile/on_Snapshot__31_staticfile.py
@@ -31,8 +31,8 @@ import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'staticfile'
-OUTPUT_DIR = 'staticfile'
-CHROME_SESSION_DIR = 'chrome_session'
+OUTPUT_DIR = '.'
+CHROME_SESSION_DIR = '../chrome_session'
# Content-Types that indicate static files
# These can't be meaningfully processed by Chrome-based extractors
@@ -214,9 +214,8 @@ def download_file(url: str) -> tuple[bool, str | None, str]:
if content_length and int(content_length) > max_size:
return False, None, f'File too large: {int(content_length)} bytes > {max_size} max'
- # Create output directory
+ # Output directory is current directory (hook already runs in output dir)
output_dir = Path(OUTPUT_DIR)
- output_dir.mkdir(exist_ok=True)
# Determine filename
filename = get_filename_from_url(url)
diff --git a/archivebox/plugins/title/on_Snapshot__32_title.js b/archivebox/plugins/title/on_Snapshot__32_title.js
index afc60fb8..eb760444 100644
--- a/archivebox/plugins/title/on_Snapshot__32_title.js
+++ b/archivebox/plugins/title/on_Snapshot__32_title.js
@@ -21,9 +21,9 @@ const http = require('http');
// Extractor metadata
const EXTRACTOR_NAME = 'title';
-const OUTPUT_DIR = 'title';
+const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'title.txt';
-const CHROME_SESSION_DIR = 'chrome_session';
+const CHROME_SESSION_DIR = '../chrome_session';
// Parse command line arguments
function parseArgs() {
@@ -162,10 +162,7 @@ async function getTitleFromCdp(cdpUrl) {
}
async function extractTitle(url) {
- // Create output directory
- if (!fs.existsSync(OUTPUT_DIR)) {
- fs.mkdirSync(OUTPUT_DIR, { recursive: true });
- }
+ // Output directory is current directory (hook already runs in output dir)
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
// Try Chrome session first
diff --git a/archivebox/plugins/wget/on_Snapshot__50_wget.py b/archivebox/plugins/wget/on_Snapshot__50_wget.py
index 4b409d8c..265d43c2 100644
--- a/archivebox/plugins/wget/on_Snapshot__50_wget.py
+++ b/archivebox/plugins/wget/on_Snapshot__50_wget.py
@@ -43,7 +43,7 @@ import rich_click as click
EXTRACTOR_NAME = 'wget'
BIN_NAME = 'wget'
BIN_PROVIDERS = 'apt,brew,env'
-OUTPUT_DIR = 'wget'
+OUTPUT_DIR = '.'
def get_env(name: str, default: str = '') -> str:
@@ -66,7 +66,7 @@ def get_env_int(name: str, default: int = 0) -> int:
return default
-STATICFILE_DIR = 'staticfile'
+STATICFILE_DIR = '../staticfile'
def has_staticfile_output() -> bool:
"""Check if staticfile extractor already downloaded this URL."""
diff --git a/archivebox/templates/admin/base.html b/archivebox/templates/admin/base.html
index 8c580cc5..8d3f1e90 100644
--- a/archivebox/templates/admin/base.html
+++ b/archivebox/templates/admin/base.html
@@ -30,6 +30,1031 @@
color: white;
cursor: pointer;
}
+
+ /* ============================================
+ Modern card-based admin UI (shadcn-inspired)
+ ============================================ */
+
+ /* Base font improvements */
+ body, html {
+ font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
+ -webkit-font-smoothing: antialiased;
+ -moz-osx-font-smoothing: grayscale;
+ font-size: 15px;
+ line-height: 1.6;
+ color: #0f172a;
+ background: #f8fafc;
+ }
+
+ #container {
+ background: #f8fafc;
+ }
+
+ #content {
+ padding: 24px;
+ }
+
+ /* Main form container - flexbox grid */
+ #content-main form > div,
+ #content form > div {
+ display: flex;
+ flex-wrap: wrap;
+ gap: 20px;
+ align-items: stretch;
+ }
+
+ /* Each fieldset becomes a card */
+ #content-main form fieldset,
+ #content form fieldset,
+ #content-main form .module:not(.inline-group),
+ #content form .module:not(.inline-group) {
+ background: #fff !important;
+ border: 1px solid #e2e8f0 !important;
+ border-top: 1px solid #e2e8f0 !important;
+ border-left: 1px solid #e2e8f0 !important;
+ border-right: 1px solid #e2e8f0 !important;
+ border-bottom: 1px solid #e2e8f0 !important;
+ border-radius: 12px !important;
+ padding: 0 !important;
+ margin: 0 !important;
+ box-shadow: 0 1px 3px rgba(0,0,0,0.04), 0 1px 2px rgba(0,0,0,0.06);
+ flex: 1 1 340px;
+ min-width: 320px;
+ max-width: calc(33.33% - 14px);
+ box-sizing: border-box;
+ display: flex;
+ flex-direction: column;
+ transition: box-shadow 0.2s ease, border-color 0.2s ease;
+ overflow: hidden;
+ }
+
+ /* Wide fieldsets MUST override card max-width - placed after card rules for specificity */
+ #content-main form fieldset.wide,
+ #content form fieldset.wide,
+ #content-main form fieldset:has(.field-archiveresults_list),
+ #content form fieldset:has(.field-archiveresults_list),
+ #content-main form fieldset:has(.field-snapshots),
+ #content form fieldset:has(.field-snapshots) {
+ flex: 1 1 100% !important;
+ max-width: 100% !important;
+ min-width: 100% !important;
+ width: 100% !important;
+ flex-basis: 100% !important;
+ }
+
+ /* Inline groups should NOT have card constraints */
+ #content-main form .inline-group,
+ #content form .inline-group,
+ .inline-group fieldset,
+ .inline-group .module {
+ flex: 1 1 100% !important;
+ max-width: 100% !important;
+ min-width: 100% !important;
+ width: 100% !important;
+ }
+
+ #content-main form fieldset:hover,
+ #content form fieldset:hover {
+ box-shadow: 0 4px 6px rgba(0,0,0,0.05), 0 2px 4px rgba(0,0,0,0.06);
+ border-color: #cbd5e1;
+ }
+
+ /* Archive results list content should take full width */
+ .field-archiveresults_list,
+ .field-archiveresults_list .readonly,
+ .field-snapshots,
+ .field-snapshots .readonly {
+ width: 100% !important;
+ max-width: 100% !important;
+ background: transparent !important;
+ border: none !important;
+ padding: 0 !important;
+ }
+
+ /* Card headers - no borders, just background */
+ #content-main form fieldset h2,
+ #content form fieldset h2,
+ #content-main form .module h2,
+ #content form .module h2 {
+ margin: 0 !important;
+ padding: 8px 16px !important;
+ background: #f1f5f9 !important;
+ color: #334155 !important;
+ font-size: 12px !important;
+ font-weight: 600 !important;
+ font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif !important;
+ border: none !important;
+ border-top: none !important;
+ border-left: none !important;
+ border-right: none !important;
+ border-bottom: none !important;
+ border-radius: 0 !important;
+ text-transform: uppercase;
+ letter-spacing: 0.5px;
+ flex-shrink: 0;
+ -webkit-font-smoothing: antialiased;
+ box-shadow: none !important;
+ outline: none !important;
+ }
+
+ /* Collapse toggle styling */
+ #content-main form fieldset h2 a.collapse-toggle,
+ #content form fieldset h2 a.collapse-toggle {
+ color: #64748b;
+ }
+
+ /* Card content area */
+ #content-main form fieldset > div,
+ #content form fieldset > div {
+ padding: 20px;
+ flex: 1;
+ overflow-x: hidden;
+ overflow-y: visible;
+ min-width: 0;
+ }
+
+ /* Form rows inside cards */
+ #content-main form fieldset .form-row,
+ #content form fieldset .form-row {
+ padding: 8px 0;
+ border-bottom: 1px solid #f1f5f9;
+ min-width: 0;
+ min-height: auto;
+ }
+
+ #content-main form fieldset .form-row:first-child,
+ #content form fieldset .form-row:first-child {
+ padding-top: 0;
+ }
+
+ #content-main form fieldset .form-row:last-child,
+ #content form fieldset .form-row:last-child {
+ border-bottom: none;
+ padding-bottom: 0;
+ }
+
+ /* Remove borders from nested fieldsets and flex-containers inside cards */
+ #content-main form fieldset fieldset,
+ #content form fieldset fieldset,
+ #content-main form fieldset .flex-container,
+ #content form fieldset .flex-container,
+ #content-main form .module fieldset,
+ #content form .module fieldset {
+ background: transparent !important;
+ border: none !important;
+ border-radius: 0 !important;
+ box-shadow: none !important;
+ padding: 0 !important;
+ margin: 0 !important;
+ min-width: 0 !important;
+ max-width: 94% !important;
+ flex: none !important;
+ display: block !important;
+ }
+
+ /* Nested fieldset headers should be invisible */
+ #content-main form fieldset fieldset h2,
+ #content form fieldset fieldset h2,
+ #content-main form fieldset .flex-container legend,
+ #content form fieldset .flex-container legend {
+ background: transparent !important;
+ padding: 0 0 4px 0 !important;
+ font-size: 13px !important;
+ color: #374151 !important;
+ text-transform: none !important;
+ letter-spacing: normal !important;
+ }
+
+ /* Ensure form elements inside cards don't overflow */
+ #content-main form fieldset input,
+ #content-main form fieldset select,
+ #content-main form fieldset textarea,
+ #content form fieldset input,
+ #content form fieldset select,
+ #content form fieldset textarea {
+ max-width: 100%;
+ box-sizing: border-box;
+ }
+
+ /* Related widget wrapper should fit within card */
+ #content-main form fieldset .related-widget-wrapper,
+ #content form fieldset .related-widget-wrapper {
+ max-width: 100%;
+ }
+
+ #content-main form fieldset .related-widget-wrapper select,
+ #content form fieldset .related-widget-wrapper select {
+ min-width: 0;
+ flex: 1;
+ }
+
+ /* Labels inside cards */
+ #content-main form fieldset .form-row > label,
+ #content form fieldset .form-row > label,
+ #content-main form fieldset .form-row > .flex-container > label,
+ #content form fieldset .form-row > .flex-container > label,
+ #content-main form label,
+ #content form label,
+ .aligned label,
+ legend {
+ font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
+ font-weight: 500;
+ color: #374151;
+ display: block;
+ margin-bottom: 8px;
+ float: none !important;
+ width: auto !important;
+ padding: 0 !important;
+ font-size: 13px;
+ letter-spacing: -0.01em;
+ -webkit-font-smoothing: antialiased;
+ -moz-osx-font-smoothing: grayscale;
+ }
+
+ /* Readonly fields styling */
+ #content-main form fieldset .readonly,
+ #content form fieldset .readonly {
+ background: #f8fafc;
+ padding: 12px 14px;
+ border-radius: 8px;
+ font-family: ui-monospace, SFMono-Regular, "SF Mono", Menlo, Monaco, Consolas, monospace;
+ font-size: 13px;
+ word-break: break-word;
+ line-height: 1.6;
+ border: 1px solid #e2e8f0;
+ color: #475569;
+ }
+
+ /* Long content in readonly */
+ #content-main form fieldset .readonly pre,
+ #content form fieldset .readonly pre {
+ margin: 0;
+ white-space: pre-wrap;
+ word-break: break-word;
+ font-family: inherit;
+ }
+
+ /* Input styling */
+ #content-main form input[type="text"],
+ #content-main form input[type="number"],
+ #content-main form input[type="url"],
+ #content-main form input[type="email"],
+ #content-main form input[type="password"],
+ #content form input[type="text"],
+ #content form input[type="number"],
+ #content form input[type="url"],
+ #content form input[type="email"],
+ #content form input[type="password"] {
+ width: 100%;
+ padding: 10px 14px;
+ border: 1px solid #d1d5db;
+ border-radius: 8px;
+ font-size: 14px;
+ font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
+ box-sizing: border-box;
+ background: #fff;
+ color: #1e293b;
+ transition: border-color 0.15s ease, box-shadow 0.15s ease;
+ -webkit-font-smoothing: antialiased;
+ }
+
+ #content-main form select,
+ #content form select {
+ width: 100%;
+ border: 1px solid #d1d5db;
+ border-radius: 8px;
+ font-size: 14px;
+ font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
+ box-sizing: border-box;
+ background: #fff;
+ color: #1e293b;
+ transition: border-color 0.15s ease, box-shadow 0.15s ease;
+ -webkit-font-smoothing: antialiased;
+ }
+
+ #content-main form input::placeholder,
+ #content form input::placeholder {
+ color: #94a3b8;
+ }
+
+ /* Focus states */
+ #content-main form input:focus,
+ #content-main form select:focus,
+ #content-main form textarea:focus,
+ #content form input:focus,
+ #content form select:focus,
+ #content form textarea:focus {
+ border-color: #3b82f6;
+ outline: none;
+ box-shadow: 0 0 0 3px rgba(59, 130, 246, 0.15);
+ }
+
+ /* Textarea styling */
+ #content-main form textarea,
+ #content form textarea {
+ width: 100%;
+ box-sizing: border-box;
+ border: 1px solid #d1d5db;
+ border-radius: 8px;
+ padding: 12px 14px;
+ font-size: 14px;
+ font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
+ line-height: 1.6;
+ resize: vertical;
+ min-height: 80px;
+ color: #1e293b;
+ transition: border-color 0.15s ease, box-shadow 0.15s ease;
+ -webkit-font-smoothing: antialiased;
+ }
+
+ /* Fix vTextField width */
+ .vTextField {
+ width: 100% !important;
+ }
+
+ /* ============================================
+ Button styling (shadcn-inspired)
+ ============================================ */
+
+ /* Base button styles */
+ input[type="submit"],
+ button,
+ .button,
+ .btn,
+ a.button,
+ .submit-row input,
+ .submit-row a.button {
+ display: inline-flex;
+ align-items: center;
+ justify-content: center;
+ gap: 8px;
+ padding: 10px 18px;
+ font-size: 14px;
+ font-weight: 500;
+ font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
+ line-height: 1.4;
+ border-radius: 8px;
+ border: 1px solid transparent;
+ cursor: pointer;
+ transition: all 0.15s ease;
+ text-decoration: none;
+ white-space: nowrap;
+ -webkit-font-smoothing: antialiased;
+ }
+
+ /* Primary button (default) */
+ input[type="submit"],
+ button[type="submit"],
+ .button.default,
+ .submit-row input[type="submit"] {
+ background: #0f172a;
+ color: #fff;
+ border-color: #0f172a;
+ }
+
+ input[type="submit"]:hover,
+ button[type="submit"]:hover,
+ .button.default:hover,
+ .submit-row input[type="submit"]:hover {
+ background: #1e293b;
+ border-color: #1e293b;
+ }
+
+ input[type="submit"]:active,
+ button[type="submit"]:active {
+ background: #334155;
+ transform: translateY(1px);
+ }
+
+ /* Secondary/outline buttons */
+ button:not([type="submit"]),
+ .button:not(.default),
+ a.button {
+ background: #fff;
+ color: #374151;
+ border-color: #d1d5db;
+ }
+
+ button:not([type="submit"]):hover,
+ .button:not(.default):hover,
+ a.button:hover {
+ background: #f9fafb;
+ border-color: #9ca3af;
+ color: #1f2937;
+ }
+
+ /* Danger button */
+ .deletelink,
+ a.deletelink,
+ button.deletelink,
+ input[name="delete"],
+ .button.delete {
+ background: #fff;
+ color: #dc2626;
+ border-color: #fecaca;
+ }
+
+ .deletelink:hover,
+ a.deletelink:hover,
+ button.deletelink:hover,
+ input[name="delete"]:hover,
+ .button.delete:hover {
+ background: #fef2f2;
+ border-color: #f87171;
+ color: #b91c1c;
+ }
+
+ /* Small buttons */
+ .btn-sm,
+ .object-tools a,
+ .datetimeshortcuts a {
+ padding: 6px 12px;
+ font-size: 13px;
+ border-radius: 6px;
+ }
+
+ /* Object tools (top action buttons) */
+ .object-tools {
+ margin-bottom: 20px;
+ }
+
+ .object-tools li {
+ margin-left: 10px;
+ }
+
+ .object-tools a {
+ background: #fff;
+ color: #374151;
+ border: 1px solid #d1d5db;
+ text-decoration: none;
+ display: inline-flex;
+ align-items: center;
+ }
+
+ .object-tools a:hover {
+ background: #f9fafb;
+ border-color: #9ca3af;
+ }
+
+ /* Submit row styling */
+ .submit-row {
+ margin-top: 24px;
+ padding: 20px;
+ background: #fff;
+ border-radius: 12px;
+ border: 1px solid #e2e8f0;
+ box-shadow: 0 1px 3px rgba(0,0,0,0.04);
+ clear: both;
+ flex: 1 1 100%;
+ display: flex;
+ gap: 12px;
+ flex-wrap: wrap;
+ align-items: center;
+ }
+
+ .submit-row p {
+ margin: 0;
+ }
+
+ .submit-row .deletelink-box {
+ margin-left: auto;
+ }
+
+ /* Responsive: 2 columns on medium screens */
+ @media (max-width: 1400px) {
+ #content-main form fieldset,
+ #content form fieldset {
+ max-width: calc(50% - 10px);
+ flex: 1 1 320px;
+ }
+ }
+
+ /* Responsive: stack on smaller screens */
+ @media (max-width: 900px) {
+ #content-main form fieldset,
+ #content form fieldset {
+ flex: 1 1 100%;
+ max-width: 100%;
+ min-width: auto;
+ }
+
+ #content {
+ padding: 16px;
+ }
+ }
+
+ /* Module content padding */
+ #content-main form .module > div,
+ #content form .module > div {
+ padding: 12px;
+ }
+
+ /* Fix for JSON/config editor */
+ .field-config .readonly,
+ .field-config textarea {
+ width: 100%;
+ min-height: 120px;
+ max-height: none;
+ }
+
+ /* Related widget styling */
+ .related-widget-wrapper {
+ display: flex;
+ align-items: center;
+ gap: 8px;
+ flex-wrap: wrap;
+ }
+
+ .related-widget-wrapper select {
+ flex: 1;
+ min-width: 150px;
+ }
+
+ .related-widget-wrapper a {
+ flex-shrink: 0;
+ padding: 8px;
+ border-radius: 6px;
+ color: #64748b;
+ transition: color 0.15s ease, background 0.15s ease;
+ }
+
+ .related-widget-wrapper a:hover {
+ color: #1e293b;
+ background: #f1f5f9;
+ }
+
+ /* Help text styling */
+ .help {
+ font-size: 13px;
+ color: #64748b;
+ margin-top: 6px;
+ line-height: 1.5;
+ }
+
+ /* Error styling */
+ .errorlist {
+ color: #dc2626;
+ font-size: 13px;
+ margin: 6px 0;
+ padding: 0;
+ list-style: none;
+ }
+
+ .errorlist li {
+ background: #fef2f2;
+ padding: 8px 12px;
+ border-radius: 6px;
+ border: 1px solid #fecaca;
+ }
+
+ /* Inline related objects - force full width */
+ .inline-group,
+ #archiveresult_set-group,
+ #content-main form .inline-group,
+ #content-main form > div > .inline-group,
+ #content form > div > .inline-group,
+ .change-form .inline-group,
+ div.inline-group {
+ flex: 1 1 100% !important;
+ max-width: 100% !important;
+ min-width: 100% !important;
+ width: 100% !important;
+ margin-top: 20px;
+ flex-basis: 100% !important;
+ }
+
+ /* Ensure inline-group breaks out of card grid */
+ #content-main form > div,
+ #content form > div {
+ flex-wrap: wrap;
+ }
+
+ /* TabularInline table full width */
+ .inline-group .tabular,
+ .inline-group table {
+ width: 100% !important;
+ }
+
+ .inline-related {
+ margin: 12px 0;
+ padding: 16px;
+ background: #fff;
+ border-radius: 10px;
+ border: 1px solid #e2e8f0;
+ }
+
+ .inline-related h3 {
+ margin: -16px -16px 16px -16px;
+ padding: 12px 16px;
+ background: #f8fafc;
+ border-radius: 9px 9px 0 0;
+ border-bottom: 1px solid #e2e8f0;
+ font-size: 13px;
+ font-weight: 600;
+ color: #374151;
+ }
+
+ /* Tabular inline styling */
+ .tabular {
+ border-radius: 8px;
+ overflow: hidden;
+ border: 1px solid #e2e8f0;
+ }
+
+ .tabular td, .tabular th {
+ padding: 12px 14px;
+ font-size: 13px;
+ border-bottom: 1px solid #f1f5f9;
+ }
+
+ .tabular th {
+ background: #f8fafc;
+ font-weight: 600;
+ color: #374151;
+ text-align: left;
+ }
+
+ .tabular tr:last-child td {
+ border-bottom: none;
+ }
+
+ /* Delete checkbox */
+ .inline-deletelink {
+ color: #dc2626;
+ font-size: 13px;
+ }
+
+ /* Datetime widgets */
+ .datetimeshortcuts {
+ margin-left: 10px;
+ }
+
+ .datetimeshortcuts a {
+ background: #f1f5f9;
+ color: #475569;
+ border: none;
+ padding: 4px 10px;
+ }
+
+ .datetimeshortcuts a:hover {
+ background: #e2e8f0;
+ color: #1e293b;
+ }
+
+ /* Aligned forms - fix label positioning */
+ .aligned .form-row > div {
+ margin-left: 0 !important;
+ }
+
+ /* Checkbox styling */
+ input[type="checkbox"] {
+ width: 18px;
+ height: 18px;
+ border-radius: 4px;
+ border: 1px solid #d1d5db;
+ cursor: pointer;
+ accent-color: #3b82f6;
+ }
+
+ /* Links styling */
+ a {
+ color: #2563eb;
+ text-decoration: none;
+ transition: color 0.15s ease;
+ }
+
+ a:hover {
+ color: #1d4ed8;
+ }
+
+ /* Messages/alerts */
+ .messagelist {
+ padding: 0;
+ margin: 0 0 20px 0;
+ }
+
+ .messagelist li {
+ padding: 14px 18px;
+ border-radius: 10px;
+ font-size: 14px;
+ margin-bottom: 10px;
+ display: flex;
+ align-items: center;
+ gap: 10px;
+ }
+
+ ul.messagelist li.success {
+ background: #f0fdf4 !important;
+ background-image: none !important;
+ border: 1px solid #bbf7d0;
+ color: #166534;
+ }
+
+ .messagelist li.warning {
+ background: #fffbeb;
+ border: 1px solid #fde68a;
+ color: #92400e;
+ }
+
+ .messagelist li.error {
+ background: #fef2f2;
+ border: 1px solid #fecaca;
+ color: #991b1b;
+ }
+
+ /* Breadcrumbs */
+ .breadcrumbs {
+ background: transparent;
+ padding: 12px 24px;
+ font-size: 13px;
+ color: #64748b;
+ }
+
+ .breadcrumbs a {
+ color: #64748b;
+ }
+
+ .breadcrumbs a:hover {
+ color: #1e293b;
+ }
+
+ /* Action buttons in cards */
+ .card .btn,
+ .card button {
+ margin-top: 10px;
+ }
+
+ /* Select2 overrides */
+ .select2-container--default .select2-selection--single,
+ .select2-container--default .select2-selection--multiple {
+ border: 1px solid #d1d5db;
+ border-radius: 8px;
+ min-height: 42px;
+ }
+
+ .select2-container--default .select2-selection--single:focus,
+ .select2-container--default .select2-selection--multiple:focus {
+ border-color: #3b82f6;
+ box-shadow: 0 0 0 3px rgba(59, 130, 246, 0.15);
+ }
+
+ /* ============================================
+ Admin List/Changelist Page Styling
+ ============================================ */
+
+ /* Results table container */
+ #changelist {
+ background: #fff;
+ border-radius: 12px;
+ border: 1px solid #e2e8f0;
+ box-shadow: 0 1px 3px rgba(0,0,0,0.04);
+ overflow: hidden;
+ }
+
+ /* Table styling */
+ #result_list {
+ width: 100%;
+ border-collapse: collapse;
+ font-size: 14px;
+ }
+
+ #result_list thead th {
+ background: #f8fafc;
+ border-bottom: 2px solid #e2e8f0;
+ padding: 12px 16px;
+ font-weight: 600;
+ font-size: 13px;
+ color: #475569;
+ text-align: left;
+ text-transform: uppercase;
+ letter-spacing: 0.025em;
+ white-space: nowrap;
+ }
+
+ #result_list thead th a {
+ color: #475569;
+ text-decoration: none;
+ }
+
+ #result_list thead th a:hover {
+ color: #1e293b;
+ }
+
+ #result_list thead th.sorted {
+ background: #f1f5f9;
+ }
+
+ #result_list thead th .text span {
+ padding-right: 5px;
+ }
+
+ #result_list tbody tr {
+ border-bottom: 1px solid #f1f5f9;
+ transition: background-color 0.15s ease;
+ }
+
+ #result_list tbody tr:hover {
+ background-color: #f8fafc;
+ }
+
+ #result_list tbody tr.selected {
+ background-color: #eff6ff;
+ }
+
+ #result_list tbody td {
+ padding: 12px 16px;
+ color: #334155;
+ vertical-align: middle;
+ }
+
+ #result_list tbody td a {
+ color: #2563eb;
+ font-weight: 500;
+ }
+
+ #result_list tbody td a:hover {
+ color: #1d4ed8;
+ text-decoration: underline;
+ }
+
+ /* Checkbox column */
+ #result_list .action-checkbox,
+ #result_list th.action-checkbox-column {
+ width: 40px;
+ text-align: center;
+ padding: 12px 8px;
+ }
+
+ /* Pagination */
+ .paginator {
+ background: #f8fafc;
+ padding: 12px 16px;
+ border-top: 1px solid #e2e8f0;
+ font-size: 14px;
+ color: #64748b;
+ }
+
+ .paginator a {
+ color: #2563eb;
+ padding: 6px 12px;
+ border-radius: 6px;
+ margin: 0 2px;
+ text-decoration: none;
+ }
+
+ .paginator a:hover {
+ background: #e2e8f0;
+ }
+
+ /* Toolbar / search bar */
+ #toolbar {
+ padding: 16px;
+ background: #fff;
+ border-bottom: 1px solid #e2e8f0;
+ display: flex;
+ align-items: center;
+ gap: 12px;
+ }
+
+ #toolbar form {
+ display: flex;
+ align-items: center;
+ gap: 8px;
+ flex: 1;
+ }
+
+ #searchbar {
+ flex: 1;
+ max-width: 400px;
+ padding: 10px 14px;
+ border: 1px solid #d1d5db;
+ border-radius: 8px;
+ font-size: 14px;
+ }
+
+ #searchbar:focus {
+ border-color: #3b82f6;
+ outline: none;
+ box-shadow: 0 0 0 3px rgba(59, 130, 246, 0.15);
+ }
+
+ /* Filter sidebar */
+ #changelist-filter {
+ background: #fff;
+ border: 1px solid #e2e8f0;
+ border-radius: 12px;
+ box-shadow: 0 1px 3px rgba(0,0,0,0.04);
+ overflow: hidden;
+ }
+
+ #changelist-filter h2 {
+ background: #f8fafc;
+ padding: 12px 16px;
+ font-size: 13px;
+ font-weight: 600;
+ color: #475569;
+ text-transform: uppercase;
+ letter-spacing: 0.025em;
+ margin: 0;
+ border-bottom: 1px solid #e2e8f0;
+ }
+
+ #changelist-filter h3 {
+ padding: 12px 16px 8px;
+ font-size: 12px;
+ font-weight: 600;
+ color: #64748b;
+ text-transform: uppercase;
+ letter-spacing: 0.05em;
+ margin: 0;
+ }
+
+ #changelist-filter ul {
+ padding: 0 8px 12px;
+ margin: 0;
+ list-style: none;
+ }
+
+ #changelist-filter li {
+ margin: 0;
+ }
+
+ #changelist-filter li a {
+ display: block;
+ padding: 8px 12px;
+ color: #475569;
+ text-decoration: none;
+ border-radius: 6px;
+ font-size: 14px;
+ transition: background-color 0.15s ease;
+ }
+
+ #changelist-filter li a:hover {
+ background: #f1f5f9;
+ color: #1e293b;
+ }
+
+ #changelist-filter li.selected a {
+ background: #eff6ff;
+ color: #2563eb;
+ font-weight: 500;
+ }
+
+ /* Actions bar */
+ .actions {
+ padding: 12px 16px;
+ background: #f8fafc;
+ border-bottom: 1px solid #e2e8f0;
+ display: flex;
+ align-items: center;
+ gap: 12px;
+ flex-wrap: wrap;
+ }
+
+ .actions label {
+ font-size: 14px;
+ color: #475569;
+ }
+
+ .actions select {
+ padding: 8px 12px;
+ border: 1px solid #d1d5db;
+ border-radius: 6px;
+ font-size: 14px;
+ background: #fff;
+ }
+
+ .actions .button {
+ padding: 8px 16px;
+ font-size: 14px;
+ }
+
+ /* Object count */
+ .actions .action-counter {
+ color: #64748b;
+ font-size: 14px;
+ }
+
+ /* Empty results */
+ #changelist-form .results + p,
+ .paginator + p {
+ padding: 40px;
+ text-align: center;
+ color: #64748b;
+ font-size: 15px;
+ }
+
+ /* Date hierarchy */
+ .xfull {
+ padding: 12px 16px;
+ background: #f8fafc;
+ border-bottom: 1px solid #e2e8f0;
+ }
+
+ .xfull a {
+ color: #2563eb;
+ margin-right: 8px;
+ }
{% endblock %}
diff --git a/archivebox/templates/admin/progress_monitor.html b/archivebox/templates/admin/progress_monitor.html
index bdf9d64f..1b9d9dde 100644
--- a/archivebox/templates/admin/progress_monitor.html
+++ b/archivebox/templates/admin/progress_monitor.html
@@ -57,13 +57,24 @@
box-shadow: 0 0 8px #3fb950;
animation: pulse 2s infinite;
}
+ #progress-monitor .status-dot.idle {
+ background: #d29922;
+ box-shadow: 0 0 4px #d29922;
+ }
#progress-monitor .status-dot.stopped {
- background: #f85149;
+ background: #6e7681;
+ }
+ #progress-monitor .status-dot.flash {
+ animation: flash 0.3s ease-out;
}
@keyframes pulse {
0%, 100% { opacity: 1; box-shadow: 0 0 8px #3fb950; }
50% { opacity: 0.6; box-shadow: 0 0 4px #3fb950; }
}
+ @keyframes flash {
+ 0% { transform: scale(1.5); }
+ 100% { transform: scale(1); }
+ }
/* Stats */
#progress-monitor .stats {
@@ -89,6 +100,19 @@
#progress-monitor .stat-value.error { color: #f85149; }
#progress-monitor .stat-value.warning { color: #d29922; }
#progress-monitor .stat-value.info { color: #58a6ff; }
+ #progress-monitor .stat.clickable {
+ cursor: pointer;
+ padding: 2px 6px;
+ margin: -2px -6px;
+ border-radius: 4px;
+ transition: background 0.2s;
+ }
+ #progress-monitor .stat.clickable:hover {
+ background: rgba(255,255,255,0.1);
+ }
+ #progress-monitor .stat.clickable:active {
+ background: rgba(255,255,255,0.2);
+ }
/* Toggle Button */
#progress-monitor .toggle-btn {
@@ -259,48 +283,86 @@
padding: 0 12px 8px;
}
- /* Extractor List */
+ /* Extractor List - Compact Badge Layout */
#progress-monitor .extractor-list {
padding: 8px 12px;
background: rgba(0,0,0,0.2);
border-top: 1px solid #21262d;
+ display: flex;
+ flex-wrap: wrap;
+ gap: 4px;
}
- #progress-monitor .extractor-item {
+ #progress-monitor .extractor-badge {
+ position: relative;
+ display: inline-flex;
+ align-items: center;
+ gap: 4px;
+ padding: 3px 8px;
+ border-radius: 4px;
+ font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
+ font-size: 10px;
+ background: #21262d;
+ overflow: hidden;
+ white-space: nowrap;
+ }
+ #progress-monitor .extractor-badge .progress-fill {
+ position: absolute;
+ top: 0;
+ left: 0;
+ bottom: 0;
+ z-index: 0;
+ transition: width 0.3s ease-out;
+ }
+ #progress-monitor .extractor-badge .badge-content {
+ position: relative;
+ z-index: 1;
display: flex;
align-items: center;
- gap: 8px;
- padding: 4px 0;
+ gap: 4px;
}
- #progress-monitor .extractor-icon {
- font-size: 12px;
- width: 16px;
- text-align: center;
+ #progress-monitor .extractor-badge.queued {
+ color: #8b949e;
}
- #progress-monitor .extractor-icon.running {
+ #progress-monitor .extractor-badge.queued .progress-fill {
+ background: rgba(110, 118, 129, 0.2);
+ width: 0%;
+ }
+ #progress-monitor .extractor-badge.started {
color: #d29922;
- animation: spin 1s linear infinite;
}
- #progress-monitor .extractor-icon.success {
+ #progress-monitor .extractor-badge.started .progress-fill {
+ background: rgba(210, 153, 34, 0.3);
+ width: 50%;
+ animation: progress-pulse 1.5s ease-in-out infinite;
+ }
+ @keyframes progress-pulse {
+ 0%, 100% { opacity: 0.5; }
+ 50% { opacity: 1; }
+ }
+ #progress-monitor .extractor-badge.succeeded {
color: #3fb950;
}
- #progress-monitor .extractor-icon.failed {
+ #progress-monitor .extractor-badge.succeeded .progress-fill {
+ background: rgba(63, 185, 80, 0.25);
+ width: 100%;
+ }
+ #progress-monitor .extractor-badge.failed {
color: #f85149;
}
- #progress-monitor .extractor-icon.pending {
- color: #8b949e;
+ #progress-monitor .extractor-badge.failed .progress-fill {
+ background: rgba(248, 81, 73, 0.25);
+ width: 100%;
+ }
+ #progress-monitor .extractor-badge .badge-icon {
+ font-size: 10px;
+ }
+ #progress-monitor .extractor-badge.started .badge-icon {
+ animation: spin 1s linear infinite;
}
@keyframes spin {
from { transform: rotate(0deg); }
to { transform: rotate(360deg); }
}
- #progress-monitor .extractor-name {
- flex: 1;
- font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
- font-size: 11px;
- }
- #progress-monitor .extractor-progress {
- width: 60px;
- }
/* Status Badge */
#progress-monitor .status-badge {
@@ -356,11 +418,11 @@
Queued
0
-
+
Done
0
-
+
Failed
0
@@ -390,6 +452,24 @@
let expandedCrawls = new Set(JSON.parse(localStorage.getItem('progress-monitor-expanded-crawls') || '[]'));
let expandedSnapshots = new Set(JSON.parse(localStorage.getItem('progress-monitor-expanded-snapshots') || '[]'));
+ // Baselines for resettable counters
+ let succeededBaseline = parseInt(localStorage.getItem('progress-succeeded-baseline') || '0');
+ let failedBaseline = parseInt(localStorage.getItem('progress-failed-baseline') || '0');
+ let lastSucceeded = 0;
+ let lastFailed = 0;
+
+ // Click handlers for resetting counters
+ document.getElementById('stat-succeeded').addEventListener('click', function() {
+ succeededBaseline = lastSucceeded;
+ localStorage.setItem('progress-succeeded-baseline', succeededBaseline);
+ document.getElementById('total-succeeded').textContent = '0';
+ });
+ document.getElementById('stat-failed').addEventListener('click', function() {
+ failedBaseline = lastFailed;
+ localStorage.setItem('progress-failed-baseline', failedBaseline);
+ document.getElementById('total-failed').textContent = '0';
+ });
+
function formatUrl(url) {
try {
const u = new URL(url);
@@ -400,24 +480,18 @@
}
function renderExtractor(extractor) {
- const iconClass = extractor.status === 'started' ? 'running' :
- extractor.status === 'succeeded' ? 'success' :
- extractor.status === 'failed' ? 'failed' : 'pending';
const icon = extractor.status === 'started' ? '↻' :
extractor.status === 'succeeded' ? '✓' :
extractor.status === 'failed' ? '✗' : '○';
return `
-
+
`;
}
@@ -427,10 +501,14 @@
const statusIcon = snapshot.status === 'started' ? '↻' : '📄';
let extractorHtml = '';
- if (snapshot.active_extractors && snapshot.active_extractors.length > 0) {
+ if (snapshot.all_extractors && snapshot.all_extractors.length > 0) {
+ // Sort extractors alphabetically by name to prevent reordering on updates
+ const sortedExtractors = [...snapshot.all_extractors].sort((a, b) =>
+ a.extractor.localeCompare(b.extractor)
+ );
extractorHtml = `
`;
}
@@ -438,7 +516,7 @@
return `