logging and admin ui improvements

This commit is contained in:
Nick Sweeting
2025-12-25 01:10:41 -08:00
parent 8218675ed4
commit 866f993f26
60 changed files with 2932 additions and 497 deletions

View File

@@ -0,0 +1,3 @@
[SERVER_CONFIG]
SECRET_KEY = amuxg7v5e2l_6jrktp_f3kszlpx4ieqk4rtwda5q6nfiavits4

View File

@@ -13,7 +13,21 @@ class APITokenAdmin(BaseModelAdmin):
sort_fields = ('id', 'created_at', 'created_by', 'expires')
readonly_fields = ('created_at', 'modified_at')
search_fields = ('id', 'created_by__username', 'token')
fields = ('created_by', 'token', 'expires', *readonly_fields)
fieldsets = (
('Token', {
'fields': ('token', 'expires'),
'classes': ('card',),
}),
('Owner', {
'fields': ('created_by',),
'classes': ('card',),
}),
('Timestamps', {
'fields': ('created_at', 'modified_at'),
'classes': ('card',),
}),
)
list_filter = ('created_by',)
ordering = ['-created_at']
@@ -25,6 +39,29 @@ class CustomWebhookAdmin(WebhookAdmin, BaseModelAdmin):
sort_fields = ('created_at', 'created_by', 'id', 'referenced_model', 'endpoint', 'last_success', 'last_error')
readonly_fields = ('created_at', 'modified_at', *WebhookAdmin.readonly_fields)
fieldsets = (
('Webhook', {
'fields': ('name', 'signal', 'referenced_model', 'endpoint'),
'classes': ('card', 'wide'),
}),
('Authentication', {
'fields': ('auth_token',),
'classes': ('card',),
}),
('Status', {
'fields': ('enabled', 'last_success', 'last_error'),
'classes': ('card',),
}),
('Owner', {
'fields': ('created_by',),
'classes': ('card',),
}),
('Timestamps', {
'fields': ('created_at', 'modified_at'),
'classes': ('card',),
}),
)
def register_admin(admin_site):
admin_site.register(APIToken, APITokenAdmin)

View File

@@ -115,12 +115,10 @@ def add(urls: str | list[str],
# - Repeat until max_depth reached
if bg:
# Background mode: start orchestrator and return immediately
print('[yellow]\\[*] Running in background mode - starting orchestrator...[/yellow]')
orchestrator = Orchestrator(exit_on_idle=True)
orchestrator.start() # Fork to background
# Background mode: just queue work and return (orchestrator via server will pick it up)
print('[yellow]\\[*] URLs queued. Orchestrator will process them (run `archivebox server` if not already running).[/yellow]')
else:
# Foreground mode: run orchestrator until all work is done
# Foreground mode: run orchestrator inline until all work is done
print(f'[green]\\[*] Starting orchestrator to process crawl...[/green]')
orchestrator = Orchestrator(exit_on_idle=True)
orchestrator.runloop() # Block until complete

View File

@@ -117,11 +117,11 @@ def run_plugins(
if snapshot_id:
snapshot_ids.add(snapshot_id)
elif record.get('url'):
# Look up by URL
try:
snap = Snapshot.objects.get(url=record['url'])
# Look up by URL (get most recent if multiple exist)
snap = Snapshot.objects.filter(url=record['url']).order_by('-created_at').first()
if snap:
snapshot_ids.add(str(snap.id))
except Snapshot.DoesNotExist:
else:
rprint(f'[yellow]Snapshot not found for URL: {record["url"]}[/yellow]', file=sys.stderr)
elif record_type == TYPE_ARCHIVERESULT:

View File

@@ -49,20 +49,45 @@ def install(dry_run: bool=False) -> None:
# Using a minimal crawl that will trigger on_Crawl hooks
created_by_id = get_or_create_system_user_pk()
seed = Seed.objects.create(
seed, _created = Seed.objects.get_or_create(
uri='archivebox://install',
label='Dependency detection',
created_by_id=created_by_id,
defaults={
'extractor': 'auto',
}
)
crawl = Crawl.objects.create(
crawl, created = Crawl.objects.get_or_create(
seed=seed,
max_depth=0,
created_by_id=created_by_id,
status='queued',
defaults={
'status': 'queued',
}
)
# If crawl already existed, reset it to queued state so it can be processed again
if not created:
crawl.status = 'queued'
crawl.retry_at = timezone.now()
crawl.save()
print(f'[+] Created dependency detection crawl: {crawl.id}')
print(f'[+] Crawl status: {crawl.status}, retry_at: {crawl.retry_at}')
# Verify the crawl is in the queue
from crawls.models import Crawl as CrawlModel
queued_crawls = CrawlModel.objects.filter(
retry_at__lte=timezone.now()
).exclude(
status__in=CrawlModel.FINAL_STATES
)
print(f'[+] Crawls in queue: {queued_crawls.count()}')
if queued_crawls.exists():
for c in queued_crawls:
print(f' - Crawl {c.id}: status={c.status}, retry_at={c.retry_at}')
print('[+] Running crawl to detect binaries via on_Crawl hooks...')
print()

View File

@@ -56,20 +56,53 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
except IndexError:
pass
print('[green][+] Starting ArchiveBox webserver...[/green]')
print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
print(f' [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]')
print(' > Writing ArchiveBox error log to ./logs/errors.log')
if SHELL_CONFIG.DEBUG:
print('[green][+] Starting ArchiveBox webserver in DEBUG mode...[/green]')
print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
print(f' [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]')
print(' > Writing ArchiveBox error log to ./logs/errors.log')
if not reload:
runserver_args.append('--noreload') # '--insecure'
if nothreading:
runserver_args.append('--nothreading')
call_command("runserver", *runserver_args)
else:
from workers.supervisord_util import start_server_workers
from workers.supervisord_util import (
get_existing_supervisord_process,
get_worker,
start_server_workers,
tail_multiple_worker_logs,
)
# Check if supervisord is already running
supervisor = get_existing_supervisord_process()
if supervisor:
daphne_proc = get_worker(supervisor, 'worker_daphne')
# If daphne is already running, just tail logs
if daphne_proc and daphne_proc.get('statename') == 'RUNNING':
orchestrator_proc = get_worker(supervisor, 'worker_orchestrator')
print('[yellow][!] ArchiveBox server is already running[/yellow]')
print(f' [green]√[/green] Web server (worker_daphne) is RUNNING on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
if orchestrator_proc and orchestrator_proc.get('statename') == 'RUNNING':
print(f' [green]√[/green] Background worker (worker_orchestrator) is RUNNING')
print()
print('[blue][i] Tailing worker logs (Ctrl+C to stop watching)...[/i][/blue]')
print()
# Tail logs for both workers
tail_multiple_worker_logs(
log_files=['logs/worker_daphne.log', 'logs/worker_orchestrator.log'],
follow=True,
)
return
# Otherwise, daphne is not running - fall through to start it
# No existing workers found - start new ones
print('[green][+] Starting ArchiveBox webserver...[/green]')
print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
print(f' [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]')
print(' > Writing ArchiveBox error log to ./logs/errors.log')
print()
start_server_workers(host=host, port=port, daemonize=daemonize)
print("\n[i][green][🟩] ArchiveBox server shut down gracefully.[/green][/i]")

View File

@@ -119,12 +119,13 @@ def version(quiet: bool=False,
else:
for key in sorted(set(binary_config_keys)):
# Get the actual binary name/path from config value
bin_value = config.get(key, '').strip()
# Prioritize Machine.config overrides over base config
bin_value = machine.config.get(key) or config.get(key, '').strip()
if not bin_value:
continue
# Check if it's a path (has slashes) or just a name
is_path = '/' in bin_value
is_path = '/' in str(bin_value)
if is_path:
# It's a full path - match against abspath

View File

@@ -5,7 +5,6 @@ import sys
from datetime import datetime, timezone
from rich.progress import Progress
from rich.console import Console
import django
@@ -27,16 +26,6 @@ STDERR = Console(stderr=True)
logging.CONSOLE = CONSOLE
INITIAL_STARTUP_PROGRESS = None
INITIAL_STARTUP_PROGRESS_TASK = 0
def bump_startup_progress_bar(advance=1):
global INITIAL_STARTUP_PROGRESS
global INITIAL_STARTUP_PROGRESS_TASK
if INITIAL_STARTUP_PROGRESS:
INITIAL_STARTUP_PROGRESS.update(INITIAL_STARTUP_PROGRESS_TASK, advance=advance) # type: ignore
def setup_django_minimal():
# sys.path.append(str(CONSTANTS.PACKAGE_DIR))
# os.environ.setdefault('ARCHIVEBOX_DATA_DIR', str(CONSTANTS.DATA_DIR))
@@ -49,9 +38,7 @@ DJANGO_SET_UP = False
def setup_django(check_db=False, in_memory_db=False) -> None:
from rich.panel import Panel
global INITIAL_STARTUP_PROGRESS
global INITIAL_STARTUP_PROGRESS_TASK
global DJANGO_SET_UP
if DJANGO_SET_UP:
@@ -59,118 +46,100 @@ def setup_django(check_db=False, in_memory_db=False) -> None:
# TODO: figure out why CLI entrypoints with init_pending are running this twice sometimes
return
with Progress(transient=True, expand=True, console=STDERR) as INITIAL_STARTUP_PROGRESS:
INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS.add_task("[green]Loading modules...", total=25, visible=True)
from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, SudoPermission
# if running as root, chown the data dir to the archivebox user to make sure it's accessible to the archivebox user
if IS_ROOT and ARCHIVEBOX_USER != 0:
with SudoPermission(uid=0):
# running as root is a special case where it's ok to be a bit slower
# make sure data dir is always owned by the correct user
os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}" 2>/dev/null')
os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}"/* 2>/dev/null')
from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, SudoPermission
bump_startup_progress_bar()
try:
from django.core.management import call_command
bump_startup_progress_bar()
# if running as root, chown the data dir to the archivebox user to make sure it's accessible to the archivebox user
if IS_ROOT and ARCHIVEBOX_USER != 0:
with SudoPermission(uid=0):
# running as root is a special case where it's ok to be a bit slower
# make sure data dir is always owned by the correct user
os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}" 2>/dev/null')
os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}"/* 2>/dev/null')
if in_memory_db:
raise Exception('dont use this anymore')
# some commands (e.g. oneshot) dont store a long-lived sqlite3 db file on disk.
# in those cases we create a temporary in-memory db and run the migrations
# immediately to get a usable in-memory-database at startup
os.environ.setdefault("ARCHIVEBOX_DATABASE_NAME", ":memory:")
try:
from django.core.management import call_command
if in_memory_db:
raise Exception('dont use this anymore')
# some commands (e.g. oneshot) dont store a long-lived sqlite3 db file on disk.
# in those cases we create a temporary in-memory db and run the migrations
# immediately to get a usable in-memory-database at startup
os.environ.setdefault("ARCHIVEBOX_DATABASE_NAME", ":memory:")
django.setup()
call_command("migrate", interactive=False, verbosity=0)
else:
# Otherwise use default sqlite3 file-based database and initialize django
# without running migrations automatically (user runs them manually by calling init)
try:
django.setup()
bump_startup_progress_bar()
call_command("migrate", interactive=False, verbosity=0)
else:
# Otherwise use default sqlite3 file-based database and initialize django
# without running migrations automatically (user runs them manually by calling init)
try:
django.setup()
except Exception as e:
bump_startup_progress_bar(advance=1000)
is_using_meta_cmd = any(ignored_subcommand in sys.argv for ignored_subcommand in ('help', 'version', '--help', '--version'))
if not is_using_meta_cmd:
# show error message to user only if they're not running a meta command / just trying to get help
STDERR.print()
STDERR.print(Panel(
f'\n[red]{e.__class__.__name__}[/red]: [yellow]{e}[/yellow]\nPlease check your config and [blue]DATA_DIR[/blue] permissions.\n',
title='\n\n[red][X] Error while trying to load database![/red]',
subtitle='[grey53]NO WRITES CAN BE PERFORMED[/grey53]',
expand=False,
style='bold red',
))
STDERR.print()
STDERR.print_exception(show_locals=False)
return
bump_startup_progress_bar()
from django.conf import settings
# log startup message to the error log
with open(settings.ERROR_LOG, "a", encoding='utf-8') as f:
command = ' '.join(sys.argv)
ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
f.write(f"\n> {command}; TS={ts} VERSION={CONSTANTS.VERSION} IN_DOCKER={SHELL_CONFIG.IN_DOCKER} IS_TTY={SHELL_CONFIG.IS_TTY}\n")
if check_db:
# make sure the data dir is owned by a non-root user
if CONSTANTS.DATA_DIR.stat().st_uid == 0:
STDERR.print('[red][X] Error: ArchiveBox DATA_DIR cannot be owned by root![/red]')
STDERR.print(f' {CONSTANTS.DATA_DIR}')
except Exception as e:
is_using_meta_cmd = any(ignored_subcommand in sys.argv for ignored_subcommand in ('help', 'version', '--help', '--version'))
if not is_using_meta_cmd:
# show error message to user only if they're not running a meta command / just trying to get help
STDERR.print()
STDERR.print('[violet]Hint:[/violet] Are you running archivebox in the right folder? (and as a non-root user?)')
STDERR.print(' cd path/to/your/archive/data')
STDERR.print(' archivebox [command]')
STDERR.print(Panel(
f'\n[red]{e.__class__.__name__}[/red]: [yellow]{e}[/yellow]\nPlease check your config and [blue]DATA_DIR[/blue] permissions.\n',
title='\n\n[red][X] Error while trying to load database![/red]',
subtitle='[grey53]NO WRITES CAN BE PERFORMED[/grey53]',
expand=False,
style='bold red',
))
STDERR.print()
raise SystemExit(9)
# Create cache table in DB if needed
try:
from django.core.cache import cache
cache.get('test', None)
except django.db.utils.OperationalError:
call_command("createcachetable", verbosity=0)
STDERR.print_exception(show_locals=False)
return
bump_startup_progress_bar()
from django.conf import settings
# if archivebox gets imported multiple times, we have to close
# the sqlite3 whenever we init from scratch to avoid multiple threads
# sharing the same connection by accident
from django.db import connections
for conn in connections.all():
conn.close_if_unusable_or_obsolete()
# log startup message to the error log
with open(settings.ERROR_LOG, "a", encoding='utf-8') as f:
command = ' '.join(sys.argv)
ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
f.write(f"\n> {command}; TS={ts} VERSION={CONSTANTS.VERSION} IN_DOCKER={SHELL_CONFIG.IN_DOCKER} IS_TTY={SHELL_CONFIG.IS_TTY}\n")
sql_index_path = CONSTANTS.DATABASE_FILE
assert os.access(sql_index_path, os.F_OK), (
f'No database file {sql_index_path} found in: {CONSTANTS.DATA_DIR} (Are you in an ArchiveBox collection directory?)')
if check_db:
# make sure the data dir is owned by a non-root user
if CONSTANTS.DATA_DIR.stat().st_uid == 0:
STDERR.print('[red][X] Error: ArchiveBox DATA_DIR cannot be owned by root![/red]')
STDERR.print(f' {CONSTANTS.DATA_DIR}')
STDERR.print()
STDERR.print('[violet]Hint:[/violet] Are you running archivebox in the right folder? (and as a non-root user?)')
STDERR.print(' cd path/to/your/archive/data')
STDERR.print(' archivebox [command]')
STDERR.print()
raise SystemExit(9)
bump_startup_progress_bar()
# Create cache table in DB if needed
try:
from django.core.cache import cache
cache.get('test', None)
except django.db.utils.OperationalError:
call_command("createcachetable", verbosity=0)
# https://docs.pydantic.dev/logfire/integrations/django/ Logfire Debugging
# if settings.DEBUG_LOGFIRE:
# from opentelemetry.instrumentation.sqlite3 import SQLite3Instrumentor
# SQLite3Instrumentor().instrument()
# if archivebox gets imported multiple times, we have to close
# the sqlite3 whenever we init from scratch to avoid multiple threads
# sharing the same connection by accident
from django.db import connections
for conn in connections.all():
conn.close_if_unusable_or_obsolete()
# import logfire
sql_index_path = CONSTANTS.DATABASE_FILE
assert os.access(sql_index_path, os.F_OK), (
f'No database file {sql_index_path} found in: {CONSTANTS.DATA_DIR} (Are you in an ArchiveBox collection directory?)')
# logfire.configure()
# logfire.instrument_django(is_sql_commentor_enabled=True)
# logfire.info(f'Started ArchiveBox v{CONSTANTS.VERSION}', argv=sys.argv)
# https://docs.pydantic.dev/logfire/integrations/django/ Logfire Debugging
# if settings.DEBUG_LOGFIRE:
# from opentelemetry.instrumentation.sqlite3 import SQLite3Instrumentor
# SQLite3Instrumentor().instrument()
# import logfire
# logfire.configure()
# logfire.instrument_django(is_sql_commentor_enabled=True)
# logfire.info(f'Started ArchiveBox v{CONSTANTS.VERSION}', argv=sys.argv)
except KeyboardInterrupt:
raise SystemExit(2)
except KeyboardInterrupt:
raise SystemExit(2)
DJANGO_SET_UP = True
INITIAL_STARTUP_PROGRESS = None
INITIAL_STARTUP_PROGRESS_TASK = None

View File

@@ -19,6 +19,150 @@ from archivebox.hooks import get_extractor_icon
from core.models import ArchiveResult, Snapshot
def render_archiveresults_list(archiveresults_qs, limit=50):
"""Render a nice inline list view of archive results with status, extractor, output, and actions."""
results = list(archiveresults_qs.order_by('-end_ts').select_related('snapshot')[:limit])
if not results:
return mark_safe('<div style="color: #64748b; font-style: italic; padding: 16px 0;">No Archive Results yet...</div>')
# Status colors
status_colors = {
'succeeded': ('#166534', '#dcfce7'), # green
'failed': ('#991b1b', '#fee2e2'), # red
'queued': ('#6b7280', '#f3f4f6'), # gray
'started': ('#92400e', '#fef3c7'), # amber
}
rows = []
for idx, result in enumerate(results):
status = result.status or 'queued'
color, bg = status_colors.get(status, ('#6b7280', '#f3f4f6'))
# Get extractor icon
icon = get_extractor_icon(result.extractor)
# Format timestamp
end_time = result.end_ts.strftime('%Y-%m-%d %H:%M:%S') if result.end_ts else '-'
# Truncate output for display
full_output = result.output or '-'
output_display = full_output[:60]
if len(full_output) > 60:
output_display += '...'
# Get full command as tooltip
cmd_str = ' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd or '-')
# Build output link
output_link = f'/archive/{result.snapshot.timestamp}/{result.output}' if result.output and result.status == 'succeeded' else f'/archive/{result.snapshot.timestamp}/'
# Get version - try cmd_version field
version = result.cmd_version if result.cmd_version else '-'
# Unique ID for this row's expandable output
row_id = f'output_{idx}_{str(result.id)[:8]}'
rows.append(f'''
<tr style="border-bottom: 1px solid #f1f5f9; transition: background 0.15s;" onmouseover="this.style.background='#f8fafc'" onmouseout="this.style.background='transparent'">
<td style="padding: 10px 12px; white-space: nowrap;">
<span style="display: inline-block; padding: 3px 10px; border-radius: 12px;
font-size: 11px; font-weight: 600; text-transform: uppercase;
color: {color}; background: {bg};">{status}</span>
</td>
<td style="padding: 10px 12px; white-space: nowrap; font-size: 20px;" title="{result.extractor}">
{icon}
</td>
<td style="padding: 10px 12px; font-weight: 500; color: #334155;">
{result.extractor}
</td>
<td style="padding: 10px 12px; max-width: 280px;">
<span onclick="document.getElementById('{row_id}').open = !document.getElementById('{row_id}').open"
style="color: #2563eb; text-decoration: none; font-family: ui-monospace, monospace; font-size: 12px; cursor: pointer;"
title="Click to expand full output">
{output_display}
</span>
</td>
<td style="padding: 10px 12px; white-space: nowrap; color: #64748b; font-size: 12px;">
{end_time}
</td>
<td style="padding: 10px 12px; white-space: nowrap; font-family: ui-monospace, monospace; font-size: 11px; color: #64748b;">
{version}
</td>
<td style="padding: 10px 8px; white-space: nowrap;">
<div style="display: flex; gap: 4px;">
<a href="{output_link}" target="_blank"
style="padding: 4px 8px; background: #f1f5f9; border-radius: 4px; color: #475569; text-decoration: none; font-size: 11px;"
title="View output">📄</a>
<a href="{reverse('admin:core_archiveresult_change', args=[result.id])}"
style="padding: 4px 8px; background: #f1f5f9; border-radius: 4px; color: #475569; text-decoration: none; font-size: 11px;"
title="Edit">✏️</a>
</div>
</td>
</tr>
<tr style="border-bottom: 1px solid #e2e8f0;">
<td colspan="7" style="padding: 0 12px 10px 12px;">
<details id="{row_id}" style="margin: 0;">
<summary style="cursor: pointer; font-size: 11px; color: #94a3b8; user-select: none;">
Details &amp; Output
</summary>
<div style="margin-top: 8px; padding: 10px; background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 6px; max-height: 200px; overflow: auto;">
<div style="font-size: 11px; color: #64748b; margin-bottom: 8px;">
<span style="margin-right: 16px;"><b>ID:</b> <code>{str(result.id)[:8]}...</code></span>
<span style="margin-right: 16px;"><b>Version:</b> <code>{version}</code></span>
<span style="margin-right: 16px;"><b>PWD:</b> <code>{result.pwd or '-'}</code></span>
</div>
<div style="font-size: 11px; color: #64748b; margin-bottom: 8px;">
<b>Output:</b>
</div>
<pre style="margin: 0; padding: 8px; background: #1e293b; border-radius: 4px; color: #e2e8f0; font-size: 12px; white-space: pre-wrap; word-break: break-all; max-height: 120px; overflow: auto;">{full_output}</pre>
<div style="font-size: 11px; color: #64748b; margin-top: 8px;">
<b>Command:</b>
</div>
<pre style="margin: 0; padding: 8px; background: #1e293b; border-radius: 4px; color: #e2e8f0; font-size: 11px; white-space: pre-wrap; word-break: break-all;">{cmd_str}</pre>
</div>
</details>
</td>
</tr>
''')
total_count = archiveresults_qs.count()
footer = ''
if total_count > limit:
footer = f'''
<tr>
<td colspan="7" style="padding: 12px; text-align: center; color: #64748b; font-size: 13px; background: #f8fafc;">
Showing {limit} of {total_count} results &nbsp;
<a href="/admin/core/archiveresult/?snapshot__id__exact={results[0].snapshot_id if results else ''}"
style="color: #2563eb;">View all →</a>
</td>
</tr>
'''
return mark_safe(f'''
<div style="border: 1px solid #e2e8f0; border-radius: 8px; overflow: hidden; background: #fff; width: 100%;">
<table style="width: 100%; border-collapse: collapse; font-size: 14px;">
<thead>
<tr style="background: #f8fafc; border-bottom: 2px solid #e2e8f0;">
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Status</th>
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; width: 32px;"></th>
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Extractor</th>
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Output</th>
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Completed</th>
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Version</th>
<th style="padding: 10px 8px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Actions</th>
</tr>
</thead>
<tbody>
{''.join(rows)}
{footer}
</tbody>
</table>
</div>
''')
class ArchiveResultInline(admin.TabularInline):
name = 'Archive Results Log'
@@ -97,18 +241,44 @@ class ArchiveResultAdmin(BaseModelAdmin):
sort_fields = ('id', 'created_by', 'created_at', 'extractor', 'status')
readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'extractor_with_icon')
search_fields = ('id', 'snapshot__url', 'extractor', 'output', 'cmd_version', 'cmd', 'snapshot__timestamp')
fields = ('snapshot', 'extractor', 'status', 'retry_at', 'start_ts', 'end_ts', 'created_by', 'pwd', 'cmd_version', 'cmd', 'output', *readonly_fields)
autocomplete_fields = ['snapshot']
fieldsets = (
('Snapshot', {
'fields': ('snapshot', 'snapshot_info', 'tags_str'),
'classes': ('card', 'wide'),
}),
('Extractor', {
'fields': ('extractor', 'extractor_with_icon', 'status', 'retry_at'),
'classes': ('card',),
}),
('Timing', {
'fields': ('start_ts', 'end_ts', 'created_at', 'modified_at'),
'classes': ('card',),
}),
('Command', {
'fields': ('cmd', 'cmd_str', 'cmd_version', 'pwd'),
'classes': ('card',),
}),
('Output', {
'fields': ('output', 'output_summary'),
'classes': ('card', 'wide'),
}),
('Metadata', {
'fields': ('created_by',),
'classes': ('card',),
}),
)
list_filter = ('status', 'extractor', 'start_ts', 'cmd_version')
ordering = ['-start_ts']
list_per_page = SERVER_CONFIG.SNAPSHOTS_PER_PAGE
paginator = AccelleratedPaginator
save_on_top = True
actions = ['delete_selected']
class Meta:
verbose_name = 'Archive Result'
verbose_name_plural = 'Archive Results'

View File

@@ -25,7 +25,7 @@ from archivebox.workers.tasks import bg_archive_snapshots, bg_add
from core.models import Tag
from core.admin_tags import TagInline
from core.admin_archiveresults import ArchiveResultInline
from core.admin_archiveresults import ArchiveResultInline, render_archiveresults_list
# GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': [], 'CAN_UPGRADE': False}
@@ -54,13 +54,48 @@ class SnapshotActionForm(ActionForm):
class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
list_display = ('created_at', 'title_str', 'status', 'files', 'size', 'url_str')
sort_fields = ('title_str', 'url_str', 'created_at', 'status', 'crawl')
readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'output_dir')
readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'output_dir', 'archiveresults_list')
search_fields = ('id', 'url', 'timestamp', 'title', 'tags__name')
list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'created_by', 'tags__name')
fields = ('url', 'title', 'created_by', 'bookmarked_at', 'status', 'retry_at', 'crawl', 'config', *readonly_fields)
fieldsets = (
('URL', {
'fields': ('url', 'title'),
'classes': ('card', 'wide'),
}),
('Status', {
'fields': ('status', 'retry_at', 'status_info'),
'classes': ('card',),
}),
('Timestamps', {
'fields': ('bookmarked_at', 'created_at', 'modified_at', 'downloaded_at'),
'classes': ('card',),
}),
('Relations', {
'fields': ('crawl', 'created_by', 'tags_str'),
'classes': ('card',),
}),
('Config', {
'fields': ('config',),
'classes': ('card',),
}),
('Files', {
'fields': ('output_dir',),
'classes': ('card',),
}),
('Actions', {
'fields': ('admin_actions',),
'classes': ('card', 'wide'),
}),
('Archive Results', {
'fields': ('archiveresults_list',),
'classes': ('card', 'wide'),
}),
)
ordering = ['-created_at']
actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots']
inlines = [TagInline, ArchiveResultInline]
inlines = [TagInline] # Removed ArchiveResultInline, using custom renderer instead
list_per_page = min(max(5, SERVER_CONFIG.SNAPSHOTS_PER_PAGE), 5000)
action_form = SnapshotActionForm
@@ -155,6 +190,10 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
obj.extension or '-',
)
@admin.display(description='Archive Results')
def archiveresults_list(self, obj):
return render_archiveresults_list(obj.archiveresult_set.all())
@admin.display(
description='Title',
ordering='title',

View File

@@ -51,11 +51,25 @@ class TagAdmin(BaseModelAdmin):
sort_fields = ('name', 'slug', 'id', 'created_by', 'created_at')
readonly_fields = ('slug', 'id', 'created_at', 'modified_at', 'snapshots')
search_fields = ('id', 'name', 'slug')
fields = ('name', 'created_by', *readonly_fields)
actions = ['delete_selected', 'merge_tags']
ordering = ['-created_at']
# inlines = [TaggedItemInline]
fieldsets = (
('Tag Info', {
'fields': ('name', 'slug'),
'classes': ('card',),
}),
('Metadata', {
'fields': ('id', 'created_by', 'created_at', 'modified_at'),
'classes': ('card',),
}),
('Snapshots', {
'fields': ('snapshots',),
'classes': ('card', 'wide'),
}),
)
paginator = AccelleratedPaginator

View File

@@ -1,7 +1,5 @@
__package__ = 'archivebox.core'
import sys
from django.apps import AppConfig
@@ -12,41 +10,3 @@ class CoreConfig(AppConfig):
"""Register the archivebox.core.admin_site as the main django admin site"""
from core.admin_site import register_admin_site
register_admin_site()
# Auto-start the orchestrator when running the web server
self._maybe_start_orchestrator()
def _maybe_start_orchestrator(self):
"""Start the orchestrator if we're running a web server."""
import os
# Don't start orchestrator during migrations, shell, tests, etc.
# Only start when running: runserver, daphne, gunicorn, uwsgi
if not self._is_web_server():
return
# Don't start if RUN_ORCHESTRATOR env var is explicitly set to false
if os.environ.get('RUN_ORCHESTRATOR', '').lower() in ('false', '0', 'no'):
return
# Don't start in autoreload child process (avoid double-start)
if os.environ.get('RUN_MAIN') != 'true' and 'runserver' in sys.argv:
return
try:
from workers.orchestrator import Orchestrator
if not Orchestrator.is_running():
# Start orchestrator as daemon (won't exit on idle when started by server)
orchestrator = Orchestrator(exit_on_idle=False)
orchestrator.start()
except Exception as e:
# Don't crash the server if orchestrator fails to start
import logging
logging.getLogger('archivebox').warning(f'Failed to auto-start orchestrator: {e}')
def _is_web_server(self) -> bool:
"""Check if we're running a web server command."""
# Check for common web server indicators
server_commands = ('runserver', 'daphne', 'gunicorn', 'uwsgi', 'server')
return any(cmd in ' '.join(sys.argv).lower() for cmd in server_commands)

View File

@@ -0,0 +1,22 @@
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0024_snapshot_crawl'),
]
operations = [
# Remove the unique constraint on url
migrations.AlterField(
model_name='snapshot',
name='url',
field=models.URLField(db_index=True, unique=False),
),
# Add unique constraint on (url, crawl) combination
migrations.AddConstraint(
model_name='snapshot',
constraint=models.UniqueConstraint(fields=['url', 'crawl'], name='unique_url_per_crawl'),
),
]

View File

@@ -60,7 +60,8 @@ class Tag(ModelWithSerializers):
return self.name
def save(self, *args, **kwargs):
if self._state.adding:
is_new = self._state.adding
if is_new:
self.slug = slugify(self.name)
existing = set(Tag.objects.filter(slug__startswith=self.slug).values_list("slug", flat=True))
i = None
@@ -72,6 +73,19 @@ class Tag(ModelWithSerializers):
i = (i or 0) + 1
super().save(*args, **kwargs)
if is_new:
from archivebox.misc.logging_util import log_worker_event
log_worker_event(
worker_type='DB',
event='Created Tag',
indent_level=0,
metadata={
'id': self.id,
'name': self.name,
'slug': self.slug,
},
)
@property
def api_url(self) -> str:
return reverse_lazy('api-1:get_tag', args=[self.id])
@@ -241,12 +255,13 @@ class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)):
if tag.strip()
))
try:
snapshot = self.get(url=url)
# Get most recent snapshot with this URL (URLs can exist in multiple crawls)
snapshot = self.filter(url=url).order_by('-created_at').first()
if snapshot:
if title and (not snapshot.title or len(title) > len(snapshot.title or '')):
snapshot.title = title
snapshot.save(update_fields=['title', 'modified_at'])
except self.model.DoesNotExist:
else:
if timestamp:
while self.filter(timestamp=timestamp).exists():
timestamp = str(float(timestamp) + 1.0)
@@ -284,7 +299,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
created_at = models.DateTimeField(default=timezone.now, db_index=True)
modified_at = models.DateTimeField(auto_now=True)
url = models.URLField(unique=True, db_index=True)
url = models.URLField(unique=False, db_index=True) # URLs can appear in multiple crawls
timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False)
bookmarked_at = models.DateTimeField(default=timezone.now, db_index=True)
crawl: Crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name='snapshot_set', db_index=True) # type: ignore
@@ -313,11 +328,16 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
class Meta(TypedModelMeta):
verbose_name = "Snapshot"
verbose_name_plural = "Snapshots"
constraints = [
# Allow same URL in different crawls, but not duplicates within same crawl
models.UniqueConstraint(fields=['url', 'crawl'], name='unique_url_per_crawl'),
]
def __str__(self):
return f'[{self.id}] {self.url[:64]}'
def save(self, *args, **kwargs):
is_new = self._state.adding
if not self.bookmarked_at:
self.bookmarked_at = self.created_at or timezone.now()
if not self.timestamp:
@@ -327,6 +347,21 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
self.crawl.urls += f'\n{self.url}'
self.crawl.save()
if is_new:
from archivebox.misc.logging_util import log_worker_event
log_worker_event(
worker_type='DB',
event='Created Snapshot',
indent_level=2,
url=self.url,
metadata={
'id': str(self.id),
'crawl_id': str(self.crawl_id) if self.crawl_id else None,
'depth': self.depth,
'status': self.status,
},
)
def output_dir_parent(self) -> str:
return 'archive'
@@ -807,6 +842,24 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
def __str__(self):
return f'[{self.id}] {self.snapshot.url[:64]} -> {self.extractor}'
def save(self, *args, **kwargs):
is_new = self._state.adding
super().save(*args, **kwargs)
if is_new:
from archivebox.misc.logging_util import log_worker_event
log_worker_event(
worker_type='DB',
event='Created ArchiveResult',
indent_level=3,
extractor=self.extractor,
metadata={
'id': str(self.id),
'snapshot_id': str(self.snapshot_id),
'snapshot_url': str(self.snapshot.url)[:64],
'status': self.status,
},
)
@cached_property
def snapshot_dir(self):
return Path(self.snapshot.output_dir)
@@ -879,7 +932,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
from django.utils import timezone
from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook
extractor_dir = Path(self.snapshot.output_dir) / self.extractor
config_objects = [self.snapshot.crawl, self.snapshot] if self.snapshot.crawl else [self.snapshot]
# Find hook for this extractor
@@ -899,6 +951,10 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
self.save()
return
# Use plugin directory name instead of extractor name (removes numeric prefix)
plugin_name = hook.parent.name
extractor_dir = Path(self.snapshot.output_dir) / plugin_name
# Run the hook
start_ts = timezone.now()
result = run_hook(

View File

@@ -45,15 +45,14 @@ class SnapshotMachine(StateMachine, strict_states=True):
super().__init__(snapshot, *args, **kwargs)
def __repr__(self) -> str:
return f'[grey53]Snapshot\\[{self.snapshot.id}] 🏃‍♂️ Worker\\[pid={os.getpid()}].tick()[/grey53] [blue]{self.snapshot.status.upper()}[/blue] ⚙️ [grey37]Machine[/grey37]'
return f'Snapshot[{self.snapshot.id}]'
def __str__(self) -> str:
return self.__repr__()
def can_start(self) -> bool:
can_start = bool(self.snapshot.url)
if not can_start:
print(f'{self}.can_start() [blue]QUEUED[/blue] ➡️❌ [blue]STARTED[/blue] cant start yet +{timezone.now() - self.snapshot.retry_at}s')
# Suppressed: queue waiting logs
return can_start
def is_finished(self) -> bool:
@@ -73,15 +72,15 @@ class SnapshotMachine(StateMachine, strict_states=True):
@queued.enter
def enter_queued(self):
print(f'{self}.on_queued() ↳ snapshot.retry_at = now()')
# Suppressed: state transition logs
self.snapshot.update_for_workers(
retry_at=timezone.now(),
status=Snapshot.StatusChoices.QUEUED,
)
@started.enter
def enter_started(self):
print(f'{self}.on_started() ↳ snapshot.run()')
# Suppressed: state transition logs
# lock the snapshot while we create the pending archiveresults
self.snapshot.update_for_workers(
retry_at=timezone.now() + timedelta(seconds=30), # if failed, wait 30s before retrying
@@ -95,10 +94,10 @@ class SnapshotMachine(StateMachine, strict_states=True):
retry_at=timezone.now() + timedelta(seconds=5), # wait 5s before checking it again
status=Snapshot.StatusChoices.STARTED,
)
@sealed.enter
def enter_sealed(self):
print(f'{self}.on_sealed() ↳ snapshot.retry_at=None')
# Suppressed: state transition logs
self.snapshot.update_for_workers(
retry_at=None,
status=Snapshot.StatusChoices.SEALED,
@@ -161,15 +160,14 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
super().__init__(archiveresult, *args, **kwargs)
def __repr__(self) -> str:
return f'[grey53]ArchiveResult\\[{self.archiveresult.id}] 🏃‍♂️ Worker\\[pid={os.getpid()}].tick()[/grey53] [blue]{self.archiveresult.status.upper()}[/blue] ⚙️ [grey37]Machine[/grey37]'
return f'ArchiveResult[{self.archiveresult.id}]'
def __str__(self) -> str:
return self.__repr__()
def can_start(self) -> bool:
can_start = bool(self.archiveresult.snapshot.url)
if not can_start:
print(f'{self}.can_start() [blue]QUEUED[/blue] ➡️❌ [blue]STARTED[/blue]: cant start yet +{timezone.now() - self.archiveresult.retry_at}s')
# Suppressed: queue waiting logs
return can_start
def is_succeeded(self) -> bool:
@@ -202,41 +200,34 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
@queued.enter
def enter_queued(self):
print(f'{self}.on_queued() ↳ archiveresult.retry_at = now()')
# Suppressed: state transition logs
self.archiveresult.update_for_workers(
retry_at=timezone.now(),
status=ArchiveResult.StatusChoices.QUEUED,
start_ts=None,
) # bump the snapshot's retry_at so they pickup any new changes
@started.enter
def enter_started(self):
print(f'{self}.on_started() ↳ archiveresult.start_ts + run_extractor()')
# Suppressed: state transition logs
# Lock the object and mark start time
self.archiveresult.update_for_workers(
retry_at=timezone.now() + timedelta(seconds=120), # 2 min timeout for extractor
status=ArchiveResult.StatusChoices.STARTED,
start_ts=timezone.now(),
)
# Run the extractor - this updates status, output, timestamps, etc.
self.archiveresult.run()
# Save the updated result
self.archiveresult.save()
# Log the result
if self.archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED:
print(f'{self} ✅ extractor succeeded: {self.archiveresult.output[:50] if self.archiveresult.output else ""}...')
elif self.archiveresult.status == ArchiveResult.StatusChoices.FAILED:
print(f'{self} ❌ extractor failed: {self.archiveresult.output[:100] if self.archiveresult.output else ""}...')
elif self.archiveresult.status == ArchiveResult.StatusChoices.SKIPPED:
print(f'{self} ⏭️ extractor skipped: {self.archiveresult.output[:50] if self.archiveresult.output else ""}')
# Suppressed: extractor result logs (already logged by worker)
@backoff.enter
def enter_backoff(self):
print(f'{self}.on_backoff() ↳ archiveresult.retries += 1, archiveresult.bump_retry_at(+60s), archiveresult.end_ts = None')
# Suppressed: state transition logs
self.archiveresult.update_for_workers(
retry_at=timezone.now() + timedelta(seconds=60),
status=ArchiveResult.StatusChoices.BACKOFF,
@@ -244,10 +235,10 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
# retries=F('retries') + 1, # F() equivalent to getattr(self.archiveresult, 'retries', 0) + 1,
)
self.archiveresult.save(write_indexes=True)
@succeeded.enter
def enter_succeeded(self):
print(f'{self}.on_succeeded() ↳ archiveresult.retry_at = None, archiveresult.end_ts = now()')
# Suppressed: state transition logs
self.archiveresult.update_for_workers(
retry_at=None,
status=ArchiveResult.StatusChoices.SUCCEEDED,
@@ -270,7 +261,7 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
@failed.enter
def enter_failed(self):
print(f'{self}.on_failed() ↳ archiveresult.retry_at = None, archiveresult.end_ts = now()')
# Suppressed: state transition logs
self.archiveresult.update_for_workers(
retry_at=None,
status=ArchiveResult.StatusChoices.FAILED,
@@ -291,7 +282,7 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
@skipped.enter
def enter_skipped(self):
print(f'{self}.on_skipped() ↳ archiveresult.retry_at = None, archiveresult.end_ts = now()')
# Suppressed: state transition logs
self.archiveresult.update_for_workers(
retry_at=None,
status=ArchiveResult.StatusChoices.SKIPPED,

View File

@@ -503,15 +503,7 @@ class AddView(UserPassesTestMixin, FormView):
mark_safe(f"Adding {rough_url_count} URLs in the background. (refresh in a minute start seeing results) {crawl.admin_change_url}"),
)
# Start orchestrator in background to process the queued crawl
try:
from archivebox.workers.tasks import ensure_orchestrator_running
ensure_orchestrator_running()
except Exception as e:
# Orchestrator may already be running via supervisord, or fail to start
# This is not fatal - the crawl will be processed when orchestrator runs
print(f'[!] Failed to start orchestrator: {e}')
# Orchestrator (managed by supervisord) will pick up the queued crawl
return redirect(crawl.admin_change_url)
@@ -539,6 +531,7 @@ def live_progress_view(request):
from workers.orchestrator import Orchestrator
from crawls.models import Crawl
from core.models import Snapshot, ArchiveResult
from django.db.models import Case, When, Value, IntegerField
# Get orchestrator status
orchestrator_running = Orchestrator.is_running()
@@ -570,8 +563,26 @@ def live_progress_view(request):
crawl_snapshots = Snapshot.objects.filter(crawl=crawl)
total_snapshots = crawl_snapshots.count()
completed_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.SEALED).count()
started_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.STARTED).count()
pending_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.QUEUED).count()
# Count URLs in the crawl (for when snapshots haven't been created yet)
urls_count = 0
if crawl.urls:
urls_count = len([u for u in crawl.urls.split('\n') if u.strip()])
elif crawl.seed and crawl.seed.uri:
# Try to get URL count from seed
if crawl.seed.uri.startswith('file:///'):
try:
from pathlib import Path
seed_file = Path(crawl.seed.uri.replace('file://', ''))
if seed_file.exists():
urls_count = len([l for l in seed_file.read_text().split('\n') if l.strip() and not l.startswith('#')])
except:
pass
else:
urls_count = 1 # Single URL seed
# Calculate crawl progress
crawl_progress = int((completed_snapshots / total_snapshots) * 100) if total_snapshots > 0 else 0
@@ -590,16 +601,24 @@ def live_progress_view(request):
# Calculate snapshot progress
snapshot_progress = int(((completed_extractors + failed_extractors) / total_extractors) * 100) if total_extractors > 0 else 0
# Get active extractors for this snapshot
active_extractors = [
# Get all extractors for this snapshot
# Order: started first, then queued, then completed
all_extractors = [
{
'id': str(ar.id),
'extractor': ar.extractor,
'status': ar.status,
'started': ar.start_ts.isoformat() if ar.start_ts else None,
'progress': 50,
}
for ar in snapshot_results.filter(status=ArchiveResult.StatusChoices.STARTED).order_by('-start_ts')[:5]
for ar in snapshot_results.annotate(
status_order=Case(
When(status=ArchiveResult.StatusChoices.STARTED, then=Value(0)),
When(status=ArchiveResult.StatusChoices.QUEUED, then=Value(1)),
When(status=ArchiveResult.StatusChoices.SUCCEEDED, then=Value(2)),
When(status=ArchiveResult.StatusChoices.FAILED, then=Value(3)),
default=Value(4),
output_field=IntegerField(),
)
).order_by('status_order', 'extractor')
]
active_snapshots_for_crawl.append({
@@ -612,9 +631,17 @@ def live_progress_view(request):
'completed_extractors': completed_extractors,
'failed_extractors': failed_extractors,
'pending_extractors': pending_extractors,
'active_extractors': active_extractors,
'all_extractors': all_extractors,
})
# Check if crawl can start (for debugging stuck crawls)
can_start = bool(crawl.seed and crawl.seed.uri)
seed_uri = crawl.seed.uri[:60] if crawl.seed and crawl.seed.uri else None
# Check if retry_at is in the future (would prevent worker from claiming)
retry_at_future = crawl.retry_at > timezone.now() if crawl.retry_at else False
seconds_until_retry = int((crawl.retry_at - timezone.now()).total_seconds()) if crawl.retry_at and retry_at_future else 0
active_crawls.append({
'id': str(crawl.id),
'label': str(crawl)[:60],
@@ -622,11 +649,17 @@ def live_progress_view(request):
'started': crawl.modified_at.isoformat() if crawl.modified_at else None,
'progress': crawl_progress,
'max_depth': crawl.max_depth,
'urls_count': urls_count,
'total_snapshots': total_snapshots,
'completed_snapshots': completed_snapshots,
'started_snapshots': started_snapshots,
'failed_snapshots': 0,
'pending_snapshots': pending_snapshots,
'active_snapshots': active_snapshots_for_crawl,
'can_start': can_start,
'seed_uri': seed_uri,
'retry_at_future': retry_at_future,
'seconds_until_retry': seconds_until_retry,
})
return JsonResponse({

View File

@@ -8,6 +8,7 @@ from django.contrib import admin, messages
from django.urls import path
from django.http import JsonResponse
from django.views.decorators.http import require_POST
from django.db.models import Count, Q
from archivebox import DATA_DIR
@@ -19,13 +20,155 @@ from core.models import Snapshot
from crawls.models import Seed, Crawl, CrawlSchedule
def render_snapshots_list(snapshots_qs, limit=20):
"""Render a nice inline list view of snapshots with status, title, URL, and progress."""
snapshots = snapshots_qs.order_by('-created_at')[:limit].annotate(
total_results=Count('archiveresult'),
succeeded_results=Count('archiveresult', filter=Q(archiveresult__status='succeeded')),
failed_results=Count('archiveresult', filter=Q(archiveresult__status='failed')),
)
if not snapshots:
return mark_safe('<div style="color: #666; font-style: italic; padding: 8px 0;">No Snapshots yet...</div>')
# Status colors matching Django admin and progress monitor
status_colors = {
'queued': ('#6c757d', '#f8f9fa'), # gray
'started': ('#856404', '#fff3cd'), # amber
'sealed': ('#155724', '#d4edda'), # green
'failed': ('#721c24', '#f8d7da'), # red
}
rows = []
for snapshot in snapshots:
status = snapshot.status or 'queued'
color, bg = status_colors.get(status, ('#6c757d', '#f8f9fa'))
# Calculate progress
total = snapshot.total_results
done = snapshot.succeeded_results + snapshot.failed_results
progress_pct = int((done / total) * 100) if total > 0 else 0
progress_text = f'{done}/{total}' if total > 0 else '-'
# Truncate title and URL
title = (snapshot.title or 'Untitled')[:60]
if len(snapshot.title or '') > 60:
title += '...'
url_display = snapshot.url[:50]
if len(snapshot.url) > 50:
url_display += '...'
# Format date
date_str = snapshot.created_at.strftime('%Y-%m-%d %H:%M') if snapshot.created_at else '-'
rows.append(f'''
<tr style="border-bottom: 1px solid #eee;">
<td style="padding: 6px 8px; white-space: nowrap;">
<span style="display: inline-block; padding: 2px 8px; border-radius: 10px;
font-size: 11px; font-weight: 500; text-transform: uppercase;
color: {color}; background: {bg};">{status}</span>
</td>
<td style="padding: 6px 8px; white-space: nowrap;">
<a href="/archive/{snapshot.timestamp}/" style="text-decoration: none;">
<img src="/archive/{snapshot.timestamp}/favicon.ico"
style="width: 16px; height: 16px; vertical-align: middle; margin-right: 4px;"
onerror="this.style.display='none'"/>
</a>
</td>
<td style="padding: 6px 8px; max-width: 300px;">
<a href="{snapshot.admin_change_url}" style="color: #417690; text-decoration: none; font-weight: 500;"
title="{snapshot.title or 'Untitled'}">{title}</a>
</td>
<td style="padding: 6px 8px; max-width: 250px;">
<a href="{snapshot.url}" target="_blank"
style="color: #666; text-decoration: none; font-family: monospace; font-size: 11px;"
title="{snapshot.url}">{url_display}</a>
</td>
<td style="padding: 6px 8px; white-space: nowrap; text-align: center;">
<div style="display: inline-flex; align-items: center; gap: 6px;">
<div style="width: 60px; height: 6px; background: #eee; border-radius: 3px; overflow: hidden;">
<div style="width: {progress_pct}%; height: 100%;
background: {'#28a745' if snapshot.failed_results == 0 else '#ffc107' if snapshot.succeeded_results > 0 else '#dc3545'};
transition: width 0.3s;"></div>
</div>
<a href="/admin/core/archiveresult/?snapshot__id__exact={snapshot.id}"
style="font-size: 11px; color: #417690; min-width: 35px; text-decoration: none;"
title="View archive results">{progress_text}</a>
</div>
</td>
<td style="padding: 6px 8px; white-space: nowrap; color: #888; font-size: 11px;">
{date_str}
</td>
</tr>
''')
total_count = snapshots_qs.count()
footer = ''
if total_count > limit:
footer = f'''
<tr>
<td colspan="6" style="padding: 8px; text-align: center; color: #666; font-size: 12px; background: #f8f9fa;">
Showing {limit} of {total_count} snapshots
</td>
</tr>
'''
return mark_safe(f'''
<div style="border: 1px solid #ddd; border-radius: 6px; overflow: hidden; max-width: 100%;">
<table style="width: 100%; border-collapse: collapse; font-size: 13px;">
<thead>
<tr style="background: #f5f5f5; border-bottom: 2px solid #ddd;">
<th style="padding: 8px; text-align: left; font-weight: 600; color: #333;">Status</th>
<th style="padding: 8px; text-align: left; font-weight: 600; color: #333; width: 24px;"></th>
<th style="padding: 8px; text-align: left; font-weight: 600; color: #333;">Title</th>
<th style="padding: 8px; text-align: left; font-weight: 600; color: #333;">URL</th>
<th style="padding: 8px; text-align: center; font-weight: 600; color: #333;">Progress</th>
<th style="padding: 8px; text-align: left; font-weight: 600; color: #333;">Created</th>
</tr>
</thead>
<tbody>
{''.join(rows)}
{footer}
</tbody>
</table>
</div>
''')
class SeedAdmin(ConfigEditorMixin, BaseModelAdmin):
list_display = ('id', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str', 'crawls', 'num_crawls', 'num_snapshots')
sort_fields = ('id', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str')
search_fields = ('id', 'created_by__username', 'label', 'notes', 'uri', 'extractor', 'tags_str')
readonly_fields = ('created_at', 'modified_at', 'scheduled_crawls', 'crawls', 'snapshots', 'contents')
fields = ('label', 'notes', 'uri', 'extractor', 'tags_str', 'config', 'created_by', *readonly_fields)
fieldsets = (
('Source', {
'fields': ('uri', 'contents'),
'classes': ('card', 'wide'),
}),
('Info', {
'fields': ('label', 'notes', 'tags_str'),
'classes': ('card',),
}),
('Settings', {
'fields': ('extractor', 'config'),
'classes': ('card',),
}),
('Metadata', {
'fields': ('created_by', 'created_at', 'modified_at'),
'classes': ('card',),
}),
('Crawls', {
'fields': ('scheduled_crawls', 'crawls'),
'classes': ('card',),
}),
('Snapshots', {
'fields': ('snapshots',),
'classes': ('card',),
}),
)
list_filter = ('extractor', 'created_by')
ordering = ['-created_at']
@@ -51,22 +194,19 @@ class SeedAdmin(ConfigEditorMixin, BaseModelAdmin):
)) or mark_safe('<i>No Crawls yet...</i>')
def snapshots(self, obj):
return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
(snapshot.admin_change_url, snapshot)
for snapshot in obj.snapshot_set.all().order_by('-created_at')[:20]
)) or mark_safe('<i>No Snapshots yet...</i>')
return render_snapshots_list(obj.snapshot_set.all())
def contents(self, obj):
if obj.uri.startswith('file:///data/'):
source_file = DATA_DIR / obj.uri.replace('file:///data/', '', 1)
source_file = obj.get_file_path()
if source_file:
contents = ""
try:
contents = source_file.read_text().strip()[:14_000]
except Exception as e:
contents = f'Error reading {source_file}: {e}'
return format_html('<b><code>{}</code>:</b><br/><pre>{}</pre>', source_file, contents)
return format_html('See URLs here: <a href="{}">{}</a>', obj.uri, obj.uri)
@@ -78,7 +218,37 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
search_fields = ('id', 'created_by__username', 'max_depth', 'label', 'notes', 'seed_id', 'schedule_id', 'status', 'seed__uri')
readonly_fields = ('created_at', 'modified_at', 'snapshots', 'seed_urls_editor')
fields = ('label', 'notes', 'seed_urls_editor', 'config', 'status', 'retry_at', 'max_depth', 'seed', 'schedule', 'created_by', 'created_at', 'modified_at', 'snapshots')
fieldsets = (
('URLs', {
'fields': ('seed_urls_editor',),
'classes': ('card', 'wide'),
}),
('Info', {
'fields': ('label', 'notes'),
'classes': ('card',),
}),
('Settings', {
'fields': ('max_depth', 'config'),
'classes': ('card',),
}),
('Status', {
'fields': ('status', 'retry_at'),
'classes': ('card',),
}),
('Relations', {
'fields': ('seed', 'schedule', 'created_by'),
'classes': ('card',),
}),
('Timestamps', {
'fields': ('created_at', 'modified_at'),
'classes': ('card',),
}),
('Snapshots', {
'fields': ('snapshots',),
'classes': ('card', 'wide'),
}),
)
list_filter = ('max_depth', 'seed', 'schedule', 'created_by', 'status', 'retry_at')
ordering = ['-created_at', '-retry_at']
@@ -90,6 +260,16 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
def recrawl(self, request, obj):
"""Duplicate this crawl as a new crawl with the same seed and settings."""
from django.utils import timezone
from django.shortcuts import redirect
# Validate seed has a URI (required for crawl to start)
if not obj.seed:
messages.error(request, 'Cannot recrawl: original crawl has no seed.')
return redirect('admin:crawls_crawl_change', obj.id)
if not obj.seed.uri:
messages.error(request, 'Cannot recrawl: seed has no URI.')
return redirect('admin:crawls_crawl_change', obj.id)
new_crawl = Crawl.objects.create(
seed=obj.seed,
@@ -110,8 +290,6 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
f'It will start processing shortly.'
)
# Redirect to the new crawl's change page
from django.shortcuts import redirect
return redirect('admin:crawls_crawl_change', new_crawl.id)
def get_urls(self):
@@ -133,7 +311,8 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
except Crawl.DoesNotExist:
return JsonResponse({'success': False, 'error': 'Crawl not found'}, status=404)
if not (crawl.seed and crawl.seed.uri and crawl.seed.uri.startswith('file:///data/')):
source_file = crawl.seed.get_file_path() if crawl.seed else None
if not source_file:
return JsonResponse({'success': False, 'error': 'Seed is not a local file'}, status=400)
try:
@@ -142,8 +321,6 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
except json.JSONDecodeError:
return JsonResponse({'success': False, 'error': 'Invalid JSON'}, status=400)
source_file = DATA_DIR / crawl.seed.uri.replace('file:///data/', '', 1)
try:
# Ensure parent directory exists
source_file.parent.mkdir(parents=True, exist_ok=True)
@@ -156,10 +333,7 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
return obj.snapshot_set.count()
def snapshots(self, obj):
return format_html_join('<br/>', '<a href="{}">{}</a>', (
(snapshot.admin_change_url, snapshot)
for snapshot in obj.snapshot_set.all().order_by('-created_at')[:20]
)) or mark_safe('<i>No Snapshots yet...</i>')
return render_snapshots_list(obj.snapshot_set.all())
@admin.display(description='Schedule', ordering='schedule')
def schedule_str(self, obj):
@@ -186,13 +360,12 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
seed_uri = obj.urls
# Check if it's a local file we can edit
is_file = seed_uri.startswith('file:///data/')
source_file = obj.seed.get_file_path() if obj.seed else None
is_file = source_file is not None
contents = ""
error = None
source_file = None
if is_file:
source_file = DATA_DIR / seed_uri.replace('file:///data/', '', 1)
if is_file and source_file:
try:
contents = source_file.read_text().strip()
except Exception as e:
@@ -337,7 +510,29 @@ class CrawlScheduleAdmin(BaseModelAdmin):
search_fields = ('id', 'created_by__username', 'label', 'notes', 'schedule_id', 'template_id', 'template__seed__uri')
readonly_fields = ('created_at', 'modified_at', 'crawls', 'snapshots')
fields = ('label', 'notes', 'schedule', 'template', 'created_by', *readonly_fields)
fieldsets = (
('Schedule Info', {
'fields': ('label', 'notes'),
'classes': ('card',),
}),
('Configuration', {
'fields': ('schedule', 'template'),
'classes': ('card',),
}),
('Metadata', {
'fields': ('created_by', 'created_at', 'modified_at'),
'classes': ('card',),
}),
('Crawls', {
'fields': ('crawls',),
'classes': ('card', 'wide'),
}),
('Snapshots', {
'fields': ('snapshots',),
'classes': ('card', 'wide'),
}),
)
list_filter = ('created_by',)
ordering = ['-created_at']
@@ -362,10 +557,7 @@ class CrawlScheduleAdmin(BaseModelAdmin):
def snapshots(self, obj):
crawl_ids = obj.crawl_set.values_list('pk', flat=True)
return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
(snapshot.admin_change_url, snapshot)
for snapshot in Snapshot.objects.filter(crawl_id__in=crawl_ids).order_by('-created_at')[:20]
)) or mark_safe('<i>No Snapshots yet...</i>')
return render_snapshots_list(Snapshot.objects.filter(crawl_id__in=crawl_ids))
def register_admin(admin_site):

View File

@@ -44,9 +44,27 @@ class Seed(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthS
def __str__(self):
return f'[{self.id}] {self.uri[:64]}'
def save(self, *args, **kwargs):
is_new = self._state.adding
super().save(*args, **kwargs)
if is_new:
from archivebox.misc.logging_util import log_worker_event
log_worker_event(
worker_type='DB',
event='Created Seed',
indent_level=0,
metadata={
'id': str(self.id),
'uri': str(self.uri)[:64],
'extractor': self.extractor,
'label': self.label or None,
},
)
@classmethod
def from_file(cls, source_file: Path, label: str = '', parser: str = 'auto', tag: str = '', created_by=None, config=None):
source_path = str(source_file.resolve()).replace(str(CONSTANTS.DATA_DIR), '/data')
# Use absolute path for file:// URLs so extractors can find the files
source_path = str(source_file.resolve())
seed, _ = cls.objects.get_or_create(
label=label or source_file.name, uri=f'file://{source_path}',
created_by_id=getattr(created_by, 'pk', created_by) or get_or_create_system_user_pk(),
@@ -62,6 +80,25 @@ class Seed(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthS
def api_url(self) -> str:
return reverse_lazy('api-1:get_seed', args=[self.id])
def get_file_path(self) -> Path | None:
"""
Get the filesystem path for file:// URIs.
Handles both old format (file:///data/...) and new format (file:///absolute/path).
Returns None if URI is not a file:// URI.
"""
if not self.uri.startswith('file://'):
return None
# Remove file:// prefix
path_str = self.uri.replace('file://', '', 1)
# Handle old format: file:///data/... -> DATA_DIR/...
if path_str.startswith('/data/'):
return CONSTANTS.DATA_DIR / path_str.replace('/data/', '', 1)
# Handle new format: file:///absolute/path
return Path(path_str)
@property
def snapshot_set(self) -> QuerySet['Snapshot']:
from core.models import Snapshot
@@ -136,6 +173,23 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
def __str__(self):
return f'[{self.id}] {self.seed.uri[:64] if self.seed else ""}'
def save(self, *args, **kwargs):
is_new = self._state.adding
super().save(*args, **kwargs)
if is_new:
from archivebox.misc.logging_util import log_worker_event
log_worker_event(
worker_type='DB',
event='Created Crawl',
indent_level=1,
metadata={
'id': str(self.id),
'seed_uri': str(self.seed.uri)[:64] if self.seed else None,
'max_depth': self.max_depth,
'status': self.status,
},
)
@classmethod
def from_seed(cls, seed: Seed, max_depth: int = 0, persona: str = 'Default', tags_str: str = '', config=None, created_by=None):
crawl, _ = cls.objects.get_or_create(

View File

@@ -36,13 +36,19 @@ class CrawlMachine(StateMachine, strict_states=True):
super().__init__(crawl, *args, **kwargs)
def __repr__(self) -> str:
return f'[grey53]Crawl\\[{self.crawl.id}] 🏃‍♂️ Worker\\[pid={os.getpid()}].tick()[/grey53] [blue]{self.crawl.status.upper()}[/blue] ⚙️ [grey37]Machine[/grey37]'
return f'Crawl[{self.crawl.id}]'
def __str__(self) -> str:
return self.__repr__()
def can_start(self) -> bool:
return bool(self.crawl.seed and self.crawl.seed.uri)
if not self.crawl.seed:
print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: no seed[/red]')
return False
if not self.crawl.seed.uri:
print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: seed has no URI[/red]')
return False
return True
def is_finished(self) -> bool:
from core.models import Snapshot, ArchiveResult
@@ -73,25 +79,121 @@ class CrawlMachine(StateMachine, strict_states=True):
@started.enter
def enter_started(self):
print(f'{self}.on_started(): [blue]↳ STARTED[/blue] crawl.run()')
# Suppressed: state transition logs
# lock the crawl object while we create snapshots
self.crawl.update_for_workers(
retry_at=timezone.now() + timedelta(seconds=5),
status=Crawl.StatusChoices.QUEUED,
)
# Run the crawl - creates root snapshot and processes queued URLs
self.crawl.run()
try:
# Run on_Crawl hooks to validate/install dependencies
self._run_crawl_hooks()
# only update status to STARTED once snapshots are created
self.crawl.update_for_workers(
retry_at=timezone.now() + timedelta(seconds=5),
status=Crawl.StatusChoices.STARTED,
# Run the crawl - creates root snapshot and processes queued URLs
self.crawl.run()
# only update status to STARTED once snapshots are created
self.crawl.update_for_workers(
retry_at=timezone.now() + timedelta(seconds=5),
status=Crawl.StatusChoices.STARTED,
)
except Exception as e:
print(f'[red]⚠️ Crawl {self.crawl.id} failed to start: {e}[/red]')
import traceback
traceback.print_exc()
# Re-raise so the worker knows it failed
raise
def _run_crawl_hooks(self):
"""Run on_Crawl hooks to validate/install dependencies."""
from pathlib import Path
from archivebox.hooks import run_hooks, discover_hooks
from archivebox.config import CONSTANTS
# Discover and run all on_Crawl hooks
hooks = discover_hooks('Crawl')
if not hooks:
return
# Create a temporary output directory for hook results
output_dir = Path(CONSTANTS.DATA_DIR) / 'tmp' / f'crawl_{self.crawl.id}'
output_dir.mkdir(parents=True, exist_ok=True)
# Run all on_Crawl hooks
results = run_hooks(
event_name='Crawl',
output_dir=output_dir,
timeout=60,
config_objects=[self.crawl, self.crawl.seed] if self.crawl.seed else [self.crawl],
crawl_id=str(self.crawl.id),
seed_uri=self.crawl.seed.uri if self.crawl.seed else '',
)
@sealed.enter
# Process hook results - parse JSONL output and create DB objects
self._process_hook_results(results)
def _process_hook_results(self, results: list):
"""Process JSONL output from hooks to create InstalledBinary and update Machine config."""
import json
from machine.models import Machine, InstalledBinary
machine = Machine.current()
for result in results:
if result['returncode'] != 0:
# Hook failed - might indicate missing dependency
continue
# Parse JSONL output
for line in result['stdout'].strip().split('\n'):
if not line.strip():
continue
try:
obj = json.loads(line)
obj_type = obj.get('type')
if obj_type == 'InstalledBinary':
# Create or update InstalledBinary record
# Skip if essential fields are missing
if not obj.get('name') or not obj.get('abspath') or not obj.get('version'):
continue
InstalledBinary.objects.update_or_create(
machine=machine,
name=obj['name'],
defaults={
'abspath': obj['abspath'],
'version': obj['version'],
'sha256': obj.get('sha256') or '',
'binprovider': obj.get('binprovider') or 'env',
}
)
elif obj_type == 'Machine':
# Update Machine config
method = obj.get('_method', 'update')
if method == 'update':
key = obj.get('key', '')
value = obj.get('value')
if key.startswith('config/'):
config_key = key[7:] # Remove 'config/' prefix
machine.config[config_key] = value
machine.save(update_fields=['config'])
elif obj_type == 'Dependency':
# Dependency request - could trigger installation
# For now just log it (installation hooks would be separate)
print(f'[yellow]Dependency requested: {obj.get("bin_name")}[/yellow]')
except json.JSONDecodeError:
# Not JSON, skip
continue
@sealed.enter
def enter_sealed(self):
print(f'{self}.on_sealed(): [blue]↳ SEALED[/blue] crawl.retry_at=None')
# Suppressed: state transition logs
self.crawl.update_for_workers(
retry_at=None,
status=Crawl.StatusChoices.SEALED,

View File

@@ -245,6 +245,14 @@ def run_hook(
env.setdefault('USER_AGENT', str(getattr(config, 'USER_AGENT', '')))
env.setdefault('RESOLUTION', str(getattr(config, 'RESOLUTION', '')))
# Pass SEARCH_BACKEND_ENGINE from new-style config
try:
from archivebox.config.configset import get_config
search_config = get_config()
env.setdefault('SEARCH_BACKEND_ENGINE', str(search_config.get('SEARCH_BACKEND_ENGINE', 'ripgrep')))
except Exception:
env.setdefault('SEARCH_BACKEND_ENGINE', 'ripgrep')
# Create output directory if needed
output_dir.mkdir(parents=True, exist_ok=True)

View File

@@ -0,0 +1,2 @@
> /Users/squash/Local/Code/archiveboxes/archivebox-nue/archivebox/cli/archivebox_init.py --force; TS=2025-12-25__08:03:12 VERSION=0.9.0rc1 IN_DOCKER=False IS_TTY=False

View File

@@ -12,7 +12,33 @@ class MachineAdmin(ConfigEditorMixin, BaseModelAdmin):
sort_fields = ('id', 'created_at', 'hostname', 'ips', 'os_platform', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'os_arch', 'os_family', 'os_release', 'hw_uuid')
readonly_fields = ('guid', 'created_at', 'modified_at', 'ips')
fields = (*readonly_fields, 'hostname', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'hw_uuid', 'os_arch', 'os_family', 'os_platform', 'os_kernel', 'os_release', 'stats', 'config', 'num_uses_succeeded', 'num_uses_failed')
fieldsets = (
('Identity', {
'fields': ('hostname', 'guid', 'ips'),
'classes': ('card',),
}),
('Hardware', {
'fields': ('hw_manufacturer', 'hw_product', 'hw_uuid', 'hw_in_docker', 'hw_in_vm'),
'classes': ('card',),
}),
('Operating System', {
'fields': ('os_platform', 'os_family', 'os_arch', 'os_kernel', 'os_release'),
'classes': ('card',),
}),
('Statistics', {
'fields': ('stats', 'num_uses_succeeded', 'num_uses_failed'),
'classes': ('card',),
}),
('Configuration', {
'fields': ('config',),
'classes': ('card', 'wide'),
}),
('Timestamps', {
'fields': ('created_at', 'modified_at'),
'classes': ('card',),
}),
)
list_filter = ('hw_in_docker', 'hw_in_vm', 'os_arch', 'os_family', 'os_platform')
ordering = ['-created_at']
@@ -33,7 +59,29 @@ class NetworkInterfaceAdmin(BaseModelAdmin):
search_fields = ('id', 'machine__id', 'iface', 'ip_public', 'ip_local', 'mac_address', 'dns_server', 'hostname', 'isp', 'city', 'region', 'country')
readonly_fields = ('machine', 'created_at', 'modified_at', 'mac_address', 'ip_public', 'ip_local', 'dns_server')
fields = (*readonly_fields, 'iface', 'hostname', 'isp', 'city', 'region', 'country', 'num_uses_succeeded', 'num_uses_failed')
fieldsets = (
('Machine', {
'fields': ('machine',),
'classes': ('card',),
}),
('Network', {
'fields': ('iface', 'ip_public', 'ip_local', 'mac_address', 'dns_server'),
'classes': ('card',),
}),
('Location', {
'fields': ('hostname', 'isp', 'city', 'region', 'country'),
'classes': ('card',),
}),
('Usage', {
'fields': ('num_uses_succeeded', 'num_uses_failed'),
'classes': ('card',),
}),
('Timestamps', {
'fields': ('created_at', 'modified_at'),
'classes': ('card',),
}),
)
list_filter = ('isp', 'country', 'region')
ordering = ['-created_at']
@@ -54,7 +102,25 @@ class DependencyAdmin(ConfigEditorMixin, BaseModelAdmin):
search_fields = ('id', 'bin_name', 'bin_providers')
readonly_fields = ('id', 'created_at', 'modified_at', 'is_installed', 'installed_count')
fields = ('bin_name', 'bin_providers', 'custom_cmds', 'config', *readonly_fields)
fieldsets = (
('Binary', {
'fields': ('bin_name', 'bin_providers', 'is_installed', 'installed_count'),
'classes': ('card',),
}),
('Commands', {
'fields': ('custom_cmds',),
'classes': ('card',),
}),
('Configuration', {
'fields': ('config',),
'classes': ('card', 'wide'),
}),
('Timestamps', {
'fields': ('id', 'created_at', 'modified_at'),
'classes': ('card',),
}),
)
list_filter = ('bin_providers', 'created_at')
ordering = ['-created_at']
@@ -82,7 +148,29 @@ class InstalledBinaryAdmin(BaseModelAdmin):
search_fields = ('id', 'machine__id', 'name', 'binprovider', 'version', 'abspath', 'sha256', 'dependency__bin_name')
readonly_fields = ('created_at', 'modified_at')
fields = ('machine', 'dependency', 'name', 'binprovider', 'abspath', 'version', 'sha256', *readonly_fields, 'num_uses_succeeded', 'num_uses_failed')
fieldsets = (
('Binary Info', {
'fields': ('name', 'dependency', 'binprovider'),
'classes': ('card',),
}),
('Location', {
'fields': ('machine', 'abspath'),
'classes': ('card',),
}),
('Version', {
'fields': ('version', 'sha256'),
'classes': ('card',),
}),
('Usage', {
'fields': ('num_uses_succeeded', 'num_uses_failed'),
'classes': ('card',),
}),
('Timestamps', {
'fields': ('created_at', 'modified_at'),
'classes': ('card',),
}),
)
list_filter = ('name', 'binprovider', 'machine_id', 'dependency')
ordering = ['-created_at']

View File

@@ -544,16 +544,21 @@ def log_worker_event(
# Build worker identifier
worker_parts = [worker_type]
if pid:
# Don't add pid/worker_id for DB operations (they happen in whatever process is running)
if pid and worker_type != 'DB':
worker_parts.append(f'pid={pid}')
if worker_id and worker_type in ('CrawlWorker', 'Orchestrator'):
if worker_id and worker_type in ('CrawlWorker', 'Orchestrator') and worker_type != 'DB':
worker_parts.append(f'id={worker_id}')
if url and worker_type == 'SnapshotWorker':
if url and worker_type in ('SnapshotWorker', 'DB'):
worker_parts.append(f'url={truncate_url(url)}')
if extractor and worker_type == 'ArchiveResultWorker':
if extractor and worker_type in ('ArchiveResultWorker', 'DB'):
worker_parts.append(f'extractor={extractor}')
worker_label = f'{worker_parts[0]}[{", ".join(worker_parts[1:])}]'
# Format worker label - only add brackets if there are additional identifiers
if len(worker_parts) > 1:
worker_label = f'{worker_parts[0]}[{", ".join(worker_parts[1:])}]'
else:
worker_label = worker_parts[0]
# Build metadata string
metadata_str = ''
@@ -579,12 +584,14 @@ def log_worker_event(
meta_parts.append(f'{k}: {len(v)}')
else:
meta_parts.append(f'{k}: {v}')
metadata_str = ' {' + ', '.join(meta_parts) + '}'
metadata_str = ' | '.join(meta_parts)
# Determine color based on event
color = 'white'
if event in ('Starting...', 'Started', 'STARTED', 'Started in background'):
color = 'green'
elif event.startswith('Created'):
color = 'cyan' # DB creation events
elif event in ('Processing...', 'PROCESSING'):
color = 'blue'
elif event in ('Completed', 'COMPLETED', 'All work complete'):
@@ -606,8 +613,9 @@ def log_worker_event(
text.append(indent) # Indentation
# Append worker label and event with color
text.append(f'{worker_label} {event}{error_str}', style=color)
# Append metadata without color
text.append(metadata_str)
# Append metadata without color (add separator if metadata exists)
if metadata_str:
text.append(f' | {metadata_str}')
CONSOLE.print(text)

View File

@@ -21,9 +21,9 @@ const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'accessibility';
const OUTPUT_DIR = 'accessibility';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'accessibility.json';
const CHROME_SESSION_DIR = 'chrome_session';
const CHROME_SESSION_DIR = '../chrome_session';
// Parse command line arguments
function parseArgs() {
@@ -60,10 +60,7 @@ function getCdpUrl() {
// Extract accessibility info
async function extractAccessibility(url) {
// Create output directory
if (!fs.existsSync(OUTPUT_DIR)) {
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
}
// Output directory is current directory (hook already runs in output dir)
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
let browser = null;

View File

@@ -24,7 +24,7 @@ import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'archive_org'
OUTPUT_DIR = 'archive_org'
OUTPUT_DIR = '.'
OUTPUT_FILE = 'archive.org.txt'

View File

@@ -26,7 +26,7 @@ import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'chrome_cleanup'
CHROME_SESSION_DIR = 'chrome_session'
CHROME_SESSION_DIR = '../chrome_session'
def get_env(name: str, default: str = '') -> str:

View File

@@ -31,7 +31,7 @@ const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'chrome_navigate';
const CHROME_SESSION_DIR = 'chrome_session';
const CHROME_SESSION_DIR = '../chrome_session';
// Parse command line arguments
function parseArgs() {

View File

@@ -21,9 +21,9 @@ const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'consolelog';
const OUTPUT_DIR = 'consolelog';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'console.jsonl';
const CHROME_SESSION_DIR = 'chrome_session';
const CHROME_SESSION_DIR = '../chrome_session';
// Parse command line arguments
function parseArgs() {
@@ -86,10 +86,7 @@ async function serializeArgs(args) {
async function captureConsoleLogs(url) {
const captureTimeout = (getEnvInt('CONSOLELOG_TIMEOUT') || 5) * 1000;
// Create output directory
if (!fs.existsSync(OUTPUT_DIR)) {
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
}
// Output directory is current directory (hook already runs in output dir)
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
// Clear existing file

View File

@@ -24,9 +24,9 @@ const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'dom';
const OUTPUT_DIR = 'dom';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'output.html';
const CHROME_SESSION_DIR = 'chrome_session';
const CHROME_SESSION_DIR = '../chrome_session';
// Parse command line arguments
function parseArgs() {
@@ -58,7 +58,7 @@ function getEnvInt(name, defaultValue = 0) {
}
// Check if staticfile extractor already downloaded this URL
const STATICFILE_DIR = 'staticfile';
const STATICFILE_DIR = '../staticfile';
function hasStaticFileOutput() {
return fs.existsSync(STATICFILE_DIR) && fs.readdirSync(STATICFILE_DIR).length > 0;
}
@@ -114,10 +114,7 @@ async function dumpDom(url) {
const { width, height } = parseResolution(resolution);
// Create output directory
if (!fs.existsSync(OUTPUT_DIR)) {
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
}
// Output directory is current directory (hook already runs in output dir)
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
let browser = null;

View File

@@ -26,7 +26,7 @@ import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'favicon'
OUTPUT_DIR = 'favicon'
OUTPUT_DIR = '.'
OUTPUT_FILE = 'favicon.ico'

View File

@@ -26,7 +26,7 @@ import rich_click as click
EXTRACTOR_NAME = 'git'
BIN_NAME = 'git'
BIN_PROVIDERS = 'apt,brew,env'
OUTPUT_DIR = 'repo'
OUTPUT_DIR = '.'
def get_env(name: str, default: str = '') -> str:

View File

@@ -22,9 +22,9 @@ const http = require('http');
// Extractor metadata
const EXTRACTOR_NAME = 'headers';
const OUTPUT_DIR = 'headers';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'headers.json';
const CHROME_SESSION_DIR = 'chrome_session';
const CHROME_SESSION_DIR = '../chrome_session';
const CHROME_HEADERS_FILE = 'response_headers.json';
// Parse command line arguments
@@ -110,10 +110,7 @@ function fetchHeaders(url) {
}
async function extractHeaders(url) {
// Create output directory
if (!fs.existsSync(OUTPUT_DIR)) {
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
}
// Output directory is current directory (hook already runs in output dir)
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
// Try Chrome session first

View File

@@ -28,7 +28,7 @@ import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'htmltotext'
OUTPUT_DIR = 'htmltotext'
OUTPUT_DIR = '.'
OUTPUT_FILE = 'htmltotext.txt'
@@ -114,9 +114,8 @@ def extract_htmltotext(url: str) -> tuple[bool, str | None, str]:
if not text or len(text) < 10:
return False, None, 'No meaningful text extracted from HTML'
# Create output directory and write output
# Output directory is current directory (hook already runs in output dir)
output_dir = Path(OUTPUT_DIR)
output_dir.mkdir(exist_ok=True)
output_path = output_dir / OUTPUT_FILE
output_path.write_text(text, encoding='utf-8')

View File

@@ -39,7 +39,7 @@ import rich_click as click
EXTRACTOR_NAME = 'media'
BIN_NAME = 'yt-dlp'
BIN_PROVIDERS = 'pip,apt,brew,env'
OUTPUT_DIR = 'media'
OUTPUT_DIR = '.'
def get_env(name: str, default: str = '') -> str:
@@ -62,7 +62,7 @@ def get_env_int(name: str, default: int = 0) -> int:
return default
STATICFILE_DIR = 'staticfile'
STATICFILE_DIR = '../staticfile'
def has_staticfile_output() -> bool:
"""Check if staticfile extractor already downloaded this URL."""
@@ -129,9 +129,8 @@ def save_media(url: str, binary: str) -> tuple[bool, str | None, str]:
extra_args = get_env('YTDLP_EXTRA_ARGS') or get_env('YOUTUBEDL_EXTRA_ARGS', '')
media_max_size = get_env('MEDIA_MAX_SIZE', '750m')
# Create output directory
# Output directory is current directory (hook already runs in output dir)
output_dir = Path(OUTPUT_DIR)
output_dir.mkdir(exist_ok=True)
# Build command (later options take precedence)
cmd = [

View File

@@ -27,7 +27,7 @@ import rich_click as click
EXTRACTOR_NAME = 'mercury'
BIN_NAME = 'postlight-parser'
BIN_PROVIDERS = 'npm,env'
OUTPUT_DIR = 'mercury'
OUTPUT_DIR = '.'
def get_env(name: str, default: str = '') -> str:
@@ -72,9 +72,8 @@ def extract_mercury(url: str, binary: str) -> tuple[bool, str | None, str]:
"""
timeout = get_env_int('TIMEOUT', 60)
# Create output directory
# Output directory is current directory (hook already runs in output dir)
output_dir = Path(OUTPUT_DIR)
output_dir.mkdir(exist_ok=True)
try:
# Get text version

View File

@@ -24,10 +24,10 @@ const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'parse_dom_outlinks';
const OUTPUT_DIR = 'parse_dom_outlinks';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'outlinks.json';
const URLS_FILE = 'urls.jsonl'; // For crawl system
const CHROME_SESSION_DIR = 'chrome_session';
const CHROME_SESSION_DIR = '../chrome_session';
// Parse command line arguments
function parseArgs() {
@@ -64,10 +64,7 @@ function getCdpUrl() {
// Extract outlinks
async function extractOutlinks(url) {
// Create output directory
if (!fs.existsSync(OUTPUT_DIR)) {
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
}
// Output directory is current directory (hook already runs in output dir)
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
let browser = null;

View File

@@ -23,9 +23,9 @@ const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'pdf';
const OUTPUT_DIR = 'pdf';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'output.pdf';
const CHROME_SESSION_DIR = 'chrome_session';
const CHROME_SESSION_DIR = '../chrome_session';
// Parse command line arguments
function parseArgs() {
@@ -57,7 +57,7 @@ function getEnvInt(name, defaultValue = 0) {
}
// Check if staticfile extractor already downloaded this URL
const STATICFILE_DIR = 'staticfile';
const STATICFILE_DIR = '../staticfile';
function hasStaticFileOutput() {
return fs.existsSync(STATICFILE_DIR) && fs.readdirSync(STATICFILE_DIR).length > 0;
}
@@ -113,10 +113,7 @@ async function printToPdf(url) {
const { width, height } = parseResolution(resolution);
// Create output directory
if (!fs.existsSync(OUTPUT_DIR)) {
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
}
// Output directory is current directory (hook already runs in output dir)
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
let browser = null;

View File

@@ -29,7 +29,7 @@ import rich_click as click
EXTRACTOR_NAME = 'readability'
BIN_NAME = 'readability-extractor'
BIN_PROVIDERS = 'npm,env'
OUTPUT_DIR = 'readability'
OUTPUT_DIR = '.'
def get_env(name: str, default: str = '') -> str:
@@ -101,9 +101,8 @@ def extract_readability(url: str, binary: str) -> tuple[bool, str | None, str]:
if not html_source:
return False, None, 'No HTML source found (run singlefile, dom, or wget first)'
# Create output directory
# Output directory is current directory (hook already runs in output dir)
output_dir = Path(OUTPUT_DIR)
output_dir.mkdir(exist_ok=True)
try:
# Run readability-extractor (outputs JSON by default)

View File

@@ -21,9 +21,9 @@ const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'redirects';
const OUTPUT_DIR = 'redirects';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'redirects.json';
const CHROME_SESSION_DIR = 'chrome_session';
const CHROME_SESSION_DIR = '../chrome_session';
// Parse command line arguments
function parseArgs() {
@@ -60,10 +60,7 @@ function getCdpUrl() {
// Track redirect chain
async function trackRedirects(url) {
// Create output directory
if (!fs.existsSync(OUTPUT_DIR)) {
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
}
// Output directory is current directory (hook already runs in output dir)
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
let browser = null;

View File

@@ -26,8 +26,8 @@ const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'responses';
const OUTPUT_DIR = 'responses';
const CHROME_SESSION_DIR = 'chrome_session';
const OUTPUT_DIR = '.';
const CHROME_SESSION_DIR = '../chrome_session';
// Resource types to capture (by default, capture everything)
const DEFAULT_TYPES = ['script', 'stylesheet', 'font', 'image', 'media', 'xhr', 'websocket'];
@@ -149,10 +149,8 @@ async function archiveResponses(originalUrl) {
const typesStr = getEnv('RESPONSES_TYPES', DEFAULT_TYPES.join(','));
const typesToSave = typesStr.split(',').map(t => t.trim().toLowerCase());
// Create output directories
if (!fs.existsSync(OUTPUT_DIR)) {
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
}
// Output directory is current directory (hook already runs in output dir)
// Create subdirectories for organizing responses
const allDir = path.join(OUTPUT_DIR, 'all');
if (!fs.existsSync(allDir)) {
fs.mkdirSync(allDir, { recursive: true });

View File

@@ -23,9 +23,9 @@ const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'screenshot';
const OUTPUT_DIR = 'screenshot';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'screenshot.png';
const CHROME_SESSION_DIR = 'chrome_session';
const CHROME_SESSION_DIR = '../chrome_session';
// Parse command line arguments
function parseArgs() {
@@ -57,7 +57,7 @@ function getEnvInt(name, defaultValue = 0) {
}
// Check if staticfile extractor already downloaded this URL
const STATICFILE_DIR = 'staticfile';
const STATICFILE_DIR = '../staticfile';
function hasStaticFileOutput() {
return fs.existsSync(STATICFILE_DIR) && fs.readdirSync(STATICFILE_DIR).length > 0;
}
@@ -116,10 +116,7 @@ async function takeScreenshot(url) {
const { width, height } = parseResolution(resolution);
// Create output directory
if (!fs.existsSync(OUTPUT_DIR)) {
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
}
// Output directory is current directory (hook already runs in output dir)
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
let browser = null;

View File

@@ -0,0 +1,131 @@
#!/usr/bin/env python3
"""
Validation hook for ripgrep binary.
Only runs if SEARCH_BACKEND_ENGINE is set to 'ripgrep'.
Outputs JSONL for InstalledBinary and Machine config updates.
"""
import os
import sys
import json
import shutil
import hashlib
import subprocess
from pathlib import Path
def get_binary_version(abspath: str) -> str | None:
"""Get version string from ripgrep binary."""
try:
result = subprocess.run(
[abspath, '--version'],
capture_output=True,
text=True,
timeout=5,
)
if result.returncode == 0 and result.stdout:
# ripgrep version string: "ripgrep 14.1.0"
first_line = result.stdout.strip().split('\n')[0]
parts = first_line.split()
for i, part in enumerate(parts):
if part.lower() == 'ripgrep' and i + 1 < len(parts):
return parts[i + 1]
# Try to find version number pattern
for part in parts:
if part[0].isdigit() and '.' in part:
return part
return first_line[:32]
except Exception:
pass
return None
def get_binary_hash(abspath: str) -> str | None:
"""Get SHA256 hash of binary."""
try:
with open(abspath, 'rb') as f:
return hashlib.sha256(f.read()).hexdigest()
except Exception:
return None
def find_ripgrep() -> dict | None:
"""Find ripgrep binary using shutil.which or env var."""
# Check env var first - if it's an absolute path and exists, use it
ripgrep_env = os.environ.get('RIPGREP_BINARY', '')
if ripgrep_env and '/' in ripgrep_env and Path(ripgrep_env).is_file():
abspath = ripgrep_env
else:
# Otherwise try shutil.which with the env var as the binary name
abspath = shutil.which(ripgrep_env) if ripgrep_env else None
if not abspath:
abspath = shutil.which('rg')
if abspath and Path(abspath).is_file():
return {
'name': 'rg',
'abspath': abspath,
'version': get_binary_version(abspath),
'sha256': get_binary_hash(abspath),
'binprovider': 'env',
}
return None
def main():
"""Validate ripgrep binary and output JSONL."""
# Check if ripgrep search backend is enabled
search_backend = os.environ.get('SEARCH_BACKEND_ENGINE', '').lower()
if search_backend != 'ripgrep':
# No-op: ripgrep is not the active search backend
sys.exit(0)
result = find_ripgrep()
if result and result.get('abspath'):
# Output InstalledBinary
print(json.dumps({
'type': 'InstalledBinary',
'name': result['name'],
'abspath': result['abspath'],
'version': result['version'],
'sha256': result['sha256'],
'binprovider': result['binprovider'],
}))
# Output Machine config update
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/RIPGREP_BINARY',
'value': result['abspath'],
}))
if result['version']:
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/RIPGREP_VERSION',
'value': result['version'],
}))
sys.exit(0)
else:
# Output Dependency request
print(json.dumps({
'type': 'Dependency',
'bin_name': 'rg',
'bin_providers': 'apt,brew,cargo,env',
}))
# Exit non-zero to indicate binary not found
print(f"ripgrep binary not found", file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,306 @@
#!/usr/bin/env python3
"""
Tests for ripgrep binary detection and archivebox install functionality.
Guards against regressions in:
1. Machine.config overrides not being used in version command
2. Ripgrep hook not resolving binary names via shutil.which()
3. SEARCH_BACKEND_ENGINE not being passed to hook environment
"""
import os
import sys
import json
import shutil
import tempfile
import subprocess
from pathlib import Path
from unittest.mock import patch, MagicMock
import pytest
def test_ripgrep_hook_detects_binary_from_path():
"""Test that ripgrep hook finds binary using shutil.which() when env var is just a name."""
hook_path = Path(__file__).parent.parent / 'on_Crawl__00_validate_ripgrep.py'
# Skip if rg is not installed
if not shutil.which('rg'):
pytest.skip("ripgrep (rg) not installed")
# Set SEARCH_BACKEND_ENGINE to enable the hook
env = os.environ.copy()
env['SEARCH_BACKEND_ENGINE'] = 'ripgrep'
env['RIPGREP_BINARY'] = 'rg' # Just the name, not the full path (this was the bug)
result = subprocess.run(
[sys.executable, str(hook_path)],
capture_output=True,
text=True,
env=env,
timeout=10,
)
assert result.returncode == 0, f"Hook failed: {result.stderr}"
# Parse JSONL output
lines = [line for line in result.stdout.strip().split('\n') if line.strip()]
assert len(lines) >= 2, "Expected at least 2 JSONL lines (InstalledBinary + Machine config)"
installed_binary = json.loads(lines[0])
assert installed_binary['type'] == 'InstalledBinary'
assert installed_binary['name'] == 'rg'
assert '/' in installed_binary['abspath'], "Expected full path, not just binary name"
assert Path(installed_binary['abspath']).is_file(), "Binary path should exist"
assert installed_binary['version'], "Version should be detected"
machine_config = json.loads(lines[1])
assert machine_config['type'] == 'Machine'
assert machine_config['key'] == 'config/RIPGREP_BINARY'
assert '/' in machine_config['value'], "Machine config should store full path"
def test_ripgrep_hook_skips_when_backend_not_ripgrep():
"""Test that ripgrep hook exits silently when search backend is not ripgrep."""
hook_path = Path(__file__).parent.parent / 'on_Crawl__00_validate_ripgrep.py'
env = os.environ.copy()
env['SEARCH_BACKEND_ENGINE'] = 'sqlite' # Different backend
result = subprocess.run(
[sys.executable, str(hook_path)],
capture_output=True,
text=True,
env=env,
timeout=10,
)
assert result.returncode == 0, "Hook should exit successfully when backend is not ripgrep"
assert result.stdout.strip() == '', "Hook should produce no output when backend is not ripgrep"
def test_ripgrep_hook_handles_absolute_path():
"""Test that ripgrep hook works when RIPGREP_BINARY is an absolute path."""
hook_path = Path(__file__).parent.parent / 'on_Crawl__00_validate_ripgrep.py'
rg_path = shutil.which('rg')
if not rg_path:
pytest.skip("ripgrep (rg) not installed")
env = os.environ.copy()
env['SEARCH_BACKEND_ENGINE'] = 'ripgrep'
env['RIPGREP_BINARY'] = rg_path # Full absolute path
result = subprocess.run(
[sys.executable, str(hook_path)],
capture_output=True,
text=True,
env=env,
timeout=10,
)
assert result.returncode == 0, f"Hook failed: {result.stderr}"
assert result.stdout.strip(), "Hook should produce output"
installed_binary = json.loads(result.stdout.strip().split('\n')[0])
assert installed_binary['abspath'] == rg_path
@pytest.mark.django_db
def test_machine_config_overrides_base_config():
"""
Test that Machine.config overrides take precedence over base config.
Guards against regression where archivebox version was showing binaries
as "not installed" even though they were detected and stored in Machine.config.
"""
from machine.models import Machine, InstalledBinary
machine = Machine.current()
# Simulate a hook detecting chrome and storing it with a different path than base config
detected_chrome_path = '/custom/path/to/chrome'
machine.config['CHROME_BINARY'] = detected_chrome_path
machine.config['CHROME_VERSION'] = '143.0.7499.170'
machine.save()
# Create InstalledBinary record
InstalledBinary.objects.create(
machine=machine,
name='chrome',
abspath=detected_chrome_path,
version='143.0.7499.170',
binprovider='env',
)
# Verify Machine.config takes precedence
from archivebox.config.configset import get_config
config = get_config()
# Machine.config should override the base config value
assert machine.config.get('CHROME_BINARY') == detected_chrome_path
# The version command should use Machine.config, not base config
# (Base config might have 'chromium' while Machine.config has the full path)
bin_value = machine.config.get('CHROME_BINARY') or config.get('CHROME_BINARY', '')
assert bin_value == detected_chrome_path, \
"Machine.config override should take precedence over base config"
@pytest.mark.django_db
def test_search_backend_engine_passed_to_hooks():
"""
Test that SEARCH_BACKEND_ENGINE is passed to hook environment.
Guards against regression where hooks couldn't determine which search backend was active.
"""
from pathlib import Path
from archivebox.hooks import build_hook_environment
from archivebox.config.configset import get_config
config = get_config()
search_backend = config.get('SEARCH_BACKEND_ENGINE', 'ripgrep')
env = build_hook_environment(overrides=None)
assert 'SEARCH_BACKEND_ENGINE' in env, \
"SEARCH_BACKEND_ENGINE must be in hook environment"
assert env['SEARCH_BACKEND_ENGINE'] == search_backend, \
f"Expected SEARCH_BACKEND_ENGINE={search_backend}, got {env.get('SEARCH_BACKEND_ENGINE')}"
@pytest.mark.django_db
def test_install_creates_installedbinary_records():
"""
Test that archivebox install creates InstalledBinary records for detected binaries.
This is an integration test that verifies the full install flow.
"""
from machine.models import Machine, InstalledBinary
from crawls.models import Seed, Crawl
from crawls.statemachines import CrawlMachine
from archivebox.base_models.models import get_or_create_system_user_pk
machine = Machine.current()
initial_binary_count = InstalledBinary.objects.filter(machine=machine).count()
# Create an install crawl (like archivebox install does)
created_by_id = get_or_create_system_user_pk()
seed, _ = Seed.objects.get_or_create(
uri='archivebox://test-install',
label='Test dependency detection',
created_by_id=created_by_id,
defaults={'extractor': 'auto'},
)
crawl = Crawl.objects.create(
seed=seed,
max_depth=0,
created_by_id=created_by_id,
status='queued',
)
# Run the crawl state machine (this triggers hooks)
sm = CrawlMachine(crawl)
sm.send('tick') # queued -> started (runs hooks)
# Verify InstalledBinary records were created
final_binary_count = InstalledBinary.objects.filter(machine=machine).count()
assert final_binary_count > initial_binary_count, \
"archivebox install should create InstalledBinary records"
# Verify at least some common binaries were detected
common_binaries = ['git', 'wget', 'node']
detected = []
for bin_name in common_binaries:
if InstalledBinary.objects.filter(machine=machine, name=bin_name).exists():
detected.append(bin_name)
assert detected, f"At least one of {common_binaries} should be detected"
# Verify detected binaries have valid paths and versions
for binary in InstalledBinary.objects.filter(machine=machine):
if binary.abspath: # Only check non-empty paths
assert '/' in binary.abspath, \
f"{binary.name} should have full path, not just name: {binary.abspath}"
# Version might be empty for some binaries, that's ok
@pytest.mark.django_db
def test_ripgrep_only_detected_when_backend_enabled():
"""
Test that ripgrep is only detected when SEARCH_BACKEND_ENGINE='ripgrep'.
Guards against ripgrep being installed/detected when not needed.
"""
from machine.models import Machine, InstalledBinary
from crawls.models import Seed, Crawl
from crawls.statemachines import CrawlMachine
from archivebox.base_models.models import get_or_create_system_user_pk
from django.conf import settings
if not shutil.which('rg'):
pytest.skip("ripgrep (rg) not installed")
machine = Machine.current()
# Clear any existing ripgrep records
InstalledBinary.objects.filter(machine=machine, name='rg').delete()
# Test 1: With ripgrep backend - should be detected
with patch('archivebox.config.configset.get_config') as mock_config:
mock_config.return_value = {'SEARCH_BACKEND_ENGINE': 'ripgrep', 'RIPGREP_BINARY': 'rg'}
created_by_id = get_or_create_system_user_pk()
seed = Seed.objects.create(
uri='archivebox://test-rg-enabled',
label='Test ripgrep detection enabled',
created_by_id=created_by_id,
extractor='auto',
)
crawl = Crawl.objects.create(
seed=seed,
max_depth=0,
created_by_id=created_by_id,
status='queued',
)
sm = CrawlMachine(crawl)
sm.send('tick')
# Ripgrep should be detected
rg_detected = InstalledBinary.objects.filter(machine=machine, name='rg').exists()
assert rg_detected, "Ripgrep should be detected when SEARCH_BACKEND_ENGINE='ripgrep'"
# Clear records again
InstalledBinary.objects.filter(machine=machine, name='rg').delete()
# Test 2: With different backend - should NOT be detected
with patch('archivebox.config.configset.get_config') as mock_config:
mock_config.return_value = {'SEARCH_BACKEND_ENGINE': 'sqlite', 'RIPGREP_BINARY': 'rg'}
seed2 = Seed.objects.create(
uri='archivebox://test-rg-disabled',
label='Test ripgrep detection disabled',
created_by_id=created_by_id,
extractor='auto',
)
crawl2 = Crawl.objects.create(
seed=seed2,
max_depth=0,
created_by_id=created_by_id,
status='queued',
)
sm2 = CrawlMachine(crawl2)
sm2.send('tick')
# Ripgrep should NOT be detected
rg_detected = InstalledBinary.objects.filter(machine=machine, name='rg').exists()
assert not rg_detected, "Ripgrep should NOT be detected when SEARCH_BACKEND_ENGINE!='ripgrep'"
if __name__ == '__main__':
pytest.main([__file__, '-v'])

View File

@@ -29,7 +29,7 @@ import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'index_sonic'
OUTPUT_DIR = 'search_index'
OUTPUT_DIR = '.'
# Text file patterns to index
INDEXABLE_FILES = [

View File

@@ -27,7 +27,7 @@ import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'index_sqlite'
OUTPUT_DIR = 'search_index'
OUTPUT_DIR = '.'
# Text file patterns to index, in priority order
INDEXABLE_FILES = [

View File

@@ -21,9 +21,9 @@ const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'seo';
const OUTPUT_DIR = 'seo';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'seo.json';
const CHROME_SESSION_DIR = 'chrome_session';
const CHROME_SESSION_DIR = '../chrome_session';
// Parse command line arguments
function parseArgs() {
@@ -60,10 +60,7 @@ function getCdpUrl() {
// Extract SEO metadata
async function extractSeo(url) {
// Create output directory
if (!fs.existsSync(OUTPUT_DIR)) {
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
}
// Output directory is current directory (hook already runs in output dir)
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
let browser = null;

View File

@@ -40,7 +40,7 @@ const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
const CHROME_DOWNLOADS_DIR = process.env.CHROME_DOWNLOADS_DIR ||
path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_downloads');
const OUTPUT_DIR = 'singlefile';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'singlefile.html';
/**
@@ -102,8 +102,7 @@ async function saveSinglefileWithExtension(page, extension, options = {}) {
.filter(fn => fn.endsWith('.html'))
);
// Ensure output directory exists
await fs.promises.mkdir(OUTPUT_DIR, { recursive: true });
// Output directory is current directory (hook already runs in output dir)
const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
console.log(`[🛠️] Saving SingleFile HTML using extension (${extension.id})...`);
@@ -170,8 +169,7 @@ async function saveSinglefileWithCLI(url, options = {}) {
return null;
}
// Ensure output directory exists
await fs.promises.mkdir(OUTPUT_DIR, { recursive: true });
// Output directory is current directory (hook already runs in output dir)
const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
// Build command

View File

@@ -41,7 +41,7 @@ import rich_click as click
EXTRACTOR_NAME = 'singlefile'
BIN_NAME = 'single-file'
BIN_PROVIDERS = 'npm,env'
OUTPUT_DIR = 'singlefile'
OUTPUT_DIR = '.'
OUTPUT_FILE = 'singlefile.html'
@@ -65,7 +65,7 @@ def get_env_int(name: str, default: int = 0) -> int:
return default
STATICFILE_DIR = 'staticfile'
STATICFILE_DIR = '../staticfile'
def has_staticfile_output() -> bool:
"""Check if staticfile extractor already downloaded this URL."""
@@ -135,7 +135,7 @@ def get_version(binary: str) -> str:
return ''
CHROME_SESSION_DIR = 'chrome_session'
CHROME_SESSION_DIR = '../chrome_session'
def get_cdp_url() -> str | None:
@@ -203,9 +203,8 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]:
if extra_args:
cmd.extend(extra_args.split())
# Create output directory
# Output directory is current directory (hook already runs in output dir)
output_dir = Path(OUTPUT_DIR)
output_dir.mkdir(exist_ok=True)
output_path = output_dir / OUTPUT_FILE
cmd.extend([url, str(output_path)])
@@ -274,7 +273,7 @@ def main(url: str, snapshot_id: str):
sys.exit(1)
version = get_version(binary)
cmd_str = f'{binary} {url} {OUTPUT_DIR}/{OUTPUT_FILE}'
cmd_str = f'{binary} {url} {OUTPUT_FILE}'
# Run extraction
success, output, error = save_singlefile(url, binary)

View File

@@ -21,9 +21,9 @@ const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'ssl';
const OUTPUT_DIR = 'ssl';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'ssl.json';
const CHROME_SESSION_DIR = 'chrome_session';
const CHROME_SESSION_DIR = '../chrome_session';
// Parse command line arguments
function parseArgs() {
@@ -60,10 +60,7 @@ function getCdpUrl() {
// Extract SSL details
async function extractSsl(url) {
// Create output directory
if (!fs.existsSync(OUTPUT_DIR)) {
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
}
// Output directory is current directory (hook already runs in output dir)
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
// Only extract SSL for HTTPS URLs

View File

@@ -31,8 +31,8 @@ import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'staticfile'
OUTPUT_DIR = 'staticfile'
CHROME_SESSION_DIR = 'chrome_session'
OUTPUT_DIR = '.'
CHROME_SESSION_DIR = '../chrome_session'
# Content-Types that indicate static files
# These can't be meaningfully processed by Chrome-based extractors
@@ -214,9 +214,8 @@ def download_file(url: str) -> tuple[bool, str | None, str]:
if content_length and int(content_length) > max_size:
return False, None, f'File too large: {int(content_length)} bytes > {max_size} max'
# Create output directory
# Output directory is current directory (hook already runs in output dir)
output_dir = Path(OUTPUT_DIR)
output_dir.mkdir(exist_ok=True)
# Determine filename
filename = get_filename_from_url(url)

View File

@@ -21,9 +21,9 @@ const http = require('http');
// Extractor metadata
const EXTRACTOR_NAME = 'title';
const OUTPUT_DIR = 'title';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'title.txt';
const CHROME_SESSION_DIR = 'chrome_session';
const CHROME_SESSION_DIR = '../chrome_session';
// Parse command line arguments
function parseArgs() {
@@ -162,10 +162,7 @@ async function getTitleFromCdp(cdpUrl) {
}
async function extractTitle(url) {
// Create output directory
if (!fs.existsSync(OUTPUT_DIR)) {
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
}
// Output directory is current directory (hook already runs in output dir)
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
// Try Chrome session first

View File

@@ -43,7 +43,7 @@ import rich_click as click
EXTRACTOR_NAME = 'wget'
BIN_NAME = 'wget'
BIN_PROVIDERS = 'apt,brew,env'
OUTPUT_DIR = 'wget'
OUTPUT_DIR = '.'
def get_env(name: str, default: str = '') -> str:
@@ -66,7 +66,7 @@ def get_env_int(name: str, default: int = 0) -> int:
return default
STATICFILE_DIR = 'staticfile'
STATICFILE_DIR = '../staticfile'
def has_staticfile_output() -> bool:
"""Check if staticfile extractor already downloaded this URL."""

File diff suppressed because it is too large Load Diff

View File

@@ -57,13 +57,24 @@
box-shadow: 0 0 8px #3fb950;
animation: pulse 2s infinite;
}
#progress-monitor .status-dot.idle {
background: #d29922;
box-shadow: 0 0 4px #d29922;
}
#progress-monitor .status-dot.stopped {
background: #f85149;
background: #6e7681;
}
#progress-monitor .status-dot.flash {
animation: flash 0.3s ease-out;
}
@keyframes pulse {
0%, 100% { opacity: 1; box-shadow: 0 0 8px #3fb950; }
50% { opacity: 0.6; box-shadow: 0 0 4px #3fb950; }
}
@keyframes flash {
0% { transform: scale(1.5); }
100% { transform: scale(1); }
}
/* Stats */
#progress-monitor .stats {
@@ -89,6 +100,19 @@
#progress-monitor .stat-value.error { color: #f85149; }
#progress-monitor .stat-value.warning { color: #d29922; }
#progress-monitor .stat-value.info { color: #58a6ff; }
#progress-monitor .stat.clickable {
cursor: pointer;
padding: 2px 6px;
margin: -2px -6px;
border-radius: 4px;
transition: background 0.2s;
}
#progress-monitor .stat.clickable:hover {
background: rgba(255,255,255,0.1);
}
#progress-monitor .stat.clickable:active {
background: rgba(255,255,255,0.2);
}
/* Toggle Button */
#progress-monitor .toggle-btn {
@@ -259,48 +283,86 @@
padding: 0 12px 8px;
}
/* Extractor List */
/* Extractor List - Compact Badge Layout */
#progress-monitor .extractor-list {
padding: 8px 12px;
background: rgba(0,0,0,0.2);
border-top: 1px solid #21262d;
display: flex;
flex-wrap: wrap;
gap: 4px;
}
#progress-monitor .extractor-item {
#progress-monitor .extractor-badge {
position: relative;
display: inline-flex;
align-items: center;
gap: 4px;
padding: 3px 8px;
border-radius: 4px;
font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
font-size: 10px;
background: #21262d;
overflow: hidden;
white-space: nowrap;
}
#progress-monitor .extractor-badge .progress-fill {
position: absolute;
top: 0;
left: 0;
bottom: 0;
z-index: 0;
transition: width 0.3s ease-out;
}
#progress-monitor .extractor-badge .badge-content {
position: relative;
z-index: 1;
display: flex;
align-items: center;
gap: 8px;
padding: 4px 0;
gap: 4px;
}
#progress-monitor .extractor-icon {
font-size: 12px;
width: 16px;
text-align: center;
#progress-monitor .extractor-badge.queued {
color: #8b949e;
}
#progress-monitor .extractor-icon.running {
#progress-monitor .extractor-badge.queued .progress-fill {
background: rgba(110, 118, 129, 0.2);
width: 0%;
}
#progress-monitor .extractor-badge.started {
color: #d29922;
animation: spin 1s linear infinite;
}
#progress-monitor .extractor-icon.success {
#progress-monitor .extractor-badge.started .progress-fill {
background: rgba(210, 153, 34, 0.3);
width: 50%;
animation: progress-pulse 1.5s ease-in-out infinite;
}
@keyframes progress-pulse {
0%, 100% { opacity: 0.5; }
50% { opacity: 1; }
}
#progress-monitor .extractor-badge.succeeded {
color: #3fb950;
}
#progress-monitor .extractor-icon.failed {
#progress-monitor .extractor-badge.succeeded .progress-fill {
background: rgba(63, 185, 80, 0.25);
width: 100%;
}
#progress-monitor .extractor-badge.failed {
color: #f85149;
}
#progress-monitor .extractor-icon.pending {
color: #8b949e;
#progress-monitor .extractor-badge.failed .progress-fill {
background: rgba(248, 81, 73, 0.25);
width: 100%;
}
#progress-monitor .extractor-badge .badge-icon {
font-size: 10px;
}
#progress-monitor .extractor-badge.started .badge-icon {
animation: spin 1s linear infinite;
}
@keyframes spin {
from { transform: rotate(0deg); }
to { transform: rotate(360deg); }
}
#progress-monitor .extractor-name {
flex: 1;
font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
font-size: 11px;
}
#progress-monitor .extractor-progress {
width: 60px;
}
/* Status Badge */
#progress-monitor .status-badge {
@@ -356,11 +418,11 @@
<span class="stat-label">Queued</span>
<span class="stat-value warning" id="total-queued">0</span>
</div>
<div class="stat">
<div class="stat clickable" id="stat-succeeded" title="Click to reset counter">
<span class="stat-label">Done</span>
<span class="stat-value success" id="total-succeeded">0</span>
</div>
<div class="stat">
<div class="stat clickable" id="stat-failed" title="Click to reset counter">
<span class="stat-label">Failed</span>
<span class="stat-value error" id="total-failed">0</span>
</div>
@@ -390,6 +452,24 @@
let expandedCrawls = new Set(JSON.parse(localStorage.getItem('progress-monitor-expanded-crawls') || '[]'));
let expandedSnapshots = new Set(JSON.parse(localStorage.getItem('progress-monitor-expanded-snapshots') || '[]'));
// Baselines for resettable counters
let succeededBaseline = parseInt(localStorage.getItem('progress-succeeded-baseline') || '0');
let failedBaseline = parseInt(localStorage.getItem('progress-failed-baseline') || '0');
let lastSucceeded = 0;
let lastFailed = 0;
// Click handlers for resetting counters
document.getElementById('stat-succeeded').addEventListener('click', function() {
succeededBaseline = lastSucceeded;
localStorage.setItem('progress-succeeded-baseline', succeededBaseline);
document.getElementById('total-succeeded').textContent = '0';
});
document.getElementById('stat-failed').addEventListener('click', function() {
failedBaseline = lastFailed;
localStorage.setItem('progress-failed-baseline', failedBaseline);
document.getElementById('total-failed').textContent = '0';
});
function formatUrl(url) {
try {
const u = new URL(url);
@@ -400,24 +480,18 @@
}
function renderExtractor(extractor) {
const iconClass = extractor.status === 'started' ? 'running' :
extractor.status === 'succeeded' ? 'success' :
extractor.status === 'failed' ? 'failed' : 'pending';
const icon = extractor.status === 'started' ? '&#8635;' :
extractor.status === 'succeeded' ? '&#10003;' :
extractor.status === 'failed' ? '&#10007;' : '&#9675;';
return `
<div class="extractor-item">
<span class="extractor-icon ${iconClass}">${icon}</span>
<span class="extractor-name">${extractor.extractor}</span>
<div class="extractor-progress">
<div class="progress-bar-container">
<div class="progress-bar extractor ${extractor.status === 'started' ? 'indeterminate' : ''}"
style="width: ${extractor.status === 'succeeded' ? '100' : extractor.status === 'failed' ? '100' : extractor.progress}%"></div>
</div>
</div>
</div>
<span class="extractor-badge ${extractor.status}">
<span class="progress-fill"></span>
<span class="badge-content">
<span class="badge-icon">${icon}</span>
<span>${extractor.extractor}</span>
</span>
</span>
`;
}
@@ -427,10 +501,14 @@
const statusIcon = snapshot.status === 'started' ? '&#8635;' : '&#128196;';
let extractorHtml = '';
if (snapshot.active_extractors && snapshot.active_extractors.length > 0) {
if (snapshot.all_extractors && snapshot.all_extractors.length > 0) {
// Sort extractors alphabetically by name to prevent reordering on updates
const sortedExtractors = [...snapshot.all_extractors].sort((a, b) =>
a.extractor.localeCompare(b.extractor)
);
extractorHtml = `
<div class="extractor-list" style="${isExpanded ? '' : 'display:none'}">
${snapshot.active_extractors.map(e => renderExtractor(e)).join('')}
${sortedExtractors.map(e => renderExtractor(e)).join('')}
</div>
`;
}
@@ -438,7 +516,7 @@
return `
<div class="snapshot-item" data-snapshot-key="${snapshotKey}">
<div class="snapshot-header" onclick="window.toggleSnapshot('${snapshotKey}')">
<span class="expand-icon ${isExpanded ? 'expanded' : ''}">${snapshot.active_extractors?.length ? '&#9654;' : ''}</span>
<span class="expand-icon ${isExpanded ? 'expanded' : ''}">${snapshot.all_extractors?.length ? '&#9654;' : ''}</span>
<span class="snapshot-icon">${statusIcon}</span>
<div class="snapshot-info">
<div class="snapshot-url">${formatUrl(snapshot.url)}</div>
@@ -469,6 +547,40 @@
snapshotsHtml = crawl.active_snapshots.map(s => renderSnapshot(s, crawl.id)).join('');
}
// Show warning if crawl is stuck (queued but can't start)
let warningHtml = '';
if (crawl.status === 'queued' && !crawl.can_start) {
warningHtml = `
<div style="padding: 8px 14px; background: rgba(248, 81, 73, 0.1); border-top: 1px solid #f85149; color: #f85149; font-size: 11px;">
⚠️ Crawl cannot start: ${crawl.seed_uri ? 'unknown error' : 'no seed URI'}
</div>
`;
} else if (crawl.status === 'queued' && crawl.retry_at_future) {
// Queued but retry_at is in future (was claimed by worker, will retry)
warningHtml = `
<div style="padding: 8px 14px; background: rgba(88, 166, 255, 0.1); border-top: 1px solid #58a6ff; color: #58a6ff; font-size: 11px;">
🔄 Retrying in ${crawl.seconds_until_retry}s...${crawl.seed_uri ? ` (${crawl.seed_uri})` : ''}
</div>
`;
} else if (crawl.status === 'queued' && crawl.total_snapshots === 0) {
// Queued and waiting to be picked up by worker
warningHtml = `
<div style="padding: 8px 14px; background: rgba(210, 153, 34, 0.1); border-top: 1px solid #d29922; color: #d29922; font-size: 11px;">
⏳ Waiting for worker to pick up...${crawl.seed_uri ? ` (${crawl.seed_uri})` : ''}
</div>
`;
}
// Show snapshot info or URL count if no snapshots yet
let metaText = `depth: ${crawl.max_depth}`;
if (crawl.total_snapshots > 0) {
metaText += ` | ${crawl.total_snapshots} snapshots`;
} else if (crawl.urls_count > 0) {
metaText += ` | ${crawl.urls_count} URLs`;
} else if (crawl.seed_uri) {
metaText += ` | ${crawl.seed_uri.substring(0, 40)}${crawl.seed_uri.length > 40 ? '...' : ''}`;
}
return `
<div class="crawl-item" data-crawl-id="${crawl.id}">
<div class="crawl-header" onclick="window.toggleCrawl('${crawl.id}')">
@@ -476,10 +588,11 @@
<span class="crawl-icon">${statusIcon}</span>
<div class="crawl-info">
<div class="crawl-label">${crawl.label}</div>
<div class="crawl-meta">depth: ${crawl.max_depth} | ${crawl.total_snapshots} snapshots</div>
<div class="crawl-meta">${metaText}</div>
</div>
<div class="crawl-stats">
<span style="color:#3fb950">${crawl.completed_snapshots} done</span>
<span style="color:#d29922">${crawl.started_snapshots || 0} active</span>
<span style="color:#8b949e">${crawl.pending_snapshots} pending</span>
</div>
<span class="status-badge ${crawl.status}">${crawl.status}</span>
@@ -490,6 +603,7 @@
style="width: ${crawl.progress}%"></div>
</div>
</div>
${warningHtml}
<div class="crawl-body" style="${isExpanded ? '' : 'display:none'}">
<div class="snapshot-list">
${snapshotsHtml}
@@ -542,25 +656,48 @@
data.snapshots_pending > 0 || data.snapshots_started > 0 ||
data.archiveresults_pending > 0 || data.archiveresults_started > 0;
// Update orchestrator status
// Update orchestrator status - show "Running" only when there's actual activity
// Don't distinguish between "Stopped" and "Idle" since orchestrator starts/stops frequently
const dot = document.getElementById('orchestrator-dot');
const text = document.getElementById('orchestrator-text');
if (data.orchestrator_running) {
dot.classList.remove('stopped');
const hasWorkers = data.total_workers > 0;
if (hasWorkers || hasActivity) {
dot.classList.remove('stopped', 'idle');
dot.classList.add('running');
text.textContent = 'Running';
} else {
dot.classList.remove('running');
dot.classList.add('stopped');
text.textContent = 'Stopped';
// No activity - show as idle (whether orchestrator process exists or not)
dot.classList.remove('stopped', 'running');
dot.classList.add('idle');
text.textContent = 'Idle';
}
// Pulse the dot to show we got fresh data
dot.classList.add('flash');
setTimeout(() => dot.classList.remove('flash'), 300);
// Update stats
document.getElementById('worker-count').textContent = data.total_workers;
document.getElementById('total-queued').textContent =
data.crawls_pending + data.snapshots_pending + data.archiveresults_pending;
document.getElementById('total-succeeded').textContent = data.archiveresults_succeeded;
document.getElementById('total-failed').textContent = data.archiveresults_failed;
// Store raw values and display relative to baseline
lastSucceeded = data.archiveresults_succeeded;
lastFailed = data.archiveresults_failed;
// If baseline is higher than current (e.g. after DB reset), reset baseline
if (succeededBaseline > lastSucceeded) {
succeededBaseline = 0;
localStorage.setItem('progress-succeeded-baseline', '0');
}
if (failedBaseline > lastFailed) {
failedBaseline = 0;
localStorage.setItem('progress-failed-baseline', '0');
}
document.getElementById('total-succeeded').textContent = lastSucceeded - succeededBaseline;
document.getElementById('total-failed').textContent = lastFailed - failedBaseline;
// Render crawl tree
if (data.active_crawls.length > 0) {

View File

@@ -7,9 +7,14 @@ class Command(BaseCommand):
help = 'Run the archivebox orchestrator'
def add_arguments(self, parser):
parser.add_argument('--daemon', '-d', action='store_true', help="Run forever (don't exit on idle)")
parser.add_argument(
'--exit-on-idle',
action='store_true',
default=False,
help="Exit when all work is complete (default: run forever)"
)
def handle(self, *args, **kwargs):
daemon = kwargs.get('daemon', False)
orchestrator = Orchestrator(exit_on_idle=not daemon)
exit_on_idle = kwargs.get('exit_on_idle', False)
orchestrator = Orchestrator(exit_on_idle=exit_on_idle)
orchestrator.runloop()

View File

@@ -12,16 +12,17 @@ Architecture:
└── Each worker spawns task subprocesses via CLI
Usage:
# Embedded in other commands (exits when done)
# Default: runs forever (for use as subprocess of server)
orchestrator = Orchestrator(exit_on_idle=False)
orchestrator.runloop()
# Exit when done (for embedded use in other commands)
orchestrator = Orchestrator(exit_on_idle=True)
orchestrator.runloop()
# Daemon mode (runs forever)
orchestrator = Orchestrator(exit_on_idle=False)
orchestrator.start() # fork and return
# Or run via CLI
archivebox orchestrator [--daemon]
archivebox manage orchestrator # runs forever
archivebox manage orchestrator --exit-on-idle # exits when done
"""
__package__ = 'archivebox.workers'
@@ -45,6 +46,14 @@ from .pid_utils import (
)
def _run_orchestrator_process(exit_on_idle: bool) -> None:
"""Top-level function for multiprocessing (must be picklable)."""
from archivebox.config.django import setup_django
setup_django()
orchestrator = Orchestrator(exit_on_idle=exit_on_idle)
orchestrator.runloop()
class Orchestrator:
"""
Manages worker processes by polling queues and spawning workers as needed.
@@ -277,12 +286,12 @@ class Orchestrator:
Fork orchestrator as a background process.
Returns the PID of the new process.
"""
def run_orchestrator():
from archivebox.config.django import setup_django
setup_django()
self.runloop()
proc = Process(target=run_orchestrator, name='orchestrator')
# Use module-level function to avoid pickle errors with local functions
proc = Process(
target=_run_orchestrator_process,
args=(self.exit_on_idle,),
name='orchestrator'
)
proc.start()
assert proc.pid is not None

View File

@@ -28,7 +28,7 @@ WORKERS_DIR_NAME = "workers"
ORCHESTRATOR_WORKER = {
"name": "worker_orchestrator",
"command": "archivebox manage orchestrator",
"command": "archivebox manage orchestrator", # runs forever by default
"autostart": "true",
"autorestart": "true",
"stdout_logfile": "logs/worker_orchestrator.log",
@@ -332,14 +332,14 @@ def stop_worker(supervisor, daemon_name):
def tail_worker_logs(log_path: str):
get_or_create_supervisord_process(daemonize=False)
from rich.live import Live
from rich.table import Table
table = Table()
table.add_column("TS")
table.add_column("URL")
try:
with Live(table, refresh_per_second=1) as live: # update 4 times a second to feel fluid
with open(log_path, 'r') as f:
@@ -352,6 +352,83 @@ def tail_worker_logs(log_path: str):
except SystemExit:
pass
def tail_multiple_worker_logs(log_files: list[str], follow=True):
"""Tail multiple log files simultaneously, interleaving their output."""
import select
from pathlib import Path
# Convert relative paths to absolute paths
log_paths = []
for log_file in log_files:
log_path = Path(log_file)
if not log_path.is_absolute():
log_path = CONSTANTS.DATA_DIR / log_path
# Create log file if it doesn't exist
if not log_path.exists():
log_path.parent.mkdir(parents=True, exist_ok=True)
log_path.touch()
log_paths.append(log_path)
# Open all log files
file_handles = []
for log_path in log_paths:
try:
f = open(log_path, 'r')
# Seek to end of file if following
if follow:
f.seek(0, 2) # Seek to end
file_handles.append((log_path.name, f))
except Exception as e:
print(f"[yellow]Warning: Could not open {log_path}: {e}[/yellow]")
if not file_handles:
print("[red]No log files could be opened[/red]")
return
# Print which logs we're tailing
log_names = [name for name, _ in file_handles]
print(f"[dim]Tailing: {', '.join(log_names)}[/dim]")
print()
try:
while follow:
# Read available lines from all files
for log_name, f in file_handles:
line = f.readline()
if line:
# Colorize based on log source
if 'orchestrator' in log_name.lower():
color = 'cyan'
elif 'daphne' in log_name.lower():
color = 'green'
else:
color = 'white'
# Strip ANSI codes if present (supervisord does this but just in case)
import re
line_clean = re.sub(r'\x1b\[[0-9;]*m', '', line.rstrip())
if line_clean:
print(f'[{color}][{log_name}][/{color}] {line_clean}')
# Small sleep to avoid busy-waiting
time.sleep(0.1)
except (KeyboardInterrupt, BrokenPipeError, IOError):
print("\n[yellow][i] Stopped tailing logs[/i][/yellow]")
except SystemExit:
pass
finally:
# Close all file handles
for _, f in file_handles:
try:
f.close()
except Exception:
pass
def watch_worker(supervisor, daemon_name, interval=5):
"""loop continuously and monitor worker's health"""
while True:

View File

@@ -3,6 +3,9 @@ Background task functions for queuing work to the orchestrator.
These functions queue Snapshots/Crawls for processing by setting their status
to QUEUED, which the orchestrator workers will pick up and process.
NOTE: These functions do NOT start the orchestrator - they assume it's already
running via `archivebox server` (supervisord) or will be run inline by the CLI.
"""
__package__ = 'archivebox.workers'
@@ -10,16 +13,6 @@ __package__ = 'archivebox.workers'
from django.utils import timezone
def ensure_orchestrator_running():
"""Ensure the orchestrator is running to process queued items."""
from .orchestrator import Orchestrator
if not Orchestrator.is_running():
# Start orchestrator in background
orchestrator = Orchestrator(exit_on_idle=True)
orchestrator.start()
def bg_add(add_kwargs: dict) -> int:
"""
Add URLs and queue them for archiving.
@@ -36,9 +29,6 @@ def bg_add(add_kwargs: dict) -> int:
result = add(**add_kwargs)
# Ensure orchestrator is running to process the new snapshots
ensure_orchestrator_running()
return len(result) if result else 0
@@ -66,10 +56,6 @@ def bg_archive_snapshots(snapshots, kwargs: dict | None = None) -> int:
)
queued_count += 1
# Ensure orchestrator is running to process the queued snapshots
if queued_count > 0:
ensure_orchestrator_running()
return queued_count
@@ -90,9 +76,6 @@ def bg_archive_snapshot(snapshot, overwrite: bool = False, methods: list | None
status=Snapshot.StatusChoices.QUEUED,
retry_at=timezone.now(),
)
# Ensure orchestrator is running to process the queued snapshot
ensure_orchestrator_running()
return 1
return 0

View File

@@ -67,8 +67,8 @@ class Worker:
# Configuration (can be overridden by subclasses)
MAX_TICK_TIME: ClassVar[int] = 60
MAX_CONCURRENT_TASKS: ClassVar[int] = 1
POLL_INTERVAL: ClassVar[float] = 0.5
IDLE_TIMEOUT: ClassVar[int] = 3 # Exit after N idle iterations (set to 0 to never exit)
POLL_INTERVAL: ClassVar[float] = 1.0
IDLE_TIMEOUT: ClassVar[int] = 10 # Exit after N idle iterations (10 sec at 1.0 poll interval)
def __init__(self, worker_id: int = 0, daemon: bool = False, **kwargs: Any):
self.worker_id = worker_id