mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-03 14:27:55 +10:00
logging and admin ui improvements
This commit is contained in:
3
archivebox/ArchiveBox.conf
Normal file
3
archivebox/ArchiveBox.conf
Normal file
@@ -0,0 +1,3 @@
|
||||
[SERVER_CONFIG]
|
||||
SECRET_KEY = amuxg7v5e2l_6jrktp_f3kszlpx4ieqk4rtwda5q6nfiavits4
|
||||
|
||||
@@ -13,7 +13,21 @@ class APITokenAdmin(BaseModelAdmin):
|
||||
sort_fields = ('id', 'created_at', 'created_by', 'expires')
|
||||
readonly_fields = ('created_at', 'modified_at')
|
||||
search_fields = ('id', 'created_by__username', 'token')
|
||||
fields = ('created_by', 'token', 'expires', *readonly_fields)
|
||||
|
||||
fieldsets = (
|
||||
('Token', {
|
||||
'fields': ('token', 'expires'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Owner', {
|
||||
'fields': ('created_by',),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Timestamps', {
|
||||
'fields': ('created_at', 'modified_at'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
)
|
||||
|
||||
list_filter = ('created_by',)
|
||||
ordering = ['-created_at']
|
||||
@@ -25,6 +39,29 @@ class CustomWebhookAdmin(WebhookAdmin, BaseModelAdmin):
|
||||
sort_fields = ('created_at', 'created_by', 'id', 'referenced_model', 'endpoint', 'last_success', 'last_error')
|
||||
readonly_fields = ('created_at', 'modified_at', *WebhookAdmin.readonly_fields)
|
||||
|
||||
fieldsets = (
|
||||
('Webhook', {
|
||||
'fields': ('name', 'signal', 'referenced_model', 'endpoint'),
|
||||
'classes': ('card', 'wide'),
|
||||
}),
|
||||
('Authentication', {
|
||||
'fields': ('auth_token',),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Status', {
|
||||
'fields': ('enabled', 'last_success', 'last_error'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Owner', {
|
||||
'fields': ('created_by',),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Timestamps', {
|
||||
'fields': ('created_at', 'modified_at'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
)
|
||||
|
||||
|
||||
def register_admin(admin_site):
|
||||
admin_site.register(APIToken, APITokenAdmin)
|
||||
|
||||
@@ -115,12 +115,10 @@ def add(urls: str | list[str],
|
||||
# - Repeat until max_depth reached
|
||||
|
||||
if bg:
|
||||
# Background mode: start orchestrator and return immediately
|
||||
print('[yellow]\\[*] Running in background mode - starting orchestrator...[/yellow]')
|
||||
orchestrator = Orchestrator(exit_on_idle=True)
|
||||
orchestrator.start() # Fork to background
|
||||
# Background mode: just queue work and return (orchestrator via server will pick it up)
|
||||
print('[yellow]\\[*] URLs queued. Orchestrator will process them (run `archivebox server` if not already running).[/yellow]')
|
||||
else:
|
||||
# Foreground mode: run orchestrator until all work is done
|
||||
# Foreground mode: run orchestrator inline until all work is done
|
||||
print(f'[green]\\[*] Starting orchestrator to process crawl...[/green]')
|
||||
orchestrator = Orchestrator(exit_on_idle=True)
|
||||
orchestrator.runloop() # Block until complete
|
||||
|
||||
@@ -117,11 +117,11 @@ def run_plugins(
|
||||
if snapshot_id:
|
||||
snapshot_ids.add(snapshot_id)
|
||||
elif record.get('url'):
|
||||
# Look up by URL
|
||||
try:
|
||||
snap = Snapshot.objects.get(url=record['url'])
|
||||
# Look up by URL (get most recent if multiple exist)
|
||||
snap = Snapshot.objects.filter(url=record['url']).order_by('-created_at').first()
|
||||
if snap:
|
||||
snapshot_ids.add(str(snap.id))
|
||||
except Snapshot.DoesNotExist:
|
||||
else:
|
||||
rprint(f'[yellow]Snapshot not found for URL: {record["url"]}[/yellow]', file=sys.stderr)
|
||||
|
||||
elif record_type == TYPE_ARCHIVERESULT:
|
||||
|
||||
@@ -49,20 +49,45 @@ def install(dry_run: bool=False) -> None:
|
||||
# Using a minimal crawl that will trigger on_Crawl hooks
|
||||
created_by_id = get_or_create_system_user_pk()
|
||||
|
||||
seed = Seed.objects.create(
|
||||
seed, _created = Seed.objects.get_or_create(
|
||||
uri='archivebox://install',
|
||||
label='Dependency detection',
|
||||
created_by_id=created_by_id,
|
||||
defaults={
|
||||
'extractor': 'auto',
|
||||
}
|
||||
)
|
||||
|
||||
crawl = Crawl.objects.create(
|
||||
crawl, created = Crawl.objects.get_or_create(
|
||||
seed=seed,
|
||||
max_depth=0,
|
||||
created_by_id=created_by_id,
|
||||
status='queued',
|
||||
defaults={
|
||||
'status': 'queued',
|
||||
}
|
||||
)
|
||||
|
||||
# If crawl already existed, reset it to queued state so it can be processed again
|
||||
if not created:
|
||||
crawl.status = 'queued'
|
||||
crawl.retry_at = timezone.now()
|
||||
crawl.save()
|
||||
|
||||
print(f'[+] Created dependency detection crawl: {crawl.id}')
|
||||
print(f'[+] Crawl status: {crawl.status}, retry_at: {crawl.retry_at}')
|
||||
|
||||
# Verify the crawl is in the queue
|
||||
from crawls.models import Crawl as CrawlModel
|
||||
queued_crawls = CrawlModel.objects.filter(
|
||||
retry_at__lte=timezone.now()
|
||||
).exclude(
|
||||
status__in=CrawlModel.FINAL_STATES
|
||||
)
|
||||
print(f'[+] Crawls in queue: {queued_crawls.count()}')
|
||||
if queued_crawls.exists():
|
||||
for c in queued_crawls:
|
||||
print(f' - Crawl {c.id}: status={c.status}, retry_at={c.retry_at}')
|
||||
|
||||
print('[+] Running crawl to detect binaries via on_Crawl hooks...')
|
||||
print()
|
||||
|
||||
|
||||
@@ -56,20 +56,53 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
|
||||
except IndexError:
|
||||
pass
|
||||
|
||||
print('[green][+] Starting ArchiveBox webserver...[/green]')
|
||||
print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
|
||||
print(f' [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]')
|
||||
print(' > Writing ArchiveBox error log to ./logs/errors.log')
|
||||
|
||||
if SHELL_CONFIG.DEBUG:
|
||||
print('[green][+] Starting ArchiveBox webserver in DEBUG mode...[/green]')
|
||||
print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
|
||||
print(f' [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]')
|
||||
print(' > Writing ArchiveBox error log to ./logs/errors.log')
|
||||
if not reload:
|
||||
runserver_args.append('--noreload') # '--insecure'
|
||||
if nothreading:
|
||||
runserver_args.append('--nothreading')
|
||||
call_command("runserver", *runserver_args)
|
||||
else:
|
||||
from workers.supervisord_util import start_server_workers
|
||||
from workers.supervisord_util import (
|
||||
get_existing_supervisord_process,
|
||||
get_worker,
|
||||
start_server_workers,
|
||||
tail_multiple_worker_logs,
|
||||
)
|
||||
|
||||
# Check if supervisord is already running
|
||||
supervisor = get_existing_supervisord_process()
|
||||
if supervisor:
|
||||
daphne_proc = get_worker(supervisor, 'worker_daphne')
|
||||
|
||||
# If daphne is already running, just tail logs
|
||||
if daphne_proc and daphne_proc.get('statename') == 'RUNNING':
|
||||
orchestrator_proc = get_worker(supervisor, 'worker_orchestrator')
|
||||
print('[yellow][!] ArchiveBox server is already running[/yellow]')
|
||||
print(f' [green]√[/green] Web server (worker_daphne) is RUNNING on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
|
||||
if orchestrator_proc and orchestrator_proc.get('statename') == 'RUNNING':
|
||||
print(f' [green]√[/green] Background worker (worker_orchestrator) is RUNNING')
|
||||
print()
|
||||
print('[blue][i] Tailing worker logs (Ctrl+C to stop watching)...[/i][/blue]')
|
||||
print()
|
||||
|
||||
# Tail logs for both workers
|
||||
tail_multiple_worker_logs(
|
||||
log_files=['logs/worker_daphne.log', 'logs/worker_orchestrator.log'],
|
||||
follow=True,
|
||||
)
|
||||
return
|
||||
# Otherwise, daphne is not running - fall through to start it
|
||||
|
||||
# No existing workers found - start new ones
|
||||
print('[green][+] Starting ArchiveBox webserver...[/green]')
|
||||
print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
|
||||
print(f' [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]')
|
||||
print(' > Writing ArchiveBox error log to ./logs/errors.log')
|
||||
print()
|
||||
start_server_workers(host=host, port=port, daemonize=daemonize)
|
||||
print("\n[i][green][🟩] ArchiveBox server shut down gracefully.[/green][/i]")
|
||||
|
||||
@@ -119,12 +119,13 @@ def version(quiet: bool=False,
|
||||
else:
|
||||
for key in sorted(set(binary_config_keys)):
|
||||
# Get the actual binary name/path from config value
|
||||
bin_value = config.get(key, '').strip()
|
||||
# Prioritize Machine.config overrides over base config
|
||||
bin_value = machine.config.get(key) or config.get(key, '').strip()
|
||||
if not bin_value:
|
||||
continue
|
||||
|
||||
# Check if it's a path (has slashes) or just a name
|
||||
is_path = '/' in bin_value
|
||||
is_path = '/' in str(bin_value)
|
||||
|
||||
if is_path:
|
||||
# It's a full path - match against abspath
|
||||
|
||||
@@ -5,7 +5,6 @@ import sys
|
||||
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from rich.progress import Progress
|
||||
from rich.console import Console
|
||||
|
||||
import django
|
||||
@@ -27,16 +26,6 @@ STDERR = Console(stderr=True)
|
||||
logging.CONSOLE = CONSOLE
|
||||
|
||||
|
||||
INITIAL_STARTUP_PROGRESS = None
|
||||
INITIAL_STARTUP_PROGRESS_TASK = 0
|
||||
|
||||
def bump_startup_progress_bar(advance=1):
|
||||
global INITIAL_STARTUP_PROGRESS
|
||||
global INITIAL_STARTUP_PROGRESS_TASK
|
||||
if INITIAL_STARTUP_PROGRESS:
|
||||
INITIAL_STARTUP_PROGRESS.update(INITIAL_STARTUP_PROGRESS_TASK, advance=advance) # type: ignore
|
||||
|
||||
|
||||
def setup_django_minimal():
|
||||
# sys.path.append(str(CONSTANTS.PACKAGE_DIR))
|
||||
# os.environ.setdefault('ARCHIVEBOX_DATA_DIR', str(CONSTANTS.DATA_DIR))
|
||||
@@ -49,9 +38,7 @@ DJANGO_SET_UP = False
|
||||
|
||||
def setup_django(check_db=False, in_memory_db=False) -> None:
|
||||
from rich.panel import Panel
|
||||
|
||||
global INITIAL_STARTUP_PROGRESS
|
||||
global INITIAL_STARTUP_PROGRESS_TASK
|
||||
|
||||
global DJANGO_SET_UP
|
||||
|
||||
if DJANGO_SET_UP:
|
||||
@@ -59,118 +46,100 @@ def setup_django(check_db=False, in_memory_db=False) -> None:
|
||||
# TODO: figure out why CLI entrypoints with init_pending are running this twice sometimes
|
||||
return
|
||||
|
||||
with Progress(transient=True, expand=True, console=STDERR) as INITIAL_STARTUP_PROGRESS:
|
||||
INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS.add_task("[green]Loading modules...", total=25, visible=True)
|
||||
|
||||
from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, SudoPermission
|
||||
|
||||
# if running as root, chown the data dir to the archivebox user to make sure it's accessible to the archivebox user
|
||||
if IS_ROOT and ARCHIVEBOX_USER != 0:
|
||||
with SudoPermission(uid=0):
|
||||
# running as root is a special case where it's ok to be a bit slower
|
||||
# make sure data dir is always owned by the correct user
|
||||
os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}" 2>/dev/null')
|
||||
os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}"/* 2>/dev/null')
|
||||
from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, SudoPermission
|
||||
|
||||
bump_startup_progress_bar()
|
||||
try:
|
||||
from django.core.management import call_command
|
||||
|
||||
bump_startup_progress_bar()
|
||||
# if running as root, chown the data dir to the archivebox user to make sure it's accessible to the archivebox user
|
||||
if IS_ROOT and ARCHIVEBOX_USER != 0:
|
||||
with SudoPermission(uid=0):
|
||||
# running as root is a special case where it's ok to be a bit slower
|
||||
# make sure data dir is always owned by the correct user
|
||||
os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}" 2>/dev/null')
|
||||
os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}"/* 2>/dev/null')
|
||||
|
||||
if in_memory_db:
|
||||
raise Exception('dont use this anymore')
|
||||
|
||||
# some commands (e.g. oneshot) dont store a long-lived sqlite3 db file on disk.
|
||||
# in those cases we create a temporary in-memory db and run the migrations
|
||||
# immediately to get a usable in-memory-database at startup
|
||||
os.environ.setdefault("ARCHIVEBOX_DATABASE_NAME", ":memory:")
|
||||
try:
|
||||
from django.core.management import call_command
|
||||
|
||||
if in_memory_db:
|
||||
raise Exception('dont use this anymore')
|
||||
|
||||
# some commands (e.g. oneshot) dont store a long-lived sqlite3 db file on disk.
|
||||
# in those cases we create a temporary in-memory db and run the migrations
|
||||
# immediately to get a usable in-memory-database at startup
|
||||
os.environ.setdefault("ARCHIVEBOX_DATABASE_NAME", ":memory:")
|
||||
django.setup()
|
||||
|
||||
call_command("migrate", interactive=False, verbosity=0)
|
||||
else:
|
||||
# Otherwise use default sqlite3 file-based database and initialize django
|
||||
# without running migrations automatically (user runs them manually by calling init)
|
||||
try:
|
||||
django.setup()
|
||||
|
||||
bump_startup_progress_bar()
|
||||
call_command("migrate", interactive=False, verbosity=0)
|
||||
else:
|
||||
# Otherwise use default sqlite3 file-based database and initialize django
|
||||
# without running migrations automatically (user runs them manually by calling init)
|
||||
try:
|
||||
django.setup()
|
||||
except Exception as e:
|
||||
bump_startup_progress_bar(advance=1000)
|
||||
|
||||
is_using_meta_cmd = any(ignored_subcommand in sys.argv for ignored_subcommand in ('help', 'version', '--help', '--version'))
|
||||
if not is_using_meta_cmd:
|
||||
# show error message to user only if they're not running a meta command / just trying to get help
|
||||
STDERR.print()
|
||||
STDERR.print(Panel(
|
||||
f'\n[red]{e.__class__.__name__}[/red]: [yellow]{e}[/yellow]\nPlease check your config and [blue]DATA_DIR[/blue] permissions.\n',
|
||||
title='\n\n[red][X] Error while trying to load database![/red]',
|
||||
subtitle='[grey53]NO WRITES CAN BE PERFORMED[/grey53]',
|
||||
expand=False,
|
||||
style='bold red',
|
||||
))
|
||||
STDERR.print()
|
||||
STDERR.print_exception(show_locals=False)
|
||||
return
|
||||
|
||||
bump_startup_progress_bar()
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
# log startup message to the error log
|
||||
with open(settings.ERROR_LOG, "a", encoding='utf-8') as f:
|
||||
command = ' '.join(sys.argv)
|
||||
ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
|
||||
f.write(f"\n> {command}; TS={ts} VERSION={CONSTANTS.VERSION} IN_DOCKER={SHELL_CONFIG.IN_DOCKER} IS_TTY={SHELL_CONFIG.IS_TTY}\n")
|
||||
|
||||
if check_db:
|
||||
# make sure the data dir is owned by a non-root user
|
||||
if CONSTANTS.DATA_DIR.stat().st_uid == 0:
|
||||
STDERR.print('[red][X] Error: ArchiveBox DATA_DIR cannot be owned by root![/red]')
|
||||
STDERR.print(f' {CONSTANTS.DATA_DIR}')
|
||||
except Exception as e:
|
||||
is_using_meta_cmd = any(ignored_subcommand in sys.argv for ignored_subcommand in ('help', 'version', '--help', '--version'))
|
||||
if not is_using_meta_cmd:
|
||||
# show error message to user only if they're not running a meta command / just trying to get help
|
||||
STDERR.print()
|
||||
STDERR.print('[violet]Hint:[/violet] Are you running archivebox in the right folder? (and as a non-root user?)')
|
||||
STDERR.print(' cd path/to/your/archive/data')
|
||||
STDERR.print(' archivebox [command]')
|
||||
STDERR.print(Panel(
|
||||
f'\n[red]{e.__class__.__name__}[/red]: [yellow]{e}[/yellow]\nPlease check your config and [blue]DATA_DIR[/blue] permissions.\n',
|
||||
title='\n\n[red][X] Error while trying to load database![/red]',
|
||||
subtitle='[grey53]NO WRITES CAN BE PERFORMED[/grey53]',
|
||||
expand=False,
|
||||
style='bold red',
|
||||
))
|
||||
STDERR.print()
|
||||
raise SystemExit(9)
|
||||
|
||||
# Create cache table in DB if needed
|
||||
try:
|
||||
from django.core.cache import cache
|
||||
cache.get('test', None)
|
||||
except django.db.utils.OperationalError:
|
||||
call_command("createcachetable", verbosity=0)
|
||||
STDERR.print_exception(show_locals=False)
|
||||
return
|
||||
|
||||
bump_startup_progress_bar()
|
||||
from django.conf import settings
|
||||
|
||||
# if archivebox gets imported multiple times, we have to close
|
||||
# the sqlite3 whenever we init from scratch to avoid multiple threads
|
||||
# sharing the same connection by accident
|
||||
from django.db import connections
|
||||
for conn in connections.all():
|
||||
conn.close_if_unusable_or_obsolete()
|
||||
# log startup message to the error log
|
||||
with open(settings.ERROR_LOG, "a", encoding='utf-8') as f:
|
||||
command = ' '.join(sys.argv)
|
||||
ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
|
||||
f.write(f"\n> {command}; TS={ts} VERSION={CONSTANTS.VERSION} IN_DOCKER={SHELL_CONFIG.IN_DOCKER} IS_TTY={SHELL_CONFIG.IS_TTY}\n")
|
||||
|
||||
sql_index_path = CONSTANTS.DATABASE_FILE
|
||||
assert os.access(sql_index_path, os.F_OK), (
|
||||
f'No database file {sql_index_path} found in: {CONSTANTS.DATA_DIR} (Are you in an ArchiveBox collection directory?)')
|
||||
if check_db:
|
||||
# make sure the data dir is owned by a non-root user
|
||||
if CONSTANTS.DATA_DIR.stat().st_uid == 0:
|
||||
STDERR.print('[red][X] Error: ArchiveBox DATA_DIR cannot be owned by root![/red]')
|
||||
STDERR.print(f' {CONSTANTS.DATA_DIR}')
|
||||
STDERR.print()
|
||||
STDERR.print('[violet]Hint:[/violet] Are you running archivebox in the right folder? (and as a non-root user?)')
|
||||
STDERR.print(' cd path/to/your/archive/data')
|
||||
STDERR.print(' archivebox [command]')
|
||||
STDERR.print()
|
||||
raise SystemExit(9)
|
||||
|
||||
bump_startup_progress_bar()
|
||||
# Create cache table in DB if needed
|
||||
try:
|
||||
from django.core.cache import cache
|
||||
cache.get('test', None)
|
||||
except django.db.utils.OperationalError:
|
||||
call_command("createcachetable", verbosity=0)
|
||||
|
||||
# https://docs.pydantic.dev/logfire/integrations/django/ Logfire Debugging
|
||||
# if settings.DEBUG_LOGFIRE:
|
||||
# from opentelemetry.instrumentation.sqlite3 import SQLite3Instrumentor
|
||||
# SQLite3Instrumentor().instrument()
|
||||
# if archivebox gets imported multiple times, we have to close
|
||||
# the sqlite3 whenever we init from scratch to avoid multiple threads
|
||||
# sharing the same connection by accident
|
||||
from django.db import connections
|
||||
for conn in connections.all():
|
||||
conn.close_if_unusable_or_obsolete()
|
||||
|
||||
# import logfire
|
||||
sql_index_path = CONSTANTS.DATABASE_FILE
|
||||
assert os.access(sql_index_path, os.F_OK), (
|
||||
f'No database file {sql_index_path} found in: {CONSTANTS.DATA_DIR} (Are you in an ArchiveBox collection directory?)')
|
||||
|
||||
# logfire.configure()
|
||||
# logfire.instrument_django(is_sql_commentor_enabled=True)
|
||||
# logfire.info(f'Started ArchiveBox v{CONSTANTS.VERSION}', argv=sys.argv)
|
||||
# https://docs.pydantic.dev/logfire/integrations/django/ Logfire Debugging
|
||||
# if settings.DEBUG_LOGFIRE:
|
||||
# from opentelemetry.instrumentation.sqlite3 import SQLite3Instrumentor
|
||||
# SQLite3Instrumentor().instrument()
|
||||
|
||||
# import logfire
|
||||
|
||||
# logfire.configure()
|
||||
# logfire.instrument_django(is_sql_commentor_enabled=True)
|
||||
# logfire.info(f'Started ArchiveBox v{CONSTANTS.VERSION}', argv=sys.argv)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
raise SystemExit(2)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
raise SystemExit(2)
|
||||
|
||||
DJANGO_SET_UP = True
|
||||
|
||||
INITIAL_STARTUP_PROGRESS = None
|
||||
INITIAL_STARTUP_PROGRESS_TASK = None
|
||||
|
||||
@@ -19,6 +19,150 @@ from archivebox.hooks import get_extractor_icon
|
||||
from core.models import ArchiveResult, Snapshot
|
||||
|
||||
|
||||
def render_archiveresults_list(archiveresults_qs, limit=50):
|
||||
"""Render a nice inline list view of archive results with status, extractor, output, and actions."""
|
||||
|
||||
results = list(archiveresults_qs.order_by('-end_ts').select_related('snapshot')[:limit])
|
||||
|
||||
if not results:
|
||||
return mark_safe('<div style="color: #64748b; font-style: italic; padding: 16px 0;">No Archive Results yet...</div>')
|
||||
|
||||
# Status colors
|
||||
status_colors = {
|
||||
'succeeded': ('#166534', '#dcfce7'), # green
|
||||
'failed': ('#991b1b', '#fee2e2'), # red
|
||||
'queued': ('#6b7280', '#f3f4f6'), # gray
|
||||
'started': ('#92400e', '#fef3c7'), # amber
|
||||
}
|
||||
|
||||
rows = []
|
||||
for idx, result in enumerate(results):
|
||||
status = result.status or 'queued'
|
||||
color, bg = status_colors.get(status, ('#6b7280', '#f3f4f6'))
|
||||
|
||||
# Get extractor icon
|
||||
icon = get_extractor_icon(result.extractor)
|
||||
|
||||
# Format timestamp
|
||||
end_time = result.end_ts.strftime('%Y-%m-%d %H:%M:%S') if result.end_ts else '-'
|
||||
|
||||
# Truncate output for display
|
||||
full_output = result.output or '-'
|
||||
output_display = full_output[:60]
|
||||
if len(full_output) > 60:
|
||||
output_display += '...'
|
||||
|
||||
# Get full command as tooltip
|
||||
cmd_str = ' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd or '-')
|
||||
|
||||
# Build output link
|
||||
output_link = f'/archive/{result.snapshot.timestamp}/{result.output}' if result.output and result.status == 'succeeded' else f'/archive/{result.snapshot.timestamp}/'
|
||||
|
||||
# Get version - try cmd_version field
|
||||
version = result.cmd_version if result.cmd_version else '-'
|
||||
|
||||
# Unique ID for this row's expandable output
|
||||
row_id = f'output_{idx}_{str(result.id)[:8]}'
|
||||
|
||||
rows.append(f'''
|
||||
<tr style="border-bottom: 1px solid #f1f5f9; transition: background 0.15s;" onmouseover="this.style.background='#f8fafc'" onmouseout="this.style.background='transparent'">
|
||||
<td style="padding: 10px 12px; white-space: nowrap;">
|
||||
<span style="display: inline-block; padding: 3px 10px; border-radius: 12px;
|
||||
font-size: 11px; font-weight: 600; text-transform: uppercase;
|
||||
color: {color}; background: {bg};">{status}</span>
|
||||
</td>
|
||||
<td style="padding: 10px 12px; white-space: nowrap; font-size: 20px;" title="{result.extractor}">
|
||||
{icon}
|
||||
</td>
|
||||
<td style="padding: 10px 12px; font-weight: 500; color: #334155;">
|
||||
{result.extractor}
|
||||
</td>
|
||||
<td style="padding: 10px 12px; max-width: 280px;">
|
||||
<span onclick="document.getElementById('{row_id}').open = !document.getElementById('{row_id}').open"
|
||||
style="color: #2563eb; text-decoration: none; font-family: ui-monospace, monospace; font-size: 12px; cursor: pointer;"
|
||||
title="Click to expand full output">
|
||||
{output_display}
|
||||
</span>
|
||||
</td>
|
||||
<td style="padding: 10px 12px; white-space: nowrap; color: #64748b; font-size: 12px;">
|
||||
{end_time}
|
||||
</td>
|
||||
<td style="padding: 10px 12px; white-space: nowrap; font-family: ui-monospace, monospace; font-size: 11px; color: #64748b;">
|
||||
{version}
|
||||
</td>
|
||||
<td style="padding: 10px 8px; white-space: nowrap;">
|
||||
<div style="display: flex; gap: 4px;">
|
||||
<a href="{output_link}" target="_blank"
|
||||
style="padding: 4px 8px; background: #f1f5f9; border-radius: 4px; color: #475569; text-decoration: none; font-size: 11px;"
|
||||
title="View output">📄</a>
|
||||
<a href="{reverse('admin:core_archiveresult_change', args=[result.id])}"
|
||||
style="padding: 4px 8px; background: #f1f5f9; border-radius: 4px; color: #475569; text-decoration: none; font-size: 11px;"
|
||||
title="Edit">✏️</a>
|
||||
</div>
|
||||
</td>
|
||||
</tr>
|
||||
<tr style="border-bottom: 1px solid #e2e8f0;">
|
||||
<td colspan="7" style="padding: 0 12px 10px 12px;">
|
||||
<details id="{row_id}" style="margin: 0;">
|
||||
<summary style="cursor: pointer; font-size: 11px; color: #94a3b8; user-select: none;">
|
||||
Details & Output
|
||||
</summary>
|
||||
<div style="margin-top: 8px; padding: 10px; background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 6px; max-height: 200px; overflow: auto;">
|
||||
<div style="font-size: 11px; color: #64748b; margin-bottom: 8px;">
|
||||
<span style="margin-right: 16px;"><b>ID:</b> <code>{str(result.id)[:8]}...</code></span>
|
||||
<span style="margin-right: 16px;"><b>Version:</b> <code>{version}</code></span>
|
||||
<span style="margin-right: 16px;"><b>PWD:</b> <code>{result.pwd or '-'}</code></span>
|
||||
</div>
|
||||
<div style="font-size: 11px; color: #64748b; margin-bottom: 8px;">
|
||||
<b>Output:</b>
|
||||
</div>
|
||||
<pre style="margin: 0; padding: 8px; background: #1e293b; border-radius: 4px; color: #e2e8f0; font-size: 12px; white-space: pre-wrap; word-break: break-all; max-height: 120px; overflow: auto;">{full_output}</pre>
|
||||
<div style="font-size: 11px; color: #64748b; margin-top: 8px;">
|
||||
<b>Command:</b>
|
||||
</div>
|
||||
<pre style="margin: 0; padding: 8px; background: #1e293b; border-radius: 4px; color: #e2e8f0; font-size: 11px; white-space: pre-wrap; word-break: break-all;">{cmd_str}</pre>
|
||||
</div>
|
||||
</details>
|
||||
</td>
|
||||
</tr>
|
||||
''')
|
||||
|
||||
total_count = archiveresults_qs.count()
|
||||
footer = ''
|
||||
if total_count > limit:
|
||||
footer = f'''
|
||||
<tr>
|
||||
<td colspan="7" style="padding: 12px; text-align: center; color: #64748b; font-size: 13px; background: #f8fafc;">
|
||||
Showing {limit} of {total_count} results
|
||||
<a href="/admin/core/archiveresult/?snapshot__id__exact={results[0].snapshot_id if results else ''}"
|
||||
style="color: #2563eb;">View all →</a>
|
||||
</td>
|
||||
</tr>
|
||||
'''
|
||||
|
||||
return mark_safe(f'''
|
||||
<div style="border: 1px solid #e2e8f0; border-radius: 8px; overflow: hidden; background: #fff; width: 100%;">
|
||||
<table style="width: 100%; border-collapse: collapse; font-size: 14px;">
|
||||
<thead>
|
||||
<tr style="background: #f8fafc; border-bottom: 2px solid #e2e8f0;">
|
||||
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Status</th>
|
||||
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; width: 32px;"></th>
|
||||
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Extractor</th>
|
||||
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Output</th>
|
||||
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Completed</th>
|
||||
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Version</th>
|
||||
<th style="padding: 10px 8px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Actions</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{''.join(rows)}
|
||||
{footer}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
''')
|
||||
|
||||
|
||||
|
||||
class ArchiveResultInline(admin.TabularInline):
|
||||
name = 'Archive Results Log'
|
||||
@@ -97,18 +241,44 @@ class ArchiveResultAdmin(BaseModelAdmin):
|
||||
sort_fields = ('id', 'created_by', 'created_at', 'extractor', 'status')
|
||||
readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'extractor_with_icon')
|
||||
search_fields = ('id', 'snapshot__url', 'extractor', 'output', 'cmd_version', 'cmd', 'snapshot__timestamp')
|
||||
fields = ('snapshot', 'extractor', 'status', 'retry_at', 'start_ts', 'end_ts', 'created_by', 'pwd', 'cmd_version', 'cmd', 'output', *readonly_fields)
|
||||
autocomplete_fields = ['snapshot']
|
||||
|
||||
fieldsets = (
|
||||
('Snapshot', {
|
||||
'fields': ('snapshot', 'snapshot_info', 'tags_str'),
|
||||
'classes': ('card', 'wide'),
|
||||
}),
|
||||
('Extractor', {
|
||||
'fields': ('extractor', 'extractor_with_icon', 'status', 'retry_at'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Timing', {
|
||||
'fields': ('start_ts', 'end_ts', 'created_at', 'modified_at'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Command', {
|
||||
'fields': ('cmd', 'cmd_str', 'cmd_version', 'pwd'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Output', {
|
||||
'fields': ('output', 'output_summary'),
|
||||
'classes': ('card', 'wide'),
|
||||
}),
|
||||
('Metadata', {
|
||||
'fields': ('created_by',),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
)
|
||||
|
||||
list_filter = ('status', 'extractor', 'start_ts', 'cmd_version')
|
||||
ordering = ['-start_ts']
|
||||
list_per_page = SERVER_CONFIG.SNAPSHOTS_PER_PAGE
|
||||
|
||||
|
||||
paginator = AccelleratedPaginator
|
||||
save_on_top = True
|
||||
|
||||
|
||||
actions = ['delete_selected']
|
||||
|
||||
|
||||
class Meta:
|
||||
verbose_name = 'Archive Result'
|
||||
verbose_name_plural = 'Archive Results'
|
||||
|
||||
@@ -25,7 +25,7 @@ from archivebox.workers.tasks import bg_archive_snapshots, bg_add
|
||||
|
||||
from core.models import Tag
|
||||
from core.admin_tags import TagInline
|
||||
from core.admin_archiveresults import ArchiveResultInline
|
||||
from core.admin_archiveresults import ArchiveResultInline, render_archiveresults_list
|
||||
|
||||
|
||||
# GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': [], 'CAN_UPGRADE': False}
|
||||
@@ -54,13 +54,48 @@ class SnapshotActionForm(ActionForm):
|
||||
class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
||||
list_display = ('created_at', 'title_str', 'status', 'files', 'size', 'url_str')
|
||||
sort_fields = ('title_str', 'url_str', 'created_at', 'status', 'crawl')
|
||||
readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'output_dir')
|
||||
readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'output_dir', 'archiveresults_list')
|
||||
search_fields = ('id', 'url', 'timestamp', 'title', 'tags__name')
|
||||
list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'created_by', 'tags__name')
|
||||
fields = ('url', 'title', 'created_by', 'bookmarked_at', 'status', 'retry_at', 'crawl', 'config', *readonly_fields)
|
||||
|
||||
fieldsets = (
|
||||
('URL', {
|
||||
'fields': ('url', 'title'),
|
||||
'classes': ('card', 'wide'),
|
||||
}),
|
||||
('Status', {
|
||||
'fields': ('status', 'retry_at', 'status_info'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Timestamps', {
|
||||
'fields': ('bookmarked_at', 'created_at', 'modified_at', 'downloaded_at'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Relations', {
|
||||
'fields': ('crawl', 'created_by', 'tags_str'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Config', {
|
||||
'fields': ('config',),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Files', {
|
||||
'fields': ('output_dir',),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Actions', {
|
||||
'fields': ('admin_actions',),
|
||||
'classes': ('card', 'wide'),
|
||||
}),
|
||||
('Archive Results', {
|
||||
'fields': ('archiveresults_list',),
|
||||
'classes': ('card', 'wide'),
|
||||
}),
|
||||
)
|
||||
|
||||
ordering = ['-created_at']
|
||||
actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots']
|
||||
inlines = [TagInline, ArchiveResultInline]
|
||||
inlines = [TagInline] # Removed ArchiveResultInline, using custom renderer instead
|
||||
list_per_page = min(max(5, SERVER_CONFIG.SNAPSHOTS_PER_PAGE), 5000)
|
||||
|
||||
action_form = SnapshotActionForm
|
||||
@@ -155,6 +190,10 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
||||
obj.extension or '-',
|
||||
)
|
||||
|
||||
@admin.display(description='Archive Results')
|
||||
def archiveresults_list(self, obj):
|
||||
return render_archiveresults_list(obj.archiveresult_set.all())
|
||||
|
||||
@admin.display(
|
||||
description='Title',
|
||||
ordering='title',
|
||||
|
||||
@@ -51,11 +51,25 @@ class TagAdmin(BaseModelAdmin):
|
||||
sort_fields = ('name', 'slug', 'id', 'created_by', 'created_at')
|
||||
readonly_fields = ('slug', 'id', 'created_at', 'modified_at', 'snapshots')
|
||||
search_fields = ('id', 'name', 'slug')
|
||||
fields = ('name', 'created_by', *readonly_fields)
|
||||
actions = ['delete_selected', 'merge_tags']
|
||||
ordering = ['-created_at']
|
||||
# inlines = [TaggedItemInline]
|
||||
|
||||
fieldsets = (
|
||||
('Tag Info', {
|
||||
'fields': ('name', 'slug'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Metadata', {
|
||||
'fields': ('id', 'created_by', 'created_at', 'modified_at'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Snapshots', {
|
||||
'fields': ('snapshots',),
|
||||
'classes': ('card', 'wide'),
|
||||
}),
|
||||
)
|
||||
|
||||
paginator = AccelleratedPaginator
|
||||
|
||||
|
||||
|
||||
@@ -1,7 +1,5 @@
|
||||
__package__ = 'archivebox.core'
|
||||
|
||||
import sys
|
||||
|
||||
from django.apps import AppConfig
|
||||
|
||||
|
||||
@@ -12,41 +10,3 @@ class CoreConfig(AppConfig):
|
||||
"""Register the archivebox.core.admin_site as the main django admin site"""
|
||||
from core.admin_site import register_admin_site
|
||||
register_admin_site()
|
||||
|
||||
# Auto-start the orchestrator when running the web server
|
||||
self._maybe_start_orchestrator()
|
||||
|
||||
def _maybe_start_orchestrator(self):
|
||||
"""Start the orchestrator if we're running a web server."""
|
||||
import os
|
||||
|
||||
# Don't start orchestrator during migrations, shell, tests, etc.
|
||||
# Only start when running: runserver, daphne, gunicorn, uwsgi
|
||||
if not self._is_web_server():
|
||||
return
|
||||
|
||||
# Don't start if RUN_ORCHESTRATOR env var is explicitly set to false
|
||||
if os.environ.get('RUN_ORCHESTRATOR', '').lower() in ('false', '0', 'no'):
|
||||
return
|
||||
|
||||
# Don't start in autoreload child process (avoid double-start)
|
||||
if os.environ.get('RUN_MAIN') != 'true' and 'runserver' in sys.argv:
|
||||
return
|
||||
|
||||
try:
|
||||
from workers.orchestrator import Orchestrator
|
||||
|
||||
if not Orchestrator.is_running():
|
||||
# Start orchestrator as daemon (won't exit on idle when started by server)
|
||||
orchestrator = Orchestrator(exit_on_idle=False)
|
||||
orchestrator.start()
|
||||
except Exception as e:
|
||||
# Don't crash the server if orchestrator fails to start
|
||||
import logging
|
||||
logging.getLogger('archivebox').warning(f'Failed to auto-start orchestrator: {e}')
|
||||
|
||||
def _is_web_server(self) -> bool:
|
||||
"""Check if we're running a web server command."""
|
||||
# Check for common web server indicators
|
||||
server_commands = ('runserver', 'daphne', 'gunicorn', 'uwsgi', 'server')
|
||||
return any(cmd in ' '.join(sys.argv).lower() for cmd in server_commands)
|
||||
|
||||
@@ -0,0 +1,22 @@
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0024_snapshot_crawl'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
# Remove the unique constraint on url
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='url',
|
||||
field=models.URLField(db_index=True, unique=False),
|
||||
),
|
||||
# Add unique constraint on (url, crawl) combination
|
||||
migrations.AddConstraint(
|
||||
model_name='snapshot',
|
||||
constraint=models.UniqueConstraint(fields=['url', 'crawl'], name='unique_url_per_crawl'),
|
||||
),
|
||||
]
|
||||
@@ -60,7 +60,8 @@ class Tag(ModelWithSerializers):
|
||||
return self.name
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
if self._state.adding:
|
||||
is_new = self._state.adding
|
||||
if is_new:
|
||||
self.slug = slugify(self.name)
|
||||
existing = set(Tag.objects.filter(slug__startswith=self.slug).values_list("slug", flat=True))
|
||||
i = None
|
||||
@@ -72,6 +73,19 @@ class Tag(ModelWithSerializers):
|
||||
i = (i or 0) + 1
|
||||
super().save(*args, **kwargs)
|
||||
|
||||
if is_new:
|
||||
from archivebox.misc.logging_util import log_worker_event
|
||||
log_worker_event(
|
||||
worker_type='DB',
|
||||
event='Created Tag',
|
||||
indent_level=0,
|
||||
metadata={
|
||||
'id': self.id,
|
||||
'name': self.name,
|
||||
'slug': self.slug,
|
||||
},
|
||||
)
|
||||
|
||||
@property
|
||||
def api_url(self) -> str:
|
||||
return reverse_lazy('api-1:get_tag', args=[self.id])
|
||||
@@ -241,12 +255,13 @@ class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)):
|
||||
if tag.strip()
|
||||
))
|
||||
|
||||
try:
|
||||
snapshot = self.get(url=url)
|
||||
# Get most recent snapshot with this URL (URLs can exist in multiple crawls)
|
||||
snapshot = self.filter(url=url).order_by('-created_at').first()
|
||||
if snapshot:
|
||||
if title and (not snapshot.title or len(title) > len(snapshot.title or '')):
|
||||
snapshot.title = title
|
||||
snapshot.save(update_fields=['title', 'modified_at'])
|
||||
except self.model.DoesNotExist:
|
||||
else:
|
||||
if timestamp:
|
||||
while self.filter(timestamp=timestamp).exists():
|
||||
timestamp = str(float(timestamp) + 1.0)
|
||||
@@ -284,7 +299,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
created_at = models.DateTimeField(default=timezone.now, db_index=True)
|
||||
modified_at = models.DateTimeField(auto_now=True)
|
||||
|
||||
url = models.URLField(unique=True, db_index=True)
|
||||
url = models.URLField(unique=False, db_index=True) # URLs can appear in multiple crawls
|
||||
timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False)
|
||||
bookmarked_at = models.DateTimeField(default=timezone.now, db_index=True)
|
||||
crawl: Crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name='snapshot_set', db_index=True) # type: ignore
|
||||
@@ -313,11 +328,16 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
class Meta(TypedModelMeta):
|
||||
verbose_name = "Snapshot"
|
||||
verbose_name_plural = "Snapshots"
|
||||
constraints = [
|
||||
# Allow same URL in different crawls, but not duplicates within same crawl
|
||||
models.UniqueConstraint(fields=['url', 'crawl'], name='unique_url_per_crawl'),
|
||||
]
|
||||
|
||||
def __str__(self):
|
||||
return f'[{self.id}] {self.url[:64]}'
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
is_new = self._state.adding
|
||||
if not self.bookmarked_at:
|
||||
self.bookmarked_at = self.created_at or timezone.now()
|
||||
if not self.timestamp:
|
||||
@@ -327,6 +347,21 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
self.crawl.urls += f'\n{self.url}'
|
||||
self.crawl.save()
|
||||
|
||||
if is_new:
|
||||
from archivebox.misc.logging_util import log_worker_event
|
||||
log_worker_event(
|
||||
worker_type='DB',
|
||||
event='Created Snapshot',
|
||||
indent_level=2,
|
||||
url=self.url,
|
||||
metadata={
|
||||
'id': str(self.id),
|
||||
'crawl_id': str(self.crawl_id) if self.crawl_id else None,
|
||||
'depth': self.depth,
|
||||
'status': self.status,
|
||||
},
|
||||
)
|
||||
|
||||
def output_dir_parent(self) -> str:
|
||||
return 'archive'
|
||||
|
||||
@@ -807,6 +842,24 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
def __str__(self):
|
||||
return f'[{self.id}] {self.snapshot.url[:64]} -> {self.extractor}'
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
is_new = self._state.adding
|
||||
super().save(*args, **kwargs)
|
||||
if is_new:
|
||||
from archivebox.misc.logging_util import log_worker_event
|
||||
log_worker_event(
|
||||
worker_type='DB',
|
||||
event='Created ArchiveResult',
|
||||
indent_level=3,
|
||||
extractor=self.extractor,
|
||||
metadata={
|
||||
'id': str(self.id),
|
||||
'snapshot_id': str(self.snapshot_id),
|
||||
'snapshot_url': str(self.snapshot.url)[:64],
|
||||
'status': self.status,
|
||||
},
|
||||
)
|
||||
|
||||
@cached_property
|
||||
def snapshot_dir(self):
|
||||
return Path(self.snapshot.output_dir)
|
||||
@@ -879,7 +932,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
from django.utils import timezone
|
||||
from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook
|
||||
|
||||
extractor_dir = Path(self.snapshot.output_dir) / self.extractor
|
||||
config_objects = [self.snapshot.crawl, self.snapshot] if self.snapshot.crawl else [self.snapshot]
|
||||
|
||||
# Find hook for this extractor
|
||||
@@ -899,6 +951,10 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
self.save()
|
||||
return
|
||||
|
||||
# Use plugin directory name instead of extractor name (removes numeric prefix)
|
||||
plugin_name = hook.parent.name
|
||||
extractor_dir = Path(self.snapshot.output_dir) / plugin_name
|
||||
|
||||
# Run the hook
|
||||
start_ts = timezone.now()
|
||||
result = run_hook(
|
||||
|
||||
@@ -45,15 +45,14 @@ class SnapshotMachine(StateMachine, strict_states=True):
|
||||
super().__init__(snapshot, *args, **kwargs)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f'[grey53]Snapshot\\[{self.snapshot.id}] 🏃♂️ Worker\\[pid={os.getpid()}].tick()[/grey53] [blue]{self.snapshot.status.upper()}[/blue] ⚙️ [grey37]Machine[/grey37]'
|
||||
|
||||
return f'Snapshot[{self.snapshot.id}]'
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.__repr__()
|
||||
|
||||
|
||||
def can_start(self) -> bool:
|
||||
can_start = bool(self.snapshot.url)
|
||||
if not can_start:
|
||||
print(f'{self}.can_start() [blue]QUEUED[/blue] ➡️❌ [blue]STARTED[/blue] cant start yet +{timezone.now() - self.snapshot.retry_at}s')
|
||||
# Suppressed: queue waiting logs
|
||||
return can_start
|
||||
|
||||
def is_finished(self) -> bool:
|
||||
@@ -73,15 +72,15 @@ class SnapshotMachine(StateMachine, strict_states=True):
|
||||
|
||||
@queued.enter
|
||||
def enter_queued(self):
|
||||
print(f'{self}.on_queued() ↳ snapshot.retry_at = now()')
|
||||
# Suppressed: state transition logs
|
||||
self.snapshot.update_for_workers(
|
||||
retry_at=timezone.now(),
|
||||
status=Snapshot.StatusChoices.QUEUED,
|
||||
)
|
||||
|
||||
|
||||
@started.enter
|
||||
def enter_started(self):
|
||||
print(f'{self}.on_started() ↳ snapshot.run()')
|
||||
# Suppressed: state transition logs
|
||||
# lock the snapshot while we create the pending archiveresults
|
||||
self.snapshot.update_for_workers(
|
||||
retry_at=timezone.now() + timedelta(seconds=30), # if failed, wait 30s before retrying
|
||||
@@ -95,10 +94,10 @@ class SnapshotMachine(StateMachine, strict_states=True):
|
||||
retry_at=timezone.now() + timedelta(seconds=5), # wait 5s before checking it again
|
||||
status=Snapshot.StatusChoices.STARTED,
|
||||
)
|
||||
|
||||
|
||||
@sealed.enter
|
||||
def enter_sealed(self):
|
||||
print(f'{self}.on_sealed() ↳ snapshot.retry_at=None')
|
||||
# Suppressed: state transition logs
|
||||
self.snapshot.update_for_workers(
|
||||
retry_at=None,
|
||||
status=Snapshot.StatusChoices.SEALED,
|
||||
@@ -161,15 +160,14 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
|
||||
super().__init__(archiveresult, *args, **kwargs)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f'[grey53]ArchiveResult\\[{self.archiveresult.id}] 🏃♂️ Worker\\[pid={os.getpid()}].tick()[/grey53] [blue]{self.archiveresult.status.upper()}[/blue] ⚙️ [grey37]Machine[/grey37]'
|
||||
|
||||
return f'ArchiveResult[{self.archiveresult.id}]'
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.__repr__()
|
||||
|
||||
|
||||
def can_start(self) -> bool:
|
||||
can_start = bool(self.archiveresult.snapshot.url)
|
||||
if not can_start:
|
||||
print(f'{self}.can_start() [blue]QUEUED[/blue] ➡️❌ [blue]STARTED[/blue]: cant start yet +{timezone.now() - self.archiveresult.retry_at}s')
|
||||
# Suppressed: queue waiting logs
|
||||
return can_start
|
||||
|
||||
def is_succeeded(self) -> bool:
|
||||
@@ -202,41 +200,34 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
|
||||
|
||||
@queued.enter
|
||||
def enter_queued(self):
|
||||
print(f'{self}.on_queued() ↳ archiveresult.retry_at = now()')
|
||||
# Suppressed: state transition logs
|
||||
self.archiveresult.update_for_workers(
|
||||
retry_at=timezone.now(),
|
||||
status=ArchiveResult.StatusChoices.QUEUED,
|
||||
start_ts=None,
|
||||
) # bump the snapshot's retry_at so they pickup any new changes
|
||||
|
||||
|
||||
@started.enter
|
||||
def enter_started(self):
|
||||
print(f'{self}.on_started() ↳ archiveresult.start_ts + run_extractor()')
|
||||
|
||||
# Suppressed: state transition logs
|
||||
# Lock the object and mark start time
|
||||
self.archiveresult.update_for_workers(
|
||||
retry_at=timezone.now() + timedelta(seconds=120), # 2 min timeout for extractor
|
||||
status=ArchiveResult.StatusChoices.STARTED,
|
||||
start_ts=timezone.now(),
|
||||
)
|
||||
|
||||
|
||||
# Run the extractor - this updates status, output, timestamps, etc.
|
||||
self.archiveresult.run()
|
||||
|
||||
|
||||
# Save the updated result
|
||||
self.archiveresult.save()
|
||||
|
||||
# Log the result
|
||||
if self.archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED:
|
||||
print(f'{self} ✅ extractor succeeded: {self.archiveresult.output[:50] if self.archiveresult.output else ""}...')
|
||||
elif self.archiveresult.status == ArchiveResult.StatusChoices.FAILED:
|
||||
print(f'{self} ❌ extractor failed: {self.archiveresult.output[:100] if self.archiveresult.output else ""}...')
|
||||
elif self.archiveresult.status == ArchiveResult.StatusChoices.SKIPPED:
|
||||
print(f'{self} ⏭️ extractor skipped: {self.archiveresult.output[:50] if self.archiveresult.output else ""}')
|
||||
|
||||
# Suppressed: extractor result logs (already logged by worker)
|
||||
|
||||
@backoff.enter
|
||||
def enter_backoff(self):
|
||||
print(f'{self}.on_backoff() ↳ archiveresult.retries += 1, archiveresult.bump_retry_at(+60s), archiveresult.end_ts = None')
|
||||
# Suppressed: state transition logs
|
||||
self.archiveresult.update_for_workers(
|
||||
retry_at=timezone.now() + timedelta(seconds=60),
|
||||
status=ArchiveResult.StatusChoices.BACKOFF,
|
||||
@@ -244,10 +235,10 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
|
||||
# retries=F('retries') + 1, # F() equivalent to getattr(self.archiveresult, 'retries', 0) + 1,
|
||||
)
|
||||
self.archiveresult.save(write_indexes=True)
|
||||
|
||||
|
||||
@succeeded.enter
|
||||
def enter_succeeded(self):
|
||||
print(f'{self}.on_succeeded() ↳ archiveresult.retry_at = None, archiveresult.end_ts = now()')
|
||||
# Suppressed: state transition logs
|
||||
self.archiveresult.update_for_workers(
|
||||
retry_at=None,
|
||||
status=ArchiveResult.StatusChoices.SUCCEEDED,
|
||||
@@ -270,7 +261,7 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
|
||||
|
||||
@failed.enter
|
||||
def enter_failed(self):
|
||||
print(f'{self}.on_failed() ↳ archiveresult.retry_at = None, archiveresult.end_ts = now()')
|
||||
# Suppressed: state transition logs
|
||||
self.archiveresult.update_for_workers(
|
||||
retry_at=None,
|
||||
status=ArchiveResult.StatusChoices.FAILED,
|
||||
@@ -291,7 +282,7 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
|
||||
|
||||
@skipped.enter
|
||||
def enter_skipped(self):
|
||||
print(f'{self}.on_skipped() ↳ archiveresult.retry_at = None, archiveresult.end_ts = now()')
|
||||
# Suppressed: state transition logs
|
||||
self.archiveresult.update_for_workers(
|
||||
retry_at=None,
|
||||
status=ArchiveResult.StatusChoices.SKIPPED,
|
||||
|
||||
@@ -503,15 +503,7 @@ class AddView(UserPassesTestMixin, FormView):
|
||||
mark_safe(f"Adding {rough_url_count} URLs in the background. (refresh in a minute start seeing results) {crawl.admin_change_url}"),
|
||||
)
|
||||
|
||||
# Start orchestrator in background to process the queued crawl
|
||||
try:
|
||||
from archivebox.workers.tasks import ensure_orchestrator_running
|
||||
ensure_orchestrator_running()
|
||||
except Exception as e:
|
||||
# Orchestrator may already be running via supervisord, or fail to start
|
||||
# This is not fatal - the crawl will be processed when orchestrator runs
|
||||
print(f'[!] Failed to start orchestrator: {e}')
|
||||
|
||||
# Orchestrator (managed by supervisord) will pick up the queued crawl
|
||||
return redirect(crawl.admin_change_url)
|
||||
|
||||
|
||||
@@ -539,6 +531,7 @@ def live_progress_view(request):
|
||||
from workers.orchestrator import Orchestrator
|
||||
from crawls.models import Crawl
|
||||
from core.models import Snapshot, ArchiveResult
|
||||
from django.db.models import Case, When, Value, IntegerField
|
||||
|
||||
# Get orchestrator status
|
||||
orchestrator_running = Orchestrator.is_running()
|
||||
@@ -570,8 +563,26 @@ def live_progress_view(request):
|
||||
crawl_snapshots = Snapshot.objects.filter(crawl=crawl)
|
||||
total_snapshots = crawl_snapshots.count()
|
||||
completed_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.SEALED).count()
|
||||
started_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.STARTED).count()
|
||||
pending_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.QUEUED).count()
|
||||
|
||||
# Count URLs in the crawl (for when snapshots haven't been created yet)
|
||||
urls_count = 0
|
||||
if crawl.urls:
|
||||
urls_count = len([u for u in crawl.urls.split('\n') if u.strip()])
|
||||
elif crawl.seed and crawl.seed.uri:
|
||||
# Try to get URL count from seed
|
||||
if crawl.seed.uri.startswith('file:///'):
|
||||
try:
|
||||
from pathlib import Path
|
||||
seed_file = Path(crawl.seed.uri.replace('file://', ''))
|
||||
if seed_file.exists():
|
||||
urls_count = len([l for l in seed_file.read_text().split('\n') if l.strip() and not l.startswith('#')])
|
||||
except:
|
||||
pass
|
||||
else:
|
||||
urls_count = 1 # Single URL seed
|
||||
|
||||
# Calculate crawl progress
|
||||
crawl_progress = int((completed_snapshots / total_snapshots) * 100) if total_snapshots > 0 else 0
|
||||
|
||||
@@ -590,16 +601,24 @@ def live_progress_view(request):
|
||||
# Calculate snapshot progress
|
||||
snapshot_progress = int(((completed_extractors + failed_extractors) / total_extractors) * 100) if total_extractors > 0 else 0
|
||||
|
||||
# Get active extractors for this snapshot
|
||||
active_extractors = [
|
||||
# Get all extractors for this snapshot
|
||||
# Order: started first, then queued, then completed
|
||||
all_extractors = [
|
||||
{
|
||||
'id': str(ar.id),
|
||||
'extractor': ar.extractor,
|
||||
'status': ar.status,
|
||||
'started': ar.start_ts.isoformat() if ar.start_ts else None,
|
||||
'progress': 50,
|
||||
}
|
||||
for ar in snapshot_results.filter(status=ArchiveResult.StatusChoices.STARTED).order_by('-start_ts')[:5]
|
||||
for ar in snapshot_results.annotate(
|
||||
status_order=Case(
|
||||
When(status=ArchiveResult.StatusChoices.STARTED, then=Value(0)),
|
||||
When(status=ArchiveResult.StatusChoices.QUEUED, then=Value(1)),
|
||||
When(status=ArchiveResult.StatusChoices.SUCCEEDED, then=Value(2)),
|
||||
When(status=ArchiveResult.StatusChoices.FAILED, then=Value(3)),
|
||||
default=Value(4),
|
||||
output_field=IntegerField(),
|
||||
)
|
||||
).order_by('status_order', 'extractor')
|
||||
]
|
||||
|
||||
active_snapshots_for_crawl.append({
|
||||
@@ -612,9 +631,17 @@ def live_progress_view(request):
|
||||
'completed_extractors': completed_extractors,
|
||||
'failed_extractors': failed_extractors,
|
||||
'pending_extractors': pending_extractors,
|
||||
'active_extractors': active_extractors,
|
||||
'all_extractors': all_extractors,
|
||||
})
|
||||
|
||||
# Check if crawl can start (for debugging stuck crawls)
|
||||
can_start = bool(crawl.seed and crawl.seed.uri)
|
||||
seed_uri = crawl.seed.uri[:60] if crawl.seed and crawl.seed.uri else None
|
||||
|
||||
# Check if retry_at is in the future (would prevent worker from claiming)
|
||||
retry_at_future = crawl.retry_at > timezone.now() if crawl.retry_at else False
|
||||
seconds_until_retry = int((crawl.retry_at - timezone.now()).total_seconds()) if crawl.retry_at and retry_at_future else 0
|
||||
|
||||
active_crawls.append({
|
||||
'id': str(crawl.id),
|
||||
'label': str(crawl)[:60],
|
||||
@@ -622,11 +649,17 @@ def live_progress_view(request):
|
||||
'started': crawl.modified_at.isoformat() if crawl.modified_at else None,
|
||||
'progress': crawl_progress,
|
||||
'max_depth': crawl.max_depth,
|
||||
'urls_count': urls_count,
|
||||
'total_snapshots': total_snapshots,
|
||||
'completed_snapshots': completed_snapshots,
|
||||
'started_snapshots': started_snapshots,
|
||||
'failed_snapshots': 0,
|
||||
'pending_snapshots': pending_snapshots,
|
||||
'active_snapshots': active_snapshots_for_crawl,
|
||||
'can_start': can_start,
|
||||
'seed_uri': seed_uri,
|
||||
'retry_at_future': retry_at_future,
|
||||
'seconds_until_retry': seconds_until_retry,
|
||||
})
|
||||
|
||||
return JsonResponse({
|
||||
|
||||
@@ -8,6 +8,7 @@ from django.contrib import admin, messages
|
||||
from django.urls import path
|
||||
from django.http import JsonResponse
|
||||
from django.views.decorators.http import require_POST
|
||||
from django.db.models import Count, Q
|
||||
|
||||
from archivebox import DATA_DIR
|
||||
|
||||
@@ -19,13 +20,155 @@ from core.models import Snapshot
|
||||
from crawls.models import Seed, Crawl, CrawlSchedule
|
||||
|
||||
|
||||
def render_snapshots_list(snapshots_qs, limit=20):
|
||||
"""Render a nice inline list view of snapshots with status, title, URL, and progress."""
|
||||
|
||||
snapshots = snapshots_qs.order_by('-created_at')[:limit].annotate(
|
||||
total_results=Count('archiveresult'),
|
||||
succeeded_results=Count('archiveresult', filter=Q(archiveresult__status='succeeded')),
|
||||
failed_results=Count('archiveresult', filter=Q(archiveresult__status='failed')),
|
||||
)
|
||||
|
||||
if not snapshots:
|
||||
return mark_safe('<div style="color: #666; font-style: italic; padding: 8px 0;">No Snapshots yet...</div>')
|
||||
|
||||
# Status colors matching Django admin and progress monitor
|
||||
status_colors = {
|
||||
'queued': ('#6c757d', '#f8f9fa'), # gray
|
||||
'started': ('#856404', '#fff3cd'), # amber
|
||||
'sealed': ('#155724', '#d4edda'), # green
|
||||
'failed': ('#721c24', '#f8d7da'), # red
|
||||
}
|
||||
|
||||
rows = []
|
||||
for snapshot in snapshots:
|
||||
status = snapshot.status or 'queued'
|
||||
color, bg = status_colors.get(status, ('#6c757d', '#f8f9fa'))
|
||||
|
||||
# Calculate progress
|
||||
total = snapshot.total_results
|
||||
done = snapshot.succeeded_results + snapshot.failed_results
|
||||
progress_pct = int((done / total) * 100) if total > 0 else 0
|
||||
progress_text = f'{done}/{total}' if total > 0 else '-'
|
||||
|
||||
# Truncate title and URL
|
||||
title = (snapshot.title or 'Untitled')[:60]
|
||||
if len(snapshot.title or '') > 60:
|
||||
title += '...'
|
||||
url_display = snapshot.url[:50]
|
||||
if len(snapshot.url) > 50:
|
||||
url_display += '...'
|
||||
|
||||
# Format date
|
||||
date_str = snapshot.created_at.strftime('%Y-%m-%d %H:%M') if snapshot.created_at else '-'
|
||||
|
||||
rows.append(f'''
|
||||
<tr style="border-bottom: 1px solid #eee;">
|
||||
<td style="padding: 6px 8px; white-space: nowrap;">
|
||||
<span style="display: inline-block; padding: 2px 8px; border-radius: 10px;
|
||||
font-size: 11px; font-weight: 500; text-transform: uppercase;
|
||||
color: {color}; background: {bg};">{status}</span>
|
||||
</td>
|
||||
<td style="padding: 6px 8px; white-space: nowrap;">
|
||||
<a href="/archive/{snapshot.timestamp}/" style="text-decoration: none;">
|
||||
<img src="/archive/{snapshot.timestamp}/favicon.ico"
|
||||
style="width: 16px; height: 16px; vertical-align: middle; margin-right: 4px;"
|
||||
onerror="this.style.display='none'"/>
|
||||
</a>
|
||||
</td>
|
||||
<td style="padding: 6px 8px; max-width: 300px;">
|
||||
<a href="{snapshot.admin_change_url}" style="color: #417690; text-decoration: none; font-weight: 500;"
|
||||
title="{snapshot.title or 'Untitled'}">{title}</a>
|
||||
</td>
|
||||
<td style="padding: 6px 8px; max-width: 250px;">
|
||||
<a href="{snapshot.url}" target="_blank"
|
||||
style="color: #666; text-decoration: none; font-family: monospace; font-size: 11px;"
|
||||
title="{snapshot.url}">{url_display}</a>
|
||||
</td>
|
||||
<td style="padding: 6px 8px; white-space: nowrap; text-align: center;">
|
||||
<div style="display: inline-flex; align-items: center; gap: 6px;">
|
||||
<div style="width: 60px; height: 6px; background: #eee; border-radius: 3px; overflow: hidden;">
|
||||
<div style="width: {progress_pct}%; height: 100%;
|
||||
background: {'#28a745' if snapshot.failed_results == 0 else '#ffc107' if snapshot.succeeded_results > 0 else '#dc3545'};
|
||||
transition: width 0.3s;"></div>
|
||||
</div>
|
||||
<a href="/admin/core/archiveresult/?snapshot__id__exact={snapshot.id}"
|
||||
style="font-size: 11px; color: #417690; min-width: 35px; text-decoration: none;"
|
||||
title="View archive results">{progress_text}</a>
|
||||
</div>
|
||||
</td>
|
||||
<td style="padding: 6px 8px; white-space: nowrap; color: #888; font-size: 11px;">
|
||||
{date_str}
|
||||
</td>
|
||||
</tr>
|
||||
''')
|
||||
|
||||
total_count = snapshots_qs.count()
|
||||
footer = ''
|
||||
if total_count > limit:
|
||||
footer = f'''
|
||||
<tr>
|
||||
<td colspan="6" style="padding: 8px; text-align: center; color: #666; font-size: 12px; background: #f8f9fa;">
|
||||
Showing {limit} of {total_count} snapshots
|
||||
</td>
|
||||
</tr>
|
||||
'''
|
||||
|
||||
return mark_safe(f'''
|
||||
<div style="border: 1px solid #ddd; border-radius: 6px; overflow: hidden; max-width: 100%;">
|
||||
<table style="width: 100%; border-collapse: collapse; font-size: 13px;">
|
||||
<thead>
|
||||
<tr style="background: #f5f5f5; border-bottom: 2px solid #ddd;">
|
||||
<th style="padding: 8px; text-align: left; font-weight: 600; color: #333;">Status</th>
|
||||
<th style="padding: 8px; text-align: left; font-weight: 600; color: #333; width: 24px;"></th>
|
||||
<th style="padding: 8px; text-align: left; font-weight: 600; color: #333;">Title</th>
|
||||
<th style="padding: 8px; text-align: left; font-weight: 600; color: #333;">URL</th>
|
||||
<th style="padding: 8px; text-align: center; font-weight: 600; color: #333;">Progress</th>
|
||||
<th style="padding: 8px; text-align: left; font-weight: 600; color: #333;">Created</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{''.join(rows)}
|
||||
{footer}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
''')
|
||||
|
||||
|
||||
class SeedAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||
list_display = ('id', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str', 'crawls', 'num_crawls', 'num_snapshots')
|
||||
sort_fields = ('id', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str')
|
||||
search_fields = ('id', 'created_by__username', 'label', 'notes', 'uri', 'extractor', 'tags_str')
|
||||
|
||||
readonly_fields = ('created_at', 'modified_at', 'scheduled_crawls', 'crawls', 'snapshots', 'contents')
|
||||
fields = ('label', 'notes', 'uri', 'extractor', 'tags_str', 'config', 'created_by', *readonly_fields)
|
||||
|
||||
fieldsets = (
|
||||
('Source', {
|
||||
'fields': ('uri', 'contents'),
|
||||
'classes': ('card', 'wide'),
|
||||
}),
|
||||
('Info', {
|
||||
'fields': ('label', 'notes', 'tags_str'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Settings', {
|
||||
'fields': ('extractor', 'config'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Metadata', {
|
||||
'fields': ('created_by', 'created_at', 'modified_at'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Crawls', {
|
||||
'fields': ('scheduled_crawls', 'crawls'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Snapshots', {
|
||||
'fields': ('snapshots',),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
)
|
||||
|
||||
list_filter = ('extractor', 'created_by')
|
||||
ordering = ['-created_at']
|
||||
@@ -51,22 +194,19 @@ class SeedAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||
)) or mark_safe('<i>No Crawls yet...</i>')
|
||||
|
||||
def snapshots(self, obj):
|
||||
return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
|
||||
(snapshot.admin_change_url, snapshot)
|
||||
for snapshot in obj.snapshot_set.all().order_by('-created_at')[:20]
|
||||
)) or mark_safe('<i>No Snapshots yet...</i>')
|
||||
return render_snapshots_list(obj.snapshot_set.all())
|
||||
|
||||
def contents(self, obj):
|
||||
if obj.uri.startswith('file:///data/'):
|
||||
source_file = DATA_DIR / obj.uri.replace('file:///data/', '', 1)
|
||||
source_file = obj.get_file_path()
|
||||
if source_file:
|
||||
contents = ""
|
||||
try:
|
||||
contents = source_file.read_text().strip()[:14_000]
|
||||
except Exception as e:
|
||||
contents = f'Error reading {source_file}: {e}'
|
||||
|
||||
|
||||
return format_html('<b><code>{}</code>:</b><br/><pre>{}</pre>', source_file, contents)
|
||||
|
||||
|
||||
return format_html('See URLs here: <a href="{}">{}</a>', obj.uri, obj.uri)
|
||||
|
||||
|
||||
@@ -78,7 +218,37 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||
search_fields = ('id', 'created_by__username', 'max_depth', 'label', 'notes', 'seed_id', 'schedule_id', 'status', 'seed__uri')
|
||||
|
||||
readonly_fields = ('created_at', 'modified_at', 'snapshots', 'seed_urls_editor')
|
||||
fields = ('label', 'notes', 'seed_urls_editor', 'config', 'status', 'retry_at', 'max_depth', 'seed', 'schedule', 'created_by', 'created_at', 'modified_at', 'snapshots')
|
||||
|
||||
fieldsets = (
|
||||
('URLs', {
|
||||
'fields': ('seed_urls_editor',),
|
||||
'classes': ('card', 'wide'),
|
||||
}),
|
||||
('Info', {
|
||||
'fields': ('label', 'notes'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Settings', {
|
||||
'fields': ('max_depth', 'config'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Status', {
|
||||
'fields': ('status', 'retry_at'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Relations', {
|
||||
'fields': ('seed', 'schedule', 'created_by'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Timestamps', {
|
||||
'fields': ('created_at', 'modified_at'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Snapshots', {
|
||||
'fields': ('snapshots',),
|
||||
'classes': ('card', 'wide'),
|
||||
}),
|
||||
)
|
||||
|
||||
list_filter = ('max_depth', 'seed', 'schedule', 'created_by', 'status', 'retry_at')
|
||||
ordering = ['-created_at', '-retry_at']
|
||||
@@ -90,6 +260,16 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||
def recrawl(self, request, obj):
|
||||
"""Duplicate this crawl as a new crawl with the same seed and settings."""
|
||||
from django.utils import timezone
|
||||
from django.shortcuts import redirect
|
||||
|
||||
# Validate seed has a URI (required for crawl to start)
|
||||
if not obj.seed:
|
||||
messages.error(request, 'Cannot recrawl: original crawl has no seed.')
|
||||
return redirect('admin:crawls_crawl_change', obj.id)
|
||||
|
||||
if not obj.seed.uri:
|
||||
messages.error(request, 'Cannot recrawl: seed has no URI.')
|
||||
return redirect('admin:crawls_crawl_change', obj.id)
|
||||
|
||||
new_crawl = Crawl.objects.create(
|
||||
seed=obj.seed,
|
||||
@@ -110,8 +290,6 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||
f'It will start processing shortly.'
|
||||
)
|
||||
|
||||
# Redirect to the new crawl's change page
|
||||
from django.shortcuts import redirect
|
||||
return redirect('admin:crawls_crawl_change', new_crawl.id)
|
||||
|
||||
def get_urls(self):
|
||||
@@ -133,7 +311,8 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||
except Crawl.DoesNotExist:
|
||||
return JsonResponse({'success': False, 'error': 'Crawl not found'}, status=404)
|
||||
|
||||
if not (crawl.seed and crawl.seed.uri and crawl.seed.uri.startswith('file:///data/')):
|
||||
source_file = crawl.seed.get_file_path() if crawl.seed else None
|
||||
if not source_file:
|
||||
return JsonResponse({'success': False, 'error': 'Seed is not a local file'}, status=400)
|
||||
|
||||
try:
|
||||
@@ -142,8 +321,6 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||
except json.JSONDecodeError:
|
||||
return JsonResponse({'success': False, 'error': 'Invalid JSON'}, status=400)
|
||||
|
||||
source_file = DATA_DIR / crawl.seed.uri.replace('file:///data/', '', 1)
|
||||
|
||||
try:
|
||||
# Ensure parent directory exists
|
||||
source_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
@@ -156,10 +333,7 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||
return obj.snapshot_set.count()
|
||||
|
||||
def snapshots(self, obj):
|
||||
return format_html_join('<br/>', '<a href="{}">{}</a>', (
|
||||
(snapshot.admin_change_url, snapshot)
|
||||
for snapshot in obj.snapshot_set.all().order_by('-created_at')[:20]
|
||||
)) or mark_safe('<i>No Snapshots yet...</i>')
|
||||
return render_snapshots_list(obj.snapshot_set.all())
|
||||
|
||||
@admin.display(description='Schedule', ordering='schedule')
|
||||
def schedule_str(self, obj):
|
||||
@@ -186,13 +360,12 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||
seed_uri = obj.urls
|
||||
|
||||
# Check if it's a local file we can edit
|
||||
is_file = seed_uri.startswith('file:///data/')
|
||||
source_file = obj.seed.get_file_path() if obj.seed else None
|
||||
is_file = source_file is not None
|
||||
contents = ""
|
||||
error = None
|
||||
source_file = None
|
||||
|
||||
if is_file:
|
||||
source_file = DATA_DIR / seed_uri.replace('file:///data/', '', 1)
|
||||
if is_file and source_file:
|
||||
try:
|
||||
contents = source_file.read_text().strip()
|
||||
except Exception as e:
|
||||
@@ -337,7 +510,29 @@ class CrawlScheduleAdmin(BaseModelAdmin):
|
||||
search_fields = ('id', 'created_by__username', 'label', 'notes', 'schedule_id', 'template_id', 'template__seed__uri')
|
||||
|
||||
readonly_fields = ('created_at', 'modified_at', 'crawls', 'snapshots')
|
||||
fields = ('label', 'notes', 'schedule', 'template', 'created_by', *readonly_fields)
|
||||
|
||||
fieldsets = (
|
||||
('Schedule Info', {
|
||||
'fields': ('label', 'notes'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Configuration', {
|
||||
'fields': ('schedule', 'template'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Metadata', {
|
||||
'fields': ('created_by', 'created_at', 'modified_at'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Crawls', {
|
||||
'fields': ('crawls',),
|
||||
'classes': ('card', 'wide'),
|
||||
}),
|
||||
('Snapshots', {
|
||||
'fields': ('snapshots',),
|
||||
'classes': ('card', 'wide'),
|
||||
}),
|
||||
)
|
||||
|
||||
list_filter = ('created_by',)
|
||||
ordering = ['-created_at']
|
||||
@@ -362,10 +557,7 @@ class CrawlScheduleAdmin(BaseModelAdmin):
|
||||
|
||||
def snapshots(self, obj):
|
||||
crawl_ids = obj.crawl_set.values_list('pk', flat=True)
|
||||
return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
|
||||
(snapshot.admin_change_url, snapshot)
|
||||
for snapshot in Snapshot.objects.filter(crawl_id__in=crawl_ids).order_by('-created_at')[:20]
|
||||
)) or mark_safe('<i>No Snapshots yet...</i>')
|
||||
return render_snapshots_list(Snapshot.objects.filter(crawl_id__in=crawl_ids))
|
||||
|
||||
|
||||
def register_admin(admin_site):
|
||||
|
||||
@@ -44,9 +44,27 @@ class Seed(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthS
|
||||
def __str__(self):
|
||||
return f'[{self.id}] {self.uri[:64]}'
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
is_new = self._state.adding
|
||||
super().save(*args, **kwargs)
|
||||
if is_new:
|
||||
from archivebox.misc.logging_util import log_worker_event
|
||||
log_worker_event(
|
||||
worker_type='DB',
|
||||
event='Created Seed',
|
||||
indent_level=0,
|
||||
metadata={
|
||||
'id': str(self.id),
|
||||
'uri': str(self.uri)[:64],
|
||||
'extractor': self.extractor,
|
||||
'label': self.label or None,
|
||||
},
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_file(cls, source_file: Path, label: str = '', parser: str = 'auto', tag: str = '', created_by=None, config=None):
|
||||
source_path = str(source_file.resolve()).replace(str(CONSTANTS.DATA_DIR), '/data')
|
||||
# Use absolute path for file:// URLs so extractors can find the files
|
||||
source_path = str(source_file.resolve())
|
||||
seed, _ = cls.objects.get_or_create(
|
||||
label=label or source_file.name, uri=f'file://{source_path}',
|
||||
created_by_id=getattr(created_by, 'pk', created_by) or get_or_create_system_user_pk(),
|
||||
@@ -62,6 +80,25 @@ class Seed(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthS
|
||||
def api_url(self) -> str:
|
||||
return reverse_lazy('api-1:get_seed', args=[self.id])
|
||||
|
||||
def get_file_path(self) -> Path | None:
|
||||
"""
|
||||
Get the filesystem path for file:// URIs.
|
||||
Handles both old format (file:///data/...) and new format (file:///absolute/path).
|
||||
Returns None if URI is not a file:// URI.
|
||||
"""
|
||||
if not self.uri.startswith('file://'):
|
||||
return None
|
||||
|
||||
# Remove file:// prefix
|
||||
path_str = self.uri.replace('file://', '', 1)
|
||||
|
||||
# Handle old format: file:///data/... -> DATA_DIR/...
|
||||
if path_str.startswith('/data/'):
|
||||
return CONSTANTS.DATA_DIR / path_str.replace('/data/', '', 1)
|
||||
|
||||
# Handle new format: file:///absolute/path
|
||||
return Path(path_str)
|
||||
|
||||
@property
|
||||
def snapshot_set(self) -> QuerySet['Snapshot']:
|
||||
from core.models import Snapshot
|
||||
@@ -136,6 +173,23 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
def __str__(self):
|
||||
return f'[{self.id}] {self.seed.uri[:64] if self.seed else ""}'
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
is_new = self._state.adding
|
||||
super().save(*args, **kwargs)
|
||||
if is_new:
|
||||
from archivebox.misc.logging_util import log_worker_event
|
||||
log_worker_event(
|
||||
worker_type='DB',
|
||||
event='Created Crawl',
|
||||
indent_level=1,
|
||||
metadata={
|
||||
'id': str(self.id),
|
||||
'seed_uri': str(self.seed.uri)[:64] if self.seed else None,
|
||||
'max_depth': self.max_depth,
|
||||
'status': self.status,
|
||||
},
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_seed(cls, seed: Seed, max_depth: int = 0, persona: str = 'Default', tags_str: str = '', config=None, created_by=None):
|
||||
crawl, _ = cls.objects.get_or_create(
|
||||
|
||||
@@ -36,13 +36,19 @@ class CrawlMachine(StateMachine, strict_states=True):
|
||||
super().__init__(crawl, *args, **kwargs)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f'[grey53]Crawl\\[{self.crawl.id}] 🏃♂️ Worker\\[pid={os.getpid()}].tick()[/grey53] [blue]{self.crawl.status.upper()}[/blue] ⚙️ [grey37]Machine[/grey37]'
|
||||
|
||||
return f'Crawl[{self.crawl.id}]'
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.__repr__()
|
||||
|
||||
def can_start(self) -> bool:
|
||||
return bool(self.crawl.seed and self.crawl.seed.uri)
|
||||
if not self.crawl.seed:
|
||||
print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: no seed[/red]')
|
||||
return False
|
||||
if not self.crawl.seed.uri:
|
||||
print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: seed has no URI[/red]')
|
||||
return False
|
||||
return True
|
||||
|
||||
def is_finished(self) -> bool:
|
||||
from core.models import Snapshot, ArchiveResult
|
||||
@@ -73,25 +79,121 @@ class CrawlMachine(StateMachine, strict_states=True):
|
||||
|
||||
@started.enter
|
||||
def enter_started(self):
|
||||
print(f'{self}.on_started(): [blue]↳ STARTED[/blue] crawl.run()')
|
||||
# Suppressed: state transition logs
|
||||
# lock the crawl object while we create snapshots
|
||||
self.crawl.update_for_workers(
|
||||
retry_at=timezone.now() + timedelta(seconds=5),
|
||||
status=Crawl.StatusChoices.QUEUED,
|
||||
)
|
||||
|
||||
# Run the crawl - creates root snapshot and processes queued URLs
|
||||
self.crawl.run()
|
||||
try:
|
||||
# Run on_Crawl hooks to validate/install dependencies
|
||||
self._run_crawl_hooks()
|
||||
|
||||
# only update status to STARTED once snapshots are created
|
||||
self.crawl.update_for_workers(
|
||||
retry_at=timezone.now() + timedelta(seconds=5),
|
||||
status=Crawl.StatusChoices.STARTED,
|
||||
# Run the crawl - creates root snapshot and processes queued URLs
|
||||
self.crawl.run()
|
||||
|
||||
# only update status to STARTED once snapshots are created
|
||||
self.crawl.update_for_workers(
|
||||
retry_at=timezone.now() + timedelta(seconds=5),
|
||||
status=Crawl.StatusChoices.STARTED,
|
||||
)
|
||||
except Exception as e:
|
||||
print(f'[red]⚠️ Crawl {self.crawl.id} failed to start: {e}[/red]')
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
# Re-raise so the worker knows it failed
|
||||
raise
|
||||
|
||||
def _run_crawl_hooks(self):
|
||||
"""Run on_Crawl hooks to validate/install dependencies."""
|
||||
from pathlib import Path
|
||||
from archivebox.hooks import run_hooks, discover_hooks
|
||||
from archivebox.config import CONSTANTS
|
||||
|
||||
# Discover and run all on_Crawl hooks
|
||||
hooks = discover_hooks('Crawl')
|
||||
if not hooks:
|
||||
return
|
||||
|
||||
# Create a temporary output directory for hook results
|
||||
output_dir = Path(CONSTANTS.DATA_DIR) / 'tmp' / f'crawl_{self.crawl.id}'
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Run all on_Crawl hooks
|
||||
results = run_hooks(
|
||||
event_name='Crawl',
|
||||
output_dir=output_dir,
|
||||
timeout=60,
|
||||
config_objects=[self.crawl, self.crawl.seed] if self.crawl.seed else [self.crawl],
|
||||
crawl_id=str(self.crawl.id),
|
||||
seed_uri=self.crawl.seed.uri if self.crawl.seed else '',
|
||||
)
|
||||
|
||||
@sealed.enter
|
||||
# Process hook results - parse JSONL output and create DB objects
|
||||
self._process_hook_results(results)
|
||||
|
||||
def _process_hook_results(self, results: list):
|
||||
"""Process JSONL output from hooks to create InstalledBinary and update Machine config."""
|
||||
import json
|
||||
from machine.models import Machine, InstalledBinary
|
||||
|
||||
machine = Machine.current()
|
||||
|
||||
for result in results:
|
||||
if result['returncode'] != 0:
|
||||
# Hook failed - might indicate missing dependency
|
||||
continue
|
||||
|
||||
# Parse JSONL output
|
||||
for line in result['stdout'].strip().split('\n'):
|
||||
if not line.strip():
|
||||
continue
|
||||
|
||||
try:
|
||||
obj = json.loads(line)
|
||||
obj_type = obj.get('type')
|
||||
|
||||
if obj_type == 'InstalledBinary':
|
||||
# Create or update InstalledBinary record
|
||||
# Skip if essential fields are missing
|
||||
if not obj.get('name') or not obj.get('abspath') or not obj.get('version'):
|
||||
continue
|
||||
|
||||
InstalledBinary.objects.update_or_create(
|
||||
machine=machine,
|
||||
name=obj['name'],
|
||||
defaults={
|
||||
'abspath': obj['abspath'],
|
||||
'version': obj['version'],
|
||||
'sha256': obj.get('sha256') or '',
|
||||
'binprovider': obj.get('binprovider') or 'env',
|
||||
}
|
||||
)
|
||||
|
||||
elif obj_type == 'Machine':
|
||||
# Update Machine config
|
||||
method = obj.get('_method', 'update')
|
||||
if method == 'update':
|
||||
key = obj.get('key', '')
|
||||
value = obj.get('value')
|
||||
if key.startswith('config/'):
|
||||
config_key = key[7:] # Remove 'config/' prefix
|
||||
machine.config[config_key] = value
|
||||
machine.save(update_fields=['config'])
|
||||
|
||||
elif obj_type == 'Dependency':
|
||||
# Dependency request - could trigger installation
|
||||
# For now just log it (installation hooks would be separate)
|
||||
print(f'[yellow]Dependency requested: {obj.get("bin_name")}[/yellow]')
|
||||
|
||||
except json.JSONDecodeError:
|
||||
# Not JSON, skip
|
||||
continue
|
||||
|
||||
@sealed.enter
|
||||
def enter_sealed(self):
|
||||
print(f'{self}.on_sealed(): [blue]↳ SEALED[/blue] crawl.retry_at=None')
|
||||
# Suppressed: state transition logs
|
||||
self.crawl.update_for_workers(
|
||||
retry_at=None,
|
||||
status=Crawl.StatusChoices.SEALED,
|
||||
|
||||
@@ -245,6 +245,14 @@ def run_hook(
|
||||
env.setdefault('USER_AGENT', str(getattr(config, 'USER_AGENT', '')))
|
||||
env.setdefault('RESOLUTION', str(getattr(config, 'RESOLUTION', '')))
|
||||
|
||||
# Pass SEARCH_BACKEND_ENGINE from new-style config
|
||||
try:
|
||||
from archivebox.config.configset import get_config
|
||||
search_config = get_config()
|
||||
env.setdefault('SEARCH_BACKEND_ENGINE', str(search_config.get('SEARCH_BACKEND_ENGINE', 'ripgrep')))
|
||||
except Exception:
|
||||
env.setdefault('SEARCH_BACKEND_ENGINE', 'ripgrep')
|
||||
|
||||
# Create output directory if needed
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
2
archivebox/logs/errors.log
Normal file
2
archivebox/logs/errors.log
Normal file
@@ -0,0 +1,2 @@
|
||||
|
||||
> /Users/squash/Local/Code/archiveboxes/archivebox-nue/archivebox/cli/archivebox_init.py --force; TS=2025-12-25__08:03:12 VERSION=0.9.0rc1 IN_DOCKER=False IS_TTY=False
|
||||
@@ -12,7 +12,33 @@ class MachineAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||
sort_fields = ('id', 'created_at', 'hostname', 'ips', 'os_platform', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'os_arch', 'os_family', 'os_release', 'hw_uuid')
|
||||
|
||||
readonly_fields = ('guid', 'created_at', 'modified_at', 'ips')
|
||||
fields = (*readonly_fields, 'hostname', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'hw_uuid', 'os_arch', 'os_family', 'os_platform', 'os_kernel', 'os_release', 'stats', 'config', 'num_uses_succeeded', 'num_uses_failed')
|
||||
|
||||
fieldsets = (
|
||||
('Identity', {
|
||||
'fields': ('hostname', 'guid', 'ips'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Hardware', {
|
||||
'fields': ('hw_manufacturer', 'hw_product', 'hw_uuid', 'hw_in_docker', 'hw_in_vm'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Operating System', {
|
||||
'fields': ('os_platform', 'os_family', 'os_arch', 'os_kernel', 'os_release'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Statistics', {
|
||||
'fields': ('stats', 'num_uses_succeeded', 'num_uses_failed'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Configuration', {
|
||||
'fields': ('config',),
|
||||
'classes': ('card', 'wide'),
|
||||
}),
|
||||
('Timestamps', {
|
||||
'fields': ('created_at', 'modified_at'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
)
|
||||
|
||||
list_filter = ('hw_in_docker', 'hw_in_vm', 'os_arch', 'os_family', 'os_platform')
|
||||
ordering = ['-created_at']
|
||||
@@ -33,7 +59,29 @@ class NetworkInterfaceAdmin(BaseModelAdmin):
|
||||
search_fields = ('id', 'machine__id', 'iface', 'ip_public', 'ip_local', 'mac_address', 'dns_server', 'hostname', 'isp', 'city', 'region', 'country')
|
||||
|
||||
readonly_fields = ('machine', 'created_at', 'modified_at', 'mac_address', 'ip_public', 'ip_local', 'dns_server')
|
||||
fields = (*readonly_fields, 'iface', 'hostname', 'isp', 'city', 'region', 'country', 'num_uses_succeeded', 'num_uses_failed')
|
||||
|
||||
fieldsets = (
|
||||
('Machine', {
|
||||
'fields': ('machine',),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Network', {
|
||||
'fields': ('iface', 'ip_public', 'ip_local', 'mac_address', 'dns_server'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Location', {
|
||||
'fields': ('hostname', 'isp', 'city', 'region', 'country'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Usage', {
|
||||
'fields': ('num_uses_succeeded', 'num_uses_failed'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Timestamps', {
|
||||
'fields': ('created_at', 'modified_at'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
)
|
||||
|
||||
list_filter = ('isp', 'country', 'region')
|
||||
ordering = ['-created_at']
|
||||
@@ -54,7 +102,25 @@ class DependencyAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||
search_fields = ('id', 'bin_name', 'bin_providers')
|
||||
|
||||
readonly_fields = ('id', 'created_at', 'modified_at', 'is_installed', 'installed_count')
|
||||
fields = ('bin_name', 'bin_providers', 'custom_cmds', 'config', *readonly_fields)
|
||||
|
||||
fieldsets = (
|
||||
('Binary', {
|
||||
'fields': ('bin_name', 'bin_providers', 'is_installed', 'installed_count'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Commands', {
|
||||
'fields': ('custom_cmds',),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Configuration', {
|
||||
'fields': ('config',),
|
||||
'classes': ('card', 'wide'),
|
||||
}),
|
||||
('Timestamps', {
|
||||
'fields': ('id', 'created_at', 'modified_at'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
)
|
||||
|
||||
list_filter = ('bin_providers', 'created_at')
|
||||
ordering = ['-created_at']
|
||||
@@ -82,7 +148,29 @@ class InstalledBinaryAdmin(BaseModelAdmin):
|
||||
search_fields = ('id', 'machine__id', 'name', 'binprovider', 'version', 'abspath', 'sha256', 'dependency__bin_name')
|
||||
|
||||
readonly_fields = ('created_at', 'modified_at')
|
||||
fields = ('machine', 'dependency', 'name', 'binprovider', 'abspath', 'version', 'sha256', *readonly_fields, 'num_uses_succeeded', 'num_uses_failed')
|
||||
|
||||
fieldsets = (
|
||||
('Binary Info', {
|
||||
'fields': ('name', 'dependency', 'binprovider'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Location', {
|
||||
'fields': ('machine', 'abspath'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Version', {
|
||||
'fields': ('version', 'sha256'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Usage', {
|
||||
'fields': ('num_uses_succeeded', 'num_uses_failed'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Timestamps', {
|
||||
'fields': ('created_at', 'modified_at'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
)
|
||||
|
||||
list_filter = ('name', 'binprovider', 'machine_id', 'dependency')
|
||||
ordering = ['-created_at']
|
||||
|
||||
@@ -544,16 +544,21 @@ def log_worker_event(
|
||||
|
||||
# Build worker identifier
|
||||
worker_parts = [worker_type]
|
||||
if pid:
|
||||
# Don't add pid/worker_id for DB operations (they happen in whatever process is running)
|
||||
if pid and worker_type != 'DB':
|
||||
worker_parts.append(f'pid={pid}')
|
||||
if worker_id and worker_type in ('CrawlWorker', 'Orchestrator'):
|
||||
if worker_id and worker_type in ('CrawlWorker', 'Orchestrator') and worker_type != 'DB':
|
||||
worker_parts.append(f'id={worker_id}')
|
||||
if url and worker_type == 'SnapshotWorker':
|
||||
if url and worker_type in ('SnapshotWorker', 'DB'):
|
||||
worker_parts.append(f'url={truncate_url(url)}')
|
||||
if extractor and worker_type == 'ArchiveResultWorker':
|
||||
if extractor and worker_type in ('ArchiveResultWorker', 'DB'):
|
||||
worker_parts.append(f'extractor={extractor}')
|
||||
|
||||
worker_label = f'{worker_parts[0]}[{", ".join(worker_parts[1:])}]'
|
||||
# Format worker label - only add brackets if there are additional identifiers
|
||||
if len(worker_parts) > 1:
|
||||
worker_label = f'{worker_parts[0]}[{", ".join(worker_parts[1:])}]'
|
||||
else:
|
||||
worker_label = worker_parts[0]
|
||||
|
||||
# Build metadata string
|
||||
metadata_str = ''
|
||||
@@ -579,12 +584,14 @@ def log_worker_event(
|
||||
meta_parts.append(f'{k}: {len(v)}')
|
||||
else:
|
||||
meta_parts.append(f'{k}: {v}')
|
||||
metadata_str = ' {' + ', '.join(meta_parts) + '}'
|
||||
metadata_str = ' | '.join(meta_parts)
|
||||
|
||||
# Determine color based on event
|
||||
color = 'white'
|
||||
if event in ('Starting...', 'Started', 'STARTED', 'Started in background'):
|
||||
color = 'green'
|
||||
elif event.startswith('Created'):
|
||||
color = 'cyan' # DB creation events
|
||||
elif event in ('Processing...', 'PROCESSING'):
|
||||
color = 'blue'
|
||||
elif event in ('Completed', 'COMPLETED', 'All work complete'):
|
||||
@@ -606,8 +613,9 @@ def log_worker_event(
|
||||
text.append(indent) # Indentation
|
||||
# Append worker label and event with color
|
||||
text.append(f'{worker_label} {event}{error_str}', style=color)
|
||||
# Append metadata without color
|
||||
text.append(metadata_str)
|
||||
# Append metadata without color (add separator if metadata exists)
|
||||
if metadata_str:
|
||||
text.append(f' | {metadata_str}')
|
||||
|
||||
CONSOLE.print(text)
|
||||
|
||||
|
||||
@@ -21,9 +21,9 @@ const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Extractor metadata
|
||||
const EXTRACTOR_NAME = 'accessibility';
|
||||
const OUTPUT_DIR = 'accessibility';
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'accessibility.json';
|
||||
const CHROME_SESSION_DIR = 'chrome_session';
|
||||
const CHROME_SESSION_DIR = '../chrome_session';
|
||||
|
||||
// Parse command line arguments
|
||||
function parseArgs() {
|
||||
@@ -60,10 +60,7 @@ function getCdpUrl() {
|
||||
|
||||
// Extract accessibility info
|
||||
async function extractAccessibility(url) {
|
||||
// Create output directory
|
||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
}
|
||||
// Output directory is current directory (hook already runs in output dir)
|
||||
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
let browser = null;
|
||||
|
||||
@@ -24,7 +24,7 @@ import rich_click as click
|
||||
|
||||
# Extractor metadata
|
||||
EXTRACTOR_NAME = 'archive_org'
|
||||
OUTPUT_DIR = 'archive_org'
|
||||
OUTPUT_DIR = '.'
|
||||
OUTPUT_FILE = 'archive.org.txt'
|
||||
|
||||
|
||||
|
||||
@@ -26,7 +26,7 @@ import rich_click as click
|
||||
|
||||
# Extractor metadata
|
||||
EXTRACTOR_NAME = 'chrome_cleanup'
|
||||
CHROME_SESSION_DIR = 'chrome_session'
|
||||
CHROME_SESSION_DIR = '../chrome_session'
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
|
||||
@@ -31,7 +31,7 @@ const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Extractor metadata
|
||||
const EXTRACTOR_NAME = 'chrome_navigate';
|
||||
const CHROME_SESSION_DIR = 'chrome_session';
|
||||
const CHROME_SESSION_DIR = '../chrome_session';
|
||||
|
||||
// Parse command line arguments
|
||||
function parseArgs() {
|
||||
|
||||
@@ -21,9 +21,9 @@ const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Extractor metadata
|
||||
const EXTRACTOR_NAME = 'consolelog';
|
||||
const OUTPUT_DIR = 'consolelog';
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'console.jsonl';
|
||||
const CHROME_SESSION_DIR = 'chrome_session';
|
||||
const CHROME_SESSION_DIR = '../chrome_session';
|
||||
|
||||
// Parse command line arguments
|
||||
function parseArgs() {
|
||||
@@ -86,10 +86,7 @@ async function serializeArgs(args) {
|
||||
async function captureConsoleLogs(url) {
|
||||
const captureTimeout = (getEnvInt('CONSOLELOG_TIMEOUT') || 5) * 1000;
|
||||
|
||||
// Create output directory
|
||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
}
|
||||
// Output directory is current directory (hook already runs in output dir)
|
||||
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
// Clear existing file
|
||||
|
||||
@@ -24,9 +24,9 @@ const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Extractor metadata
|
||||
const EXTRACTOR_NAME = 'dom';
|
||||
const OUTPUT_DIR = 'dom';
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'output.html';
|
||||
const CHROME_SESSION_DIR = 'chrome_session';
|
||||
const CHROME_SESSION_DIR = '../chrome_session';
|
||||
|
||||
// Parse command line arguments
|
||||
function parseArgs() {
|
||||
@@ -58,7 +58,7 @@ function getEnvInt(name, defaultValue = 0) {
|
||||
}
|
||||
|
||||
// Check if staticfile extractor already downloaded this URL
|
||||
const STATICFILE_DIR = 'staticfile';
|
||||
const STATICFILE_DIR = '../staticfile';
|
||||
function hasStaticFileOutput() {
|
||||
return fs.existsSync(STATICFILE_DIR) && fs.readdirSync(STATICFILE_DIR).length > 0;
|
||||
}
|
||||
@@ -114,10 +114,7 @@ async function dumpDom(url) {
|
||||
|
||||
const { width, height } = parseResolution(resolution);
|
||||
|
||||
// Create output directory
|
||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
}
|
||||
// Output directory is current directory (hook already runs in output dir)
|
||||
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
let browser = null;
|
||||
|
||||
@@ -26,7 +26,7 @@ import rich_click as click
|
||||
|
||||
# Extractor metadata
|
||||
EXTRACTOR_NAME = 'favicon'
|
||||
OUTPUT_DIR = 'favicon'
|
||||
OUTPUT_DIR = '.'
|
||||
OUTPUT_FILE = 'favicon.ico'
|
||||
|
||||
|
||||
|
||||
@@ -26,7 +26,7 @@ import rich_click as click
|
||||
EXTRACTOR_NAME = 'git'
|
||||
BIN_NAME = 'git'
|
||||
BIN_PROVIDERS = 'apt,brew,env'
|
||||
OUTPUT_DIR = 'repo'
|
||||
OUTPUT_DIR = '.'
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
|
||||
@@ -22,9 +22,9 @@ const http = require('http');
|
||||
|
||||
// Extractor metadata
|
||||
const EXTRACTOR_NAME = 'headers';
|
||||
const OUTPUT_DIR = 'headers';
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'headers.json';
|
||||
const CHROME_SESSION_DIR = 'chrome_session';
|
||||
const CHROME_SESSION_DIR = '../chrome_session';
|
||||
const CHROME_HEADERS_FILE = 'response_headers.json';
|
||||
|
||||
// Parse command line arguments
|
||||
@@ -110,10 +110,7 @@ function fetchHeaders(url) {
|
||||
}
|
||||
|
||||
async function extractHeaders(url) {
|
||||
// Create output directory
|
||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
}
|
||||
// Output directory is current directory (hook already runs in output dir)
|
||||
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
// Try Chrome session first
|
||||
|
||||
@@ -28,7 +28,7 @@ import rich_click as click
|
||||
|
||||
# Extractor metadata
|
||||
EXTRACTOR_NAME = 'htmltotext'
|
||||
OUTPUT_DIR = 'htmltotext'
|
||||
OUTPUT_DIR = '.'
|
||||
OUTPUT_FILE = 'htmltotext.txt'
|
||||
|
||||
|
||||
@@ -114,9 +114,8 @@ def extract_htmltotext(url: str) -> tuple[bool, str | None, str]:
|
||||
if not text or len(text) < 10:
|
||||
return False, None, 'No meaningful text extracted from HTML'
|
||||
|
||||
# Create output directory and write output
|
||||
# Output directory is current directory (hook already runs in output dir)
|
||||
output_dir = Path(OUTPUT_DIR)
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
output_path = output_dir / OUTPUT_FILE
|
||||
output_path.write_text(text, encoding='utf-8')
|
||||
|
||||
|
||||
@@ -39,7 +39,7 @@ import rich_click as click
|
||||
EXTRACTOR_NAME = 'media'
|
||||
BIN_NAME = 'yt-dlp'
|
||||
BIN_PROVIDERS = 'pip,apt,brew,env'
|
||||
OUTPUT_DIR = 'media'
|
||||
OUTPUT_DIR = '.'
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
@@ -62,7 +62,7 @@ def get_env_int(name: str, default: int = 0) -> int:
|
||||
return default
|
||||
|
||||
|
||||
STATICFILE_DIR = 'staticfile'
|
||||
STATICFILE_DIR = '../staticfile'
|
||||
|
||||
def has_staticfile_output() -> bool:
|
||||
"""Check if staticfile extractor already downloaded this URL."""
|
||||
@@ -129,9 +129,8 @@ def save_media(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
extra_args = get_env('YTDLP_EXTRA_ARGS') or get_env('YOUTUBEDL_EXTRA_ARGS', '')
|
||||
media_max_size = get_env('MEDIA_MAX_SIZE', '750m')
|
||||
|
||||
# Create output directory
|
||||
# Output directory is current directory (hook already runs in output dir)
|
||||
output_dir = Path(OUTPUT_DIR)
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
|
||||
# Build command (later options take precedence)
|
||||
cmd = [
|
||||
|
||||
@@ -27,7 +27,7 @@ import rich_click as click
|
||||
EXTRACTOR_NAME = 'mercury'
|
||||
BIN_NAME = 'postlight-parser'
|
||||
BIN_PROVIDERS = 'npm,env'
|
||||
OUTPUT_DIR = 'mercury'
|
||||
OUTPUT_DIR = '.'
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
@@ -72,9 +72,8 @@ def extract_mercury(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
"""
|
||||
timeout = get_env_int('TIMEOUT', 60)
|
||||
|
||||
# Create output directory
|
||||
# Output directory is current directory (hook already runs in output dir)
|
||||
output_dir = Path(OUTPUT_DIR)
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
|
||||
try:
|
||||
# Get text version
|
||||
|
||||
@@ -24,10 +24,10 @@ const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Extractor metadata
|
||||
const EXTRACTOR_NAME = 'parse_dom_outlinks';
|
||||
const OUTPUT_DIR = 'parse_dom_outlinks';
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'outlinks.json';
|
||||
const URLS_FILE = 'urls.jsonl'; // For crawl system
|
||||
const CHROME_SESSION_DIR = 'chrome_session';
|
||||
const CHROME_SESSION_DIR = '../chrome_session';
|
||||
|
||||
// Parse command line arguments
|
||||
function parseArgs() {
|
||||
@@ -64,10 +64,7 @@ function getCdpUrl() {
|
||||
|
||||
// Extract outlinks
|
||||
async function extractOutlinks(url) {
|
||||
// Create output directory
|
||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
}
|
||||
// Output directory is current directory (hook already runs in output dir)
|
||||
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
let browser = null;
|
||||
|
||||
@@ -23,9 +23,9 @@ const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Extractor metadata
|
||||
const EXTRACTOR_NAME = 'pdf';
|
||||
const OUTPUT_DIR = 'pdf';
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'output.pdf';
|
||||
const CHROME_SESSION_DIR = 'chrome_session';
|
||||
const CHROME_SESSION_DIR = '../chrome_session';
|
||||
|
||||
// Parse command line arguments
|
||||
function parseArgs() {
|
||||
@@ -57,7 +57,7 @@ function getEnvInt(name, defaultValue = 0) {
|
||||
}
|
||||
|
||||
// Check if staticfile extractor already downloaded this URL
|
||||
const STATICFILE_DIR = 'staticfile';
|
||||
const STATICFILE_DIR = '../staticfile';
|
||||
function hasStaticFileOutput() {
|
||||
return fs.existsSync(STATICFILE_DIR) && fs.readdirSync(STATICFILE_DIR).length > 0;
|
||||
}
|
||||
@@ -113,10 +113,7 @@ async function printToPdf(url) {
|
||||
|
||||
const { width, height } = parseResolution(resolution);
|
||||
|
||||
// Create output directory
|
||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
}
|
||||
// Output directory is current directory (hook already runs in output dir)
|
||||
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
let browser = null;
|
||||
|
||||
@@ -29,7 +29,7 @@ import rich_click as click
|
||||
EXTRACTOR_NAME = 'readability'
|
||||
BIN_NAME = 'readability-extractor'
|
||||
BIN_PROVIDERS = 'npm,env'
|
||||
OUTPUT_DIR = 'readability'
|
||||
OUTPUT_DIR = '.'
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
@@ -101,9 +101,8 @@ def extract_readability(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
if not html_source:
|
||||
return False, None, 'No HTML source found (run singlefile, dom, or wget first)'
|
||||
|
||||
# Create output directory
|
||||
# Output directory is current directory (hook already runs in output dir)
|
||||
output_dir = Path(OUTPUT_DIR)
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
|
||||
try:
|
||||
# Run readability-extractor (outputs JSON by default)
|
||||
|
||||
@@ -21,9 +21,9 @@ const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Extractor metadata
|
||||
const EXTRACTOR_NAME = 'redirects';
|
||||
const OUTPUT_DIR = 'redirects';
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'redirects.json';
|
||||
const CHROME_SESSION_DIR = 'chrome_session';
|
||||
const CHROME_SESSION_DIR = '../chrome_session';
|
||||
|
||||
// Parse command line arguments
|
||||
function parseArgs() {
|
||||
@@ -60,10 +60,7 @@ function getCdpUrl() {
|
||||
|
||||
// Track redirect chain
|
||||
async function trackRedirects(url) {
|
||||
// Create output directory
|
||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
}
|
||||
// Output directory is current directory (hook already runs in output dir)
|
||||
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
let browser = null;
|
||||
|
||||
@@ -26,8 +26,8 @@ const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Extractor metadata
|
||||
const EXTRACTOR_NAME = 'responses';
|
||||
const OUTPUT_DIR = 'responses';
|
||||
const CHROME_SESSION_DIR = 'chrome_session';
|
||||
const OUTPUT_DIR = '.';
|
||||
const CHROME_SESSION_DIR = '../chrome_session';
|
||||
|
||||
// Resource types to capture (by default, capture everything)
|
||||
const DEFAULT_TYPES = ['script', 'stylesheet', 'font', 'image', 'media', 'xhr', 'websocket'];
|
||||
@@ -149,10 +149,8 @@ async function archiveResponses(originalUrl) {
|
||||
const typesStr = getEnv('RESPONSES_TYPES', DEFAULT_TYPES.join(','));
|
||||
const typesToSave = typesStr.split(',').map(t => t.trim().toLowerCase());
|
||||
|
||||
// Create output directories
|
||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
}
|
||||
// Output directory is current directory (hook already runs in output dir)
|
||||
// Create subdirectories for organizing responses
|
||||
const allDir = path.join(OUTPUT_DIR, 'all');
|
||||
if (!fs.existsSync(allDir)) {
|
||||
fs.mkdirSync(allDir, { recursive: true });
|
||||
|
||||
@@ -23,9 +23,9 @@ const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Extractor metadata
|
||||
const EXTRACTOR_NAME = 'screenshot';
|
||||
const OUTPUT_DIR = 'screenshot';
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'screenshot.png';
|
||||
const CHROME_SESSION_DIR = 'chrome_session';
|
||||
const CHROME_SESSION_DIR = '../chrome_session';
|
||||
|
||||
// Parse command line arguments
|
||||
function parseArgs() {
|
||||
@@ -57,7 +57,7 @@ function getEnvInt(name, defaultValue = 0) {
|
||||
}
|
||||
|
||||
// Check if staticfile extractor already downloaded this URL
|
||||
const STATICFILE_DIR = 'staticfile';
|
||||
const STATICFILE_DIR = '../staticfile';
|
||||
function hasStaticFileOutput() {
|
||||
return fs.existsSync(STATICFILE_DIR) && fs.readdirSync(STATICFILE_DIR).length > 0;
|
||||
}
|
||||
@@ -116,10 +116,7 @@ async function takeScreenshot(url) {
|
||||
|
||||
const { width, height } = parseResolution(resolution);
|
||||
|
||||
// Create output directory
|
||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
}
|
||||
// Output directory is current directory (hook already runs in output dir)
|
||||
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
let browser = null;
|
||||
|
||||
131
archivebox/plugins/search_backend_ripgrep/on_Crawl__00_validate_ripgrep.py
Executable file
131
archivebox/plugins/search_backend_ripgrep/on_Crawl__00_validate_ripgrep.py
Executable file
@@ -0,0 +1,131 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Validation hook for ripgrep binary.
|
||||
|
||||
Only runs if SEARCH_BACKEND_ENGINE is set to 'ripgrep'.
|
||||
Outputs JSONL for InstalledBinary and Machine config updates.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import shutil
|
||||
import hashlib
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def get_binary_version(abspath: str) -> str | None:
|
||||
"""Get version string from ripgrep binary."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[abspath, '--version'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
)
|
||||
if result.returncode == 0 and result.stdout:
|
||||
# ripgrep version string: "ripgrep 14.1.0"
|
||||
first_line = result.stdout.strip().split('\n')[0]
|
||||
parts = first_line.split()
|
||||
for i, part in enumerate(parts):
|
||||
if part.lower() == 'ripgrep' and i + 1 < len(parts):
|
||||
return parts[i + 1]
|
||||
# Try to find version number pattern
|
||||
for part in parts:
|
||||
if part[0].isdigit() and '.' in part:
|
||||
return part
|
||||
return first_line[:32]
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def get_binary_hash(abspath: str) -> str | None:
|
||||
"""Get SHA256 hash of binary."""
|
||||
try:
|
||||
with open(abspath, 'rb') as f:
|
||||
return hashlib.sha256(f.read()).hexdigest()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def find_ripgrep() -> dict | None:
|
||||
"""Find ripgrep binary using shutil.which or env var."""
|
||||
# Check env var first - if it's an absolute path and exists, use it
|
||||
ripgrep_env = os.environ.get('RIPGREP_BINARY', '')
|
||||
if ripgrep_env and '/' in ripgrep_env and Path(ripgrep_env).is_file():
|
||||
abspath = ripgrep_env
|
||||
else:
|
||||
# Otherwise try shutil.which with the env var as the binary name
|
||||
abspath = shutil.which(ripgrep_env) if ripgrep_env else None
|
||||
if not abspath:
|
||||
abspath = shutil.which('rg')
|
||||
|
||||
if abspath and Path(abspath).is_file():
|
||||
return {
|
||||
'name': 'rg',
|
||||
'abspath': abspath,
|
||||
'version': get_binary_version(abspath),
|
||||
'sha256': get_binary_hash(abspath),
|
||||
'binprovider': 'env',
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
"""Validate ripgrep binary and output JSONL."""
|
||||
|
||||
# Check if ripgrep search backend is enabled
|
||||
search_backend = os.environ.get('SEARCH_BACKEND_ENGINE', '').lower()
|
||||
|
||||
if search_backend != 'ripgrep':
|
||||
# No-op: ripgrep is not the active search backend
|
||||
sys.exit(0)
|
||||
|
||||
result = find_ripgrep()
|
||||
|
||||
if result and result.get('abspath'):
|
||||
# Output InstalledBinary
|
||||
print(json.dumps({
|
||||
'type': 'InstalledBinary',
|
||||
'name': result['name'],
|
||||
'abspath': result['abspath'],
|
||||
'version': result['version'],
|
||||
'sha256': result['sha256'],
|
||||
'binprovider': result['binprovider'],
|
||||
}))
|
||||
|
||||
# Output Machine config update
|
||||
print(json.dumps({
|
||||
'type': 'Machine',
|
||||
'_method': 'update',
|
||||
'key': 'config/RIPGREP_BINARY',
|
||||
'value': result['abspath'],
|
||||
}))
|
||||
|
||||
if result['version']:
|
||||
print(json.dumps({
|
||||
'type': 'Machine',
|
||||
'_method': 'update',
|
||||
'key': 'config/RIPGREP_VERSION',
|
||||
'value': result['version'],
|
||||
}))
|
||||
|
||||
sys.exit(0)
|
||||
else:
|
||||
# Output Dependency request
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'rg',
|
||||
'bin_providers': 'apt,brew,cargo,env',
|
||||
}))
|
||||
|
||||
# Exit non-zero to indicate binary not found
|
||||
print(f"ripgrep binary not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -0,0 +1,306 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Tests for ripgrep binary detection and archivebox install functionality.
|
||||
|
||||
Guards against regressions in:
|
||||
1. Machine.config overrides not being used in version command
|
||||
2. Ripgrep hook not resolving binary names via shutil.which()
|
||||
3. SEARCH_BACKEND_ENGINE not being passed to hook environment
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import shutil
|
||||
import tempfile
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
def test_ripgrep_hook_detects_binary_from_path():
|
||||
"""Test that ripgrep hook finds binary using shutil.which() when env var is just a name."""
|
||||
hook_path = Path(__file__).parent.parent / 'on_Crawl__00_validate_ripgrep.py'
|
||||
|
||||
# Skip if rg is not installed
|
||||
if not shutil.which('rg'):
|
||||
pytest.skip("ripgrep (rg) not installed")
|
||||
|
||||
# Set SEARCH_BACKEND_ENGINE to enable the hook
|
||||
env = os.environ.copy()
|
||||
env['SEARCH_BACKEND_ENGINE'] = 'ripgrep'
|
||||
env['RIPGREP_BINARY'] = 'rg' # Just the name, not the full path (this was the bug)
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(hook_path)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=10,
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Hook failed: {result.stderr}"
|
||||
|
||||
# Parse JSONL output
|
||||
lines = [line for line in result.stdout.strip().split('\n') if line.strip()]
|
||||
assert len(lines) >= 2, "Expected at least 2 JSONL lines (InstalledBinary + Machine config)"
|
||||
|
||||
installed_binary = json.loads(lines[0])
|
||||
assert installed_binary['type'] == 'InstalledBinary'
|
||||
assert installed_binary['name'] == 'rg'
|
||||
assert '/' in installed_binary['abspath'], "Expected full path, not just binary name"
|
||||
assert Path(installed_binary['abspath']).is_file(), "Binary path should exist"
|
||||
assert installed_binary['version'], "Version should be detected"
|
||||
|
||||
machine_config = json.loads(lines[1])
|
||||
assert machine_config['type'] == 'Machine'
|
||||
assert machine_config['key'] == 'config/RIPGREP_BINARY'
|
||||
assert '/' in machine_config['value'], "Machine config should store full path"
|
||||
|
||||
|
||||
def test_ripgrep_hook_skips_when_backend_not_ripgrep():
|
||||
"""Test that ripgrep hook exits silently when search backend is not ripgrep."""
|
||||
hook_path = Path(__file__).parent.parent / 'on_Crawl__00_validate_ripgrep.py'
|
||||
|
||||
env = os.environ.copy()
|
||||
env['SEARCH_BACKEND_ENGINE'] = 'sqlite' # Different backend
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(hook_path)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=10,
|
||||
)
|
||||
|
||||
assert result.returncode == 0, "Hook should exit successfully when backend is not ripgrep"
|
||||
assert result.stdout.strip() == '', "Hook should produce no output when backend is not ripgrep"
|
||||
|
||||
|
||||
def test_ripgrep_hook_handles_absolute_path():
|
||||
"""Test that ripgrep hook works when RIPGREP_BINARY is an absolute path."""
|
||||
hook_path = Path(__file__).parent.parent / 'on_Crawl__00_validate_ripgrep.py'
|
||||
|
||||
rg_path = shutil.which('rg')
|
||||
if not rg_path:
|
||||
pytest.skip("ripgrep (rg) not installed")
|
||||
|
||||
env = os.environ.copy()
|
||||
env['SEARCH_BACKEND_ENGINE'] = 'ripgrep'
|
||||
env['RIPGREP_BINARY'] = rg_path # Full absolute path
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(hook_path)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=10,
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Hook failed: {result.stderr}"
|
||||
assert result.stdout.strip(), "Hook should produce output"
|
||||
|
||||
installed_binary = json.loads(result.stdout.strip().split('\n')[0])
|
||||
assert installed_binary['abspath'] == rg_path
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_machine_config_overrides_base_config():
|
||||
"""
|
||||
Test that Machine.config overrides take precedence over base config.
|
||||
|
||||
Guards against regression where archivebox version was showing binaries
|
||||
as "not installed" even though they were detected and stored in Machine.config.
|
||||
"""
|
||||
from machine.models import Machine, InstalledBinary
|
||||
|
||||
machine = Machine.current()
|
||||
|
||||
# Simulate a hook detecting chrome and storing it with a different path than base config
|
||||
detected_chrome_path = '/custom/path/to/chrome'
|
||||
machine.config['CHROME_BINARY'] = detected_chrome_path
|
||||
machine.config['CHROME_VERSION'] = '143.0.7499.170'
|
||||
machine.save()
|
||||
|
||||
# Create InstalledBinary record
|
||||
InstalledBinary.objects.create(
|
||||
machine=machine,
|
||||
name='chrome',
|
||||
abspath=detected_chrome_path,
|
||||
version='143.0.7499.170',
|
||||
binprovider='env',
|
||||
)
|
||||
|
||||
# Verify Machine.config takes precedence
|
||||
from archivebox.config.configset import get_config
|
||||
config = get_config()
|
||||
|
||||
# Machine.config should override the base config value
|
||||
assert machine.config.get('CHROME_BINARY') == detected_chrome_path
|
||||
|
||||
# The version command should use Machine.config, not base config
|
||||
# (Base config might have 'chromium' while Machine.config has the full path)
|
||||
bin_value = machine.config.get('CHROME_BINARY') or config.get('CHROME_BINARY', '')
|
||||
assert bin_value == detected_chrome_path, \
|
||||
"Machine.config override should take precedence over base config"
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_search_backend_engine_passed_to_hooks():
|
||||
"""
|
||||
Test that SEARCH_BACKEND_ENGINE is passed to hook environment.
|
||||
|
||||
Guards against regression where hooks couldn't determine which search backend was active.
|
||||
"""
|
||||
from pathlib import Path
|
||||
from archivebox.hooks import build_hook_environment
|
||||
from archivebox.config.configset import get_config
|
||||
|
||||
config = get_config()
|
||||
search_backend = config.get('SEARCH_BACKEND_ENGINE', 'ripgrep')
|
||||
|
||||
env = build_hook_environment(overrides=None)
|
||||
|
||||
assert 'SEARCH_BACKEND_ENGINE' in env, \
|
||||
"SEARCH_BACKEND_ENGINE must be in hook environment"
|
||||
assert env['SEARCH_BACKEND_ENGINE'] == search_backend, \
|
||||
f"Expected SEARCH_BACKEND_ENGINE={search_backend}, got {env.get('SEARCH_BACKEND_ENGINE')}"
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_install_creates_installedbinary_records():
|
||||
"""
|
||||
Test that archivebox install creates InstalledBinary records for detected binaries.
|
||||
|
||||
This is an integration test that verifies the full install flow.
|
||||
"""
|
||||
from machine.models import Machine, InstalledBinary
|
||||
from crawls.models import Seed, Crawl
|
||||
from crawls.statemachines import CrawlMachine
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
|
||||
machine = Machine.current()
|
||||
initial_binary_count = InstalledBinary.objects.filter(machine=machine).count()
|
||||
|
||||
# Create an install crawl (like archivebox install does)
|
||||
created_by_id = get_or_create_system_user_pk()
|
||||
seed, _ = Seed.objects.get_or_create(
|
||||
uri='archivebox://test-install',
|
||||
label='Test dependency detection',
|
||||
created_by_id=created_by_id,
|
||||
defaults={'extractor': 'auto'},
|
||||
)
|
||||
|
||||
crawl = Crawl.objects.create(
|
||||
seed=seed,
|
||||
max_depth=0,
|
||||
created_by_id=created_by_id,
|
||||
status='queued',
|
||||
)
|
||||
|
||||
# Run the crawl state machine (this triggers hooks)
|
||||
sm = CrawlMachine(crawl)
|
||||
sm.send('tick') # queued -> started (runs hooks)
|
||||
|
||||
# Verify InstalledBinary records were created
|
||||
final_binary_count = InstalledBinary.objects.filter(machine=machine).count()
|
||||
assert final_binary_count > initial_binary_count, \
|
||||
"archivebox install should create InstalledBinary records"
|
||||
|
||||
# Verify at least some common binaries were detected
|
||||
common_binaries = ['git', 'wget', 'node']
|
||||
detected = []
|
||||
for bin_name in common_binaries:
|
||||
if InstalledBinary.objects.filter(machine=machine, name=bin_name).exists():
|
||||
detected.append(bin_name)
|
||||
|
||||
assert detected, f"At least one of {common_binaries} should be detected"
|
||||
|
||||
# Verify detected binaries have valid paths and versions
|
||||
for binary in InstalledBinary.objects.filter(machine=machine):
|
||||
if binary.abspath: # Only check non-empty paths
|
||||
assert '/' in binary.abspath, \
|
||||
f"{binary.name} should have full path, not just name: {binary.abspath}"
|
||||
# Version might be empty for some binaries, that's ok
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_ripgrep_only_detected_when_backend_enabled():
|
||||
"""
|
||||
Test that ripgrep is only detected when SEARCH_BACKEND_ENGINE='ripgrep'.
|
||||
|
||||
Guards against ripgrep being installed/detected when not needed.
|
||||
"""
|
||||
from machine.models import Machine, InstalledBinary
|
||||
from crawls.models import Seed, Crawl
|
||||
from crawls.statemachines import CrawlMachine
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from django.conf import settings
|
||||
|
||||
if not shutil.which('rg'):
|
||||
pytest.skip("ripgrep (rg) not installed")
|
||||
|
||||
machine = Machine.current()
|
||||
|
||||
# Clear any existing ripgrep records
|
||||
InstalledBinary.objects.filter(machine=machine, name='rg').delete()
|
||||
|
||||
# Test 1: With ripgrep backend - should be detected
|
||||
with patch('archivebox.config.configset.get_config') as mock_config:
|
||||
mock_config.return_value = {'SEARCH_BACKEND_ENGINE': 'ripgrep', 'RIPGREP_BINARY': 'rg'}
|
||||
|
||||
created_by_id = get_or_create_system_user_pk()
|
||||
seed = Seed.objects.create(
|
||||
uri='archivebox://test-rg-enabled',
|
||||
label='Test ripgrep detection enabled',
|
||||
created_by_id=created_by_id,
|
||||
extractor='auto',
|
||||
)
|
||||
|
||||
crawl = Crawl.objects.create(
|
||||
seed=seed,
|
||||
max_depth=0,
|
||||
created_by_id=created_by_id,
|
||||
status='queued',
|
||||
)
|
||||
|
||||
sm = CrawlMachine(crawl)
|
||||
sm.send('tick')
|
||||
|
||||
# Ripgrep should be detected
|
||||
rg_detected = InstalledBinary.objects.filter(machine=machine, name='rg').exists()
|
||||
assert rg_detected, "Ripgrep should be detected when SEARCH_BACKEND_ENGINE='ripgrep'"
|
||||
|
||||
# Clear records again
|
||||
InstalledBinary.objects.filter(machine=machine, name='rg').delete()
|
||||
|
||||
# Test 2: With different backend - should NOT be detected
|
||||
with patch('archivebox.config.configset.get_config') as mock_config:
|
||||
mock_config.return_value = {'SEARCH_BACKEND_ENGINE': 'sqlite', 'RIPGREP_BINARY': 'rg'}
|
||||
|
||||
seed2 = Seed.objects.create(
|
||||
uri='archivebox://test-rg-disabled',
|
||||
label='Test ripgrep detection disabled',
|
||||
created_by_id=created_by_id,
|
||||
extractor='auto',
|
||||
)
|
||||
|
||||
crawl2 = Crawl.objects.create(
|
||||
seed=seed2,
|
||||
max_depth=0,
|
||||
created_by_id=created_by_id,
|
||||
status='queued',
|
||||
)
|
||||
|
||||
sm2 = CrawlMachine(crawl2)
|
||||
sm2.send('tick')
|
||||
|
||||
# Ripgrep should NOT be detected
|
||||
rg_detected = InstalledBinary.objects.filter(machine=machine, name='rg').exists()
|
||||
assert not rg_detected, "Ripgrep should NOT be detected when SEARCH_BACKEND_ENGINE!='ripgrep'"
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
@@ -29,7 +29,7 @@ import rich_click as click
|
||||
|
||||
# Extractor metadata
|
||||
EXTRACTOR_NAME = 'index_sonic'
|
||||
OUTPUT_DIR = 'search_index'
|
||||
OUTPUT_DIR = '.'
|
||||
|
||||
# Text file patterns to index
|
||||
INDEXABLE_FILES = [
|
||||
|
||||
@@ -27,7 +27,7 @@ import rich_click as click
|
||||
|
||||
# Extractor metadata
|
||||
EXTRACTOR_NAME = 'index_sqlite'
|
||||
OUTPUT_DIR = 'search_index'
|
||||
OUTPUT_DIR = '.'
|
||||
|
||||
# Text file patterns to index, in priority order
|
||||
INDEXABLE_FILES = [
|
||||
|
||||
@@ -21,9 +21,9 @@ const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Extractor metadata
|
||||
const EXTRACTOR_NAME = 'seo';
|
||||
const OUTPUT_DIR = 'seo';
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'seo.json';
|
||||
const CHROME_SESSION_DIR = 'chrome_session';
|
||||
const CHROME_SESSION_DIR = '../chrome_session';
|
||||
|
||||
// Parse command line arguments
|
||||
function parseArgs() {
|
||||
@@ -60,10 +60,7 @@ function getCdpUrl() {
|
||||
|
||||
// Extract SEO metadata
|
||||
async function extractSeo(url) {
|
||||
// Create output directory
|
||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
}
|
||||
// Output directory is current directory (hook already runs in output dir)
|
||||
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
let browser = null;
|
||||
|
||||
@@ -40,7 +40,7 @@ const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
|
||||
const CHROME_DOWNLOADS_DIR = process.env.CHROME_DOWNLOADS_DIR ||
|
||||
path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_downloads');
|
||||
|
||||
const OUTPUT_DIR = 'singlefile';
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'singlefile.html';
|
||||
|
||||
/**
|
||||
@@ -102,8 +102,7 @@ async function saveSinglefileWithExtension(page, extension, options = {}) {
|
||||
.filter(fn => fn.endsWith('.html'))
|
||||
);
|
||||
|
||||
// Ensure output directory exists
|
||||
await fs.promises.mkdir(OUTPUT_DIR, { recursive: true });
|
||||
// Output directory is current directory (hook already runs in output dir)
|
||||
const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
console.log(`[🛠️] Saving SingleFile HTML using extension (${extension.id})...`);
|
||||
@@ -170,8 +169,7 @@ async function saveSinglefileWithCLI(url, options = {}) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Ensure output directory exists
|
||||
await fs.promises.mkdir(OUTPUT_DIR, { recursive: true });
|
||||
// Output directory is current directory (hook already runs in output dir)
|
||||
const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
// Build command
|
||||
|
||||
@@ -41,7 +41,7 @@ import rich_click as click
|
||||
EXTRACTOR_NAME = 'singlefile'
|
||||
BIN_NAME = 'single-file'
|
||||
BIN_PROVIDERS = 'npm,env'
|
||||
OUTPUT_DIR = 'singlefile'
|
||||
OUTPUT_DIR = '.'
|
||||
OUTPUT_FILE = 'singlefile.html'
|
||||
|
||||
|
||||
@@ -65,7 +65,7 @@ def get_env_int(name: str, default: int = 0) -> int:
|
||||
return default
|
||||
|
||||
|
||||
STATICFILE_DIR = 'staticfile'
|
||||
STATICFILE_DIR = '../staticfile'
|
||||
|
||||
def has_staticfile_output() -> bool:
|
||||
"""Check if staticfile extractor already downloaded this URL."""
|
||||
@@ -135,7 +135,7 @@ def get_version(binary: str) -> str:
|
||||
return ''
|
||||
|
||||
|
||||
CHROME_SESSION_DIR = 'chrome_session'
|
||||
CHROME_SESSION_DIR = '../chrome_session'
|
||||
|
||||
|
||||
def get_cdp_url() -> str | None:
|
||||
@@ -203,9 +203,8 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
if extra_args:
|
||||
cmd.extend(extra_args.split())
|
||||
|
||||
# Create output directory
|
||||
# Output directory is current directory (hook already runs in output dir)
|
||||
output_dir = Path(OUTPUT_DIR)
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
output_path = output_dir / OUTPUT_FILE
|
||||
|
||||
cmd.extend([url, str(output_path)])
|
||||
@@ -274,7 +273,7 @@ def main(url: str, snapshot_id: str):
|
||||
sys.exit(1)
|
||||
|
||||
version = get_version(binary)
|
||||
cmd_str = f'{binary} {url} {OUTPUT_DIR}/{OUTPUT_FILE}'
|
||||
cmd_str = f'{binary} {url} {OUTPUT_FILE}'
|
||||
|
||||
# Run extraction
|
||||
success, output, error = save_singlefile(url, binary)
|
||||
|
||||
@@ -21,9 +21,9 @@ const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Extractor metadata
|
||||
const EXTRACTOR_NAME = 'ssl';
|
||||
const OUTPUT_DIR = 'ssl';
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'ssl.json';
|
||||
const CHROME_SESSION_DIR = 'chrome_session';
|
||||
const CHROME_SESSION_DIR = '../chrome_session';
|
||||
|
||||
// Parse command line arguments
|
||||
function parseArgs() {
|
||||
@@ -60,10 +60,7 @@ function getCdpUrl() {
|
||||
|
||||
// Extract SSL details
|
||||
async function extractSsl(url) {
|
||||
// Create output directory
|
||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
}
|
||||
// Output directory is current directory (hook already runs in output dir)
|
||||
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
// Only extract SSL for HTTPS URLs
|
||||
|
||||
@@ -31,8 +31,8 @@ import rich_click as click
|
||||
|
||||
# Extractor metadata
|
||||
EXTRACTOR_NAME = 'staticfile'
|
||||
OUTPUT_DIR = 'staticfile'
|
||||
CHROME_SESSION_DIR = 'chrome_session'
|
||||
OUTPUT_DIR = '.'
|
||||
CHROME_SESSION_DIR = '../chrome_session'
|
||||
|
||||
# Content-Types that indicate static files
|
||||
# These can't be meaningfully processed by Chrome-based extractors
|
||||
@@ -214,9 +214,8 @@ def download_file(url: str) -> tuple[bool, str | None, str]:
|
||||
if content_length and int(content_length) > max_size:
|
||||
return False, None, f'File too large: {int(content_length)} bytes > {max_size} max'
|
||||
|
||||
# Create output directory
|
||||
# Output directory is current directory (hook already runs in output dir)
|
||||
output_dir = Path(OUTPUT_DIR)
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
|
||||
# Determine filename
|
||||
filename = get_filename_from_url(url)
|
||||
|
||||
@@ -21,9 +21,9 @@ const http = require('http');
|
||||
|
||||
// Extractor metadata
|
||||
const EXTRACTOR_NAME = 'title';
|
||||
const OUTPUT_DIR = 'title';
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'title.txt';
|
||||
const CHROME_SESSION_DIR = 'chrome_session';
|
||||
const CHROME_SESSION_DIR = '../chrome_session';
|
||||
|
||||
// Parse command line arguments
|
||||
function parseArgs() {
|
||||
@@ -162,10 +162,7 @@ async function getTitleFromCdp(cdpUrl) {
|
||||
}
|
||||
|
||||
async function extractTitle(url) {
|
||||
// Create output directory
|
||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
}
|
||||
// Output directory is current directory (hook already runs in output dir)
|
||||
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
// Try Chrome session first
|
||||
|
||||
@@ -43,7 +43,7 @@ import rich_click as click
|
||||
EXTRACTOR_NAME = 'wget'
|
||||
BIN_NAME = 'wget'
|
||||
BIN_PROVIDERS = 'apt,brew,env'
|
||||
OUTPUT_DIR = 'wget'
|
||||
OUTPUT_DIR = '.'
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
@@ -66,7 +66,7 @@ def get_env_int(name: str, default: int = 0) -> int:
|
||||
return default
|
||||
|
||||
|
||||
STATICFILE_DIR = 'staticfile'
|
||||
STATICFILE_DIR = '../staticfile'
|
||||
|
||||
def has_staticfile_output() -> bool:
|
||||
"""Check if staticfile extractor already downloaded this URL."""
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -57,13 +57,24 @@
|
||||
box-shadow: 0 0 8px #3fb950;
|
||||
animation: pulse 2s infinite;
|
||||
}
|
||||
#progress-monitor .status-dot.idle {
|
||||
background: #d29922;
|
||||
box-shadow: 0 0 4px #d29922;
|
||||
}
|
||||
#progress-monitor .status-dot.stopped {
|
||||
background: #f85149;
|
||||
background: #6e7681;
|
||||
}
|
||||
#progress-monitor .status-dot.flash {
|
||||
animation: flash 0.3s ease-out;
|
||||
}
|
||||
@keyframes pulse {
|
||||
0%, 100% { opacity: 1; box-shadow: 0 0 8px #3fb950; }
|
||||
50% { opacity: 0.6; box-shadow: 0 0 4px #3fb950; }
|
||||
}
|
||||
@keyframes flash {
|
||||
0% { transform: scale(1.5); }
|
||||
100% { transform: scale(1); }
|
||||
}
|
||||
|
||||
/* Stats */
|
||||
#progress-monitor .stats {
|
||||
@@ -89,6 +100,19 @@
|
||||
#progress-monitor .stat-value.error { color: #f85149; }
|
||||
#progress-monitor .stat-value.warning { color: #d29922; }
|
||||
#progress-monitor .stat-value.info { color: #58a6ff; }
|
||||
#progress-monitor .stat.clickable {
|
||||
cursor: pointer;
|
||||
padding: 2px 6px;
|
||||
margin: -2px -6px;
|
||||
border-radius: 4px;
|
||||
transition: background 0.2s;
|
||||
}
|
||||
#progress-monitor .stat.clickable:hover {
|
||||
background: rgba(255,255,255,0.1);
|
||||
}
|
||||
#progress-monitor .stat.clickable:active {
|
||||
background: rgba(255,255,255,0.2);
|
||||
}
|
||||
|
||||
/* Toggle Button */
|
||||
#progress-monitor .toggle-btn {
|
||||
@@ -259,48 +283,86 @@
|
||||
padding: 0 12px 8px;
|
||||
}
|
||||
|
||||
/* Extractor List */
|
||||
/* Extractor List - Compact Badge Layout */
|
||||
#progress-monitor .extractor-list {
|
||||
padding: 8px 12px;
|
||||
background: rgba(0,0,0,0.2);
|
||||
border-top: 1px solid #21262d;
|
||||
display: flex;
|
||||
flex-wrap: wrap;
|
||||
gap: 4px;
|
||||
}
|
||||
#progress-monitor .extractor-item {
|
||||
#progress-monitor .extractor-badge {
|
||||
position: relative;
|
||||
display: inline-flex;
|
||||
align-items: center;
|
||||
gap: 4px;
|
||||
padding: 3px 8px;
|
||||
border-radius: 4px;
|
||||
font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
|
||||
font-size: 10px;
|
||||
background: #21262d;
|
||||
overflow: hidden;
|
||||
white-space: nowrap;
|
||||
}
|
||||
#progress-monitor .extractor-badge .progress-fill {
|
||||
position: absolute;
|
||||
top: 0;
|
||||
left: 0;
|
||||
bottom: 0;
|
||||
z-index: 0;
|
||||
transition: width 0.3s ease-out;
|
||||
}
|
||||
#progress-monitor .extractor-badge .badge-content {
|
||||
position: relative;
|
||||
z-index: 1;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 8px;
|
||||
padding: 4px 0;
|
||||
gap: 4px;
|
||||
}
|
||||
#progress-monitor .extractor-icon {
|
||||
font-size: 12px;
|
||||
width: 16px;
|
||||
text-align: center;
|
||||
#progress-monitor .extractor-badge.queued {
|
||||
color: #8b949e;
|
||||
}
|
||||
#progress-monitor .extractor-icon.running {
|
||||
#progress-monitor .extractor-badge.queued .progress-fill {
|
||||
background: rgba(110, 118, 129, 0.2);
|
||||
width: 0%;
|
||||
}
|
||||
#progress-monitor .extractor-badge.started {
|
||||
color: #d29922;
|
||||
animation: spin 1s linear infinite;
|
||||
}
|
||||
#progress-monitor .extractor-icon.success {
|
||||
#progress-monitor .extractor-badge.started .progress-fill {
|
||||
background: rgba(210, 153, 34, 0.3);
|
||||
width: 50%;
|
||||
animation: progress-pulse 1.5s ease-in-out infinite;
|
||||
}
|
||||
@keyframes progress-pulse {
|
||||
0%, 100% { opacity: 0.5; }
|
||||
50% { opacity: 1; }
|
||||
}
|
||||
#progress-monitor .extractor-badge.succeeded {
|
||||
color: #3fb950;
|
||||
}
|
||||
#progress-monitor .extractor-icon.failed {
|
||||
#progress-monitor .extractor-badge.succeeded .progress-fill {
|
||||
background: rgba(63, 185, 80, 0.25);
|
||||
width: 100%;
|
||||
}
|
||||
#progress-monitor .extractor-badge.failed {
|
||||
color: #f85149;
|
||||
}
|
||||
#progress-monitor .extractor-icon.pending {
|
||||
color: #8b949e;
|
||||
#progress-monitor .extractor-badge.failed .progress-fill {
|
||||
background: rgba(248, 81, 73, 0.25);
|
||||
width: 100%;
|
||||
}
|
||||
#progress-monitor .extractor-badge .badge-icon {
|
||||
font-size: 10px;
|
||||
}
|
||||
#progress-monitor .extractor-badge.started .badge-icon {
|
||||
animation: spin 1s linear infinite;
|
||||
}
|
||||
@keyframes spin {
|
||||
from { transform: rotate(0deg); }
|
||||
to { transform: rotate(360deg); }
|
||||
}
|
||||
#progress-monitor .extractor-name {
|
||||
flex: 1;
|
||||
font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
|
||||
font-size: 11px;
|
||||
}
|
||||
#progress-monitor .extractor-progress {
|
||||
width: 60px;
|
||||
}
|
||||
|
||||
/* Status Badge */
|
||||
#progress-monitor .status-badge {
|
||||
@@ -356,11 +418,11 @@
|
||||
<span class="stat-label">Queued</span>
|
||||
<span class="stat-value warning" id="total-queued">0</span>
|
||||
</div>
|
||||
<div class="stat">
|
||||
<div class="stat clickable" id="stat-succeeded" title="Click to reset counter">
|
||||
<span class="stat-label">Done</span>
|
||||
<span class="stat-value success" id="total-succeeded">0</span>
|
||||
</div>
|
||||
<div class="stat">
|
||||
<div class="stat clickable" id="stat-failed" title="Click to reset counter">
|
||||
<span class="stat-label">Failed</span>
|
||||
<span class="stat-value error" id="total-failed">0</span>
|
||||
</div>
|
||||
@@ -390,6 +452,24 @@
|
||||
let expandedCrawls = new Set(JSON.parse(localStorage.getItem('progress-monitor-expanded-crawls') || '[]'));
|
||||
let expandedSnapshots = new Set(JSON.parse(localStorage.getItem('progress-monitor-expanded-snapshots') || '[]'));
|
||||
|
||||
// Baselines for resettable counters
|
||||
let succeededBaseline = parseInt(localStorage.getItem('progress-succeeded-baseline') || '0');
|
||||
let failedBaseline = parseInt(localStorage.getItem('progress-failed-baseline') || '0');
|
||||
let lastSucceeded = 0;
|
||||
let lastFailed = 0;
|
||||
|
||||
// Click handlers for resetting counters
|
||||
document.getElementById('stat-succeeded').addEventListener('click', function() {
|
||||
succeededBaseline = lastSucceeded;
|
||||
localStorage.setItem('progress-succeeded-baseline', succeededBaseline);
|
||||
document.getElementById('total-succeeded').textContent = '0';
|
||||
});
|
||||
document.getElementById('stat-failed').addEventListener('click', function() {
|
||||
failedBaseline = lastFailed;
|
||||
localStorage.setItem('progress-failed-baseline', failedBaseline);
|
||||
document.getElementById('total-failed').textContent = '0';
|
||||
});
|
||||
|
||||
function formatUrl(url) {
|
||||
try {
|
||||
const u = new URL(url);
|
||||
@@ -400,24 +480,18 @@
|
||||
}
|
||||
|
||||
function renderExtractor(extractor) {
|
||||
const iconClass = extractor.status === 'started' ? 'running' :
|
||||
extractor.status === 'succeeded' ? 'success' :
|
||||
extractor.status === 'failed' ? 'failed' : 'pending';
|
||||
const icon = extractor.status === 'started' ? '↻' :
|
||||
extractor.status === 'succeeded' ? '✓' :
|
||||
extractor.status === 'failed' ? '✗' : '○';
|
||||
|
||||
return `
|
||||
<div class="extractor-item">
|
||||
<span class="extractor-icon ${iconClass}">${icon}</span>
|
||||
<span class="extractor-name">${extractor.extractor}</span>
|
||||
<div class="extractor-progress">
|
||||
<div class="progress-bar-container">
|
||||
<div class="progress-bar extractor ${extractor.status === 'started' ? 'indeterminate' : ''}"
|
||||
style="width: ${extractor.status === 'succeeded' ? '100' : extractor.status === 'failed' ? '100' : extractor.progress}%"></div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<span class="extractor-badge ${extractor.status}">
|
||||
<span class="progress-fill"></span>
|
||||
<span class="badge-content">
|
||||
<span class="badge-icon">${icon}</span>
|
||||
<span>${extractor.extractor}</span>
|
||||
</span>
|
||||
</span>
|
||||
`;
|
||||
}
|
||||
|
||||
@@ -427,10 +501,14 @@
|
||||
const statusIcon = snapshot.status === 'started' ? '↻' : '📄';
|
||||
|
||||
let extractorHtml = '';
|
||||
if (snapshot.active_extractors && snapshot.active_extractors.length > 0) {
|
||||
if (snapshot.all_extractors && snapshot.all_extractors.length > 0) {
|
||||
// Sort extractors alphabetically by name to prevent reordering on updates
|
||||
const sortedExtractors = [...snapshot.all_extractors].sort((a, b) =>
|
||||
a.extractor.localeCompare(b.extractor)
|
||||
);
|
||||
extractorHtml = `
|
||||
<div class="extractor-list" style="${isExpanded ? '' : 'display:none'}">
|
||||
${snapshot.active_extractors.map(e => renderExtractor(e)).join('')}
|
||||
${sortedExtractors.map(e => renderExtractor(e)).join('')}
|
||||
</div>
|
||||
`;
|
||||
}
|
||||
@@ -438,7 +516,7 @@
|
||||
return `
|
||||
<div class="snapshot-item" data-snapshot-key="${snapshotKey}">
|
||||
<div class="snapshot-header" onclick="window.toggleSnapshot('${snapshotKey}')">
|
||||
<span class="expand-icon ${isExpanded ? 'expanded' : ''}">${snapshot.active_extractors?.length ? '▶' : ''}</span>
|
||||
<span class="expand-icon ${isExpanded ? 'expanded' : ''}">${snapshot.all_extractors?.length ? '▶' : ''}</span>
|
||||
<span class="snapshot-icon">${statusIcon}</span>
|
||||
<div class="snapshot-info">
|
||||
<div class="snapshot-url">${formatUrl(snapshot.url)}</div>
|
||||
@@ -469,6 +547,40 @@
|
||||
snapshotsHtml = crawl.active_snapshots.map(s => renderSnapshot(s, crawl.id)).join('');
|
||||
}
|
||||
|
||||
// Show warning if crawl is stuck (queued but can't start)
|
||||
let warningHtml = '';
|
||||
if (crawl.status === 'queued' && !crawl.can_start) {
|
||||
warningHtml = `
|
||||
<div style="padding: 8px 14px; background: rgba(248, 81, 73, 0.1); border-top: 1px solid #f85149; color: #f85149; font-size: 11px;">
|
||||
⚠️ Crawl cannot start: ${crawl.seed_uri ? 'unknown error' : 'no seed URI'}
|
||||
</div>
|
||||
`;
|
||||
} else if (crawl.status === 'queued' && crawl.retry_at_future) {
|
||||
// Queued but retry_at is in future (was claimed by worker, will retry)
|
||||
warningHtml = `
|
||||
<div style="padding: 8px 14px; background: rgba(88, 166, 255, 0.1); border-top: 1px solid #58a6ff; color: #58a6ff; font-size: 11px;">
|
||||
🔄 Retrying in ${crawl.seconds_until_retry}s...${crawl.seed_uri ? ` (${crawl.seed_uri})` : ''}
|
||||
</div>
|
||||
`;
|
||||
} else if (crawl.status === 'queued' && crawl.total_snapshots === 0) {
|
||||
// Queued and waiting to be picked up by worker
|
||||
warningHtml = `
|
||||
<div style="padding: 8px 14px; background: rgba(210, 153, 34, 0.1); border-top: 1px solid #d29922; color: #d29922; font-size: 11px;">
|
||||
⏳ Waiting for worker to pick up...${crawl.seed_uri ? ` (${crawl.seed_uri})` : ''}
|
||||
</div>
|
||||
`;
|
||||
}
|
||||
|
||||
// Show snapshot info or URL count if no snapshots yet
|
||||
let metaText = `depth: ${crawl.max_depth}`;
|
||||
if (crawl.total_snapshots > 0) {
|
||||
metaText += ` | ${crawl.total_snapshots} snapshots`;
|
||||
} else if (crawl.urls_count > 0) {
|
||||
metaText += ` | ${crawl.urls_count} URLs`;
|
||||
} else if (crawl.seed_uri) {
|
||||
metaText += ` | ${crawl.seed_uri.substring(0, 40)}${crawl.seed_uri.length > 40 ? '...' : ''}`;
|
||||
}
|
||||
|
||||
return `
|
||||
<div class="crawl-item" data-crawl-id="${crawl.id}">
|
||||
<div class="crawl-header" onclick="window.toggleCrawl('${crawl.id}')">
|
||||
@@ -476,10 +588,11 @@
|
||||
<span class="crawl-icon">${statusIcon}</span>
|
||||
<div class="crawl-info">
|
||||
<div class="crawl-label">${crawl.label}</div>
|
||||
<div class="crawl-meta">depth: ${crawl.max_depth} | ${crawl.total_snapshots} snapshots</div>
|
||||
<div class="crawl-meta">${metaText}</div>
|
||||
</div>
|
||||
<div class="crawl-stats">
|
||||
<span style="color:#3fb950">${crawl.completed_snapshots} done</span>
|
||||
<span style="color:#d29922">${crawl.started_snapshots || 0} active</span>
|
||||
<span style="color:#8b949e">${crawl.pending_snapshots} pending</span>
|
||||
</div>
|
||||
<span class="status-badge ${crawl.status}">${crawl.status}</span>
|
||||
@@ -490,6 +603,7 @@
|
||||
style="width: ${crawl.progress}%"></div>
|
||||
</div>
|
||||
</div>
|
||||
${warningHtml}
|
||||
<div class="crawl-body" style="${isExpanded ? '' : 'display:none'}">
|
||||
<div class="snapshot-list">
|
||||
${snapshotsHtml}
|
||||
@@ -542,25 +656,48 @@
|
||||
data.snapshots_pending > 0 || data.snapshots_started > 0 ||
|
||||
data.archiveresults_pending > 0 || data.archiveresults_started > 0;
|
||||
|
||||
// Update orchestrator status
|
||||
// Update orchestrator status - show "Running" only when there's actual activity
|
||||
// Don't distinguish between "Stopped" and "Idle" since orchestrator starts/stops frequently
|
||||
const dot = document.getElementById('orchestrator-dot');
|
||||
const text = document.getElementById('orchestrator-text');
|
||||
if (data.orchestrator_running) {
|
||||
dot.classList.remove('stopped');
|
||||
const hasWorkers = data.total_workers > 0;
|
||||
|
||||
if (hasWorkers || hasActivity) {
|
||||
dot.classList.remove('stopped', 'idle');
|
||||
dot.classList.add('running');
|
||||
text.textContent = 'Running';
|
||||
} else {
|
||||
dot.classList.remove('running');
|
||||
dot.classList.add('stopped');
|
||||
text.textContent = 'Stopped';
|
||||
// No activity - show as idle (whether orchestrator process exists or not)
|
||||
dot.classList.remove('stopped', 'running');
|
||||
dot.classList.add('idle');
|
||||
text.textContent = 'Idle';
|
||||
}
|
||||
|
||||
// Pulse the dot to show we got fresh data
|
||||
dot.classList.add('flash');
|
||||
setTimeout(() => dot.classList.remove('flash'), 300);
|
||||
|
||||
// Update stats
|
||||
document.getElementById('worker-count').textContent = data.total_workers;
|
||||
document.getElementById('total-queued').textContent =
|
||||
data.crawls_pending + data.snapshots_pending + data.archiveresults_pending;
|
||||
document.getElementById('total-succeeded').textContent = data.archiveresults_succeeded;
|
||||
document.getElementById('total-failed').textContent = data.archiveresults_failed;
|
||||
|
||||
// Store raw values and display relative to baseline
|
||||
lastSucceeded = data.archiveresults_succeeded;
|
||||
lastFailed = data.archiveresults_failed;
|
||||
|
||||
// If baseline is higher than current (e.g. after DB reset), reset baseline
|
||||
if (succeededBaseline > lastSucceeded) {
|
||||
succeededBaseline = 0;
|
||||
localStorage.setItem('progress-succeeded-baseline', '0');
|
||||
}
|
||||
if (failedBaseline > lastFailed) {
|
||||
failedBaseline = 0;
|
||||
localStorage.setItem('progress-failed-baseline', '0');
|
||||
}
|
||||
|
||||
document.getElementById('total-succeeded').textContent = lastSucceeded - succeededBaseline;
|
||||
document.getElementById('total-failed').textContent = lastFailed - failedBaseline;
|
||||
|
||||
// Render crawl tree
|
||||
if (data.active_crawls.length > 0) {
|
||||
|
||||
@@ -7,9 +7,14 @@ class Command(BaseCommand):
|
||||
help = 'Run the archivebox orchestrator'
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--daemon', '-d', action='store_true', help="Run forever (don't exit on idle)")
|
||||
parser.add_argument(
|
||||
'--exit-on-idle',
|
||||
action='store_true',
|
||||
default=False,
|
||||
help="Exit when all work is complete (default: run forever)"
|
||||
)
|
||||
|
||||
def handle(self, *args, **kwargs):
|
||||
daemon = kwargs.get('daemon', False)
|
||||
orchestrator = Orchestrator(exit_on_idle=not daemon)
|
||||
exit_on_idle = kwargs.get('exit_on_idle', False)
|
||||
orchestrator = Orchestrator(exit_on_idle=exit_on_idle)
|
||||
orchestrator.runloop()
|
||||
|
||||
@@ -12,16 +12,17 @@ Architecture:
|
||||
└── Each worker spawns task subprocesses via CLI
|
||||
|
||||
Usage:
|
||||
# Embedded in other commands (exits when done)
|
||||
# Default: runs forever (for use as subprocess of server)
|
||||
orchestrator = Orchestrator(exit_on_idle=False)
|
||||
orchestrator.runloop()
|
||||
|
||||
# Exit when done (for embedded use in other commands)
|
||||
orchestrator = Orchestrator(exit_on_idle=True)
|
||||
orchestrator.runloop()
|
||||
|
||||
# Daemon mode (runs forever)
|
||||
orchestrator = Orchestrator(exit_on_idle=False)
|
||||
orchestrator.start() # fork and return
|
||||
|
||||
|
||||
# Or run via CLI
|
||||
archivebox orchestrator [--daemon]
|
||||
archivebox manage orchestrator # runs forever
|
||||
archivebox manage orchestrator --exit-on-idle # exits when done
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.workers'
|
||||
@@ -45,6 +46,14 @@ from .pid_utils import (
|
||||
)
|
||||
|
||||
|
||||
def _run_orchestrator_process(exit_on_idle: bool) -> None:
|
||||
"""Top-level function for multiprocessing (must be picklable)."""
|
||||
from archivebox.config.django import setup_django
|
||||
setup_django()
|
||||
orchestrator = Orchestrator(exit_on_idle=exit_on_idle)
|
||||
orchestrator.runloop()
|
||||
|
||||
|
||||
class Orchestrator:
|
||||
"""
|
||||
Manages worker processes by polling queues and spawning workers as needed.
|
||||
@@ -277,12 +286,12 @@ class Orchestrator:
|
||||
Fork orchestrator as a background process.
|
||||
Returns the PID of the new process.
|
||||
"""
|
||||
def run_orchestrator():
|
||||
from archivebox.config.django import setup_django
|
||||
setup_django()
|
||||
self.runloop()
|
||||
|
||||
proc = Process(target=run_orchestrator, name='orchestrator')
|
||||
# Use module-level function to avoid pickle errors with local functions
|
||||
proc = Process(
|
||||
target=_run_orchestrator_process,
|
||||
args=(self.exit_on_idle,),
|
||||
name='orchestrator'
|
||||
)
|
||||
proc.start()
|
||||
|
||||
assert proc.pid is not None
|
||||
|
||||
@@ -28,7 +28,7 @@ WORKERS_DIR_NAME = "workers"
|
||||
|
||||
ORCHESTRATOR_WORKER = {
|
||||
"name": "worker_orchestrator",
|
||||
"command": "archivebox manage orchestrator",
|
||||
"command": "archivebox manage orchestrator", # runs forever by default
|
||||
"autostart": "true",
|
||||
"autorestart": "true",
|
||||
"stdout_logfile": "logs/worker_orchestrator.log",
|
||||
@@ -332,14 +332,14 @@ def stop_worker(supervisor, daemon_name):
|
||||
|
||||
def tail_worker_logs(log_path: str):
|
||||
get_or_create_supervisord_process(daemonize=False)
|
||||
|
||||
|
||||
from rich.live import Live
|
||||
from rich.table import Table
|
||||
|
||||
|
||||
table = Table()
|
||||
table.add_column("TS")
|
||||
table.add_column("URL")
|
||||
|
||||
|
||||
try:
|
||||
with Live(table, refresh_per_second=1) as live: # update 4 times a second to feel fluid
|
||||
with open(log_path, 'r') as f:
|
||||
@@ -352,6 +352,83 @@ def tail_worker_logs(log_path: str):
|
||||
except SystemExit:
|
||||
pass
|
||||
|
||||
|
||||
def tail_multiple_worker_logs(log_files: list[str], follow=True):
|
||||
"""Tail multiple log files simultaneously, interleaving their output."""
|
||||
import select
|
||||
from pathlib import Path
|
||||
|
||||
# Convert relative paths to absolute paths
|
||||
log_paths = []
|
||||
for log_file in log_files:
|
||||
log_path = Path(log_file)
|
||||
if not log_path.is_absolute():
|
||||
log_path = CONSTANTS.DATA_DIR / log_path
|
||||
|
||||
# Create log file if it doesn't exist
|
||||
if not log_path.exists():
|
||||
log_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
log_path.touch()
|
||||
|
||||
log_paths.append(log_path)
|
||||
|
||||
# Open all log files
|
||||
file_handles = []
|
||||
for log_path in log_paths:
|
||||
try:
|
||||
f = open(log_path, 'r')
|
||||
# Seek to end of file if following
|
||||
if follow:
|
||||
f.seek(0, 2) # Seek to end
|
||||
file_handles.append((log_path.name, f))
|
||||
except Exception as e:
|
||||
print(f"[yellow]Warning: Could not open {log_path}: {e}[/yellow]")
|
||||
|
||||
if not file_handles:
|
||||
print("[red]No log files could be opened[/red]")
|
||||
return
|
||||
|
||||
# Print which logs we're tailing
|
||||
log_names = [name for name, _ in file_handles]
|
||||
print(f"[dim]Tailing: {', '.join(log_names)}[/dim]")
|
||||
print()
|
||||
|
||||
try:
|
||||
while follow:
|
||||
# Read available lines from all files
|
||||
for log_name, f in file_handles:
|
||||
line = f.readline()
|
||||
if line:
|
||||
# Colorize based on log source
|
||||
if 'orchestrator' in log_name.lower():
|
||||
color = 'cyan'
|
||||
elif 'daphne' in log_name.lower():
|
||||
color = 'green'
|
||||
else:
|
||||
color = 'white'
|
||||
|
||||
# Strip ANSI codes if present (supervisord does this but just in case)
|
||||
import re
|
||||
line_clean = re.sub(r'\x1b\[[0-9;]*m', '', line.rstrip())
|
||||
|
||||
if line_clean:
|
||||
print(f'[{color}][{log_name}][/{color}] {line_clean}')
|
||||
|
||||
# Small sleep to avoid busy-waiting
|
||||
time.sleep(0.1)
|
||||
|
||||
except (KeyboardInterrupt, BrokenPipeError, IOError):
|
||||
print("\n[yellow][i] Stopped tailing logs[/i][/yellow]")
|
||||
except SystemExit:
|
||||
pass
|
||||
finally:
|
||||
# Close all file handles
|
||||
for _, f in file_handles:
|
||||
try:
|
||||
f.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def watch_worker(supervisor, daemon_name, interval=5):
|
||||
"""loop continuously and monitor worker's health"""
|
||||
while True:
|
||||
|
||||
@@ -3,6 +3,9 @@ Background task functions for queuing work to the orchestrator.
|
||||
|
||||
These functions queue Snapshots/Crawls for processing by setting their status
|
||||
to QUEUED, which the orchestrator workers will pick up and process.
|
||||
|
||||
NOTE: These functions do NOT start the orchestrator - they assume it's already
|
||||
running via `archivebox server` (supervisord) or will be run inline by the CLI.
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.workers'
|
||||
@@ -10,16 +13,6 @@ __package__ = 'archivebox.workers'
|
||||
from django.utils import timezone
|
||||
|
||||
|
||||
def ensure_orchestrator_running():
|
||||
"""Ensure the orchestrator is running to process queued items."""
|
||||
from .orchestrator import Orchestrator
|
||||
|
||||
if not Orchestrator.is_running():
|
||||
# Start orchestrator in background
|
||||
orchestrator = Orchestrator(exit_on_idle=True)
|
||||
orchestrator.start()
|
||||
|
||||
|
||||
def bg_add(add_kwargs: dict) -> int:
|
||||
"""
|
||||
Add URLs and queue them for archiving.
|
||||
@@ -36,9 +29,6 @@ def bg_add(add_kwargs: dict) -> int:
|
||||
|
||||
result = add(**add_kwargs)
|
||||
|
||||
# Ensure orchestrator is running to process the new snapshots
|
||||
ensure_orchestrator_running()
|
||||
|
||||
return len(result) if result else 0
|
||||
|
||||
|
||||
@@ -66,10 +56,6 @@ def bg_archive_snapshots(snapshots, kwargs: dict | None = None) -> int:
|
||||
)
|
||||
queued_count += 1
|
||||
|
||||
# Ensure orchestrator is running to process the queued snapshots
|
||||
if queued_count > 0:
|
||||
ensure_orchestrator_running()
|
||||
|
||||
return queued_count
|
||||
|
||||
|
||||
@@ -90,9 +76,6 @@ def bg_archive_snapshot(snapshot, overwrite: bool = False, methods: list | None
|
||||
status=Snapshot.StatusChoices.QUEUED,
|
||||
retry_at=timezone.now(),
|
||||
)
|
||||
|
||||
# Ensure orchestrator is running to process the queued snapshot
|
||||
ensure_orchestrator_running()
|
||||
return 1
|
||||
|
||||
return 0
|
||||
|
||||
@@ -67,8 +67,8 @@ class Worker:
|
||||
# Configuration (can be overridden by subclasses)
|
||||
MAX_TICK_TIME: ClassVar[int] = 60
|
||||
MAX_CONCURRENT_TASKS: ClassVar[int] = 1
|
||||
POLL_INTERVAL: ClassVar[float] = 0.5
|
||||
IDLE_TIMEOUT: ClassVar[int] = 3 # Exit after N idle iterations (set to 0 to never exit)
|
||||
POLL_INTERVAL: ClassVar[float] = 1.0
|
||||
IDLE_TIMEOUT: ClassVar[int] = 10 # Exit after N idle iterations (10 sec at 1.0 poll interval)
|
||||
|
||||
def __init__(self, worker_id: int = 0, daemon: bool = False, **kwargs: Any):
|
||||
self.worker_id = worker_id
|
||||
|
||||
Reference in New Issue
Block a user