diff --git a/archivebox/base_models/models.py b/archivebox/base_models/models.py
index 8037f42d..dafa428f 100644
--- a/archivebox/base_models/models.py
+++ b/archivebox/base_models/models.py
@@ -122,12 +122,10 @@ class ModelWithOutputDir(ModelWithSerializers):
class Meta:
abstract = True
- def save(self, *args, write_indexes=False, **kwargs):
+ def save(self, *args, **kwargs):
super().save(*args, **kwargs)
self.OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
self.save_json_index()
- if write_indexes:
- self.write_indexes()
@property
def output_dir_parent(self) -> str:
@@ -145,17 +143,5 @@ class ModelWithOutputDir(ModelWithSerializers):
def OUTPUT_DIR(self) -> Path:
return DATA_DIR / self.output_dir_str
- def write_indexes(self):
- self.OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
- self.save_merkle_index()
- self.save_html_index()
-
- def save_merkle_index(self):
- with open(self.OUTPUT_DIR / '.hashes.json', 'w') as f:
- json.dump(get_dir_info(self.OUTPUT_DIR, max_depth=6), f)
-
- def save_html_index(self):
- (self.OUTPUT_DIR / 'index.html').write_text(str(self))
-
def save_json_index(self):
(self.OUTPUT_DIR / 'index.json').write_text(to_json(self.as_json()))
diff --git a/archivebox/cli/archivebox_server.py b/archivebox/cli/archivebox_server.py
index 146e047c..fb0b1148 100644
--- a/archivebox/cli/archivebox_server.py
+++ b/archivebox/cli/archivebox_server.py
@@ -72,30 +72,42 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
get_worker,
start_server_workers,
tail_multiple_worker_logs,
+ is_port_in_use,
)
+ from workers.orchestrator import Orchestrator
+ import sys
+
+ # Check if port is already in use
+ if is_port_in_use(host, int(port)):
+ print(f'[red][X] Error: Port {port} is already in use[/red]')
+ print(f' Another process (possibly daphne) is already listening on {host}:{port}')
+ print(f' Stop the conflicting process or choose a different port')
+ sys.exit(1)
+
+ # Check if orchestrator is already running for this data directory
+ if Orchestrator.is_running():
+ print(f'[red][X] Error: ArchiveBox orchestrator is already running for this data directory[/red]')
+ print(f' Stop the existing orchestrator before starting a new server')
+ print(f' To stop: pkill -f "archivebox manage orchestrator"')
+ sys.exit(1)
# Check if supervisord is already running
supervisor = get_existing_supervisord_process()
if supervisor:
daphne_proc = get_worker(supervisor, 'worker_daphne')
- # If daphne is already running, just tail logs
+ # If daphne is already running, error out
if daphne_proc and daphne_proc.get('statename') == 'RUNNING':
orchestrator_proc = get_worker(supervisor, 'worker_orchestrator')
- print('[yellow][!] ArchiveBox server is already running[/yellow]')
+ print('[red][X] Error: ArchiveBox server is already running[/red]')
print(f' [green]√[/green] Web server (worker_daphne) is RUNNING on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
if orchestrator_proc and orchestrator_proc.get('statename') == 'RUNNING':
print(f' [green]√[/green] Background worker (worker_orchestrator) is RUNNING')
print()
- print('[blue][i] Tailing worker logs (Ctrl+C to stop watching)...[/i][/blue]')
- print()
-
- # Tail logs for both workers
- tail_multiple_worker_logs(
- log_files=['logs/worker_daphne.log', 'logs/worker_orchestrator.log'],
- follow=True,
- )
- return
+ print('[yellow]To stop the existing server, run:[/yellow]')
+ print(' pkill -f "archivebox server"')
+ print(' pkill -f supervisord')
+ sys.exit(1)
# Otherwise, daphne is not running - fall through to start it
# No existing workers found - start new ones
diff --git a/archivebox/config/views.py b/archivebox/config/views.py
index 0f1c33b6..66b8de4d 100644
--- a/archivebox/config/views.py
+++ b/archivebox/config/views.py
@@ -91,31 +91,43 @@ def get_detected_binaries() -> Dict[str, Dict[str, Any]]:
def get_filesystem_plugins() -> Dict[str, Dict[str, Any]]:
"""Discover plugins from filesystem directories."""
+ import json
from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR
-
+
plugins = {}
-
+
for base_dir, source in [(BUILTIN_PLUGINS_DIR, 'builtin'), (USER_PLUGINS_DIR, 'user')]:
if not base_dir.exists():
continue
-
+
for plugin_dir in base_dir.iterdir():
if plugin_dir.is_dir() and not plugin_dir.name.startswith('_'):
plugin_id = f'{source}.{plugin_dir.name}'
-
+
# Find hook scripts
hooks = []
for ext in ('sh', 'py', 'js'):
hooks.extend(plugin_dir.glob(f'on_*__*.{ext}'))
-
+
+ # Load config.json if it exists
+ config_file = plugin_dir / 'config.json'
+ config_data = None
+ if config_file.exists():
+ try:
+ with open(config_file, 'r') as f:
+ config_data = json.load(f)
+ except (json.JSONDecodeError, IOError):
+ config_data = None
+
plugins[plugin_id] = {
'id': plugin_id,
'name': plugin_dir.name,
'path': str(plugin_dir),
'source': source,
'hooks': [str(h.name) for h in hooks],
+ 'config': config_data,
}
-
+
return plugins
@@ -242,6 +254,7 @@ def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext:
"Source": [],
"Path": [],
"Hooks": [],
+ "Config": [],
}
plugins = get_filesystem_plugins()
@@ -252,12 +265,21 @@ def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext:
rows['Path'].append(format_html('{}', plugin['path']))
rows['Hooks'].append(', '.join(plugin['hooks']) or '(none)')
+ # Show config status
+ if plugin.get('config'):
+ config_properties = plugin['config'].get('properties', {})
+ config_count = len(config_properties)
+ rows['Config'].append(f'✅ {config_count} properties' if config_count > 0 else '✅ present')
+ else:
+ rows['Config'].append('❌ none')
+
if not plugins:
# Show a helpful message when no plugins found
rows['Name'].append('(no plugins found)')
rows['Source'].append('-')
rows['Path'].append(mark_safe('archivebox/plugins/ or data/plugins/'))
rows['Hooks'].append('-')
+ rows['Config'].append('-')
return TableContext(
title="Installed plugins",
@@ -266,11 +288,12 @@ def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext:
@render_with_item_view
def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
+ import json
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
plugins = get_filesystem_plugins()
-
+
plugin = plugins.get(key)
if not plugin:
return ItemContext(
@@ -279,6 +302,33 @@ def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
data=[],
)
+ # Base fields that all plugins have
+ fields = {
+ "id": plugin['id'],
+ "name": plugin['name'],
+ "source": plugin['source'],
+ "path": plugin['path'],
+ "hooks": plugin['hooks'],
+ }
+
+ # Add config.json data if available
+ if plugin.get('config'):
+ config_json = json.dumps(plugin['config'], indent=2)
+ fields["config.json"] = mark_safe(f'
{config_json}
')
+
+ # Also extract and display individual config properties for easier viewing
+ if 'properties' in plugin['config']:
+ config_properties = plugin['config']['properties']
+ properties_summary = []
+ for prop_name, prop_info in config_properties.items():
+ prop_type = prop_info.get('type', 'unknown')
+ prop_default = prop_info.get('default', 'N/A')
+ prop_desc = prop_info.get('description', '')
+ properties_summary.append(f"• {prop_name} ({prop_type}): {prop_desc}")
+
+ if properties_summary:
+ fields["Config Properties"] = mark_safe('
'.join(properties_summary))
+
return ItemContext(
slug=key,
title=plugin['name'],
@@ -286,13 +336,7 @@ def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
{
"name": plugin['name'],
"description": plugin['path'],
- "fields": {
- "id": plugin['id'],
- "name": plugin['name'],
- "source": plugin['source'],
- "path": plugin['path'],
- "hooks": plugin['hooks'],
- },
+ "fields": fields,
"help_texts": {},
},
],
diff --git a/archivebox/core/admin_archiveresults.py b/archivebox/core/admin_archiveresults.py
index 59864571..082b11f8 100644
--- a/archivebox/core/admin_archiveresults.py
+++ b/archivebox/core/admin_archiveresults.py
@@ -22,7 +22,7 @@ from core.models import ArchiveResult, Snapshot
def render_archiveresults_list(archiveresults_qs, limit=50):
"""Render a nice inline list view of archive results with status, extractor, output, and actions."""
- results = list(archiveresults_qs.order_by('-end_ts').select_related('snapshot')[:limit])
+ results = list(archiveresults_qs.order_by('extractor').select_related('snapshot')[:limit])
if not results:
return mark_safe('No Archive Results yet...
')
@@ -239,7 +239,7 @@ class ArchiveResultInline(admin.TabularInline):
class ArchiveResultAdmin(BaseModelAdmin):
list_display = ('id', 'created_by', 'created_at', 'snapshot_info', 'tags_str', 'status', 'extractor_with_icon', 'cmd_str', 'output_str')
sort_fields = ('id', 'created_by', 'created_at', 'extractor', 'status')
- readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'extractor_with_icon')
+ readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'extractor_with_icon', 'iface')
search_fields = ('id', 'snapshot__url', 'extractor', 'output', 'cmd_version', 'cmd', 'snapshot__timestamp')
autocomplete_fields = ['snapshot']
@@ -249,7 +249,7 @@ class ArchiveResultAdmin(BaseModelAdmin):
'classes': ('card', 'wide'),
}),
('Extractor', {
- 'fields': ('extractor', 'extractor_with_icon', 'status', 'retry_at'),
+ 'fields': ('extractor', 'extractor_with_icon', 'status', 'retry_at', 'iface'),
'classes': ('card',),
}),
('Timing', {
diff --git a/archivebox/core/admin_site.py b/archivebox/core/admin_site.py
index 67e074ac..6b3fe678 100644
--- a/archivebox/core/admin_site.py
+++ b/archivebox/core/admin_site.py
@@ -12,7 +12,7 @@ class ArchiveBoxAdmin(admin.AdminSite):
archivebox_admin = ArchiveBoxAdmin()
-archivebox_admin.disable_action('delete_selected')
+# Note: delete_selected is enabled per-model via actions = ['delete_selected'] in each ModelAdmin
# TODO: https://stackoverflow.com/questions/40760880/add-custom-button-to-django-admin-panel
diff --git a/archivebox/core/admin_snapshots.py b/archivebox/core/admin_snapshots.py
index bd73c363..4f0217a3 100644
--- a/archivebox/core/admin_snapshots.py
+++ b/archivebox/core/admin_snapshots.py
@@ -23,7 +23,7 @@ from archivebox.search.admin import SearchResultsAdminMixin
from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
from archivebox.workers.tasks import bg_archive_snapshots, bg_add
-from core.models import Tag
+from core.models import Tag, Snapshot
from core.admin_tags import TagInline
from core.admin_archiveresults import ArchiveResultInline, render_archiveresults_list
@@ -262,6 +262,10 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
for tag in obj.tags.all()
if str(tag.name).strip()
)
+ # Show title if available, otherwise show URL
+ display_text = obj.title or obj.url
+ css_class = 'fetched' if obj.title else 'pending'
+
return format_html(
''
'
'
@@ -272,8 +276,8 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
obj.archive_path,
obj.archive_path,
obj.archive_path,
- 'fetched' if obj.title else 'pending',
- urldecode(htmldecode(obj.title or ''))[:128] or 'Pending...'
+ css_class,
+ urldecode(htmldecode(display_text))[:128]
) + mark_safe(f' {tags}')
@admin.display(
@@ -402,12 +406,21 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
description="☠️ Delete"
)
def delete_snapshots(self, request, queryset):
- from archivebox.cli.archivebox_remove import remove
- remove(snapshots=queryset, yes=True, delete=True, out_dir=DATA_DIR)
-
+ """Delete snapshots in a single transaction to avoid SQLite concurrency issues."""
+ from django.db import transaction
+
+ total = queryset.count()
+
+ # Get list of IDs to delete first (outside transaction)
+ ids_to_delete = list(queryset.values_list('pk', flat=True))
+
+ # Delete everything in a single atomic transaction
+ with transaction.atomic():
+ deleted_count, _ = Snapshot.objects.filter(pk__in=ids_to_delete).delete()
+
messages.success(
request,
- mark_safe(f"Succesfully deleted {queryset.count()} Snapshots. Don't forget to scrub URLs from import logs (data/sources) and error logs (data/logs) if needed."),
+ mark_safe(f"Successfully deleted {total} Snapshots ({deleted_count} total objects including related records). Don't forget to scrub URLs from import logs (data/sources) and error logs (data/logs) if needed."),
)
diff --git a/archivebox/core/models.py b/archivebox/core/models.py
index 57369460..d4a32639 100644
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -295,7 +295,7 @@ class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)):
class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
- created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='snapshot_set', db_index=True)
+ created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False, related_name='snapshot_set', db_index=True)
created_at = models.DateTimeField(default=timezone.now, db_index=True)
modified_at = models.DateTimeField(auto_now=True)
@@ -362,9 +362,11 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
},
)
+ @property
def output_dir_parent(self) -> str:
return 'archive'
+ @property
def output_dir_name(self) -> str:
return str(self.timestamp)
@@ -808,7 +810,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
# UUID field is added separately by migration for new records
id = models.AutoField(primary_key=True, editable=False)
uuid = models.UUIDField(default=uuid7, null=True, blank=True, db_index=True, unique=True)
- created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='archiveresult_set', db_index=True)
+ created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False, related_name='archiveresult_set', db_index=True)
created_at = models.DateTimeField(default=timezone.now, db_index=True)
modified_at = models.DateTimeField(auto_now=True)
@@ -844,7 +846,10 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
def save(self, *args, **kwargs):
is_new = self._state.adding
- super().save(*args, **kwargs)
+ # Skip ModelWithOutputDir.save() to avoid creating index.json in plugin directories
+ # Call the Django Model.save() directly instead
+ models.Model.save(self, *args, **kwargs)
+
if is_new:
from archivebox.misc.logging_util import log_worker_event
log_worker_event(
@@ -916,9 +921,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
def output_dir_parent(self) -> str:
return str(self.snapshot.OUTPUT_DIR.relative_to(CONSTANTS.DATA_DIR))
- def write_indexes(self):
- super().write_indexes()
-
def save_search_index(self):
pass
@@ -966,6 +968,16 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
)
end_ts = timezone.now()
+ # Clean up empty output directory if no files were created
+ output_files = result.get('output_files', [])
+ if not output_files and extractor_dir.exists():
+ try:
+ # Only remove if directory is completely empty
+ if not any(extractor_dir.iterdir()):
+ extractor_dir.rmdir()
+ except (OSError, RuntimeError):
+ pass # Directory not empty or can't be removed, that's fine
+
# Determine status from return code and JSON output
output_json = result.get('output_json') or {}
json_status = output_json.get('status')
@@ -990,15 +1002,46 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
self.start_ts = start_ts
self.end_ts = end_ts
self.retry_at = None
+ self.pwd = str(extractor_dir)
+
+ # Save cmd and cmd_version from extractor output
+ if output_json.get('cmd_version'):
+ self.cmd_version = output_json['cmd_version'][:128] # Max length from model
+ if output_json.get('cmd'):
+ self.cmd = output_json['cmd']
+
self.save()
# Queue any discovered URLs for crawling (parser extractors write urls.jsonl)
self._queue_urls_for_crawl(extractor_dir)
+ # Update snapshot title if this is the title extractor
+ # Check both old numeric name and new plugin name for compatibility
+ extractor_name = get_extractor_name(self.extractor)
+ if self.status == self.StatusChoices.SUCCEEDED and extractor_name == 'title':
+ self._update_snapshot_title(extractor_dir)
+
# Trigger search indexing if succeeded
if self.status == self.StatusChoices.SUCCEEDED:
self.trigger_search_indexing()
+ def _update_snapshot_title(self, extractor_dir: Path):
+ """
+ Update snapshot title from title extractor output.
+
+ The title extractor writes title.txt with the extracted page title.
+ This updates the Snapshot.title field if the file exists and has content.
+ """
+ title_file = extractor_dir / 'title.txt'
+ if title_file.exists():
+ try:
+ title = title_file.read_text(encoding='utf-8').strip()
+ if title and (not self.snapshot.title or len(title) > len(self.snapshot.title)):
+ self.snapshot.title = title[:512] # Max length from model
+ self.snapshot.save(update_fields=['title', 'modified_at'])
+ except Exception:
+ pass # Failed to read title, that's okay
+
def _queue_urls_for_crawl(self, extractor_dir: Path):
"""
Read urls.jsonl and queue discovered URLs for crawling.
diff --git a/archivebox/core/statemachines.py b/archivebox/core/statemachines.py
index eccefbbd..610f6fe0 100644
--- a/archivebox/core/statemachines.py
+++ b/archivebox/core/statemachines.py
@@ -91,7 +91,7 @@ class SnapshotMachine(StateMachine, strict_states=True):
# unlock the snapshot after we're done + set status = started
self.snapshot.update_for_workers(
- retry_at=timezone.now() + timedelta(seconds=5), # wait 5s before checking it again
+ retry_at=timezone.now() + timedelta(seconds=5), # check again in 5s
status=Snapshot.StatusChoices.STARTED,
)
@@ -209,12 +209,15 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
@started.enter
def enter_started(self):
+ from machine.models import NetworkInterface
+
# Suppressed: state transition logs
# Lock the object and mark start time
self.archiveresult.update_for_workers(
retry_at=timezone.now() + timedelta(seconds=120), # 2 min timeout for extractor
status=ArchiveResult.StatusChoices.STARTED,
start_ts=timezone.now(),
+ iface=NetworkInterface.current(),
)
# Run the extractor - this updates status, output, timestamps, etc.
@@ -234,7 +237,7 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
end_ts=None,
# retries=F('retries') + 1, # F() equivalent to getattr(self.archiveresult, 'retries', 0) + 1,
)
- self.archiveresult.save(write_indexes=True)
+ self.archiveresult.save()
@succeeded.enter
def enter_succeeded(self):
@@ -245,7 +248,7 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
end_ts=timezone.now(),
# **self.archiveresult.get_output_dict(), # {output, output_json, stderr, stdout, returncode, errors, cmd_version, pwd, cmd, machine}
)
- self.archiveresult.save(write_indexes=True)
+ self.archiveresult.save()
# Increment health stats on ArchiveResult, Snapshot, and optionally Crawl
ArchiveResult.objects.filter(pk=self.archiveresult.pk).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
diff --git a/archivebox/core/views.py b/archivebox/core/views.py
index 4c6932df..b8139506 100644
--- a/archivebox/core/views.py
+++ b/archivebox/core/views.py
@@ -560,16 +560,28 @@ def live_progress_view(request):
archiveresults_failed = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.FAILED).count()
# Build hierarchical active crawls with nested snapshots and archive results
- active_crawls = []
- for crawl in Crawl.objects.filter(
+ from django.db.models import Prefetch
+
+ active_crawls_qs = Crawl.objects.filter(
status__in=[Crawl.StatusChoices.QUEUED, Crawl.StatusChoices.STARTED]
- ).order_by('-modified_at')[:10]:
- # Get snapshots for this crawl
- crawl_snapshots = Snapshot.objects.filter(crawl=crawl)
- total_snapshots = crawl_snapshots.count()
- completed_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.SEALED).count()
- started_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.STARTED).count()
- pending_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.QUEUED).count()
+ ).prefetch_related(
+ 'snapshot_set',
+ 'snapshot_set__archiveresult_set',
+ ).distinct().order_by('-modified_at')[:10]
+
+ active_crawls = []
+ for crawl in active_crawls_qs:
+ # Get active snapshots for this crawl - filter in Python since we prefetched all
+ crawl_snapshots = [
+ s for s in crawl.snapshot_set.all()
+ if s.status in [Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]
+ ][:5] # Limit to 5 most recent
+
+ # Count snapshots by status (in memory, not DB)
+ total_snapshots = Snapshot.objects.filter(crawl=crawl).count() # Full count needs DB
+ completed_snapshots = sum(1 for s in crawl_snapshots if s.status == Snapshot.StatusChoices.SEALED)
+ started_snapshots = sum(1 for s in crawl_snapshots if s.status == Snapshot.StatusChoices.STARTED)
+ pending_snapshots = sum(1 for s in crawl_snapshots if s.status == Snapshot.StatusChoices.QUEUED)
# Count URLs in the crawl (for when snapshots haven't been created yet)
urls_count = 0
@@ -579,39 +591,39 @@ def live_progress_view(request):
# Calculate crawl progress
crawl_progress = int((completed_snapshots / total_snapshots) * 100) if total_snapshots > 0 else 0
- # Get active snapshots for this crawl
+ # Get active snapshots for this crawl (already prefetched)
active_snapshots_for_crawl = []
- for snapshot in crawl_snapshots.filter(
- status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]
- ).order_by('-modified_at')[:5]:
- # Get archive results for this snapshot
- snapshot_results = ArchiveResult.objects.filter(snapshot=snapshot)
- total_extractors = snapshot_results.count()
- completed_extractors = snapshot_results.filter(status=ArchiveResult.StatusChoices.SUCCEEDED).count()
- failed_extractors = snapshot_results.filter(status=ArchiveResult.StatusChoices.FAILED).count()
- pending_extractors = snapshot_results.filter(status=ArchiveResult.StatusChoices.QUEUED).count()
+ for snapshot in crawl_snapshots:
+ # Get archive results for this snapshot (already prefetched)
+ snapshot_results = snapshot.archiveresult_set.all()
+
+ # Count in memory instead of DB queries
+ total_extractors = len(snapshot_results)
+ completed_extractors = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.SUCCEEDED)
+ failed_extractors = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.FAILED)
+ pending_extractors = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.QUEUED)
# Calculate snapshot progress
snapshot_progress = int(((completed_extractors + failed_extractors) / total_extractors) * 100) if total_extractors > 0 else 0
- # Get all extractors for this snapshot
+ # Get all extractors for this snapshot (already prefetched, sort in Python)
# Order: started first, then queued, then completed
+ def extractor_sort_key(ar):
+ status_order = {
+ ArchiveResult.StatusChoices.STARTED: 0,
+ ArchiveResult.StatusChoices.QUEUED: 1,
+ ArchiveResult.StatusChoices.SUCCEEDED: 2,
+ ArchiveResult.StatusChoices.FAILED: 3,
+ }
+ return (status_order.get(ar.status, 4), ar.extractor)
+
all_extractors = [
{
'id': str(ar.id),
'extractor': ar.extractor,
'status': ar.status,
}
- for ar in snapshot_results.annotate(
- status_order=Case(
- When(status=ArchiveResult.StatusChoices.STARTED, then=Value(0)),
- When(status=ArchiveResult.StatusChoices.QUEUED, then=Value(1)),
- When(status=ArchiveResult.StatusChoices.SUCCEEDED, then=Value(2)),
- When(status=ArchiveResult.StatusChoices.FAILED, then=Value(3)),
- default=Value(4),
- output_field=IntegerField(),
- )
- ).order_by('status_order', 'extractor')
+ for ar in sorted(snapshot_results, key=extractor_sort_key)
]
active_snapshots_for_crawl.append({
@@ -726,15 +738,39 @@ def find_config_default(key: str) -> str:
return default_val
def find_config_type(key: str) -> str:
+ from typing import get_type_hints, ClassVar
CONFIGS = get_all_configs()
-
+
for config in CONFIGS.values():
if hasattr(config, key):
- type_hints = get_type_hints(config)
+ # Try to get from pydantic model_fields first (more reliable)
+ if hasattr(config, 'model_fields') and key in config.model_fields:
+ field = config.model_fields[key]
+ if hasattr(field, 'annotation'):
+ try:
+ return str(field.annotation.__name__)
+ except AttributeError:
+ return str(field.annotation)
+
+ # Fallback to get_type_hints with proper namespace
try:
- return str(type_hints[key].__name__)
- except AttributeError:
- return str(type_hints[key])
+ import typing
+ namespace = {
+ 'ClassVar': ClassVar,
+ 'Optional': typing.Optional,
+ 'Union': typing.Union,
+ 'List': typing.List,
+ 'Dict': typing.Dict,
+ 'Path': Path,
+ }
+ type_hints = get_type_hints(config, globalns=namespace, localns=namespace)
+ try:
+ return str(type_hints[key].__name__)
+ except AttributeError:
+ return str(type_hints[key])
+ except Exception:
+ # If all else fails, return str
+ pass
return 'str'
def key_is_safe(key: str) -> bool:
@@ -743,17 +779,55 @@ def key_is_safe(key: str) -> bool:
return False
return True
+def find_config_source(key: str, merged_config: dict) -> str:
+ """Determine where a config value comes from."""
+ import os
+ from machine.models import Machine
+
+ # Check if it's from machine config
+ try:
+ machine = Machine.current()
+ if machine.config and key in machine.config:
+ return 'Machine'
+ except Exception:
+ pass
+
+ # Check if it's from environment variable
+ if key in os.environ:
+ return 'Environment'
+
+ # Check if it's from config file
+ from archivebox.config.configset import BaseConfigSet
+ file_config = BaseConfigSet.load_from_file(CONSTANTS.CONFIG_FILE)
+ if key in file_config:
+ return 'Config File'
+
+ # Otherwise it's using the default
+ return 'Default'
+
+
@render_with_table_view
def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
CONFIGS = get_all_configs()
-
+
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
+ # Get merged config that includes Machine.config overrides
+ try:
+ from machine.models import Machine
+ machine = Machine.current()
+ merged_config = get_config()
+ except Exception as e:
+ # Fallback if Machine model not available
+ merged_config = get_config()
+ machine = None
+
rows = {
"Section": [],
"Key": [],
"Type": [],
"Value": [],
+ "Source": [],
"Default": [],
# "Documentation": [],
# "Aliases": [],
@@ -764,7 +838,21 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
rows['Section'].append(section_id) # section.replace('_', ' ').title().replace(' Config', '')
rows['Key'].append(ItemLink(key, key=key))
rows['Type'].append(format_html('{}', find_config_type(key)))
- rows['Value'].append(mark_safe(f'{getattr(section, key)}') if key_is_safe(key) else '******** (redacted)')
+
+ # Use merged config value (includes machine overrides)
+ actual_value = merged_config.get(key, getattr(section, key, None))
+ rows['Value'].append(mark_safe(f'{actual_value}') if key_is_safe(key) else '******** (redacted)')
+
+ # Show where the value comes from
+ source = find_config_source(key, merged_config)
+ source_colors = {
+ 'Machine': 'purple',
+ 'Environment': 'blue',
+ 'Config File': 'green',
+ 'Default': 'gray'
+ }
+ rows['Source'].append(format_html('{}', source_colors.get(source, 'gray'), source))
+
rows['Default'].append(mark_safe(f'{find_config_default(key) or "See here..."}'))
# rows['Documentation'].append(mark_safe(f'Wiki: {key}'))
# rows['Aliases'].append(', '.join(find_config_aliases(key)))
@@ -775,6 +863,7 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
rows['Key'].append(ItemLink(key, key=key))
rows['Type'].append(format_html('{}', getattr(type(CONSTANTS_CONFIG[key]), '__name__', str(CONSTANTS_CONFIG[key]))))
rows['Value'].append(format_html('{}', CONSTANTS_CONFIG[key]) if key_is_safe(key) else '******** (redacted)')
+ rows['Source'].append(mark_safe('Constant'))
rows['Default'].append(mark_safe(f'{find_config_default(key) or "See here..."}'))
# rows['Documentation'].append(mark_safe(f'Wiki: {key}'))
# rows['Aliases'].append('')
@@ -787,11 +876,58 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
@render_with_item_view
def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
+ import os
+ from machine.models import Machine
+ from archivebox.config.configset import BaseConfigSet
+
CONFIGS = get_all_configs()
FLAT_CONFIG = get_flat_config()
-
+
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
+ # Get merged config
+ merged_config = get_config()
+
+ # Determine all sources for this config value
+ sources_info = []
+
+ # Default value
+ default_val = find_config_default(key)
+ if default_val:
+ sources_info.append(('Default', default_val, 'gray'))
+
+ # Config file value
+ if CONSTANTS.CONFIG_FILE.exists():
+ file_config = BaseConfigSet.load_from_file(CONSTANTS.CONFIG_FILE)
+ if key in file_config:
+ sources_info.append(('Config File', file_config[key], 'green'))
+
+ # Environment variable
+ if key in os.environ:
+ sources_info.append(('Environment', os.environ[key] if key_is_safe(key) else '********', 'blue'))
+
+ # Machine config
+ machine = None
+ machine_admin_url = None
+ try:
+ machine = Machine.current()
+ machine_admin_url = f'/admin/machine/machine/{machine.id}/change/'
+ if machine.config and key in machine.config:
+ sources_info.append(('Machine', machine.config[key] if key_is_safe(key) else '********', 'purple'))
+ except Exception:
+ pass
+
+ # Final computed value
+ final_value = merged_config.get(key, FLAT_CONFIG.get(key, CONFIGS.get(key, None)))
+ if not key_is_safe(key):
+ final_value = '********'
+
+ # Build sources display
+ sources_html = '
'.join([
+ f'{source}: {value}'
+ for source, value, color in sources_info
+ ])
+
# aliases = USER_CONFIG.get(key, {}).get("aliases", [])
aliases = []
@@ -813,7 +949,8 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
"fields": {
'Key': key,
'Type': find_config_type(key),
- 'Value': FLAT_CONFIG.get(key, CONFIGS.get(key, None)) if key_is_safe(key) else '********',
+ 'Value': final_value,
+ 'Source': find_config_source(key, merged_config),
},
"help_texts": {
'Key': mark_safe(f'''
@@ -830,10 +967,8 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
'Value': mark_safe(f'''
{'Value is redacted for your security. (Passwords, secrets, API tokens, etc. cannot be viewed in the Web UI)
' if not key_is_safe(key) else ''}
- Default:
-
- {find_config_default(key) or '↗️ See in ArchiveBox source code...'}
-
+ Configuration Sources (in priority order):
+ {sources_html}
To change this value, edit data/ArchiveBox.conf or run:
@@ -845,6 +980,20 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
}"
'''),
+ 'Source': mark_safe(f'''
+ The value shown in the "Value" field comes from the {find_config_source(key, merged_config)} source.
+
+ Priority order (highest to lowest):
+
+ - Machine - Machine-specific overrides (e.g., resolved binary paths)
+ {f'
→ Edit {key} in Machine.config for this server' if machine_admin_url else ''}
+
+ - Environment - Environment variables
+ - Config File - data/ArchiveBox.conf
+ - Default - Default value from code
+
+ {f'
💡 Tip: To override {key} on this machine, edit the Machine.config field and add:
{{"\\"{key}\\": "your_value_here"}}' if machine_admin_url and key not in CONSTANTS_CONFIG else ''}
+ '''),
},
},
],
diff --git a/archivebox/crawls/admin.py b/archivebox/crawls/admin.py
index fa41f851..f5e1c9ae 100644
--- a/archivebox/crawls/admin.py
+++ b/archivebox/crawls/admin.py
@@ -3,6 +3,7 @@ __package__ = 'archivebox.crawls'
import json
from pathlib import Path
+from django import forms
from django.utils.html import format_html, format_html_join, mark_safe
from django.contrib import admin, messages
from django.urls import path
@@ -136,16 +137,32 @@ def render_snapshots_list(snapshots_qs, limit=20):
''')
+class CrawlAdminForm(forms.ModelForm):
+ """Custom form for Crawl admin to render urls field as textarea."""
+
+ class Meta:
+ model = Crawl
+ fields = '__all__'
+ widgets = {
+ 'urls': forms.Textarea(attrs={
+ 'rows': 8,
+ 'style': 'width: 100%; font-family: monospace; font-size: 13px;',
+ 'placeholder': 'https://example.com\nhttps://example2.com\n# Comments start with #',
+ }),
+ }
+
+
class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
+ form = CrawlAdminForm
list_display = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'urls_preview', 'schedule_str', 'status', 'retry_at', 'num_snapshots')
sort_fields = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'schedule_str', 'status', 'retry_at')
search_fields = ('id', 'created_by__username', 'max_depth', 'label', 'notes', 'schedule_id', 'status', 'urls')
- readonly_fields = ('created_at', 'modified_at', 'snapshots', 'urls_editor')
+ readonly_fields = ('created_at', 'modified_at', 'snapshots')
fieldsets = (
('URLs', {
- 'fields': ('urls_editor',),
+ 'fields': ('urls',),
'classes': ('card', 'wide'),
}),
('Info', {
@@ -177,9 +194,32 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
list_filter = ('max_depth', 'extractor', 'schedule', 'created_by', 'status', 'retry_at')
ordering = ['-created_at', '-retry_at']
list_per_page = 100
- actions = ["delete_selected"]
+ actions = ["delete_selected_batched"]
change_actions = ['recrawl']
+ def get_queryset(self, request):
+ """Optimize queries with select_related and annotations."""
+ qs = super().get_queryset(request)
+ return qs.select_related('schedule', 'created_by').annotate(
+ num_snapshots_cached=Count('snapshot_set')
+ )
+
+ @admin.action(description='Delete selected crawls')
+ def delete_selected_batched(self, request, queryset):
+ """Delete crawls in a single transaction to avoid SQLite concurrency issues."""
+ from django.db import transaction
+
+ total = queryset.count()
+
+ # Get list of IDs to delete first (outside transaction)
+ ids_to_delete = list(queryset.values_list('pk', flat=True))
+
+ # Delete everything in a single atomic transaction
+ with transaction.atomic():
+ deleted_count, _ = Crawl.objects.filter(pk__in=ids_to_delete).delete()
+
+ messages.success(request, f'Successfully deleted {total} crawls ({deleted_count} total objects including related records).')
+
@action(label='Recrawl', description='Create a new crawl with the same settings')
def recrawl(self, request, obj):
"""Duplicate this crawl as a new crawl with the same URLs and settings."""
@@ -214,7 +254,8 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
return redirect('admin:crawls_crawl_change', new_crawl.id)
def num_snapshots(self, obj):
- return obj.snapshot_set.count()
+ # Use cached annotation from get_queryset to avoid N+1
+ return getattr(obj, 'num_snapshots_cached', obj.snapshot_set.count())
def snapshots(self, obj):
return render_snapshots_list(obj.snapshot_set.all())
@@ -269,7 +310,7 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
placeholder="https://example.com
https://example2.com
# Comments start with #"
readonly>{escaped_urls}
- {line_count} URL{'s' if line_count != 1 else ''} · URLs are read-only in admin, edit via API or CLI
+ {line_count} URL{'s' if line_count != 1 else ''} · Note: URLs displayed here for reference only
diff --git a/archivebox/crawls/models.py b/archivebox/crawls/models.py
index 263869fe..9f11b1c4 100644
--- a/archivebox/crawls/models.py
+++ b/archivebox/crawls/models.py
@@ -88,7 +88,9 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
def __str__(self):
first_url = self.get_urls_list()[0] if self.get_urls_list() else ''
- return f'[{self.id}] {first_url[:64]}'
+ # Show last 8 digits of UUID and more of the URL
+ short_id = str(self.id)[-8:]
+ return f'[...{short_id}] {first_url[:120]}'
def save(self, *args, **kwargs):
is_new = self._state.adding
diff --git a/archivebox/crawls/statemachines.py b/archivebox/crawls/statemachines.py
index 45cb62fc..99b72699 100644
--- a/archivebox/crawls/statemachines.py
+++ b/archivebox/crawls/statemachines.py
@@ -83,7 +83,7 @@ class CrawlMachine(StateMachine, strict_states=True):
# Suppressed: state transition logs
# lock the crawl object while we create snapshots
self.crawl.update_for_workers(
- retry_at=timezone.now() + timedelta(seconds=5),
+ retry_at=timezone.now(), # Process immediately
status=Crawl.StatusChoices.QUEUED,
)
@@ -96,7 +96,7 @@ class CrawlMachine(StateMachine, strict_states=True):
# only update status to STARTED once snapshots are created
self.crawl.update_for_workers(
- retry_at=timezone.now() + timedelta(seconds=5),
+ retry_at=timezone.now(), # Process immediately
status=Crawl.StatusChoices.STARTED,
)
except Exception as e:
@@ -129,7 +129,7 @@ class CrawlMachine(StateMachine, strict_states=True):
timeout=60,
config_objects=[self.crawl],
crawl_id=str(self.crawl.id),
- seed_uri=first_url,
+ source_url=first_url,
)
# Process hook results - parse JSONL output and create DB objects
@@ -195,8 +195,43 @@ class CrawlMachine(StateMachine, strict_states=True):
@sealed.enter
def enter_sealed(self):
+ # Run on_CrawlEnd hooks to clean up resources (e.g., kill shared Chrome)
+ self._run_crawl_end_hooks()
+
# Suppressed: state transition logs
self.crawl.update_for_workers(
retry_at=None,
status=Crawl.StatusChoices.SEALED,
)
+
+ def _run_crawl_end_hooks(self):
+ """Run on_CrawlEnd hooks to clean up resources at crawl completion."""
+ from pathlib import Path
+ from archivebox.hooks import run_hooks, discover_hooks
+ from archivebox.config import CONSTANTS
+
+ # Discover and run all on_CrawlEnd hooks
+ hooks = discover_hooks('CrawlEnd')
+ if not hooks:
+ return
+
+ # Use the same temporary output directory from crawl start
+ output_dir = Path(CONSTANTS.DATA_DIR) / 'tmp' / f'crawl_{self.crawl.id}'
+
+ # Run all on_CrawlEnd hooks
+ first_url = self.crawl.get_urls_list()[0] if self.crawl.get_urls_list() else ''
+ results = run_hooks(
+ event_name='CrawlEnd',
+ output_dir=output_dir,
+ timeout=30, # Cleanup hooks should be quick
+ config_objects=[self.crawl],
+ crawl_id=str(self.crawl.id),
+ source_url=first_url,
+ )
+
+ # Log any failures but don't block sealing
+ for result in results:
+ if result['returncode'] != 0:
+ print(f'[yellow]⚠️ CrawlEnd hook failed: {result.get("hook", "unknown")}[/yellow]')
+ if result.get('stderr'):
+ print(f'[dim]{result["stderr"][:200]}[/dim]')
diff --git a/archivebox/hooks.py b/archivebox/hooks.py
index 4b06324a..f8f75b18 100644
--- a/archivebox/hooks.py
+++ b/archivebox/hooks.py
@@ -199,16 +199,24 @@ def run_hook(
# Build CLI arguments from kwargs
for key, value in kwargs.items():
+ # Skip keys that start with underscore (internal parameters)
+ if key.startswith('_'):
+ continue
+
arg_key = f'--{key.replace("_", "-")}'
if isinstance(value, bool):
if value:
cmd.append(arg_key)
- elif value is not None:
+ elif value is not None and value != '':
# JSON-encode complex values, use str for simple ones
+ # Skip empty strings to avoid --key= which breaks argument parsers
if isinstance(value, (dict, list)):
cmd.append(f'{arg_key}={json.dumps(value)}')
else:
- cmd.append(f'{arg_key}={value}')
+ # Ensure value is converted to string and strip whitespace
+ str_value = str(value).strip()
+ if str_value: # Only add if non-empty after stripping
+ cmd.append(f'{arg_key}={str_value}')
# Set up environment with base paths
env = os.environ.copy()
diff --git a/archivebox/mcp/server.py b/archivebox/mcp/server.py
index 1789d806..a8abf996 100644
--- a/archivebox/mcp/server.py
+++ b/archivebox/mcp/server.py
@@ -125,24 +125,64 @@ def execute_click_command(cmd_name: str, click_command: click.Command, arguments
Returns MCP-formatted result with captured output and error status.
"""
+ # Setup Django for archive commands (commands that need database access)
+ from archivebox.cli import ArchiveBoxGroup
+ if cmd_name in ArchiveBoxGroup.archive_commands:
+ try:
+ from archivebox.config.django import setup_django
+ from archivebox.misc.checks import check_data_folder
+ setup_django()
+ check_data_folder()
+ except Exception as e:
+ # If Django setup fails, return error (unless it's manage/shell which handle this themselves)
+ if cmd_name not in ('manage', 'shell'):
+ return {
+ "content": [{
+ "type": "text",
+ "text": f"Error setting up Django: {str(e)}\n\nMake sure you're running the MCP server from inside an ArchiveBox data directory."
+ }],
+ "isError": True
+ }
+
# Use Click's test runner to invoke command programmatically
runner = CliRunner()
+ # Build a map of parameter names to their Click types (Argument vs Option)
+ param_map = {param.name: param for param in click_command.params}
+
# Convert arguments dict to CLI args list
args = []
+ positional_args = []
+
for key, value in arguments.items():
param_name = key.replace('_', '-') # Click uses dashes
+ param = param_map.get(key)
- if isinstance(value, bool):
- if value:
+ # Check if this is a positional Argument (not an Option)
+ is_argument = isinstance(param, click.Argument)
+
+ if is_argument:
+ # Positional arguments - add them without dashes
+ if isinstance(value, list):
+ positional_args.extend([str(v) for v in value])
+ elif value is not None:
+ positional_args.append(str(value))
+ else:
+ # Options - add with dashes
+ if isinstance(value, bool):
+ if value:
+ args.append(f'--{param_name}')
+ elif isinstance(value, list):
+ # Multiple values for an option (rare)
+ for item in value:
+ args.append(f'--{param_name}')
+ args.append(str(item))
+ elif value is not None:
args.append(f'--{param_name}')
- elif isinstance(value, list):
- # Multiple values (e.g., multiple URLs)
- for item in value:
- args.append(str(item))
- elif value is not None:
- args.append(f'--{param_name}')
- args.append(str(value))
+ args.append(str(value))
+
+ # Add positional arguments at the end
+ args.extend(positional_args)
# Execute the command
try:
diff --git a/archivebox/misc/logging_util.py b/archivebox/misc/logging_util.py
index 1016539e..2a725dd4 100644
--- a/archivebox/misc/logging_util.py
+++ b/archivebox/misc/logging_util.py
@@ -542,24 +542,32 @@ def log_worker_event(
"""
indent = ' ' * indent_level
- # Build worker identifier
+ from rich.markup import escape
+
+ # Build worker identifier (without URL/extractor)
worker_parts = [worker_type]
# Don't add pid/worker_id for DB operations (they happen in whatever process is running)
if pid and worker_type != 'DB':
worker_parts.append(f'pid={pid}')
if worker_id and worker_type in ('CrawlWorker', 'Orchestrator') and worker_type != 'DB':
worker_parts.append(f'id={worker_id}')
- if url and worker_type in ('SnapshotWorker', 'DB'):
- worker_parts.append(f'url={truncate_url(url)}')
- if extractor and worker_type in ('ArchiveResultWorker', 'DB'):
- worker_parts.append(f'extractor={extractor}')
# Format worker label - only add brackets if there are additional identifiers
+ # Use double brackets [[...]] to escape Rich markup
if len(worker_parts) > 1:
- worker_label = f'{worker_parts[0]}[{", ".join(worker_parts[1:])}]'
+ worker_label = f'{worker_parts[0]}[[{", ".join(worker_parts[1:])}]]'
else:
worker_label = worker_parts[0]
+ # Build URL/extractor display (shown AFTER the label, outside brackets)
+ url_extractor_parts = []
+ if url:
+ url_extractor_parts.append(f'url: {escape(url)}')
+ if extractor:
+ url_extractor_parts.append(f'extractor: {escape(extractor)}')
+
+ url_extractor_str = ' | '.join(url_extractor_parts) if url_extractor_parts else ''
+
# Build metadata string
metadata_str = ''
if metadata:
@@ -592,8 +600,6 @@ def log_worker_event(
color = 'green'
elif event.startswith('Created'):
color = 'cyan' # DB creation events
- elif event in ('Processing...', 'PROCESSING'):
- color = 'blue'
elif event in ('Completed', 'COMPLETED', 'All work complete'):
color = 'blue'
elif event in ('Failed', 'ERROR', 'Failed to spawn worker'):
@@ -610,6 +616,12 @@ def log_worker_event(
text = Text()
text.append(indent)
text.append(f'{worker_label} {event}{error_str}', style=color)
+
+ # Add URL/extractor info first (more important)
+ if url_extractor_str:
+ text.append(f' | {url_extractor_str}')
+
+ # Then add other metadata
if metadata_str:
text.append(f' | {metadata_str}')
diff --git a/archivebox/plugins/captcha2/on_Snapshot__01_captcha2.js b/archivebox/plugins/captcha2/on_Crawl__01_captcha2.js
similarity index 98%
rename from archivebox/plugins/captcha2/on_Snapshot__01_captcha2.js
rename to archivebox/plugins/captcha2/on_Crawl__01_captcha2.js
index db7dd896..3e6dbca2 100755
--- a/archivebox/plugins/captcha2/on_Snapshot__01_captcha2.js
+++ b/archivebox/plugins/captcha2/on_Crawl__01_captcha2.js
@@ -8,8 +8,8 @@
* Extension: https://chromewebstore.google.com/detail/ifibfemgeogfhoebkmokieepdoobkbpo
* Documentation: https://2captcha.com/blog/how-to-use-2captcha-solver-extension-in-puppeteer
*
- * Priority: 01 (early) - Must install before Chrome session starts
- * Hook: on_Snapshot
+ * Priority: 01 (early) - Must install before Chrome session starts at Crawl level
+ * Hook: on_Crawl (runs once per crawl, not per snapshot)
*
* Requirements:
* - API_KEY_2CAPTCHA environment variable must be set
diff --git a/archivebox/plugins/captcha2/on_Snapshot__21_captcha2_config.js b/archivebox/plugins/captcha2/on_Crawl__11_captcha2_config.js
similarity index 91%
rename from archivebox/plugins/captcha2/on_Snapshot__21_captcha2_config.js
rename to archivebox/plugins/captcha2/on_Crawl__11_captcha2_config.js
index f97c9ef1..d370c81f 100755
--- a/archivebox/plugins/captcha2/on_Snapshot__21_captcha2_config.js
+++ b/archivebox/plugins/captcha2/on_Crawl__11_captcha2_config.js
@@ -2,11 +2,11 @@
/**
* 2Captcha Extension Configuration
*
- * Configures the 2captcha extension with API key after Chrome session starts.
- * Runs once per browser session to inject API key into extension storage.
+ * Configures the 2captcha extension with API key after Crawl-level Chrome session starts.
+ * Runs once per crawl to inject API key into extension storage.
*
- * Priority: 21 (after chrome_session at 20, before navigation at 30)
- * Hook: on_Snapshot
+ * Priority: 11 (after chrome_session at 10)
+ * Hook: on_Crawl (runs once per crawl, not per snapshot)
*
* Requirements:
* - API_KEY_2CAPTCHA environment variable must be set
@@ -17,8 +17,19 @@ const path = require('path');
const fs = require('fs');
const puppeteer = require('puppeteer-core');
-const OUTPUT_DIR = 'chrome_session';
-const CONFIG_MARKER = path.join(OUTPUT_DIR, '.captcha2_configured');
+// Get crawl ID from args to find the crawl-level chrome session
+function getCrawlChromeSessionDir() {
+ const args = parseArgs();
+ const crawlId = args.crawl_id;
+ if (!crawlId) {
+ return null;
+ }
+ const dataDir = process.env.DATA_DIR || '.';
+ return path.join(dataDir, 'tmp', `crawl_${crawlId}`, 'chrome_session');
+}
+
+const CHROME_SESSION_DIR = getCrawlChromeSessionDir() || '../chrome_session';
+const CONFIG_MARKER = path.join(CHROME_SESSION_DIR, '.captcha2_configured');
// Get environment variable with default
function getEnv(name, defaultValue = '') {
@@ -53,7 +64,7 @@ async function configure2Captcha() {
}
// Load extensions metadata
- const extensionsFile = path.join(OUTPUT_DIR, 'extensions.json');
+ const extensionsFile = path.join(CHROME_SESSION_DIR, 'extensions.json');
if (!fs.existsSync(extensionsFile)) {
return { success: false, error: 'extensions.json not found - chrome_session must run first' };
}
@@ -70,7 +81,7 @@ async function configure2Captcha() {
try {
// Connect to the existing Chrome session via CDP
- const cdpFile = path.join(OUTPUT_DIR, 'cdp_url.txt');
+ const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
if (!fs.existsSync(cdpFile)) {
return { success: false, error: 'CDP URL not found - chrome_session must run first' };
}
diff --git a/archivebox/plugins/chrome_cleanup/on_Snapshot__45_chrome_cleanup.py b/archivebox/plugins/chrome_cleanup/on_Snapshot__45_chrome_cleanup.py
index fae91ffb..6c7133e4 100644
--- a/archivebox/plugins/chrome_cleanup/on_Snapshot__45_chrome_cleanup.py
+++ b/archivebox/plugins/chrome_cleanup/on_Snapshot__45_chrome_cleanup.py
@@ -3,10 +3,11 @@
Clean up Chrome browser session started by chrome_session extractor.
This extractor runs after all Chrome-based extractors (screenshot, pdf, dom)
-to terminate the Chrome process and clean up any leftover files.
+to clean up the Chrome session. For shared sessions (crawl-level Chrome), it
+closes only this snapshot's tab. For standalone sessions, it kills Chrome.
-Usage: on_Snapshot__24_chrome_cleanup.py --url= --snapshot-id=
-Output: Terminates Chrome process and removes lock files
+Usage: on_Snapshot__45_chrome_cleanup.py --url= --snapshot-id=
+Output: Closes tab or terminates Chrome process
Environment variables:
CHROME_USER_DATA_DIR: Chrome profile directory (for lock file cleanup)
@@ -18,6 +19,7 @@ import os
import signal
import sys
import time
+import urllib.request
from datetime import datetime, timezone
from pathlib import Path
@@ -33,18 +35,126 @@ def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
+def close_tab_via_cdp(cdp_url: str, page_id: str) -> bool:
+ """
+ Close a specific tab via Chrome DevTools Protocol.
+
+ Returns True if tab was closed successfully.
+ """
+ try:
+ # Extract port from WebSocket URL (ws://127.0.0.1:PORT/...)
+ import re
+ match = re.search(r':(\d+)/', cdp_url)
+ if not match:
+ return False
+ port = match.group(1)
+
+ # Use CDP HTTP endpoint to close the target
+ close_url = f'http://127.0.0.1:{port}/json/close/{page_id}'
+ req = urllib.request.Request(close_url, method='GET')
+
+ with urllib.request.urlopen(req, timeout=5) as resp:
+ return resp.status == 200
+
+ except Exception as e:
+ print(f'Failed to close tab via CDP: {e}', file=sys.stderr)
+ return False
+
+
+def kill_listener_processes() -> list[str]:
+ """
+ Kill any daemonized listener processes (consolelog, ssl, responses, etc.).
+
+ These hooks write listener.pid files that we need to kill.
+ Returns list of killed process descriptions.
+ """
+ killed = []
+ snapshot_dir = Path('.').resolve().parent # Go up from chrome_cleanup dir
+
+ # Look for listener.pid files in sibling directories
+ for extractor_dir in snapshot_dir.iterdir():
+ if not extractor_dir.is_dir():
+ continue
+
+ pid_file = extractor_dir / 'listener.pid'
+ if not pid_file.exists():
+ continue
+
+ try:
+ pid = int(pid_file.read_text().strip())
+ try:
+ os.kill(pid, signal.SIGTERM)
+ # Brief wait for graceful shutdown
+ for _ in range(5):
+ try:
+ os.kill(pid, 0)
+ time.sleep(0.05)
+ except OSError:
+ break
+ else:
+ # Force kill if still running
+ try:
+ os.kill(pid, signal.SIGKILL)
+ except OSError:
+ pass
+
+ killed.append(f'{extractor_dir.name} listener (PID {pid})')
+ except OSError as e:
+ if e.errno != 3: # Not "No such process"
+ killed.append(f'{extractor_dir.name} listener (already dead)')
+ except (ValueError, FileNotFoundError):
+ pass
+
+ return killed
+
+
def cleanup_chrome_session() -> tuple[bool, str | None, str]:
"""
Clean up Chrome session started by chrome_session extractor.
+ For shared sessions (crawl-level Chrome), closes only this snapshot's tab.
+ For standalone sessions, kills the Chrome process.
+
Returns: (success, output_info, error_message)
"""
+ # First, kill any daemonized listener processes
+ killed = kill_listener_processes()
+ if killed:
+ print(f'Killed listener processes: {", ".join(killed)}')
+
session_dir = Path(CHROME_SESSION_DIR)
if not session_dir.exists():
return True, 'No chrome_session directory found', ''
+ # Check if this is a shared session
+ shared_file = session_dir / 'shared_session.txt'
+ is_shared = False
+ if shared_file.exists():
+ is_shared = shared_file.read_text().strip().lower() == 'true'
+
pid_file = session_dir / 'pid.txt'
+ cdp_file = session_dir / 'cdp_url.txt'
+ page_id_file = session_dir / 'page_id.txt'
+
+ if is_shared:
+ # Shared session - only close this snapshot's tab
+ if cdp_file.exists() and page_id_file.exists():
+ try:
+ cdp_url = cdp_file.read_text().strip()
+ page_id = page_id_file.read_text().strip()
+
+ if close_tab_via_cdp(cdp_url, page_id):
+ return True, f'Closed tab {page_id[:8]}... (shared Chrome session)', ''
+ else:
+ return True, f'Tab may already be closed (shared Chrome session)', ''
+
+ except Exception as e:
+ return True, f'Tab cleanup attempted: {e}', ''
+
+ return True, 'Shared session - Chrome stays running', ''
+
+ # Standalone session - kill the Chrome process
killed = False
if pid_file.exists():
diff --git a/archivebox/plugins/chrome_navigate/on_Snapshot__30_chrome_navigate.js b/archivebox/plugins/chrome_navigate/on_Snapshot__30_chrome_navigate.js
index b34c8c96..fb414ee7 100644
--- a/archivebox/plugins/chrome_navigate/on_Snapshot__30_chrome_navigate.js
+++ b/archivebox/plugins/chrome_navigate/on_Snapshot__30_chrome_navigate.js
@@ -2,38 +2,27 @@
/**
* Navigate the Chrome browser to the target URL.
*
- * This extractor runs AFTER pre-load extractors (21-29) have registered their
- * CDP listeners. It connects to the existing Chrome session, navigates to the URL,
- * waits for page load, and captures response headers.
+ * This is a simple hook that ONLY navigates - nothing else.
+ * Pre-load hooks (21-29) should set up their own CDP listeners.
+ * Post-load hooks (31+) can then read from the loaded page.
*
* Usage: on_Snapshot__30_chrome_navigate.js --url= --snapshot-id=
- * Output: Writes to chrome_session/:
- * - response_headers.json: HTTP response headers from main document
- * - final_url.txt: Final URL after any redirects
- * - page_loaded.txt: Marker file indicating navigation is complete
+ * Output: Writes page_loaded.txt marker when navigation completes
*
* Environment variables:
- * CHROME_PAGELOAD_TIMEOUT: Timeout for page load in seconds (default: 60)
+ * CHROME_PAGELOAD_TIMEOUT: Timeout in seconds (default: 60)
* CHROME_DELAY_AFTER_LOAD: Extra delay after load in seconds (default: 0)
* CHROME_WAIT_FOR: Wait condition (default: networkidle2)
- * - domcontentloaded: DOM is ready, resources may still load
- * - load: Page fully loaded including resources
- * - networkidle0: No network activity for 500ms (strictest)
- * - networkidle2: At most 2 network connections for 500ms
- *
- * # Fallbacks
- * TIMEOUT: Fallback timeout
*/
const fs = require('fs');
const path = require('path');
const puppeteer = require('puppeteer-core');
-// Extractor metadata
const EXTRACTOR_NAME = 'chrome_navigate';
const CHROME_SESSION_DIR = '../chrome_session';
+const OUTPUT_DIR = '.';
-// Parse command line arguments
function parseArgs() {
const args = {};
process.argv.slice(2).forEach(arg => {
@@ -45,18 +34,10 @@ function parseArgs() {
return args;
}
-// Get environment variable with default
function getEnv(name, defaultValue = '') {
return (process.env[name] || defaultValue).trim();
}
-function getEnvBool(name, defaultValue = false) {
- const val = getEnv(name, '').toLowerCase();
- if (['true', '1', 'yes', 'on'].includes(val)) return true;
- if (['false', '0', 'no', 'off'].includes(val)) return false;
- return defaultValue;
-}
-
function getEnvInt(name, defaultValue = 0) {
const val = parseInt(getEnv(name, String(defaultValue)), 10);
return isNaN(val) ? defaultValue : val;
@@ -67,159 +48,79 @@ function getEnvFloat(name, defaultValue = 0) {
return isNaN(val) ? defaultValue : val;
}
-// Read CDP URL from chrome_session
function getCdpUrl() {
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
- if (!fs.existsSync(cdpFile)) {
- return null;
- }
+ if (!fs.existsSync(cdpFile)) return null;
return fs.readFileSync(cdpFile, 'utf8').trim();
}
-// Read URL from chrome_session (set by chrome_session extractor)
-function getTargetUrl() {
- const urlFile = path.join(CHROME_SESSION_DIR, 'url.txt');
- if (!fs.existsSync(urlFile)) {
- return null;
- }
- return fs.readFileSync(urlFile, 'utf8').trim();
+function getPageId() {
+ const pageIdFile = path.join(CHROME_SESSION_DIR, 'page_id.txt');
+ if (!fs.existsSync(pageIdFile)) return null;
+ return fs.readFileSync(pageIdFile, 'utf8').trim();
}
-// Validate wait condition
function getWaitCondition() {
const waitFor = getEnv('CHROME_WAIT_FOR', 'networkidle2').toLowerCase();
- const validConditions = ['domcontentloaded', 'load', 'networkidle0', 'networkidle2'];
- if (validConditions.includes(waitFor)) {
- return waitFor;
- }
- console.error(`Warning: Invalid CHROME_WAIT_FOR="${waitFor}", using networkidle2`);
- return 'networkidle2';
+ const valid = ['domcontentloaded', 'load', 'networkidle0', 'networkidle2'];
+ return valid.includes(waitFor) ? waitFor : 'networkidle2';
}
-// Sleep helper
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
-async function navigateToUrl(url, cdpUrl) {
+async function navigate(url, cdpUrl) {
const timeout = (getEnvInt('CHROME_PAGELOAD_TIMEOUT') || getEnvInt('CHROME_TIMEOUT') || getEnvInt('TIMEOUT', 60)) * 1000;
const delayAfterLoad = getEnvFloat('CHROME_DELAY_AFTER_LOAD', 0) * 1000;
const waitUntil = getWaitCondition();
+ const pageId = getPageId();
let browser = null;
- let responseHeaders = {};
- let redirectChain = [];
- let finalUrl = url;
try {
- // Connect to existing browser
- browser = await puppeteer.connect({
- browserWSEndpoint: cdpUrl,
- });
+ browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl });
- // Get all pages and find our target page
const pages = await browser.pages();
if (pages.length === 0) {
return { success: false, error: 'No pages found in browser' };
}
- // Use the last created page (most likely the one chrome_session created)
- const page = pages[pages.length - 1];
-
- // Set up response interception to capture headers and redirects
- page.on('response', async (response) => {
- const request = response.request();
-
- // Track redirects
- if (response.status() >= 300 && response.status() < 400) {
- redirectChain.push({
- url: response.url(),
- status: response.status(),
- location: response.headers()['location'] || null,
- });
- }
-
- // Capture headers from the main document request
- if (request.isNavigationRequest() && request.frame() === page.mainFrame()) {
- try {
- responseHeaders = {
- url: response.url(),
- status: response.status(),
- statusText: response.statusText(),
- headers: response.headers(),
- };
- finalUrl = response.url();
- } catch (e) {
- // Ignore errors capturing headers
- }
- }
- });
-
- // Navigate to URL and wait for load
- console.log(`Navigating to ${url} (wait: ${waitUntil}, timeout: ${timeout}ms)`);
-
- const response = await page.goto(url, {
- waitUntil,
- timeout,
- });
-
- // Capture final response if not already captured
- if (response && Object.keys(responseHeaders).length === 0) {
- responseHeaders = {
- url: response.url(),
- status: response.status(),
- statusText: response.statusText(),
- headers: response.headers(),
- };
- finalUrl = response.url();
+ // Find page by target ID if available
+ let page = null;
+ if (pageId) {
+ page = pages.find(p => {
+ const target = p.target();
+ return target && target._targetId === pageId;
+ });
+ }
+ if (!page) {
+ page = pages[pages.length - 1];
}
- // Apply optional delay after load
+ // Navigate
+ console.log(`Navigating to ${url} (wait: ${waitUntil}, timeout: ${timeout}ms)`);
+ const response = await page.goto(url, { waitUntil, timeout });
+
+ // Optional delay
if (delayAfterLoad > 0) {
console.log(`Waiting ${delayAfterLoad}ms after load...`);
await sleep(delayAfterLoad);
}
- // Write response headers
- if (Object.keys(responseHeaders).length > 0) {
- // Add redirect chain to headers
- responseHeaders.redirect_chain = redirectChain;
+ const finalUrl = page.url();
+ const status = response ? response.status() : null;
- fs.writeFileSync(
- path.join(CHROME_SESSION_DIR, 'response_headers.json'),
- JSON.stringify(responseHeaders, null, 2)
- );
- }
+ // Write marker file
+ fs.writeFileSync(path.join(OUTPUT_DIR, 'page_loaded.txt'), new Date().toISOString());
+ fs.writeFileSync(path.join(OUTPUT_DIR, 'final_url.txt'), finalUrl);
- // Write final URL (after redirects)
- fs.writeFileSync(path.join(CHROME_SESSION_DIR, 'final_url.txt'), finalUrl);
-
- // Write marker file indicating page is loaded
- fs.writeFileSync(
- path.join(CHROME_SESSION_DIR, 'page_loaded.txt'),
- new Date().toISOString()
- );
-
- // Disconnect but leave browser running for post-load extractors
browser.disconnect();
- return {
- success: true,
- output: CHROME_SESSION_DIR,
- finalUrl,
- status: responseHeaders.status,
- redirectCount: redirectChain.length,
- };
+ return { success: true, finalUrl, status };
} catch (e) {
- // Don't close browser on error - let cleanup handle it
- if (browser) {
- try {
- browser.disconnect();
- } catch (disconnectErr) {
- // Ignore
- }
- }
+ if (browser) browser.disconnect();
return { success: false, error: `${e.name}: ${e.message}` };
}
}
@@ -239,55 +140,33 @@ async function main() {
let output = null;
let error = '';
- try {
- // Check for chrome_session
- const cdpUrl = getCdpUrl();
- if (!cdpUrl) {
- console.error('ERROR: chrome_session not found (cdp_url.txt missing)');
- console.error('chrome_navigate requires chrome_session to run first');
- process.exit(1);
- }
+ const cdpUrl = getCdpUrl();
+ if (!cdpUrl) {
+ console.error('ERROR: chrome_session not found');
+ process.exit(1);
+ }
- // Get URL from chrome_session or use provided URL
- const targetUrl = getTargetUrl() || url;
+ const result = await navigate(url, cdpUrl);
- const result = await navigateToUrl(targetUrl, cdpUrl);
-
- if (result.success) {
- status = 'succeeded';
- output = result.output;
- console.log(`Page loaded: ${result.finalUrl}`);
- console.log(`HTTP status: ${result.status}`);
- if (result.redirectCount > 0) {
- console.log(`Redirects: ${result.redirectCount}`);
- }
- } else {
- status = 'failed';
- error = result.error;
- }
- } catch (e) {
- error = `${e.name}: ${e.message}`;
- status = 'failed';
+ if (result.success) {
+ status = 'succeeded';
+ output = OUTPUT_DIR;
+ console.log(`Page loaded: ${result.finalUrl} (HTTP ${result.status})`);
+ } else {
+ error = result.error;
}
const endTs = new Date();
const duration = (endTs - startTs) / 1000;
- // Print results
console.log(`START_TS=${startTs.toISOString()}`);
console.log(`END_TS=${endTs.toISOString()}`);
console.log(`DURATION=${duration.toFixed(2)}`);
- if (output) {
- console.log(`OUTPUT=${output}`);
- }
+ if (output) console.log(`OUTPUT=${output}`);
console.log(`STATUS=${status}`);
+ if (error) console.error(`ERROR=${error}`);
- if (error) {
- console.error(`ERROR=${error}`);
- }
-
- // Print JSON result
- const resultJson = {
+ console.log(`RESULT_JSON=${JSON.stringify({
extractor: EXTRACTOR_NAME,
url,
snapshot_id: snapshotId,
@@ -297,8 +176,7 @@ async function main() {
duration: Math.round(duration * 100) / 100,
output,
error: error || null,
- };
- console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
+ })}`);
process.exit(status === 'succeeded' ? 0 : 1);
}
diff --git a/archivebox/plugins/chrome_session/on_CrawlEnd__99_chrome_cleanup.py b/archivebox/plugins/chrome_session/on_CrawlEnd__99_chrome_cleanup.py
new file mode 100644
index 00000000..45c6aee7
--- /dev/null
+++ b/archivebox/plugins/chrome_session/on_CrawlEnd__99_chrome_cleanup.py
@@ -0,0 +1,141 @@
+#!/usr/bin/env python3
+"""
+Clean up Chrome browser session at the end of a crawl.
+
+This runs after all snapshots in a crawl have been processed to terminate
+the shared Chrome session that was started by on_Crawl__10_chrome_session.js.
+
+Usage: on_Crawl__99_chrome_cleanup.py --crawl-id=
+Output: Terminates the crawl's Chrome process
+"""
+
+import json
+import os
+import signal
+import sys
+import time
+from datetime import datetime, timezone
+from pathlib import Path
+
+import rich_click as click
+
+
+# Extractor metadata
+EXTRACTOR_NAME = 'chrome_cleanup'
+CHROME_SESSION_DIR = 'chrome_session'
+
+
+def get_env(name: str, default: str = '') -> str:
+ return os.environ.get(name, default).strip()
+
+
+def cleanup_crawl_chrome() -> tuple[bool, str | None, str]:
+ """
+ Clean up Chrome session for the crawl.
+
+ Returns: (success, output_info, error_message)
+ """
+ session_dir = Path(CHROME_SESSION_DIR)
+
+ if not session_dir.exists():
+ return True, 'No chrome_session directory found', ''
+
+ pid_file = session_dir / 'pid.txt'
+ killed = False
+
+ if pid_file.exists():
+ try:
+ pid = int(pid_file.read_text().strip())
+
+ # Try graceful termination first
+ try:
+ os.kill(pid, signal.SIGTERM)
+ killed = True
+ print(f'[*] Sent SIGTERM to Chrome PID {pid}')
+
+ # Wait briefly for graceful shutdown
+ for _ in range(20):
+ try:
+ os.kill(pid, 0) # Check if still running
+ time.sleep(0.1)
+ except OSError:
+ print(f'[+] Chrome process {pid} terminated')
+ break # Process is gone
+ else:
+ # Force kill if still running
+ print(f'[!] Chrome still running, sending SIGKILL')
+ try:
+ os.kill(pid, signal.SIGKILL)
+ except OSError:
+ pass
+
+ except OSError as e:
+ # Process might already be dead, that's fine
+ if e.errno == 3: # No such process
+ print(f'[*] Chrome process {pid} already terminated')
+ else:
+ return False, None, f'Failed to kill Chrome PID {pid}: {e}'
+
+ except ValueError:
+ return False, None, f'Invalid PID in {pid_file}'
+ except Exception as e:
+ return False, None, f'{type(e).__name__}: {e}'
+
+ result_info = f'Crawl Chrome cleanup: PID {"killed" if killed else "not found or already terminated"}'
+ return True, result_info, ''
+
+
+@click.command()
+@click.option('--crawl-id', required=True, help='Crawl UUID')
+@click.option('--source-url', default='', help='Source URL (unused)')
+def main(crawl_id: str, source_url: str):
+ """Clean up shared Chrome browser session for crawl."""
+
+ start_ts = datetime.now(timezone.utc)
+ output = None
+ status = 'failed'
+ error = ''
+
+ try:
+ success, output, error = cleanup_crawl_chrome()
+ status = 'succeeded' if success else 'failed'
+
+ if success:
+ print(f'Crawl Chrome cleanup completed: {output}')
+
+ except Exception as e:
+ error = f'{type(e).__name__}: {e}'
+ status = 'failed'
+
+ # Print results
+ end_ts = datetime.now(timezone.utc)
+ duration = (end_ts - start_ts).total_seconds()
+
+ print(f'START_TS={start_ts.isoformat()}')
+ print(f'END_TS={end_ts.isoformat()}')
+ print(f'DURATION={duration:.2f}')
+ if output:
+ print(f'OUTPUT={output}')
+ print(f'STATUS={status}')
+
+ if error:
+ print(f'ERROR={error}', file=sys.stderr)
+
+ # Print JSON result
+ result_json = {
+ 'extractor': EXTRACTOR_NAME,
+ 'crawl_id': crawl_id,
+ 'status': status,
+ 'start_ts': start_ts.isoformat(),
+ 'end_ts': end_ts.isoformat(),
+ 'duration': round(duration, 2),
+ 'output': output,
+ 'error': error or None,
+ }
+ print(f'RESULT_JSON={json.dumps(result_json)}')
+
+ sys.exit(0 if status == 'succeeded' else 1)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/archivebox/plugins/chrome_session/on_Crawl__10_chrome_session.js b/archivebox/plugins/chrome_session/on_Crawl__10_chrome_session.js
new file mode 100644
index 00000000..b3ad9ff8
--- /dev/null
+++ b/archivebox/plugins/chrome_session/on_Crawl__10_chrome_session.js
@@ -0,0 +1,343 @@
+#!/usr/bin/env node
+/**
+ * Launch a shared Chrome browser session for the entire crawl.
+ *
+ * This runs once per crawl and keeps Chrome alive for all snapshots to share.
+ * Each snapshot creates its own tab via on_Snapshot__20_chrome_session.js.
+ *
+ * Usage: on_Crawl__10_chrome_session.js --crawl-id= --source-url=
+ * Output: Creates chrome_session/ with:
+ * - cdp_url.txt: WebSocket URL for CDP connection
+ * - pid.txt: Chrome process ID (for cleanup)
+ *
+ * Environment variables:
+ * CHROME_BINARY: Path to Chrome/Chromium binary
+ * CHROME_RESOLUTION: Page resolution (default: 1440,2000)
+ * CHROME_HEADLESS: Run in headless mode (default: true)
+ * CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true)
+ */
+
+const fs = require('fs');
+const path = require('path');
+const { spawn } = require('child_process');
+const http = require('http');
+
+// Extractor metadata
+const EXTRACTOR_NAME = 'chrome_session';
+const OUTPUT_DIR = 'chrome_session';
+
+// Parse command line arguments
+function parseArgs() {
+ const args = {};
+ process.argv.slice(2).forEach(arg => {
+ if (arg.startsWith('--')) {
+ const [key, ...valueParts] = arg.slice(2).split('=');
+ args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
+ }
+ });
+ return args;
+}
+
+// Get environment variable with default
+function getEnv(name, defaultValue = '') {
+ return (process.env[name] || defaultValue).trim();
+}
+
+function getEnvBool(name, defaultValue = false) {
+ const val = getEnv(name, '').toLowerCase();
+ if (['true', '1', 'yes', 'on'].includes(val)) return true;
+ if (['false', '0', 'no', 'off'].includes(val)) return false;
+ return defaultValue;
+}
+
+// Find Chrome binary
+function findChrome() {
+ const chromeBinary = getEnv('CHROME_BINARY');
+ if (chromeBinary && fs.existsSync(chromeBinary)) {
+ return chromeBinary;
+ }
+
+ const candidates = [
+ // Linux
+ '/usr/bin/google-chrome',
+ '/usr/bin/google-chrome-stable',
+ '/usr/bin/chromium',
+ '/usr/bin/chromium-browser',
+ // macOS
+ '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
+ '/Applications/Chromium.app/Contents/MacOS/Chromium',
+ ];
+
+ for (const candidate of candidates) {
+ if (fs.existsSync(candidate)) {
+ return candidate;
+ }
+ }
+
+ return null;
+}
+
+// Parse resolution string
+function parseResolution(resolution) {
+ const [width, height] = resolution.split(',').map(x => parseInt(x.trim(), 10));
+ return { width: width || 1440, height: height || 2000 };
+}
+
+// Find a free port
+function findFreePort() {
+ return new Promise((resolve, reject) => {
+ const server = require('net').createServer();
+ server.unref();
+ server.on('error', reject);
+ server.listen(0, () => {
+ const port = server.address().port;
+ server.close(() => resolve(port));
+ });
+ });
+}
+
+// Wait for Chrome's DevTools port to be ready
+function waitForDebugPort(port, timeout = 30000) {
+ const startTime = Date.now();
+
+ return new Promise((resolve, reject) => {
+ const tryConnect = () => {
+ if (Date.now() - startTime > timeout) {
+ reject(new Error(`Timeout waiting for Chrome debug port ${port}`));
+ return;
+ }
+
+ const req = http.get(`http://127.0.0.1:${port}/json/version`, (res) => {
+ let data = '';
+ res.on('data', chunk => data += chunk);
+ res.on('end', () => {
+ try {
+ const info = JSON.parse(data);
+ resolve(info);
+ } catch (e) {
+ setTimeout(tryConnect, 100);
+ }
+ });
+ });
+
+ req.on('error', () => {
+ setTimeout(tryConnect, 100);
+ });
+
+ req.setTimeout(1000, () => {
+ req.destroy();
+ setTimeout(tryConnect, 100);
+ });
+ };
+
+ tryConnect();
+ });
+}
+
+async function launchChrome(binary) {
+ const resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000');
+ const checkSsl = getEnvBool('CHROME_CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true));
+ const headless = getEnvBool('CHROME_HEADLESS', true);
+
+ const { width, height } = parseResolution(resolution);
+
+ // Create output directory
+ if (!fs.existsSync(OUTPUT_DIR)) {
+ fs.mkdirSync(OUTPUT_DIR, { recursive: true });
+ }
+
+ // Find a free port for Chrome DevTools
+ const debugPort = await findFreePort();
+ console.log(`[*] Using debug port: ${debugPort}`);
+
+ // Load any installed extensions
+ const extensionUtils = require('../chrome_extensions/chrome_extension_utils.js');
+ const extensionsDir = getEnv('CHROME_EXTENSIONS_DIR') ||
+ path.join(getEnv('DATA_DIR', '.'), 'personas', getEnv('ACTIVE_PERSONA', 'Default'), 'chrome_extensions');
+
+ const installedExtensions = [];
+ if (fs.existsSync(extensionsDir)) {
+ const files = fs.readdirSync(extensionsDir);
+ for (const file of files) {
+ if (file.endsWith('.extension.json')) {
+ try {
+ const extPath = path.join(extensionsDir, file);
+ const extData = JSON.parse(fs.readFileSync(extPath, 'utf-8'));
+ if (extData.unpacked_path && fs.existsSync(extData.unpacked_path)) {
+ installedExtensions.push(extData);
+ console.log(`[*] Loading extension: ${extData.name || file}`);
+ }
+ } catch (e) {
+ // Skip invalid cache files
+ console.warn(`[!] Skipping invalid extension cache: ${file}`);
+ }
+ }
+ }
+ }
+
+ // Get extension launch arguments
+ const extensionArgs = extensionUtils.getExtensionLaunchArgs(installedExtensions);
+ if (extensionArgs.length > 0) {
+ console.log(`[+] Loaded ${installedExtensions.length} extension(s)`);
+ // Write extensions metadata for config hooks to use
+ fs.writeFileSync(
+ path.join(OUTPUT_DIR, 'extensions.json'),
+ JSON.stringify(installedExtensions, null, 2)
+ );
+ }
+
+ // Build Chrome arguments
+ const chromeArgs = [
+ `--remote-debugging-port=${debugPort}`,
+ '--remote-debugging-address=127.0.0.1',
+ '--no-sandbox',
+ '--disable-setuid-sandbox',
+ '--disable-dev-shm-usage',
+ '--disable-gpu',
+ '--disable-sync',
+ '--no-first-run',
+ '--no-default-browser-check',
+ '--disable-default-apps',
+ '--disable-infobars',
+ '--disable-blink-features=AutomationControlled',
+ '--disable-component-update',
+ '--disable-domain-reliability',
+ '--disable-breakpad',
+ '--disable-background-networking',
+ '--disable-background-timer-throttling',
+ '--disable-backgrounding-occluded-windows',
+ '--disable-renderer-backgrounding',
+ '--disable-ipc-flooding-protection',
+ '--password-store=basic',
+ '--use-mock-keychain',
+ '--font-render-hinting=none',
+ '--force-color-profile=srgb',
+ `--window-size=${width},${height}`,
+ ...extensionArgs, // Load extensions
+ ...(headless ? ['--headless=new'] : []),
+ ...(checkSsl ? [] : ['--ignore-certificate-errors']),
+ 'about:blank', // Start with blank page
+ ];
+
+ // Launch Chrome as a child process (NOT detached - stays with crawl process)
+ // Using stdio: 'ignore' so we don't block on output but Chrome stays as our child
+ const chromeProcess = spawn(binary, chromeArgs, {
+ stdio: ['ignore', 'ignore', 'ignore'],
+ });
+
+ const chromePid = chromeProcess.pid;
+ console.log(`[*] Launched Chrome (PID: ${chromePid}), waiting for debug port...`);
+
+ // Write PID immediately for cleanup
+ fs.writeFileSync(path.join(OUTPUT_DIR, 'pid.txt'), String(chromePid));
+ fs.writeFileSync(path.join(OUTPUT_DIR, 'port.txt'), String(debugPort));
+
+ try {
+ // Wait for Chrome to be ready
+ const versionInfo = await waitForDebugPort(debugPort, 30000);
+ console.log(`[+] Chrome ready: ${versionInfo.Browser}`);
+
+ // Build WebSocket URL
+ const wsUrl = versionInfo.webSocketDebuggerUrl;
+ fs.writeFileSync(path.join(OUTPUT_DIR, 'cdp_url.txt'), wsUrl);
+
+ return { success: true, cdpUrl: wsUrl, pid: chromePid, port: debugPort };
+
+ } catch (e) {
+ // Kill Chrome if setup failed
+ try {
+ process.kill(chromePid, 'SIGTERM');
+ } catch (killErr) {
+ // Ignore
+ }
+ return { success: false, error: `${e.name}: ${e.message}` };
+ }
+}
+
+async function main() {
+ const args = parseArgs();
+ const crawlId = args.crawl_id;
+
+ const startTs = new Date();
+ let status = 'failed';
+ let output = null;
+ let error = '';
+ let version = '';
+
+ try {
+ const binary = findChrome();
+ if (!binary) {
+ console.error('ERROR: Chrome/Chromium binary not found');
+ console.error('DEPENDENCY_NEEDED=chrome');
+ console.error('BIN_PROVIDERS=puppeteer,env,playwright,apt,brew');
+ console.error('INSTALL_HINT=npx @puppeteer/browsers install chrome@stable');
+ process.exit(1);
+ }
+
+ // Get Chrome version
+ try {
+ const { execSync } = require('child_process');
+ version = execSync(`"${binary}" --version`, { encoding: 'utf8', timeout: 5000 }).trim().slice(0, 64);
+ } catch (e) {
+ version = '';
+ }
+
+ const result = await launchChrome(binary);
+
+ if (result.success) {
+ status = 'succeeded';
+ output = OUTPUT_DIR;
+ console.log(`[+] Chrome session started for crawl ${crawlId}`);
+ console.log(`[+] CDP URL: ${result.cdpUrl}`);
+ console.log(`[+] PID: ${result.pid}`);
+ } else {
+ status = 'failed';
+ error = result.error;
+ }
+ } catch (e) {
+ error = `${e.name}: ${e.message}`;
+ status = 'failed';
+ }
+
+ const endTs = new Date();
+ const duration = (endTs - startTs) / 1000;
+
+ // Print results
+ console.log(`START_TS=${startTs.toISOString()}`);
+ console.log(`END_TS=${endTs.toISOString()}`);
+ console.log(`DURATION=${duration.toFixed(2)}`);
+ if (version) {
+ console.log(`VERSION=${version}`);
+ }
+ if (output) {
+ console.log(`OUTPUT=${output}`);
+ }
+ console.log(`STATUS=${status}`);
+
+ if (error) {
+ console.error(`ERROR=${error}`);
+ }
+
+ // Print JSON result
+ const resultJson = {
+ extractor: EXTRACTOR_NAME,
+ crawl_id: crawlId,
+ status,
+ start_ts: startTs.toISOString(),
+ end_ts: endTs.toISOString(),
+ duration: Math.round(duration * 100) / 100,
+ cmd_version: version,
+ output,
+ error: error || null,
+ };
+ console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
+
+ // Exit with success - Chrome stays running as our child process
+ // It will be cleaned up when the crawl process terminates
+ process.exit(status === 'succeeded' ? 0 : 1);
+}
+
+main().catch(e => {
+ console.error(`Fatal error: ${e.message}`);
+ process.exit(1);
+});
diff --git a/archivebox/plugins/chrome_session/on_Snapshot__20_chrome_session.js b/archivebox/plugins/chrome_session/on_Snapshot__20_chrome_session.js
index 84837d0a..409ba212 100755
--- a/archivebox/plugins/chrome_session/on_Snapshot__20_chrome_session.js
+++ b/archivebox/plugins/chrome_session/on_Snapshot__20_chrome_session.js
@@ -1,20 +1,21 @@
#!/usr/bin/env node
/**
- * Start a Chrome browser session for use by other extractors.
+ * Create a Chrome tab for this snapshot in the shared crawl Chrome session.
*
- * This extractor ONLY launches Chrome and creates a blank page - it does NOT navigate.
- * Pre-load extractors (21-29) can connect via CDP to register listeners before navigation.
- * The chrome_navigate extractor (30) performs the actual page load.
+ * If a crawl-level Chrome session exists (from on_Crawl__10_chrome_session.js),
+ * this connects to it and creates a new tab. Otherwise, falls back to launching
+ * its own Chrome instance.
*
- * Usage: on_Snapshot__20_chrome_session.js --url= --snapshot-id=
+ * Usage: on_Snapshot__20_chrome_session.js --url= --snapshot-id= --crawl-id=
* Output: Creates chrome_session/ with:
- * - cdp_url.txt: WebSocket URL for CDP connection
- * - pid.txt: Chrome process ID (for cleanup)
- * - page_id.txt: Target ID of the page for other extractors to use
- * - url.txt: The URL to be navigated to (for chrome_navigate)
+ * - cdp_url.txt: WebSocket URL for CDP connection (copied or new)
+ * - pid.txt: Chrome process ID (from crawl or new)
+ * - page_id.txt: Target ID of this snapshot's tab
+ * - url.txt: The URL to be navigated to
*
* Environment variables:
- * CHROME_BINARY: Path to Chrome/Chromium binary
+ * DATA_DIR: Data directory (to find crawl's Chrome session)
+ * CHROME_BINARY: Path to Chrome/Chromium binary (for fallback)
* CHROME_RESOLUTION: Page resolution (default: 1440,2000)
* CHROME_USER_AGENT: User agent string (optional)
* CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true)
@@ -23,18 +24,13 @@
const fs = require('fs');
const path = require('path');
+const { spawn } = require('child_process');
+const http = require('http');
const puppeteer = require('puppeteer-core');
-// Import extension utilities
-const extensionUtils = require('../chrome_extensions/chrome_extension_utils.js');
-
// Extractor metadata
const EXTRACTOR_NAME = 'chrome_session';
-const OUTPUT_DIR = 'chrome_session';
-
-// Get extensions directory from environment or use default
-const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
- path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions');
+const OUTPUT_DIR = '.'; // Hook already runs in the output directory
// Parse command line arguments
function parseArgs() {
@@ -60,13 +56,7 @@ function getEnvBool(name, defaultValue = false) {
return defaultValue;
}
-function getEnvInt(name, defaultValue = 0) {
- const val = parseInt(getEnv(name, String(defaultValue)), 10);
- return isNaN(val) ? defaultValue : val;
-}
-
-
-// Find Chrome binary
+// Find Chrome binary (for fallback)
function findChrome() {
const chromeBinary = getEnv('CHROME_BINARY');
if (chromeBinary && fs.existsSync(chromeBinary)) {
@@ -74,12 +64,10 @@ function findChrome() {
}
const candidates = [
- // Linux
'/usr/bin/google-chrome',
'/usr/bin/google-chrome-stable',
'/usr/bin/chromium',
'/usr/bin/chromium-browser',
- // macOS
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
'/Applications/Chromium.app/Contents/MacOS/Chromium',
];
@@ -99,40 +87,132 @@ function parseResolution(resolution) {
return { width: width || 1440, height: height || 2000 };
}
-// Load installed extensions from cache files
-function loadInstalledExtensions() {
- const extensions = [];
+// Find a free port
+function findFreePort() {
+ return new Promise((resolve, reject) => {
+ const server = require('net').createServer();
+ server.unref();
+ server.on('error', reject);
+ server.listen(0, () => {
+ const port = server.address().port;
+ server.close(() => resolve(port));
+ });
+ });
+}
- if (!fs.existsSync(EXTENSIONS_DIR)) {
- return extensions;
- }
+// Wait for Chrome's DevTools port to be ready
+function waitForDebugPort(port, timeout = 30000) {
+ const startTime = Date.now();
- // Look for *.extension.json cache files created by extension plugins
- const files = fs.readdirSync(EXTENSIONS_DIR);
- const extensionFiles = files.filter(f => f.endsWith('.extension.json'));
+ return new Promise((resolve, reject) => {
+ const tryConnect = () => {
+ if (Date.now() - startTime > timeout) {
+ reject(new Error(`Timeout waiting for Chrome debug port ${port}`));
+ return;
+ }
- for (const file of extensionFiles) {
+ const req = http.get(`http://127.0.0.1:${port}/json/version`, (res) => {
+ let data = '';
+ res.on('data', chunk => data += chunk);
+ res.on('end', () => {
+ try {
+ const info = JSON.parse(data);
+ resolve(info);
+ } catch (e) {
+ setTimeout(tryConnect, 100);
+ }
+ });
+ });
+
+ req.on('error', () => {
+ setTimeout(tryConnect, 100);
+ });
+
+ req.setTimeout(1000, () => {
+ req.destroy();
+ setTimeout(tryConnect, 100);
+ });
+ };
+
+ tryConnect();
+ });
+}
+
+// Try to find the crawl's Chrome session
+function findCrawlChromeSession(crawlId) {
+ if (!crawlId) return null;
+
+ const dataDir = getEnv('DATA_DIR', '.');
+ const crawlChromeDir = path.join(dataDir, 'tmp', `crawl_${crawlId}`, 'chrome_session');
+
+ const cdpFile = path.join(crawlChromeDir, 'cdp_url.txt');
+ const pidFile = path.join(crawlChromeDir, 'pid.txt');
+
+ if (fs.existsSync(cdpFile) && fs.existsSync(pidFile)) {
try {
- const filePath = path.join(EXTENSIONS_DIR, file);
- const data = fs.readFileSync(filePath, 'utf-8');
- const extension = JSON.parse(data);
+ const cdpUrl = fs.readFileSync(cdpFile, 'utf-8').trim();
+ const pid = parseInt(fs.readFileSync(pidFile, 'utf-8').trim(), 10);
- // Verify extension is actually installed
- const manifestPath = path.join(extension.unpacked_path, 'manifest.json');
- if (fs.existsSync(manifestPath)) {
- extensions.push(extension);
- console.log(`[+] Loaded extension: ${extension.name} (${extension.webstore_id})`);
+ // Verify the process is still running
+ try {
+ process.kill(pid, 0); // Signal 0 = check if process exists
+ return { cdpUrl, pid };
+ } catch (e) {
+ // Process not running
+ return null;
}
} catch (e) {
- console.warn(`[⚠️] Failed to load extension from ${file}: ${e.message}`);
+ return null;
}
}
- return extensions;
+ return null;
}
+// Create a new tab in an existing Chrome session
+async function createTabInExistingChrome(cdpUrl, url, pid) {
+ const resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000');
+ const userAgent = getEnv('CHROME_USER_AGENT') || getEnv('USER_AGENT', '');
+ const { width, height } = parseResolution(resolution);
-async function startChromeSession(url, binary) {
+ console.log(`[*] Connecting to existing Chrome session: ${cdpUrl}`);
+
+ // Connect Puppeteer to the running Chrome
+ const browser = await puppeteer.connect({
+ browserWSEndpoint: cdpUrl,
+ defaultViewport: { width, height },
+ });
+
+ // Create a new tab for this snapshot
+ const page = await browser.newPage();
+
+ // Set viewport
+ await page.setViewport({ width, height });
+
+ // Set user agent if specified
+ if (userAgent) {
+ await page.setUserAgent(userAgent);
+ }
+
+ // Get the page target ID
+ const target = page.target();
+ const targetId = target._targetId;
+
+ // Write session info
+ fs.writeFileSync(path.join(OUTPUT_DIR, 'cdp_url.txt'), cdpUrl);
+ fs.writeFileSync(path.join(OUTPUT_DIR, 'pid.txt'), String(pid));
+ fs.writeFileSync(path.join(OUTPUT_DIR, 'page_id.txt'), targetId);
+ fs.writeFileSync(path.join(OUTPUT_DIR, 'url.txt'), url);
+ fs.writeFileSync(path.join(OUTPUT_DIR, 'shared_session.txt'), 'true');
+
+ // Disconnect Puppeteer (Chrome and tab stay alive)
+ browser.disconnect();
+
+ return { success: true, output: OUTPUT_DIR, cdpUrl, targetId, pid, shared: true };
+}
+
+// Fallback: Launch a new Chrome instance for this snapshot
+async function launchNewChrome(url, binary) {
const resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000');
const userAgent = getEnv('CHROME_USER_AGENT') || getEnv('USER_AGENT', '');
const checkSsl = getEnvBool('CHROME_CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true));
@@ -140,115 +220,98 @@ async function startChromeSession(url, binary) {
const { width, height } = parseResolution(resolution);
- // Load installed extensions
- const extensions = loadInstalledExtensions();
- const extensionArgs = extensionUtils.getExtensionLaunchArgs(extensions);
+ // Find a free port for Chrome DevTools
+ const debugPort = await findFreePort();
+ console.log(`[*] Launching new Chrome on port: ${debugPort}`);
- if (extensions.length > 0) {
- console.log(`[*] Loading ${extensions.length} Chrome extensions...`);
- }
+ // Build Chrome arguments
+ const chromeArgs = [
+ `--remote-debugging-port=${debugPort}`,
+ '--remote-debugging-address=127.0.0.1',
+ '--no-sandbox',
+ '--disable-setuid-sandbox',
+ '--disable-dev-shm-usage',
+ '--disable-gpu',
+ '--disable-sync',
+ '--no-first-run',
+ '--no-default-browser-check',
+ '--disable-default-apps',
+ '--disable-infobars',
+ '--disable-blink-features=AutomationControlled',
+ '--disable-component-update',
+ '--disable-domain-reliability',
+ '--disable-breakpad',
+ '--disable-background-networking',
+ '--disable-background-timer-throttling',
+ '--disable-backgrounding-occluded-windows',
+ '--disable-renderer-backgrounding',
+ '--disable-ipc-flooding-protection',
+ '--password-store=basic',
+ '--use-mock-keychain',
+ '--font-render-hinting=none',
+ '--force-color-profile=srgb',
+ `--window-size=${width},${height}`,
+ ...(headless ? ['--headless=new'] : []),
+ ...(checkSsl ? [] : ['--ignore-certificate-errors']),
+ 'about:blank',
+ ];
- // Create output directory
- if (!fs.existsSync(OUTPUT_DIR)) {
- fs.mkdirSync(OUTPUT_DIR, { recursive: true });
- }
+ // Launch Chrome as a detached process (since no crawl-level Chrome exists)
+ const chromeProcess = spawn(binary, chromeArgs, {
+ detached: true,
+ stdio: ['ignore', 'ignore', 'ignore'],
+ });
+ chromeProcess.unref();
- let browser = null;
+ const chromePid = chromeProcess.pid;
+ console.log(`[*] Launched Chrome (PID: ${chromePid}), waiting for debug port...`);
+
+ // Write PID immediately for cleanup
+ fs.writeFileSync(path.join(OUTPUT_DIR, 'pid.txt'), String(chromePid));
try {
- // Launch browser with Puppeteer
- browser = await puppeteer.launch({
- executablePath: binary,
- headless: headless ? 'new' : false,
- args: [
- '--no-sandbox',
- '--disable-setuid-sandbox',
- '--disable-dev-shm-usage',
- '--disable-gpu',
- '--disable-sync',
- '--no-first-run',
- '--no-default-browser-check',
- '--disable-default-apps',
- '--disable-infobars',
- '--disable-blink-features=AutomationControlled',
- '--disable-component-update',
- '--disable-domain-reliability',
- '--disable-breakpad',
- '--disable-background-networking',
- '--disable-background-timer-throttling',
- '--disable-backgrounding-occluded-windows',
- '--disable-renderer-backgrounding',
- '--disable-ipc-flooding-protection',
- '--password-store=basic',
- '--use-mock-keychain',
- '--font-render-hinting=none',
- '--force-color-profile=srgb',
- `--window-size=${width},${height}`,
- ...(checkSsl ? [] : ['--ignore-certificate-errors']),
- ...extensionArgs,
- ],
+ // Wait for Chrome to be ready
+ const versionInfo = await waitForDebugPort(debugPort, 30000);
+ console.log(`[+] Chrome ready: ${versionInfo.Browser}`);
+
+ const wsUrl = versionInfo.webSocketDebuggerUrl;
+ fs.writeFileSync(path.join(OUTPUT_DIR, 'cdp_url.txt'), wsUrl);
+
+ // Connect Puppeteer to get page info
+ const browser = await puppeteer.connect({
+ browserWSEndpoint: wsUrl,
defaultViewport: { width, height },
});
- // Get the WebSocket endpoint URL
- const cdpUrl = browser.wsEndpoint();
- fs.writeFileSync(path.join(OUTPUT_DIR, 'cdp_url.txt'), cdpUrl);
+ let pages = await browser.pages();
+ let page = pages[0];
- // Write PID for cleanup
- const browserProcess = browser.process();
- if (browserProcess) {
- fs.writeFileSync(path.join(OUTPUT_DIR, 'pid.txt'), String(browserProcess.pid));
+ if (!page) {
+ page = await browser.newPage();
}
- // Create a new page (but DON'T navigate yet)
- const page = await browser.newPage();
+ await page.setViewport({ width, height });
- // Set user agent if specified
if (userAgent) {
await page.setUserAgent(userAgent);
}
- // Write the page target ID so other extractors can find this specific page
const target = page.target();
const targetId = target._targetId;
+
fs.writeFileSync(path.join(OUTPUT_DIR, 'page_id.txt'), targetId);
-
- // Write the URL for chrome_navigate to use
fs.writeFileSync(path.join(OUTPUT_DIR, 'url.txt'), url);
+ fs.writeFileSync(path.join(OUTPUT_DIR, 'shared_session.txt'), 'false');
- // Connect to loaded extensions at runtime (only if not already done)
- const extensionsFile = path.join(OUTPUT_DIR, 'extensions.json');
- if (extensions.length > 0 && !fs.existsSync(extensionsFile)) {
- console.log('[*] Connecting to loaded extensions (first time setup)...');
- try {
- const loadedExtensions = await extensionUtils.loadAllExtensionsFromBrowser(browser, extensions);
-
- // Write loaded extensions metadata for other extractors to use
- fs.writeFileSync(extensionsFile, JSON.stringify(loadedExtensions, null, 2));
-
- console.log(`[+] Extensions loaded and available at ${extensionsFile}`);
- console.log(`[+] ${loadedExtensions.length} extensions ready for configuration by subsequent plugins`);
- } catch (e) {
- console.warn(`[⚠️] Failed to load extensions from browser: ${e.message}`);
- }
- } else if (extensions.length > 0) {
- console.log('[*] Extensions already loaded from previous snapshot');
- }
-
- // Don't close browser - leave it running for other extractors
- // Detach puppeteer from browser so it stays running
browser.disconnect();
- return { success: true, output: OUTPUT_DIR, cdpUrl, targetId };
+ return { success: true, output: OUTPUT_DIR, cdpUrl: wsUrl, targetId, pid: chromePid, shared: false };
} catch (e) {
- // Kill browser if startup failed
- if (browser) {
- try {
- await browser.close();
- } catch (closeErr) {
- // Ignore
- }
+ try {
+ process.kill(chromePid, 'SIGTERM');
+ } catch (killErr) {
+ // Ignore
}
return { success: false, error: `${e.name}: ${e.message}` };
}
@@ -258,9 +321,10 @@ async function main() {
const args = parseArgs();
const url = args.url;
const snapshotId = args.snapshot_id;
+ const crawlId = args.crawl_id;
if (!url || !snapshotId) {
- console.error('Usage: on_Snapshot__20_chrome_session.js --url= --snapshot-id=');
+ console.error('Usage: on_Snapshot__20_chrome_session.js --url= --snapshot-id= [--crawl-id=]');
process.exit(1);
}
@@ -271,9 +335,6 @@ async function main() {
let version = '';
try {
- // chrome_session launches Chrome and creates a blank page
- // Pre-load extractors (21-29) register CDP listeners
- // chrome_navigate (30) performs actual navigation
const binary = findChrome();
if (!binary) {
console.error('ERROR: Chrome/Chromium binary not found');
@@ -291,13 +352,24 @@ async function main() {
version = '';
}
- const result = await startChromeSession(url, binary);
+ // Try to use existing crawl Chrome session
+ const crawlSession = findCrawlChromeSession(crawlId);
+ let result;
+
+ if (crawlSession) {
+ console.log(`[*] Found existing Chrome session from crawl ${crawlId}`);
+ result = await createTabInExistingChrome(crawlSession.cdpUrl, url, crawlSession.pid);
+ } else {
+ console.log(`[*] No crawl Chrome session found, launching new Chrome`);
+ result = await launchNewChrome(url, binary);
+ }
if (result.success) {
status = 'succeeded';
output = result.output;
- console.log(`Chrome session started (no navigation yet): ${result.cdpUrl}`);
- console.log(`Page target ID: ${result.targetId}`);
+ console.log(`[+] Chrome session ready (shared: ${result.shared})`);
+ console.log(`[+] CDP URL: ${result.cdpUrl}`);
+ console.log(`[+] Page target ID: ${result.targetId}`);
} else {
status = 'failed';
error = result.error;
@@ -331,6 +403,7 @@ async function main() {
extractor: EXTRACTOR_NAME,
url,
snapshot_id: snapshotId,
+ crawl_id: crawlId || null,
status,
start_ts: startTs.toISOString(),
end_ts: endTs.toISOString(),
diff --git a/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.js b/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.js
index fc90aa03..f305b08a 100755
--- a/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.js
+++ b/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.js
@@ -1,31 +1,24 @@
#!/usr/bin/env node
/**
- * Capture console output from a page.
+ * Capture console output from a page (DAEMON MODE).
*
- * Captures all console messages during page load:
- * - log, warn, error, info, debug
- * - Includes stack traces for errors
- * - Timestamps for each message
+ * This hook daemonizes and stays alive to capture console logs throughout
+ * the snapshot lifecycle. It's killed by chrome_cleanup at the end.
*
- * Usage: on_Snapshot__14_consolelog.js --url= --snapshot-id=
- * Output: Writes consolelog/console.jsonl (one message per line)
- *
- * Environment variables:
- * SAVE_CONSOLELOG: Enable console log capture (default: true)
- * CONSOLELOG_TIMEOUT: Capture duration in seconds (default: 5)
+ * Usage: on_Snapshot__21_consolelog.js --url= --snapshot-id=
+ * Output: Writes console.jsonl + listener.pid
*/
const fs = require('fs');
const path = require('path');
const puppeteer = require('puppeteer-core');
-// Extractor metadata
const EXTRACTOR_NAME = 'consolelog';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'console.jsonl';
+const PID_FILE = 'listener.pid';
const CHROME_SESSION_DIR = '../chrome_session';
-// Parse command line arguments
function parseArgs() {
const args = {};
process.argv.slice(2).forEach(arg => {
@@ -37,7 +30,6 @@ function parseArgs() {
return args;
}
-// Get environment variable with default
function getEnv(name, defaultValue = '') {
return (process.env[name] || defaultValue).trim();
}
@@ -49,12 +41,6 @@ function getEnvBool(name, defaultValue = false) {
return defaultValue;
}
-function getEnvInt(name, defaultValue = 0) {
- const val = parseInt(getEnv(name, String(defaultValue)), 10);
- return isNaN(val) ? defaultValue : val;
-}
-
-// Get CDP URL from chrome_session
function getCdpUrl() {
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
if (fs.existsSync(cdpFile)) {
@@ -63,7 +49,14 @@ function getCdpUrl() {
return null;
}
-// Serialize console message arguments
+function getPageId() {
+ const pageIdFile = path.join(CHROME_SESSION_DIR, 'page_id.txt');
+ if (fs.existsSync(pageIdFile)) {
+ return fs.readFileSync(pageIdFile, 'utf8').trim();
+ }
+ return null;
+}
+
async function serializeArgs(args) {
const serialized = [];
for (const arg of args) {
@@ -71,7 +64,6 @@ async function serializeArgs(args) {
const json = await arg.jsonValue();
serialized.push(json);
} catch (e) {
- // If jsonValue() fails, try to get text representation
try {
serialized.push(String(arg));
} catch (e2) {
@@ -82,128 +74,84 @@ async function serializeArgs(args) {
return serialized;
}
-// Capture console logs
-async function captureConsoleLogs(url) {
- const captureTimeout = (getEnvInt('CONSOLELOG_TIMEOUT') || 5) * 1000;
-
- // Output directory is current directory (hook already runs in output dir)
+async function setupListeners() {
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
+ fs.writeFileSync(outputPath, ''); // Clear existing
- // Clear existing file
- fs.writeFileSync(outputPath, '');
-
- let browser = null;
- const consoleLogs = [];
-
- try {
- // Connect to existing Chrome session
- const cdpUrl = getCdpUrl();
- if (!cdpUrl) {
- return { success: false, error: 'No Chrome session found (chrome_session extractor must run first)' };
- }
-
- browser = await puppeteer.connect({
- browserWSEndpoint: cdpUrl,
- });
-
- // Get the page
- const pages = await browser.pages();
- const page = pages.find(p => p.url().startsWith('http')) || pages[0];
-
- if (!page) {
- return { success: false, error: 'No page found in Chrome session' };
- }
-
- // Listen for console messages
- page.on('console', async (msg) => {
- try {
- const type = msg.type();
- const text = msg.text();
- const location = msg.location();
- const args = await serializeArgs(msg.args());
-
- const logEntry = {
- timestamp: new Date().toISOString(),
- type,
- text,
- args,
- location: {
- url: location.url || '',
- lineNumber: location.lineNumber,
- columnNumber: location.columnNumber,
- },
- };
-
- // Write immediately to file
- fs.appendFileSync(outputPath, JSON.stringify(logEntry) + '\n');
- consoleLogs.push(logEntry);
- } catch (e) {
- // Error processing console message, skip it
- console.error(`Error processing console message: ${e.message}`);
- }
- });
-
- // Listen for page errors
- page.on('pageerror', (error) => {
- try {
- const logEntry = {
- timestamp: new Date().toISOString(),
- type: 'error',
- text: error.message,
- stack: error.stack || '',
- location: {},
- };
-
- fs.appendFileSync(outputPath, JSON.stringify(logEntry) + '\n');
- consoleLogs.push(logEntry);
- } catch (e) {
- console.error(`Error processing page error: ${e.message}`);
- }
- });
-
- // Listen for request failures
- page.on('requestfailed', (request) => {
- try {
- const failure = request.failure();
- const logEntry = {
- timestamp: new Date().toISOString(),
- type: 'request_failed',
- text: `Request failed: ${request.url()}`,
- error: failure ? failure.errorText : 'Unknown error',
- url: request.url(),
- location: {},
- };
-
- fs.appendFileSync(outputPath, JSON.stringify(logEntry) + '\n');
- consoleLogs.push(logEntry);
- } catch (e) {
- console.error(`Error processing request failure: ${e.message}`);
- }
- });
-
- // Wait to capture logs
- await new Promise(resolve => setTimeout(resolve, captureTimeout));
-
- // Group logs by type
- const logStats = consoleLogs.reduce((acc, log) => {
- acc[log.type] = (acc[log.type] || 0) + 1;
- return acc;
- }, {});
-
- return {
- success: true,
- output: outputPath,
- logCount: consoleLogs.length,
- logStats,
- };
-
- } catch (e) {
- return { success: false, error: `${e.name}: ${e.message}` };
- } finally {
- if (browser) {
- browser.disconnect();
- }
+ const cdpUrl = getCdpUrl();
+ if (!cdpUrl) {
+ throw new Error('No Chrome session found');
}
+
+ const browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl });
+
+ // Find our page
+ const pages = await browser.pages();
+ const pageId = getPageId();
+ let page = null;
+
+ if (pageId) {
+ page = pages.find(p => {
+ const target = p.target();
+ return target && target._targetId === pageId;
+ });
+ }
+ if (!page) {
+ page = pages[pages.length - 1];
+ }
+
+ if (!page) {
+ throw new Error('No page found');
+ }
+
+ // Set up listeners that write directly to file
+ page.on('console', async (msg) => {
+ try {
+ const logEntry = {
+ timestamp: new Date().toISOString(),
+ type: msg.type(),
+ text: msg.text(),
+ args: await serializeArgs(msg.args()),
+ location: msg.location(),
+ };
+ fs.appendFileSync(outputPath, JSON.stringify(logEntry) + '\n');
+ } catch (e) {
+ // Ignore errors
+ }
+ });
+
+ page.on('pageerror', (error) => {
+ try {
+ const logEntry = {
+ timestamp: new Date().toISOString(),
+ type: 'error',
+ text: error.message,
+ stack: error.stack || '',
+ };
+ fs.appendFileSync(outputPath, JSON.stringify(logEntry) + '\n');
+ } catch (e) {
+ // Ignore
+ }
+ });
+
+ page.on('requestfailed', (request) => {
+ try {
+ const failure = request.failure();
+ const logEntry = {
+ timestamp: new Date().toISOString(),
+ type: 'request_failed',
+ text: `Request failed: ${request.url()}`,
+ error: failure ? failure.errorText : 'Unknown error',
+ url: request.url(),
+ };
+ fs.appendFileSync(outputPath, JSON.stringify(logEntry) + '\n');
+ } catch (e) {
+ // Ignore
+ }
+ });
+
+ // Don't disconnect - keep browser connection alive
+ return { browser, page };
}
async function main() {
@@ -212,80 +160,83 @@ async function main() {
const snapshotId = args.snapshot_id;
if (!url || !snapshotId) {
- console.error('Usage: on_Snapshot__14_consolelog.js --url= --snapshot-id=');
+ console.error('Usage: on_Snapshot__21_consolelog.js --url= --snapshot-id=');
process.exit(1);
}
+ if (!getEnvBool('SAVE_CONSOLELOG', true)) {
+ console.log('Skipping (SAVE_CONSOLELOG=False)');
+ const result = {
+ extractor: EXTRACTOR_NAME,
+ status: 'skipped',
+ url,
+ snapshot_id: snapshotId,
+ };
+ console.log(`RESULT_JSON=${JSON.stringify(result)}`);
+ process.exit(0);
+ }
+
const startTs = new Date();
- let status = 'failed';
- let output = null;
- let error = '';
- let logCount = 0;
try {
- // Check if enabled
- if (!getEnvBool('SAVE_CONSOLELOG', true)) {
- console.log('Skipping console log (SAVE_CONSOLELOG=False)');
- status = 'skipped';
- const endTs = new Date();
- console.log(`START_TS=${startTs.toISOString()}`);
- console.log(`END_TS=${endTs.toISOString()}`);
- console.log(`STATUS=${status}`);
- console.log(`RESULT_JSON=${JSON.stringify({extractor: EXTRACTOR_NAME, status, url, snapshot_id: snapshotId})}`);
- process.exit(0);
- }
+ // Set up listeners
+ await setupListeners();
- const result = await captureConsoleLogs(url);
+ // Write PID file so chrome_cleanup can kill us
+ fs.writeFileSync(path.join(OUTPUT_DIR, PID_FILE), String(process.pid));
- if (result.success) {
- status = 'succeeded';
- output = result.output;
- logCount = result.logCount || 0;
- const statsStr = Object.entries(result.logStats || {})
- .map(([type, count]) => `${count} ${type}`)
- .join(', ');
- console.log(`Captured ${logCount} console messages: ${statsStr}`);
- } else {
- status = 'failed';
- error = result.error;
+ // Report success immediately (we're staying alive in background)
+ const endTs = new Date();
+ const duration = (endTs - startTs) / 1000;
+
+ console.log(`START_TS=${startTs.toISOString()}`);
+ console.log(`END_TS=${endTs.toISOString()}`);
+ console.log(`DURATION=${duration.toFixed(2)}`);
+ console.log(`OUTPUT=${OUTPUT_FILE}`);
+ console.log(`STATUS=succeeded`);
+
+ const result = {
+ extractor: EXTRACTOR_NAME,
+ url,
+ snapshot_id: snapshotId,
+ status: 'succeeded',
+ start_ts: startTs.toISOString(),
+ end_ts: endTs.toISOString(),
+ duration: Math.round(duration * 100) / 100,
+ output: OUTPUT_FILE,
+ };
+ console.log(`RESULT_JSON=${JSON.stringify(result)}`);
+
+ // Daemonize: detach from parent and keep running
+ // This process will be killed by chrome_cleanup
+ if (process.stdin.isTTY) {
+ process.stdin.pause();
}
+ process.stdin.unref();
+ process.stdout.end();
+ process.stderr.end();
+
+ // Keep the process alive indefinitely
+ // Will be killed by chrome_cleanup via the PID file
+ setInterval(() => {}, 1000);
+
} catch (e) {
- error = `${e.name}: ${e.message}`;
- status = 'failed';
- }
-
- const endTs = new Date();
- const duration = (endTs - startTs) / 1000;
-
- // Print results
- console.log(`START_TS=${startTs.toISOString()}`);
- console.log(`END_TS=${endTs.toISOString()}`);
- console.log(`DURATION=${duration.toFixed(2)}`);
- if (output) {
- console.log(`OUTPUT=${output}`);
- }
- console.log(`STATUS=${status}`);
-
- if (error) {
+ const error = `${e.name}: ${e.message}`;
console.error(`ERROR=${error}`);
+
+ const endTs = new Date();
+ const result = {
+ extractor: EXTRACTOR_NAME,
+ url,
+ snapshot_id: snapshotId,
+ status: 'failed',
+ start_ts: startTs.toISOString(),
+ end_ts: endTs.toISOString(),
+ error,
+ };
+ console.log(`RESULT_JSON=${JSON.stringify(result)}`);
+ process.exit(1);
}
-
- // Print JSON result
- const resultJson = {
- extractor: EXTRACTOR_NAME,
- url,
- snapshot_id: snapshotId,
- status,
- start_ts: startTs.toISOString(),
- end_ts: endTs.toISOString(),
- duration: Math.round(duration * 100) / 100,
- output,
- log_count: logCount,
- error: error || null,
- };
- console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
-
- process.exit(status === 'succeeded' ? 0 : 1);
}
main().catch(e => {
diff --git a/archivebox/plugins/istilldontcareaboutcookies/on_Snapshot__02_istilldontcareaboutcookies.js b/archivebox/plugins/istilldontcareaboutcookies/on_Crawl__02_istilldontcareaboutcookies.js
similarity index 97%
rename from archivebox/plugins/istilldontcareaboutcookies/on_Snapshot__02_istilldontcareaboutcookies.js
rename to archivebox/plugins/istilldontcareaboutcookies/on_Crawl__02_istilldontcareaboutcookies.js
index 0f2672f5..77b50dec 100755
--- a/archivebox/plugins/istilldontcareaboutcookies/on_Snapshot__02_istilldontcareaboutcookies.js
+++ b/archivebox/plugins/istilldontcareaboutcookies/on_Crawl__02_istilldontcareaboutcookies.js
@@ -7,8 +7,8 @@
*
* Extension: https://chromewebstore.google.com/detail/edibdbjcniadpccecjdfdjjppcpchdlm
*
- * Priority: 02 (early) - Must install before Chrome session starts
- * Hook: on_Snapshot
+ * Priority: 02 (early) - Must install before Chrome session starts at Crawl level
+ * Hook: on_Crawl (runs once per crawl, not per snapshot)
*
* This extension automatically:
* - Dismisses cookie consent popups
diff --git a/archivebox/plugins/redirects/on_Snapshot__22_redirects.js b/archivebox/plugins/redirects/on_Snapshot__22_redirects.js
deleted file mode 100755
index aaa43232..00000000
--- a/archivebox/plugins/redirects/on_Snapshot__22_redirects.js
+++ /dev/null
@@ -1,278 +0,0 @@
-#!/usr/bin/env node
-/**
- * Track complete redirect chains for a URL.
- *
- * Captures:
- * - HTTP redirects (301, 302, 303, 307, 308)
- * - Meta refresh redirects
- * - JavaScript redirects (basic detection)
- * - Full redirect chain with timestamps
- *
- * Usage: on_Snapshot__15_redirects.js --url= --snapshot-id=
- * Output: Writes redirects/redirects.json
- *
- * Environment variables:
- * SAVE_REDIRECTS: Enable redirect tracking (default: true)
- */
-
-const fs = require('fs');
-const path = require('path');
-const puppeteer = require('puppeteer-core');
-
-// Extractor metadata
-const EXTRACTOR_NAME = 'redirects';
-const OUTPUT_DIR = '.';
-const OUTPUT_FILE = 'redirects.json';
-const CHROME_SESSION_DIR = '../chrome_session';
-
-// Parse command line arguments
-function parseArgs() {
- const args = {};
- process.argv.slice(2).forEach(arg => {
- if (arg.startsWith('--')) {
- const [key, ...valueParts] = arg.slice(2).split('=');
- args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
- }
- });
- return args;
-}
-
-// Get environment variable with default
-function getEnv(name, defaultValue = '') {
- return (process.env[name] || defaultValue).trim();
-}
-
-function getEnvBool(name, defaultValue = false) {
- const val = getEnv(name, '').toLowerCase();
- if (['true', '1', 'yes', 'on'].includes(val)) return true;
- if (['false', '0', 'no', 'off'].includes(val)) return false;
- return defaultValue;
-}
-
-// Get CDP URL from chrome_session
-function getCdpUrl() {
- const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
- if (fs.existsSync(cdpFile)) {
- return fs.readFileSync(cdpFile, 'utf8').trim();
- }
- return null;
-}
-
-// Track redirect chain
-async function trackRedirects(url) {
- // Output directory is current directory (hook already runs in output dir)
- const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
-
- let browser = null;
- const redirectChain = [];
-
- try {
- // Connect to existing Chrome session
- const cdpUrl = getCdpUrl();
- if (!cdpUrl) {
- return { success: false, error: 'No Chrome session found (chrome_session extractor must run first)' };
- }
-
- browser = await puppeteer.connect({
- browserWSEndpoint: cdpUrl,
- });
-
- // Get the page
- const pages = await browser.pages();
- const page = pages.find(p => p.url().startsWith('http')) || pages[0];
-
- if (!page) {
- return { success: false, error: 'No page found in Chrome session' };
- }
-
- // Track all responses to capture redirects
- page.on('response', async (response) => {
- const status = response.status();
- const responseUrl = response.url();
- const headers = response.headers();
-
- // Check if it's a redirect
- if (status >= 300 && status < 400) {
- redirectChain.push({
- timestamp: new Date().toISOString(),
- url: responseUrl,
- status,
- statusText: response.statusText(),
- location: headers['location'] || headers['Location'] || '',
- type: 'http',
- });
- }
- });
-
- // Get the current URL (which is the final destination after redirects)
- const finalUrl = page.url();
-
- // Check for meta refresh redirects
- const metaRefresh = await page.evaluate(() => {
- const meta = document.querySelector('meta[http-equiv="refresh"]');
- if (meta) {
- const content = meta.getAttribute('content') || '';
- const match = content.match(/url=['"]?([^'"]+)['"]?/i);
- return {
- content,
- url: match ? match[1] : null,
- };
- }
- return null;
- });
-
- if (metaRefresh && metaRefresh.url) {
- redirectChain.push({
- timestamp: new Date().toISOString(),
- url: finalUrl,
- status: null,
- statusText: 'Meta Refresh',
- location: metaRefresh.url,
- type: 'meta_refresh',
- content: metaRefresh.content,
- });
- }
-
- // Check for JavaScript redirects (basic detection)
- const jsRedirect = await page.evaluate(() => {
- // Check for common JavaScript redirect patterns
- const html = document.documentElement.outerHTML;
- const patterns = [
- /window\.location\s*=\s*['"]([^'"]+)['"]/i,
- /window\.location\.href\s*=\s*['"]([^'"]+)['"]/i,
- /window\.location\.replace\s*\(\s*['"]([^'"]+)['"]\s*\)/i,
- /document\.location\s*=\s*['"]([^'"]+)['"]/i,
- ];
-
- for (const pattern of patterns) {
- const match = html.match(pattern);
- if (match) {
- return {
- pattern: pattern.toString(),
- url: match[1],
- };
- }
- }
- return null;
- });
-
- if (jsRedirect && jsRedirect.url) {
- redirectChain.push({
- timestamp: new Date().toISOString(),
- url: finalUrl,
- status: null,
- statusText: 'JavaScript Redirect',
- location: jsRedirect.url,
- type: 'javascript',
- pattern: jsRedirect.pattern,
- });
- }
-
- const redirectData = {
- original_url: url,
- final_url: finalUrl,
- redirect_count: redirectChain.length,
- redirects: redirectChain,
- is_redirect: redirectChain.length > 0,
- };
-
- // Write output
- fs.writeFileSync(outputPath, JSON.stringify(redirectData, null, 2));
-
- return { success: true, output: outputPath, redirectData };
-
- } catch (e) {
- return { success: false, error: `${e.name}: ${e.message}` };
- } finally {
- if (browser) {
- browser.disconnect();
- }
- }
-}
-
-async function main() {
- const args = parseArgs();
- const url = args.url;
- const snapshotId = args.snapshot_id;
-
- if (!url || !snapshotId) {
- console.error('Usage: on_Snapshot__15_redirects.js --url= --snapshot-id=');
- process.exit(1);
- }
-
- const startTs = new Date();
- let status = 'failed';
- let output = null;
- let error = '';
-
- try {
- // Check if enabled
- if (!getEnvBool('SAVE_REDIRECTS', true)) {
- console.log('Skipping redirects (SAVE_REDIRECTS=False)');
- status = 'skipped';
- const endTs = new Date();
- console.log(`START_TS=${startTs.toISOString()}`);
- console.log(`END_TS=${endTs.toISOString()}`);
- console.log(`STATUS=${status}`);
- console.log(`RESULT_JSON=${JSON.stringify({extractor: EXTRACTOR_NAME, status, url, snapshot_id: snapshotId})}`);
- process.exit(0);
- }
-
- const result = await trackRedirects(url);
-
- if (result.success) {
- status = 'succeeded';
- output = result.output;
- const redirectCount = result.redirectData.redirect_count;
- const finalUrl = result.redirectData.final_url;
- if (redirectCount > 0) {
- console.log(`Tracked ${redirectCount} redirect(s) to: ${finalUrl}`);
- } else {
- console.log('No redirects detected');
- }
- } else {
- status = 'failed';
- error = result.error;
- }
- } catch (e) {
- error = `${e.name}: ${e.message}`;
- status = 'failed';
- }
-
- const endTs = new Date();
- const duration = (endTs - startTs) / 1000;
-
- // Print results
- console.log(`START_TS=${startTs.toISOString()}`);
- console.log(`END_TS=${endTs.toISOString()}`);
- console.log(`DURATION=${duration.toFixed(2)}`);
- if (output) {
- console.log(`OUTPUT=${output}`);
- }
- console.log(`STATUS=${status}`);
-
- if (error) {
- console.error(`ERROR=${error}`);
- }
-
- // Print JSON result
- const resultJson = {
- extractor: EXTRACTOR_NAME,
- url,
- snapshot_id: snapshotId,
- status,
- start_ts: startTs.toISOString(),
- end_ts: endTs.toISOString(),
- duration: Math.round(duration * 100) / 100,
- output,
- error: error || null,
- };
- console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
-
- process.exit(status === 'succeeded' ? 0 : 1);
-}
-
-main().catch(e => {
- console.error(`Fatal error: ${e.message}`);
- process.exit(1);
-});
diff --git a/archivebox/plugins/redirects/on_Snapshot__31_redirects.js b/archivebox/plugins/redirects/on_Snapshot__31_redirects.js
new file mode 100755
index 00000000..9a4188a5
--- /dev/null
+++ b/archivebox/plugins/redirects/on_Snapshot__31_redirects.js
@@ -0,0 +1,248 @@
+#!/usr/bin/env node
+/**
+ * Detect redirects by comparing original URL to final URL.
+ *
+ * This runs AFTER chrome_navigate and checks:
+ * - URL changed (HTTP redirect occurred)
+ * - Meta refresh tags (pending redirects)
+ * - JavaScript redirects (basic detection)
+ *
+ * Usage: on_Snapshot__31_redirects.js --url= --snapshot-id=
+ * Output: Writes redirects.json
+ */
+
+const fs = require('fs');
+const path = require('path');
+const puppeteer = require('puppeteer-core');
+
+const EXTRACTOR_NAME = 'redirects';
+const OUTPUT_DIR = '.';
+const OUTPUT_FILE = 'redirects.json';
+const CHROME_SESSION_DIR = '../chrome_session';
+const CHROME_NAVIGATE_DIR = '../chrome_navigate';
+
+function parseArgs() {
+ const args = {};
+ process.argv.slice(2).forEach(arg => {
+ if (arg.startsWith('--')) {
+ const [key, ...valueParts] = arg.slice(2).split('=');
+ args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
+ }
+ });
+ return args;
+}
+
+function getEnv(name, defaultValue = '') {
+ return (process.env[name] || defaultValue).trim();
+}
+
+function getEnvBool(name, defaultValue = false) {
+ const val = getEnv(name, '').toLowerCase();
+ if (['true', '1', 'yes', 'on'].includes(val)) return true;
+ if (['false', '0', 'no', 'off'].includes(val)) return false;
+ return defaultValue;
+}
+
+function getCdpUrl() {
+ const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
+ if (fs.existsSync(cdpFile)) {
+ return fs.readFileSync(cdpFile, 'utf8').trim();
+ }
+ return null;
+}
+
+function getPageId() {
+ const pageIdFile = path.join(CHROME_SESSION_DIR, 'page_id.txt');
+ if (fs.existsSync(pageIdFile)) {
+ return fs.readFileSync(pageIdFile, 'utf8').trim();
+ }
+ return null;
+}
+
+function getFinalUrl() {
+ // Try chrome_navigate output first
+ const navFile = path.join(CHROME_NAVIGATE_DIR, 'final_url.txt');
+ if (fs.existsSync(navFile)) {
+ return fs.readFileSync(navFile, 'utf8').trim();
+ }
+ return null;
+}
+
+async function detectRedirects(originalUrl) {
+ const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
+ const redirects = [];
+
+ // Get final URL from chrome_navigate
+ let finalUrl = getFinalUrl() || originalUrl;
+
+ // Check if URL changed (indicates redirect)
+ const urlChanged = originalUrl !== finalUrl;
+ if (urlChanged) {
+ redirects.push({
+ timestamp: new Date().toISOString(),
+ from_url: originalUrl,
+ to_url: finalUrl,
+ type: 'http',
+ detected_by: 'url_comparison',
+ });
+ }
+
+ // Connect to Chrome to check for meta refresh and JS redirects
+ const cdpUrl = getCdpUrl();
+ if (cdpUrl) {
+ let browser = null;
+ try {
+ browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl });
+
+ const pages = await browser.pages();
+ const pageId = getPageId();
+ let page = null;
+
+ if (pageId) {
+ page = pages.find(p => {
+ const target = p.target();
+ return target && target._targetId === pageId;
+ });
+ }
+ if (!page) {
+ page = pages.find(p => p.url().startsWith('http')) || pages[pages.length - 1];
+ }
+
+ if (page) {
+ // Update finalUrl from actual page
+ const pageUrl = page.url();
+ if (pageUrl && pageUrl !== 'about:blank') {
+ finalUrl = pageUrl;
+ }
+
+ // Check for meta refresh
+ try {
+ const metaRefresh = await page.evaluate(() => {
+ const meta = document.querySelector('meta[http-equiv="refresh"]');
+ if (meta) {
+ const content = meta.getAttribute('content') || '';
+ const match = content.match(/url=['"]?([^'";\s]+)['"]?/i);
+ return { content, url: match ? match[1] : null };
+ }
+ return null;
+ });
+
+ if (metaRefresh && metaRefresh.url) {
+ redirects.push({
+ timestamp: new Date().toISOString(),
+ from_url: finalUrl,
+ to_url: metaRefresh.url,
+ type: 'meta_refresh',
+ content: metaRefresh.content,
+ });
+ }
+ } catch (e) { /* ignore */ }
+
+ // Check for JS redirects
+ try {
+ const jsRedirect = await page.evaluate(() => {
+ const html = document.documentElement.outerHTML;
+ const patterns = [
+ /window\.location\s*=\s*['"]([^'"]+)['"]/i,
+ /window\.location\.href\s*=\s*['"]([^'"]+)['"]/i,
+ /window\.location\.replace\s*\(\s*['"]([^'"]+)['"]\s*\)/i,
+ ];
+ for (const pattern of patterns) {
+ const match = html.match(pattern);
+ if (match) return { url: match[1], pattern: pattern.toString() };
+ }
+ return null;
+ });
+
+ if (jsRedirect && jsRedirect.url) {
+ redirects.push({
+ timestamp: new Date().toISOString(),
+ from_url: finalUrl,
+ to_url: jsRedirect.url,
+ type: 'javascript',
+ });
+ }
+ } catch (e) { /* ignore */ }
+ }
+
+ browser.disconnect();
+ } catch (e) {
+ console.error(`Warning: Could not connect to Chrome: ${e.message}`);
+ }
+ }
+
+ const result = {
+ original_url: originalUrl,
+ final_url: finalUrl,
+ redirect_count: redirects.length,
+ redirects,
+ is_redirect: originalUrl !== finalUrl || redirects.length > 0,
+ };
+
+ fs.writeFileSync(outputPath, JSON.stringify(result, null, 2));
+ return { success: true, output: outputPath, data: result };
+}
+
+async function main() {
+ const args = parseArgs();
+ const url = args.url;
+ const snapshotId = args.snapshot_id;
+
+ if (!url || !snapshotId) {
+ console.error('Usage: on_Snapshot__31_redirects.js --url= --snapshot-id=');
+ process.exit(1);
+ }
+
+ const startTs = new Date();
+ let status = 'failed';
+ let output = null;
+ let error = '';
+
+ if (!getEnvBool('SAVE_REDIRECTS', true)) {
+ console.log('Skipping redirects (SAVE_REDIRECTS=False)');
+ status = 'skipped';
+ } else {
+ try {
+ const result = await detectRedirects(url);
+ status = 'succeeded';
+ output = result.output;
+
+ if (result.data.is_redirect) {
+ console.log(`Redirect detected: ${url} -> ${result.data.final_url}`);
+ } else {
+ console.log('No redirects detected');
+ }
+ } catch (e) {
+ error = `${e.name}: ${e.message}`;
+ }
+ }
+
+ const endTs = new Date();
+ const duration = (endTs - startTs) / 1000;
+
+ console.log(`START_TS=${startTs.toISOString()}`);
+ console.log(`END_TS=${endTs.toISOString()}`);
+ console.log(`DURATION=${duration.toFixed(2)}`);
+ if (output) console.log(`OUTPUT=${output}`);
+ console.log(`STATUS=${status}`);
+ if (error) console.error(`ERROR=${error}`);
+
+ console.log(`RESULT_JSON=${JSON.stringify({
+ extractor: EXTRACTOR_NAME,
+ url,
+ snapshot_id: snapshotId,
+ status,
+ start_ts: startTs.toISOString(),
+ end_ts: endTs.toISOString(),
+ duration: Math.round(duration * 100) / 100,
+ output,
+ error: error || null,
+ })}`);
+
+ process.exit(status === 'succeeded' ? 0 : 1);
+}
+
+main().catch(e => {
+ console.error(`Fatal error: ${e.message}`);
+ process.exit(1);
+});
diff --git a/archivebox/plugins/responses/on_Snapshot__24_responses.js b/archivebox/plugins/responses/on_Snapshot__24_responses.js
index c69743b4..5c65f3fe 100755
--- a/archivebox/plugins/responses/on_Snapshot__24_responses.js
+++ b/archivebox/plugins/responses/on_Snapshot__24_responses.js
@@ -1,22 +1,12 @@
#!/usr/bin/env node
/**
- * Archive all network responses during page load.
+ * Archive all network responses during page load (DAEMON MODE).
*
- * Connects to Chrome session and captures ALL network responses (XHR, images, scripts, etc.)
- * Saves them in an organized directory structure with both timestamped unique files
- * and URL-organized symlinks.
+ * This hook daemonizes and stays alive to capture network responses throughout
+ * the snapshot lifecycle. It's killed by chrome_cleanup at the end.
*
- * Usage: on_Snapshot__23_responses.js --url= --snapshot-id=
- * Output: Creates responses/ directory with:
- * - all/____.: Timestamped unique files
- * - ///: URL-organized symlinks by resource type
- * - index.jsonl: Searchable index of all responses
- *
- * Environment variables:
- * SAVE_RESPONSES: Enable response archiving (default: true)
- * RESPONSES_TIMEOUT: Timeout in seconds (default: 120)
- * RESPONSES_TYPES: Comma-separated resource types to save (default: all)
- * Options: script,stylesheet,font,image,media,xhr,websocket,document
+ * Usage: on_Snapshot__24_responses.js --url= --snapshot-id=
+ * Output: Creates responses/ directory with index.jsonl + listener.pid
*/
const fs = require('fs');
@@ -27,6 +17,7 @@ const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'responses';
const OUTPUT_DIR = '.';
+const PID_FILE = 'listener.pid';
const CHROME_SESSION_DIR = '../chrome_session';
// Resource types to capture (by default, capture everything)
@@ -70,6 +61,14 @@ function getCdpUrl() {
return null;
}
+function getPageId() {
+ const pageIdFile = path.join(CHROME_SESSION_DIR, 'page_id.txt');
+ if (fs.existsSync(pageIdFile)) {
+ return fs.readFileSync(pageIdFile, 'utf8').trim();
+ }
+ return null;
+}
+
// Get file extension from MIME type
function getExtensionFromMimeType(mimeType) {
const mimeMap = {
@@ -139,17 +138,14 @@ async function createSymlink(target, linkPath) {
fs.symlinkSync(relativePath, linkPath);
} catch (e) {
// Ignore symlink errors (file conflicts, permissions, etc.)
- console.error(`Failed to create symlink: ${e.message}`);
}
}
-// Archive responses by intercepting network traffic
-async function archiveResponses(originalUrl) {
- const timeout = (getEnvInt('RESPONSES_TIMEOUT') || getEnvInt('TIMEOUT', 120)) * 1000;
+// Set up response listener
+async function setupListener() {
const typesStr = getEnv('RESPONSES_TYPES', DEFAULT_TYPES.join(','));
const typesToSave = typesStr.split(',').map(t => t.trim().toLowerCase());
- // Output directory is current directory (hook already runs in output dir)
// Create subdirectories for organizing responses
const allDir = path.join(OUTPUT_DIR, 'all');
if (!fs.existsSync(allDir)) {
@@ -160,138 +156,119 @@ async function archiveResponses(originalUrl) {
const indexPath = path.join(OUTPUT_DIR, 'index.jsonl');
fs.writeFileSync(indexPath, ''); // Clear existing
- let browser = null;
- let savedCount = 0;
- const savedResponses = [];
-
- try {
- // Connect to existing Chrome session
- const cdpUrl = getCdpUrl();
- if (!cdpUrl) {
- return { success: false, error: 'No Chrome session found (chrome_session extractor must run first)' };
- }
-
- browser = await puppeteer.connect({
- browserWSEndpoint: cdpUrl,
- });
-
- // Get the page
- const pages = await browser.pages();
- const page = pages.find(p => p.url().startsWith('http')) || pages[0];
-
- if (!page) {
- return { success: false, error: 'No page found in Chrome session' };
- }
-
- // Enable request interception
- await page.setRequestInterception(false); // Don't block requests
-
- // Listen for responses
- page.on('response', async (response) => {
- try {
- const request = response.request();
- const url = response.url();
- const resourceType = request.resourceType().toLowerCase();
- const method = request.method();
- const status = response.status();
-
- // Skip redirects and errors
- if (status >= 300 && status < 400) return;
- if (status >= 400 && status < 600) return;
-
- // Check if we should save this resource type
- if (typesToSave.length && !typesToSave.includes(resourceType)) {
- return;
- }
-
- // Get response body
- let bodyBuffer = null;
- try {
- bodyBuffer = await response.buffer();
- } catch (e) {
- // Some responses can't be captured (already consumed, etc.)
- return;
- }
-
- if (!bodyBuffer || bodyBuffer.length === 0) {
- return;
- }
-
- // Determine file extension
- const mimeType = response.headers()['content-type'] || '';
- let extension = getExtensionFromMimeType(mimeType) || getExtensionFromUrl(url);
-
- // Create timestamp-based unique filename
- const timestamp = new Date().toISOString().replace(/[-:]/g, '').replace(/\..+/, '');
- const urlHash = sanitizeFilename(encodeURIComponent(url).slice(0, 64));
- const uniqueFilename = `${timestamp}__${method}__${urlHash}${extension ? '.' + extension : ''}`;
- const uniquePath = path.join(allDir, uniqueFilename);
-
- // Save to unique file
- fs.writeFileSync(uniquePath, bodyBuffer);
-
- // Create URL-organized symlink
- try {
- const urlObj = new URL(url);
- const hostname = urlObj.hostname;
- const pathname = urlObj.pathname || '/';
- const filename = path.basename(pathname) || 'index' + (extension ? '.' + extension : '');
- const dirPath = path.dirname(pathname);
-
- // Create symlink: responses////
- const symlinkDir = path.join(OUTPUT_DIR, resourceType, hostname, dirPath);
- const symlinkPath = path.join(symlinkDir, filename);
- await createSymlink(uniquePath, symlinkPath);
- } catch (e) {
- // URL parsing or symlink creation failed, skip
- }
-
- // Calculate SHA256
- const sha256 = crypto.createHash('sha256').update(bodyBuffer).digest('hex');
- const urlSha256 = crypto.createHash('sha256').update(url).digest('hex');
-
- // Write to index
- const indexEntry = {
- ts: timestamp,
- method,
- url: method === 'DATA' ? url.slice(0, 128) : url, // Truncate data: URLs
- urlSha256,
- status,
- resourceType,
- mimeType: mimeType.split(';')[0],
- responseSha256: sha256,
- path: './' + path.relative(OUTPUT_DIR, uniquePath),
- extension,
- };
-
- fs.appendFileSync(indexPath, JSON.stringify(indexEntry) + '\n');
- savedResponses.push(indexEntry);
- savedCount++;
-
- } catch (e) {
- // Log but don't fail the whole extraction
- console.error(`Error capturing response: ${e.message}`);
- }
- });
-
- // Wait a bit to ensure we capture responses
- // (chrome_session already loaded the page, just capture any remaining traffic)
- await new Promise(resolve => setTimeout(resolve, 2000));
-
- return {
- success: true,
- output: OUTPUT_DIR,
- savedCount,
- indexPath,
- };
-
- } catch (e) {
- return { success: false, error: `${e.name}: ${e.message}` };
- } finally {
- if (browser) {
- browser.disconnect();
- }
+ const cdpUrl = getCdpUrl();
+ if (!cdpUrl) {
+ throw new Error('No Chrome session found');
}
+
+ const browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl });
+
+ // Find our page
+ const pages = await browser.pages();
+ const pageId = getPageId();
+ let page = null;
+
+ if (pageId) {
+ page = pages.find(p => {
+ const target = p.target();
+ return target && target._targetId === pageId;
+ });
+ }
+ if (!page) {
+ page = pages[pages.length - 1];
+ }
+
+ if (!page) {
+ throw new Error('No page found');
+ }
+
+ // Set up response listener to capture network traffic
+ page.on('response', async (response) => {
+ try {
+ const request = response.request();
+ const url = response.url();
+ const resourceType = request.resourceType().toLowerCase();
+ const method = request.method();
+ const status = response.status();
+
+ // Skip redirects and errors
+ if (status >= 300 && status < 400) return;
+ if (status >= 400 && status < 600) return;
+
+ // Check if we should save this resource type
+ if (typesToSave.length && !typesToSave.includes(resourceType)) {
+ return;
+ }
+
+ // Get response body
+ let bodyBuffer = null;
+ try {
+ bodyBuffer = await response.buffer();
+ } catch (e) {
+ // Some responses can't be captured (already consumed, etc.)
+ return;
+ }
+
+ if (!bodyBuffer || bodyBuffer.length === 0) {
+ return;
+ }
+
+ // Determine file extension
+ const mimeType = response.headers()['content-type'] || '';
+ let extension = getExtensionFromMimeType(mimeType) || getExtensionFromUrl(url);
+
+ // Create timestamp-based unique filename
+ const timestamp = new Date().toISOString().replace(/[-:]/g, '').replace(/\..+/, '');
+ const urlHash = sanitizeFilename(encodeURIComponent(url).slice(0, 64));
+ const uniqueFilename = `${timestamp}__${method}__${urlHash}${extension ? '.' + extension : ''}`;
+ const uniquePath = path.join(allDir, uniqueFilename);
+
+ // Save to unique file
+ fs.writeFileSync(uniquePath, bodyBuffer);
+
+ // Create URL-organized symlink
+ try {
+ const urlObj = new URL(url);
+ const hostname = urlObj.hostname;
+ const pathname = urlObj.pathname || '/';
+ const filename = path.basename(pathname) || 'index' + (extension ? '.' + extension : '');
+ const dirPath = path.dirname(pathname);
+
+ // Create symlink: responses////
+ const symlinkDir = path.join(OUTPUT_DIR, resourceType, hostname, dirPath);
+ const symlinkPath = path.join(symlinkDir, filename);
+ await createSymlink(uniquePath, symlinkPath);
+ } catch (e) {
+ // URL parsing or symlink creation failed, skip
+ }
+
+ // Calculate SHA256
+ const sha256 = crypto.createHash('sha256').update(bodyBuffer).digest('hex');
+ const urlSha256 = crypto.createHash('sha256').update(url).digest('hex');
+
+ // Write to index
+ const indexEntry = {
+ ts: timestamp,
+ method,
+ url: method === 'DATA' ? url.slice(0, 128) : url, // Truncate data: URLs
+ urlSha256,
+ status,
+ resourceType,
+ mimeType: mimeType.split(';')[0],
+ responseSha256: sha256,
+ path: './' + path.relative(OUTPUT_DIR, uniquePath),
+ extension,
+ };
+
+ fs.appendFileSync(indexPath, JSON.stringify(indexEntry) + '\n');
+
+ } catch (e) {
+ // Ignore errors
+ }
+ });
+
+ // Don't disconnect - keep browser connection alive
+ return { browser, page };
}
async function main() {
@@ -300,77 +277,83 @@ async function main() {
const snapshotId = args.snapshot_id;
if (!url || !snapshotId) {
- console.error('Usage: on_Snapshot__23_responses.js --url= --snapshot-id=');
+ console.error('Usage: on_Snapshot__24_responses.js --url= --snapshot-id=');
process.exit(1);
}
+ if (!getEnvBool('SAVE_RESPONSES', true)) {
+ console.log('Skipping (SAVE_RESPONSES=False)');
+ const result = {
+ extractor: EXTRACTOR_NAME,
+ status: 'skipped',
+ url,
+ snapshot_id: snapshotId,
+ };
+ console.log(`RESULT_JSON=${JSON.stringify(result)}`);
+ process.exit(0);
+ }
+
const startTs = new Date();
- let status = 'failed';
- let output = null;
- let error = '';
- let savedCount = 0;
try {
- // Check if enabled
- if (!getEnvBool('SAVE_RESPONSES', true)) {
- console.log('Skipping responses (SAVE_RESPONSES=False)');
- status = 'skipped';
- const endTs = new Date();
- console.log(`START_TS=${startTs.toISOString()}`);
- console.log(`END_TS=${endTs.toISOString()}`);
- console.log(`STATUS=${status}`);
- console.log(`RESULT_JSON=${JSON.stringify({extractor: EXTRACTOR_NAME, status, url, snapshot_id: snapshotId})}`);
- process.exit(0);
- }
+ // Set up listener
+ await setupListener();
- const result = await archiveResponses(url);
+ // Write PID file so chrome_cleanup can kill us
+ fs.writeFileSync(path.join(OUTPUT_DIR, PID_FILE), String(process.pid));
- if (result.success) {
- status = 'succeeded';
- output = result.output;
- savedCount = result.savedCount || 0;
- console.log(`Saved ${savedCount} network responses to ${output}/`);
- } else {
- status = 'failed';
- error = result.error;
+ // Report success immediately (we're staying alive in background)
+ const endTs = new Date();
+ const duration = (endTs - startTs) / 1000;
+
+ console.log(`START_TS=${startTs.toISOString()}`);
+ console.log(`END_TS=${endTs.toISOString()}`);
+ console.log(`DURATION=${duration.toFixed(2)}`);
+ console.log(`OUTPUT=responses/`);
+ console.log(`STATUS=succeeded`);
+
+ const result = {
+ extractor: EXTRACTOR_NAME,
+ url,
+ snapshot_id: snapshotId,
+ status: 'succeeded',
+ start_ts: startTs.toISOString(),
+ end_ts: endTs.toISOString(),
+ duration: Math.round(duration * 100) / 100,
+ output: 'responses/',
+ };
+ console.log(`RESULT_JSON=${JSON.stringify(result)}`);
+
+ // Daemonize: detach from parent and keep running
+ // This process will be killed by chrome_cleanup
+ if (process.stdin.isTTY) {
+ process.stdin.pause();
}
+ process.stdin.unref();
+ process.stdout.end();
+ process.stderr.end();
+
+ // Keep the process alive indefinitely
+ // Will be killed by chrome_cleanup via the PID file
+ setInterval(() => {}, 1000);
+
} catch (e) {
- error = `${e.name}: ${e.message}`;
- status = 'failed';
- }
-
- const endTs = new Date();
- const duration = (endTs - startTs) / 1000;
-
- // Print results
- console.log(`START_TS=${startTs.toISOString()}`);
- console.log(`END_TS=${endTs.toISOString()}`);
- console.log(`DURATION=${duration.toFixed(2)}`);
- if (output) {
- console.log(`OUTPUT=${output}`);
- }
- console.log(`STATUS=${status}`);
-
- if (error) {
+ const error = `${e.name}: ${e.message}`;
console.error(`ERROR=${error}`);
+
+ const endTs = new Date();
+ const result = {
+ extractor: EXTRACTOR_NAME,
+ url,
+ snapshot_id: snapshotId,
+ status: 'failed',
+ start_ts: startTs.toISOString(),
+ end_ts: endTs.toISOString(),
+ error,
+ };
+ console.log(`RESULT_JSON=${JSON.stringify(result)}`);
+ process.exit(1);
}
-
- // Print JSON result
- const resultJson = {
- extractor: EXTRACTOR_NAME,
- url,
- snapshot_id: snapshotId,
- status,
- start_ts: startTs.toISOString(),
- end_ts: endTs.toISOString(),
- duration: Math.round(duration * 100) / 100,
- output,
- saved_count: savedCount,
- error: error || null,
- };
- console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
-
- process.exit(status === 'succeeded' ? 0 : 1);
}
main().catch(e => {
diff --git a/archivebox/plugins/singlefile/on_Snapshot__04_singlefile.js b/archivebox/plugins/singlefile/on_Crawl__04_singlefile.js
similarity index 99%
rename from archivebox/plugins/singlefile/on_Snapshot__04_singlefile.js
rename to archivebox/plugins/singlefile/on_Crawl__04_singlefile.js
index 81d23435..cb17a9a3 100755
--- a/archivebox/plugins/singlefile/on_Snapshot__04_singlefile.js
+++ b/archivebox/plugins/singlefile/on_Crawl__04_singlefile.js
@@ -7,8 +7,8 @@
*
* Extension: https://chromewebstore.google.com/detail/mpiodijhokgodhhofbcjdecpffjipkle
*
- * Priority: 04 (early) - Must install before Chrome session starts
- * Hook: on_Snapshot
+ * Priority: 04 (early) - Must install before Chrome session starts at Crawl level
+ * Hook: on_Crawl (runs once per crawl, not per snapshot)
*
* This extension automatically:
* - Saves complete web pages as single HTML files
diff --git a/archivebox/plugins/ssl/on_Snapshot__23_ssl.js b/archivebox/plugins/ssl/on_Snapshot__23_ssl.js
index 2ce4cd65..4d18e010 100755
--- a/archivebox/plugins/ssl/on_Snapshot__23_ssl.js
+++ b/archivebox/plugins/ssl/on_Snapshot__23_ssl.js
@@ -1,18 +1,12 @@
#!/usr/bin/env node
/**
- * Extract SSL/TLS certificate details from a URL.
+ * Extract SSL/TLS certificate details from a URL (DAEMON MODE).
*
- * Connects to Chrome session and retrieves security details including:
- * - Protocol (TLS 1.2, TLS 1.3, etc.)
- * - Cipher suite
- * - Certificate issuer, validity period
- * - Security state
+ * This hook daemonizes and stays alive to capture SSL details throughout
+ * the snapshot lifecycle. It's killed by chrome_cleanup at the end.
*
- * Usage: on_Snapshot__16_ssl.js --url= --snapshot-id=
- * Output: Writes ssl/ssl.json
- *
- * Environment variables:
- * SAVE_SSL: Enable SSL extraction (default: true)
+ * Usage: on_Snapshot__23_ssl.js --url= --snapshot-id=
+ * Output: Writes ssl.json + listener.pid
*/
const fs = require('fs');
@@ -23,6 +17,7 @@ const puppeteer = require('puppeteer-core');
const EXTRACTOR_NAME = 'ssl';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'ssl.json';
+const PID_FILE = 'listener.pid';
const CHROME_SESSION_DIR = '../chrome_session';
// Parse command line arguments
@@ -58,103 +53,103 @@ function getCdpUrl() {
return null;
}
-// Extract SSL details
-async function extractSsl(url) {
- // Output directory is current directory (hook already runs in output dir)
+function getPageId() {
+ const pageIdFile = path.join(CHROME_SESSION_DIR, 'page_id.txt');
+ if (fs.existsSync(pageIdFile)) {
+ return fs.readFileSync(pageIdFile, 'utf8').trim();
+ }
+ return null;
+}
+
+// Set up SSL listener
+async function setupListener(url) {
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
// Only extract SSL for HTTPS URLs
if (!url.startsWith('https://')) {
- return { success: false, error: 'URL is not HTTPS' };
+ throw new Error('URL is not HTTPS');
}
- let browser = null;
- let sslInfo = {};
+ const cdpUrl = getCdpUrl();
+ if (!cdpUrl) {
+ throw new Error('No Chrome session found');
+ }
- try {
- // Connect to existing Chrome session
- const cdpUrl = getCdpUrl();
- if (!cdpUrl) {
- return { success: false, error: 'No Chrome session found (chrome_session extractor must run first)' };
- }
+ const browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl });
- browser = await puppeteer.connect({
- browserWSEndpoint: cdpUrl,
+ // Find our page
+ const pages = await browser.pages();
+ const pageId = getPageId();
+ let page = null;
+
+ if (pageId) {
+ page = pages.find(p => {
+ const target = p.target();
+ return target && target._targetId === pageId;
});
+ }
+ if (!page) {
+ page = pages[pages.length - 1];
+ }
- // Get the page
- const pages = await browser.pages();
- const page = pages.find(p => p.url().startsWith('http')) || pages[0];
+ if (!page) {
+ throw new Error('No page found');
+ }
- if (!page) {
- return { success: false, error: 'No page found in Chrome session' };
- }
+ // Set up listener to capture SSL details when chrome_navigate loads the page
+ page.on('response', async (response) => {
+ try {
+ const request = response.request();
- // Get CDP client for low-level access
- const client = await page.target().createCDPSession();
-
- // Enable Security domain
- await client.send('Security.enable');
-
- // Get security details from the loaded page
- const securityState = await client.send('Security.getSecurityState');
-
- sslInfo = {
- url,
- securityState: securityState.securityState,
- schemeIsCryptographic: securityState.schemeIsCryptographic,
- summary: securityState.summary || '',
- };
-
- // Try to get detailed certificate info if available
- if (securityState.securityStateIssueIds && securityState.securityStateIssueIds.length > 0) {
- sslInfo.issues = securityState.securityStateIssueIds;
- }
-
- // Get response security details from navigation
- let mainResponse = null;
- page.on('response', async (response) => {
- if (response.url() === url || response.request().isNavigationRequest()) {
- mainResponse = response;
+ // Only capture the main navigation request
+ if (!request.isNavigationRequest() || request.frame() !== page.mainFrame()) {
+ return;
}
- });
- // If we have security details from response
- if (mainResponse) {
- try {
- const securityDetails = await mainResponse.securityDetails();
- if (securityDetails) {
- sslInfo.protocol = securityDetails.protocol();
- sslInfo.subjectName = securityDetails.subjectName();
- sslInfo.issuer = securityDetails.issuer();
- sslInfo.validFrom = securityDetails.validFrom();
- sslInfo.validTo = securityDetails.validTo();
- sslInfo.certificateId = securityDetails.subjectName();
+ // Only capture if it's for our target URL
+ if (!response.url().startsWith(url.split('?')[0])) {
+ return;
+ }
- const sanList = securityDetails.sanList();
- if (sanList && sanList.length > 0) {
- sslInfo.subjectAlternativeNames = sanList;
- }
+ // Get security details from the response
+ const securityDetails = response.securityDetails();
+ let sslInfo = {};
+
+ if (securityDetails) {
+ sslInfo.protocol = securityDetails.protocol();
+ sslInfo.subjectName = securityDetails.subjectName();
+ sslInfo.issuer = securityDetails.issuer();
+ sslInfo.validFrom = securityDetails.validFrom();
+ sslInfo.validTo = securityDetails.validTo();
+ sslInfo.certificateId = securityDetails.subjectName();
+ sslInfo.securityState = 'secure';
+ sslInfo.schemeIsCryptographic = true;
+
+ const sanList = securityDetails.sanList();
+ if (sanList && sanList.length > 0) {
+ sslInfo.subjectAlternativeNames = sanList;
}
- } catch (e) {
- // Security details not available
+ } else if (response.url().startsWith('https://')) {
+ // HTTPS URL but no security details means something went wrong
+ sslInfo.securityState = 'unknown';
+ sslInfo.schemeIsCryptographic = true;
+ sslInfo.error = 'No security details available';
+ } else {
+ // Non-HTTPS URL
+ sslInfo.securityState = 'insecure';
+ sslInfo.schemeIsCryptographic = false;
}
+
+ // Write output directly to file
+ fs.writeFileSync(outputPath, JSON.stringify(sslInfo, null, 2));
+
+ } catch (e) {
+ // Ignore errors
}
+ });
- await client.detach();
-
- // Write output
- fs.writeFileSync(outputPath, JSON.stringify(sslInfo, null, 2));
-
- return { success: true, output: outputPath, sslInfo };
-
- } catch (e) {
- return { success: false, error: `${e.name}: ${e.message}` };
- } finally {
- if (browser) {
- browser.disconnect();
- }
- }
+ // Don't disconnect - keep browser connection alive
+ return { browser, page };
}
async function main() {
@@ -163,75 +158,83 @@ async function main() {
const snapshotId = args.snapshot_id;
if (!url || !snapshotId) {
- console.error('Usage: on_Snapshot__16_ssl.js --url= --snapshot-id=');
+ console.error('Usage: on_Snapshot__23_ssl.js --url= --snapshot-id=');
process.exit(1);
}
+ if (!getEnvBool('SAVE_SSL', true)) {
+ console.log('Skipping (SAVE_SSL=False)');
+ const result = {
+ extractor: EXTRACTOR_NAME,
+ status: 'skipped',
+ url,
+ snapshot_id: snapshotId,
+ };
+ console.log(`RESULT_JSON=${JSON.stringify(result)}`);
+ process.exit(0);
+ }
+
const startTs = new Date();
- let status = 'failed';
- let output = null;
- let error = '';
try {
- // Check if enabled
- if (!getEnvBool('SAVE_SSL', true)) {
- console.log('Skipping SSL (SAVE_SSL=False)');
- status = 'skipped';
- const endTs = new Date();
- console.log(`START_TS=${startTs.toISOString()}`);
- console.log(`END_TS=${endTs.toISOString()}`);
- console.log(`STATUS=${status}`);
- console.log(`RESULT_JSON=${JSON.stringify({extractor: EXTRACTOR_NAME, status, url, snapshot_id: snapshotId})}`);
- process.exit(0);
- }
+ // Set up listener
+ await setupListener(url);
- const result = await extractSsl(url);
+ // Write PID file so chrome_cleanup can kill us
+ fs.writeFileSync(path.join(OUTPUT_DIR, PID_FILE), String(process.pid));
- if (result.success) {
- status = 'succeeded';
- output = result.output;
- const protocol = result.sslInfo?.protocol || 'unknown';
- console.log(`SSL details extracted: ${protocol}`);
- } else {
- status = 'failed';
- error = result.error;
+ // Report success immediately (we're staying alive in background)
+ const endTs = new Date();
+ const duration = (endTs - startTs) / 1000;
+
+ console.log(`START_TS=${startTs.toISOString()}`);
+ console.log(`END_TS=${endTs.toISOString()}`);
+ console.log(`DURATION=${duration.toFixed(2)}`);
+ console.log(`OUTPUT=${OUTPUT_FILE}`);
+ console.log(`STATUS=succeeded`);
+
+ const result = {
+ extractor: EXTRACTOR_NAME,
+ url,
+ snapshot_id: snapshotId,
+ status: 'succeeded',
+ start_ts: startTs.toISOString(),
+ end_ts: endTs.toISOString(),
+ duration: Math.round(duration * 100) / 100,
+ output: OUTPUT_FILE,
+ };
+ console.log(`RESULT_JSON=${JSON.stringify(result)}`);
+
+ // Daemonize: detach from parent and keep running
+ // This process will be killed by chrome_cleanup
+ if (process.stdin.isTTY) {
+ process.stdin.pause();
}
+ process.stdin.unref();
+ process.stdout.end();
+ process.stderr.end();
+
+ // Keep the process alive indefinitely
+ // Will be killed by chrome_cleanup via the PID file
+ setInterval(() => {}, 1000);
+
} catch (e) {
- error = `${e.name}: ${e.message}`;
- status = 'failed';
- }
-
- const endTs = new Date();
- const duration = (endTs - startTs) / 1000;
-
- // Print results
- console.log(`START_TS=${startTs.toISOString()}`);
- console.log(`END_TS=${endTs.toISOString()}`);
- console.log(`DURATION=${duration.toFixed(2)}`);
- if (output) {
- console.log(`OUTPUT=${output}`);
- }
- console.log(`STATUS=${status}`);
-
- if (error) {
+ const error = `${e.name}: ${e.message}`;
console.error(`ERROR=${error}`);
+
+ const endTs = new Date();
+ const result = {
+ extractor: EXTRACTOR_NAME,
+ url,
+ snapshot_id: snapshotId,
+ status: 'failed',
+ start_ts: startTs.toISOString(),
+ end_ts: endTs.toISOString(),
+ error,
+ };
+ console.log(`RESULT_JSON=${JSON.stringify(result)}`);
+ process.exit(1);
}
-
- // Print JSON result
- const resultJson = {
- extractor: EXTRACTOR_NAME,
- url,
- snapshot_id: snapshotId,
- status,
- start_ts: startTs.toISOString(),
- end_ts: endTs.toISOString(),
- duration: Math.round(duration * 100) / 100,
- output,
- error: error || null,
- };
- console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
-
- process.exit(status === 'succeeded' ? 0 : 1);
}
main().catch(e => {
diff --git a/archivebox/plugins/ublock/on_Snapshot__03_ublock.js b/archivebox/plugins/ublock/on_Crawl__03_ublock.js
similarity index 97%
rename from archivebox/plugins/ublock/on_Snapshot__03_ublock.js
rename to archivebox/plugins/ublock/on_Crawl__03_ublock.js
index 190a24e6..cf0f8240 100755
--- a/archivebox/plugins/ublock/on_Snapshot__03_ublock.js
+++ b/archivebox/plugins/ublock/on_Crawl__03_ublock.js
@@ -7,8 +7,8 @@
*
* Extension: https://chromewebstore.google.com/detail/cjpalhdlnbpafiamejdnhcphjbkeiagm
*
- * Priority: 03 (early) - Must install before Chrome session starts
- * Hook: on_Snapshot
+ * Priority: 03 (early) - Must install before Chrome session starts at Crawl level
+ * Hook: on_Crawl (runs once per crawl, not per snapshot)
*
* This extension automatically:
* - Blocks ads, trackers, and malware domains
diff --git a/archivebox/templates/admin/base.html b/archivebox/templates/admin/base.html
index 8d3f1e90..bbcb0a3b 100644
--- a/archivebox/templates/admin/base.html
+++ b/archivebox/templates/admin/base.html
@@ -751,13 +751,15 @@
}
.messagelist li.warning {
- background: #fffbeb;
+ background: #fffbeb !important;
+ background-image: none !important;
border: 1px solid #fde68a;
color: #92400e;
}
.messagelist li.error {
- background: #fef2f2;
+ background: #fef2f2 !important;
+ background-image: none !important;
border: 1px solid #fecaca;
color: #991b1b;
}
@@ -916,11 +918,13 @@
gap: 12px;
}
- #toolbar form {
+ #toolbar form,
+ #changelist-search {
display: flex;
align-items: center;
gap: 8px;
- flex: 1;
+ flex: 0 1 auto;
+ max-width: 500px;
}
#searchbar {
diff --git a/archivebox/templates/admin/progress_monitor.html b/archivebox/templates/admin/progress_monitor.html
index 3b5299af..10286104 100644
--- a/archivebox/templates/admin/progress_monitor.html
+++ b/archivebox/templates/admin/progress_monitor.html
@@ -162,10 +162,15 @@
padding: 10px 14px;
background: rgba(0,0,0,0.2);
cursor: pointer;
+ text-decoration: none;
+ color: inherit;
}
#progress-monitor .crawl-header:hover {
background: rgba(88, 166, 255, 0.1);
}
+ #progress-monitor a.crawl-header:visited {
+ color: inherit;
+ }
#progress-monitor .crawl-icon {
font-size: 16px;
width: 20px;
@@ -252,10 +257,15 @@
gap: 10px;
padding: 8px 12px;
cursor: pointer;
+ text-decoration: none;
+ color: inherit;
}
#progress-monitor .snapshot-header:hover {
background: rgba(88, 166, 255, 0.05);
}
+ #progress-monitor a.snapshot-header:visited {
+ color: inherit;
+ }
#progress-monitor .snapshot-icon {
font-size: 14px;
width: 18px;
@@ -391,15 +401,6 @@
color: #f85149;
}
- /* Expand/Collapse Icons */
- #progress-monitor .expand-icon {
- color: #8b949e;
- font-size: 10px;
- transition: transform 0.2s;
- }
- #progress-monitor .expand-icon.expanded {
- transform: rotate(90deg);
- }
@@ -449,8 +450,6 @@
let pollInterval = null;
let isCollapsed = localStorage.getItem('progress-monitor-collapsed') === 'true';
- let expandedCrawls = new Set(JSON.parse(localStorage.getItem('progress-monitor-expanded-crawls') || '[]'));
- let expandedSnapshots = new Set(JSON.parse(localStorage.getItem('progress-monitor-expanded-snapshots') || '[]'));
// Baselines for resettable counters
let succeededBaseline = parseInt(localStorage.getItem('progress-succeeded-baseline') || '0');
@@ -496,9 +495,8 @@
}
function renderSnapshot(snapshot, crawlId) {
- const snapshotKey = `${crawlId}-${snapshot.id}`;
- const isExpanded = expandedSnapshots.has(snapshotKey);
const statusIcon = snapshot.status === 'started' ? '↻' : '📄';
+ const adminUrl = `/admin/core/snapshot/${snapshot.id}/change/`;
let extractorHtml = '';
if (snapshot.all_extractors && snapshot.all_extractors.length > 0) {
@@ -507,16 +505,15 @@
a.extractor.localeCompare(b.extractor)
);
extractorHtml = `
-