diff --git a/archivebox/base_models/models.py b/archivebox/base_models/models.py index 8037f42d..dafa428f 100644 --- a/archivebox/base_models/models.py +++ b/archivebox/base_models/models.py @@ -122,12 +122,10 @@ class ModelWithOutputDir(ModelWithSerializers): class Meta: abstract = True - def save(self, *args, write_indexes=False, **kwargs): + def save(self, *args, **kwargs): super().save(*args, **kwargs) self.OUTPUT_DIR.mkdir(parents=True, exist_ok=True) self.save_json_index() - if write_indexes: - self.write_indexes() @property def output_dir_parent(self) -> str: @@ -145,17 +143,5 @@ class ModelWithOutputDir(ModelWithSerializers): def OUTPUT_DIR(self) -> Path: return DATA_DIR / self.output_dir_str - def write_indexes(self): - self.OUTPUT_DIR.mkdir(parents=True, exist_ok=True) - self.save_merkle_index() - self.save_html_index() - - def save_merkle_index(self): - with open(self.OUTPUT_DIR / '.hashes.json', 'w') as f: - json.dump(get_dir_info(self.OUTPUT_DIR, max_depth=6), f) - - def save_html_index(self): - (self.OUTPUT_DIR / 'index.html').write_text(str(self)) - def save_json_index(self): (self.OUTPUT_DIR / 'index.json').write_text(to_json(self.as_json())) diff --git a/archivebox/cli/archivebox_server.py b/archivebox/cli/archivebox_server.py index 146e047c..fb0b1148 100644 --- a/archivebox/cli/archivebox_server.py +++ b/archivebox/cli/archivebox_server.py @@ -72,30 +72,42 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,), get_worker, start_server_workers, tail_multiple_worker_logs, + is_port_in_use, ) + from workers.orchestrator import Orchestrator + import sys + + # Check if port is already in use + if is_port_in_use(host, int(port)): + print(f'[red][X] Error: Port {port} is already in use[/red]') + print(f' Another process (possibly daphne) is already listening on {host}:{port}') + print(f' Stop the conflicting process or choose a different port') + sys.exit(1) + + # Check if orchestrator is already running for this data directory + if Orchestrator.is_running(): + print(f'[red][X] Error: ArchiveBox orchestrator is already running for this data directory[/red]') + print(f' Stop the existing orchestrator before starting a new server') + print(f' To stop: pkill -f "archivebox manage orchestrator"') + sys.exit(1) # Check if supervisord is already running supervisor = get_existing_supervisord_process() if supervisor: daphne_proc = get_worker(supervisor, 'worker_daphne') - # If daphne is already running, just tail logs + # If daphne is already running, error out if daphne_proc and daphne_proc.get('statename') == 'RUNNING': orchestrator_proc = get_worker(supervisor, 'worker_orchestrator') - print('[yellow][!] ArchiveBox server is already running[/yellow]') + print('[red][X] Error: ArchiveBox server is already running[/red]') print(f' [green]√[/green] Web server (worker_daphne) is RUNNING on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]') if orchestrator_proc and orchestrator_proc.get('statename') == 'RUNNING': print(f' [green]√[/green] Background worker (worker_orchestrator) is RUNNING') print() - print('[blue][i] Tailing worker logs (Ctrl+C to stop watching)...[/i][/blue]') - print() - - # Tail logs for both workers - tail_multiple_worker_logs( - log_files=['logs/worker_daphne.log', 'logs/worker_orchestrator.log'], - follow=True, - ) - return + print('[yellow]To stop the existing server, run:[/yellow]') + print(' pkill -f "archivebox server"') + print(' pkill -f supervisord') + sys.exit(1) # Otherwise, daphne is not running - fall through to start it # No existing workers found - start new ones diff --git a/archivebox/config/views.py b/archivebox/config/views.py index 0f1c33b6..66b8de4d 100644 --- a/archivebox/config/views.py +++ b/archivebox/config/views.py @@ -91,31 +91,43 @@ def get_detected_binaries() -> Dict[str, Dict[str, Any]]: def get_filesystem_plugins() -> Dict[str, Dict[str, Any]]: """Discover plugins from filesystem directories.""" + import json from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR - + plugins = {} - + for base_dir, source in [(BUILTIN_PLUGINS_DIR, 'builtin'), (USER_PLUGINS_DIR, 'user')]: if not base_dir.exists(): continue - + for plugin_dir in base_dir.iterdir(): if plugin_dir.is_dir() and not plugin_dir.name.startswith('_'): plugin_id = f'{source}.{plugin_dir.name}' - + # Find hook scripts hooks = [] for ext in ('sh', 'py', 'js'): hooks.extend(plugin_dir.glob(f'on_*__*.{ext}')) - + + # Load config.json if it exists + config_file = plugin_dir / 'config.json' + config_data = None + if config_file.exists(): + try: + with open(config_file, 'r') as f: + config_data = json.load(f) + except (json.JSONDecodeError, IOError): + config_data = None + plugins[plugin_id] = { 'id': plugin_id, 'name': plugin_dir.name, 'path': str(plugin_dir), 'source': source, 'hooks': [str(h.name) for h in hooks], + 'config': config_data, } - + return plugins @@ -242,6 +254,7 @@ def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext: "Source": [], "Path": [], "Hooks": [], + "Config": [], } plugins = get_filesystem_plugins() @@ -252,12 +265,21 @@ def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext: rows['Path'].append(format_html('{}', plugin['path'])) rows['Hooks'].append(', '.join(plugin['hooks']) or '(none)') + # Show config status + if plugin.get('config'): + config_properties = plugin['config'].get('properties', {}) + config_count = len(config_properties) + rows['Config'].append(f'✅ {config_count} properties' if config_count > 0 else '✅ present') + else: + rows['Config'].append('❌ none') + if not plugins: # Show a helpful message when no plugins found rows['Name'].append('(no plugins found)') rows['Source'].append('-') rows['Path'].append(mark_safe('archivebox/plugins/ or data/plugins/')) rows['Hooks'].append('-') + rows['Config'].append('-') return TableContext( title="Installed plugins", @@ -266,11 +288,12 @@ def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext: @render_with_item_view def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: + import json assert request.user.is_superuser, 'Must be a superuser to view configuration settings.' plugins = get_filesystem_plugins() - + plugin = plugins.get(key) if not plugin: return ItemContext( @@ -279,6 +302,33 @@ def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: data=[], ) + # Base fields that all plugins have + fields = { + "id": plugin['id'], + "name": plugin['name'], + "source": plugin['source'], + "path": plugin['path'], + "hooks": plugin['hooks'], + } + + # Add config.json data if available + if plugin.get('config'): + config_json = json.dumps(plugin['config'], indent=2) + fields["config.json"] = mark_safe(f'
{config_json}
') + + # Also extract and display individual config properties for easier viewing + if 'properties' in plugin['config']: + config_properties = plugin['config']['properties'] + properties_summary = [] + for prop_name, prop_info in config_properties.items(): + prop_type = prop_info.get('type', 'unknown') + prop_default = prop_info.get('default', 'N/A') + prop_desc = prop_info.get('description', '') + properties_summary.append(f"• {prop_name} ({prop_type}): {prop_desc}") + + if properties_summary: + fields["Config Properties"] = mark_safe('
'.join(properties_summary)) + return ItemContext( slug=key, title=plugin['name'], @@ -286,13 +336,7 @@ def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: { "name": plugin['name'], "description": plugin['path'], - "fields": { - "id": plugin['id'], - "name": plugin['name'], - "source": plugin['source'], - "path": plugin['path'], - "hooks": plugin['hooks'], - }, + "fields": fields, "help_texts": {}, }, ], diff --git a/archivebox/core/admin_archiveresults.py b/archivebox/core/admin_archiveresults.py index 59864571..082b11f8 100644 --- a/archivebox/core/admin_archiveresults.py +++ b/archivebox/core/admin_archiveresults.py @@ -22,7 +22,7 @@ from core.models import ArchiveResult, Snapshot def render_archiveresults_list(archiveresults_qs, limit=50): """Render a nice inline list view of archive results with status, extractor, output, and actions.""" - results = list(archiveresults_qs.order_by('-end_ts').select_related('snapshot')[:limit]) + results = list(archiveresults_qs.order_by('extractor').select_related('snapshot')[:limit]) if not results: return mark_safe('
No Archive Results yet...
') @@ -239,7 +239,7 @@ class ArchiveResultInline(admin.TabularInline): class ArchiveResultAdmin(BaseModelAdmin): list_display = ('id', 'created_by', 'created_at', 'snapshot_info', 'tags_str', 'status', 'extractor_with_icon', 'cmd_str', 'output_str') sort_fields = ('id', 'created_by', 'created_at', 'extractor', 'status') - readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'extractor_with_icon') + readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'extractor_with_icon', 'iface') search_fields = ('id', 'snapshot__url', 'extractor', 'output', 'cmd_version', 'cmd', 'snapshot__timestamp') autocomplete_fields = ['snapshot'] @@ -249,7 +249,7 @@ class ArchiveResultAdmin(BaseModelAdmin): 'classes': ('card', 'wide'), }), ('Extractor', { - 'fields': ('extractor', 'extractor_with_icon', 'status', 'retry_at'), + 'fields': ('extractor', 'extractor_with_icon', 'status', 'retry_at', 'iface'), 'classes': ('card',), }), ('Timing', { diff --git a/archivebox/core/admin_site.py b/archivebox/core/admin_site.py index 67e074ac..6b3fe678 100644 --- a/archivebox/core/admin_site.py +++ b/archivebox/core/admin_site.py @@ -12,7 +12,7 @@ class ArchiveBoxAdmin(admin.AdminSite): archivebox_admin = ArchiveBoxAdmin() -archivebox_admin.disable_action('delete_selected') +# Note: delete_selected is enabled per-model via actions = ['delete_selected'] in each ModelAdmin # TODO: https://stackoverflow.com/questions/40760880/add-custom-button-to-django-admin-panel diff --git a/archivebox/core/admin_snapshots.py b/archivebox/core/admin_snapshots.py index bd73c363..4f0217a3 100644 --- a/archivebox/core/admin_snapshots.py +++ b/archivebox/core/admin_snapshots.py @@ -23,7 +23,7 @@ from archivebox.search.admin import SearchResultsAdminMixin from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin from archivebox.workers.tasks import bg_archive_snapshots, bg_add -from core.models import Tag +from core.models import Tag, Snapshot from core.admin_tags import TagInline from core.admin_archiveresults import ArchiveResultInline, render_archiveresults_list @@ -262,6 +262,10 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): for tag in obj.tags.all() if str(tag.name).strip() ) + # Show title if available, otherwise show URL + display_text = obj.title or obj.url + css_class = 'fetched' if obj.title else 'pending' + return format_html( '' '' @@ -272,8 +276,8 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): obj.archive_path, obj.archive_path, obj.archive_path, - 'fetched' if obj.title else 'pending', - urldecode(htmldecode(obj.title or ''))[:128] or 'Pending...' + css_class, + urldecode(htmldecode(display_text))[:128] ) + mark_safe(f' {tags}') @admin.display( @@ -402,12 +406,21 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): description="☠️ Delete" ) def delete_snapshots(self, request, queryset): - from archivebox.cli.archivebox_remove import remove - remove(snapshots=queryset, yes=True, delete=True, out_dir=DATA_DIR) - + """Delete snapshots in a single transaction to avoid SQLite concurrency issues.""" + from django.db import transaction + + total = queryset.count() + + # Get list of IDs to delete first (outside transaction) + ids_to_delete = list(queryset.values_list('pk', flat=True)) + + # Delete everything in a single atomic transaction + with transaction.atomic(): + deleted_count, _ = Snapshot.objects.filter(pk__in=ids_to_delete).delete() + messages.success( request, - mark_safe(f"Succesfully deleted {queryset.count()} Snapshots. Don't forget to scrub URLs from import logs (data/sources) and error logs (data/logs) if needed."), + mark_safe(f"Successfully deleted {total} Snapshots ({deleted_count} total objects including related records). Don't forget to scrub URLs from import logs (data/sources) and error logs (data/logs) if needed."), ) diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 57369460..d4a32639 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -295,7 +295,7 @@ class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)): class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine): id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True) - created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='snapshot_set', db_index=True) + created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False, related_name='snapshot_set', db_index=True) created_at = models.DateTimeField(default=timezone.now, db_index=True) modified_at = models.DateTimeField(auto_now=True) @@ -362,9 +362,11 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea }, ) + @property def output_dir_parent(self) -> str: return 'archive' + @property def output_dir_name(self) -> str: return str(self.timestamp) @@ -808,7 +810,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi # UUID field is added separately by migration for new records id = models.AutoField(primary_key=True, editable=False) uuid = models.UUIDField(default=uuid7, null=True, blank=True, db_index=True, unique=True) - created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='archiveresult_set', db_index=True) + created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False, related_name='archiveresult_set', db_index=True) created_at = models.DateTimeField(default=timezone.now, db_index=True) modified_at = models.DateTimeField(auto_now=True) @@ -844,7 +846,10 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi def save(self, *args, **kwargs): is_new = self._state.adding - super().save(*args, **kwargs) + # Skip ModelWithOutputDir.save() to avoid creating index.json in plugin directories + # Call the Django Model.save() directly instead + models.Model.save(self, *args, **kwargs) + if is_new: from archivebox.misc.logging_util import log_worker_event log_worker_event( @@ -916,9 +921,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi def output_dir_parent(self) -> str: return str(self.snapshot.OUTPUT_DIR.relative_to(CONSTANTS.DATA_DIR)) - def write_indexes(self): - super().write_indexes() - def save_search_index(self): pass @@ -966,6 +968,16 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi ) end_ts = timezone.now() + # Clean up empty output directory if no files were created + output_files = result.get('output_files', []) + if not output_files and extractor_dir.exists(): + try: + # Only remove if directory is completely empty + if not any(extractor_dir.iterdir()): + extractor_dir.rmdir() + except (OSError, RuntimeError): + pass # Directory not empty or can't be removed, that's fine + # Determine status from return code and JSON output output_json = result.get('output_json') or {} json_status = output_json.get('status') @@ -990,15 +1002,46 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi self.start_ts = start_ts self.end_ts = end_ts self.retry_at = None + self.pwd = str(extractor_dir) + + # Save cmd and cmd_version from extractor output + if output_json.get('cmd_version'): + self.cmd_version = output_json['cmd_version'][:128] # Max length from model + if output_json.get('cmd'): + self.cmd = output_json['cmd'] + self.save() # Queue any discovered URLs for crawling (parser extractors write urls.jsonl) self._queue_urls_for_crawl(extractor_dir) + # Update snapshot title if this is the title extractor + # Check both old numeric name and new plugin name for compatibility + extractor_name = get_extractor_name(self.extractor) + if self.status == self.StatusChoices.SUCCEEDED and extractor_name == 'title': + self._update_snapshot_title(extractor_dir) + # Trigger search indexing if succeeded if self.status == self.StatusChoices.SUCCEEDED: self.trigger_search_indexing() + def _update_snapshot_title(self, extractor_dir: Path): + """ + Update snapshot title from title extractor output. + + The title extractor writes title.txt with the extracted page title. + This updates the Snapshot.title field if the file exists and has content. + """ + title_file = extractor_dir / 'title.txt' + if title_file.exists(): + try: + title = title_file.read_text(encoding='utf-8').strip() + if title and (not self.snapshot.title or len(title) > len(self.snapshot.title)): + self.snapshot.title = title[:512] # Max length from model + self.snapshot.save(update_fields=['title', 'modified_at']) + except Exception: + pass # Failed to read title, that's okay + def _queue_urls_for_crawl(self, extractor_dir: Path): """ Read urls.jsonl and queue discovered URLs for crawling. diff --git a/archivebox/core/statemachines.py b/archivebox/core/statemachines.py index eccefbbd..610f6fe0 100644 --- a/archivebox/core/statemachines.py +++ b/archivebox/core/statemachines.py @@ -91,7 +91,7 @@ class SnapshotMachine(StateMachine, strict_states=True): # unlock the snapshot after we're done + set status = started self.snapshot.update_for_workers( - retry_at=timezone.now() + timedelta(seconds=5), # wait 5s before checking it again + retry_at=timezone.now() + timedelta(seconds=5), # check again in 5s status=Snapshot.StatusChoices.STARTED, ) @@ -209,12 +209,15 @@ class ArchiveResultMachine(StateMachine, strict_states=True): @started.enter def enter_started(self): + from machine.models import NetworkInterface + # Suppressed: state transition logs # Lock the object and mark start time self.archiveresult.update_for_workers( retry_at=timezone.now() + timedelta(seconds=120), # 2 min timeout for extractor status=ArchiveResult.StatusChoices.STARTED, start_ts=timezone.now(), + iface=NetworkInterface.current(), ) # Run the extractor - this updates status, output, timestamps, etc. @@ -234,7 +237,7 @@ class ArchiveResultMachine(StateMachine, strict_states=True): end_ts=None, # retries=F('retries') + 1, # F() equivalent to getattr(self.archiveresult, 'retries', 0) + 1, ) - self.archiveresult.save(write_indexes=True) + self.archiveresult.save() @succeeded.enter def enter_succeeded(self): @@ -245,7 +248,7 @@ class ArchiveResultMachine(StateMachine, strict_states=True): end_ts=timezone.now(), # **self.archiveresult.get_output_dict(), # {output, output_json, stderr, stdout, returncode, errors, cmd_version, pwd, cmd, machine} ) - self.archiveresult.save(write_indexes=True) + self.archiveresult.save() # Increment health stats on ArchiveResult, Snapshot, and optionally Crawl ArchiveResult.objects.filter(pk=self.archiveresult.pk).update(num_uses_succeeded=F('num_uses_succeeded') + 1) diff --git a/archivebox/core/views.py b/archivebox/core/views.py index 4c6932df..b8139506 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -560,16 +560,28 @@ def live_progress_view(request): archiveresults_failed = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.FAILED).count() # Build hierarchical active crawls with nested snapshots and archive results - active_crawls = [] - for crawl in Crawl.objects.filter( + from django.db.models import Prefetch + + active_crawls_qs = Crawl.objects.filter( status__in=[Crawl.StatusChoices.QUEUED, Crawl.StatusChoices.STARTED] - ).order_by('-modified_at')[:10]: - # Get snapshots for this crawl - crawl_snapshots = Snapshot.objects.filter(crawl=crawl) - total_snapshots = crawl_snapshots.count() - completed_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.SEALED).count() - started_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.STARTED).count() - pending_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.QUEUED).count() + ).prefetch_related( + 'snapshot_set', + 'snapshot_set__archiveresult_set', + ).distinct().order_by('-modified_at')[:10] + + active_crawls = [] + for crawl in active_crawls_qs: + # Get active snapshots for this crawl - filter in Python since we prefetched all + crawl_snapshots = [ + s for s in crawl.snapshot_set.all() + if s.status in [Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED] + ][:5] # Limit to 5 most recent + + # Count snapshots by status (in memory, not DB) + total_snapshots = Snapshot.objects.filter(crawl=crawl).count() # Full count needs DB + completed_snapshots = sum(1 for s in crawl_snapshots if s.status == Snapshot.StatusChoices.SEALED) + started_snapshots = sum(1 for s in crawl_snapshots if s.status == Snapshot.StatusChoices.STARTED) + pending_snapshots = sum(1 for s in crawl_snapshots if s.status == Snapshot.StatusChoices.QUEUED) # Count URLs in the crawl (for when snapshots haven't been created yet) urls_count = 0 @@ -579,39 +591,39 @@ def live_progress_view(request): # Calculate crawl progress crawl_progress = int((completed_snapshots / total_snapshots) * 100) if total_snapshots > 0 else 0 - # Get active snapshots for this crawl + # Get active snapshots for this crawl (already prefetched) active_snapshots_for_crawl = [] - for snapshot in crawl_snapshots.filter( - status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED] - ).order_by('-modified_at')[:5]: - # Get archive results for this snapshot - snapshot_results = ArchiveResult.objects.filter(snapshot=snapshot) - total_extractors = snapshot_results.count() - completed_extractors = snapshot_results.filter(status=ArchiveResult.StatusChoices.SUCCEEDED).count() - failed_extractors = snapshot_results.filter(status=ArchiveResult.StatusChoices.FAILED).count() - pending_extractors = snapshot_results.filter(status=ArchiveResult.StatusChoices.QUEUED).count() + for snapshot in crawl_snapshots: + # Get archive results for this snapshot (already prefetched) + snapshot_results = snapshot.archiveresult_set.all() + + # Count in memory instead of DB queries + total_extractors = len(snapshot_results) + completed_extractors = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.SUCCEEDED) + failed_extractors = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.FAILED) + pending_extractors = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.QUEUED) # Calculate snapshot progress snapshot_progress = int(((completed_extractors + failed_extractors) / total_extractors) * 100) if total_extractors > 0 else 0 - # Get all extractors for this snapshot + # Get all extractors for this snapshot (already prefetched, sort in Python) # Order: started first, then queued, then completed + def extractor_sort_key(ar): + status_order = { + ArchiveResult.StatusChoices.STARTED: 0, + ArchiveResult.StatusChoices.QUEUED: 1, + ArchiveResult.StatusChoices.SUCCEEDED: 2, + ArchiveResult.StatusChoices.FAILED: 3, + } + return (status_order.get(ar.status, 4), ar.extractor) + all_extractors = [ { 'id': str(ar.id), 'extractor': ar.extractor, 'status': ar.status, } - for ar in snapshot_results.annotate( - status_order=Case( - When(status=ArchiveResult.StatusChoices.STARTED, then=Value(0)), - When(status=ArchiveResult.StatusChoices.QUEUED, then=Value(1)), - When(status=ArchiveResult.StatusChoices.SUCCEEDED, then=Value(2)), - When(status=ArchiveResult.StatusChoices.FAILED, then=Value(3)), - default=Value(4), - output_field=IntegerField(), - ) - ).order_by('status_order', 'extractor') + for ar in sorted(snapshot_results, key=extractor_sort_key) ] active_snapshots_for_crawl.append({ @@ -726,15 +738,39 @@ def find_config_default(key: str) -> str: return default_val def find_config_type(key: str) -> str: + from typing import get_type_hints, ClassVar CONFIGS = get_all_configs() - + for config in CONFIGS.values(): if hasattr(config, key): - type_hints = get_type_hints(config) + # Try to get from pydantic model_fields first (more reliable) + if hasattr(config, 'model_fields') and key in config.model_fields: + field = config.model_fields[key] + if hasattr(field, 'annotation'): + try: + return str(field.annotation.__name__) + except AttributeError: + return str(field.annotation) + + # Fallback to get_type_hints with proper namespace try: - return str(type_hints[key].__name__) - except AttributeError: - return str(type_hints[key]) + import typing + namespace = { + 'ClassVar': ClassVar, + 'Optional': typing.Optional, + 'Union': typing.Union, + 'List': typing.List, + 'Dict': typing.Dict, + 'Path': Path, + } + type_hints = get_type_hints(config, globalns=namespace, localns=namespace) + try: + return str(type_hints[key].__name__) + except AttributeError: + return str(type_hints[key]) + except Exception: + # If all else fails, return str + pass return 'str' def key_is_safe(key: str) -> bool: @@ -743,17 +779,55 @@ def key_is_safe(key: str) -> bool: return False return True +def find_config_source(key: str, merged_config: dict) -> str: + """Determine where a config value comes from.""" + import os + from machine.models import Machine + + # Check if it's from machine config + try: + machine = Machine.current() + if machine.config and key in machine.config: + return 'Machine' + except Exception: + pass + + # Check if it's from environment variable + if key in os.environ: + return 'Environment' + + # Check if it's from config file + from archivebox.config.configset import BaseConfigSet + file_config = BaseConfigSet.load_from_file(CONSTANTS.CONFIG_FILE) + if key in file_config: + return 'Config File' + + # Otherwise it's using the default + return 'Default' + + @render_with_table_view def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext: CONFIGS = get_all_configs() - + assert request.user.is_superuser, 'Must be a superuser to view configuration settings.' + # Get merged config that includes Machine.config overrides + try: + from machine.models import Machine + machine = Machine.current() + merged_config = get_config() + except Exception as e: + # Fallback if Machine model not available + merged_config = get_config() + machine = None + rows = { "Section": [], "Key": [], "Type": [], "Value": [], + "Source": [], "Default": [], # "Documentation": [], # "Aliases": [], @@ -764,7 +838,21 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext: rows['Section'].append(section_id) # section.replace('_', ' ').title().replace(' Config', '') rows['Key'].append(ItemLink(key, key=key)) rows['Type'].append(format_html('{}', find_config_type(key))) - rows['Value'].append(mark_safe(f'{getattr(section, key)}') if key_is_safe(key) else '******** (redacted)') + + # Use merged config value (includes machine overrides) + actual_value = merged_config.get(key, getattr(section, key, None)) + rows['Value'].append(mark_safe(f'{actual_value}') if key_is_safe(key) else '******** (redacted)') + + # Show where the value comes from + source = find_config_source(key, merged_config) + source_colors = { + 'Machine': 'purple', + 'Environment': 'blue', + 'Config File': 'green', + 'Default': 'gray' + } + rows['Source'].append(format_html('{}', source_colors.get(source, 'gray'), source)) + rows['Default'].append(mark_safe(f'{find_config_default(key) or "See here..."}')) # rows['Documentation'].append(mark_safe(f'Wiki: {key}')) # rows['Aliases'].append(', '.join(find_config_aliases(key))) @@ -775,6 +863,7 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext: rows['Key'].append(ItemLink(key, key=key)) rows['Type'].append(format_html('{}', getattr(type(CONSTANTS_CONFIG[key]), '__name__', str(CONSTANTS_CONFIG[key])))) rows['Value'].append(format_html('{}', CONSTANTS_CONFIG[key]) if key_is_safe(key) else '******** (redacted)') + rows['Source'].append(mark_safe('Constant')) rows['Default'].append(mark_safe(f'{find_config_default(key) or "See here..."}')) # rows['Documentation'].append(mark_safe(f'Wiki: {key}')) # rows['Aliases'].append('') @@ -787,11 +876,58 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext: @render_with_item_view def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: + import os + from machine.models import Machine + from archivebox.config.configset import BaseConfigSet + CONFIGS = get_all_configs() FLAT_CONFIG = get_flat_config() - + assert request.user.is_superuser, 'Must be a superuser to view configuration settings.' + # Get merged config + merged_config = get_config() + + # Determine all sources for this config value + sources_info = [] + + # Default value + default_val = find_config_default(key) + if default_val: + sources_info.append(('Default', default_val, 'gray')) + + # Config file value + if CONSTANTS.CONFIG_FILE.exists(): + file_config = BaseConfigSet.load_from_file(CONSTANTS.CONFIG_FILE) + if key in file_config: + sources_info.append(('Config File', file_config[key], 'green')) + + # Environment variable + if key in os.environ: + sources_info.append(('Environment', os.environ[key] if key_is_safe(key) else '********', 'blue')) + + # Machine config + machine = None + machine_admin_url = None + try: + machine = Machine.current() + machine_admin_url = f'/admin/machine/machine/{machine.id}/change/' + if machine.config and key in machine.config: + sources_info.append(('Machine', machine.config[key] if key_is_safe(key) else '********', 'purple')) + except Exception: + pass + + # Final computed value + final_value = merged_config.get(key, FLAT_CONFIG.get(key, CONFIGS.get(key, None))) + if not key_is_safe(key): + final_value = '********' + + # Build sources display + sources_html = '
'.join([ + f'{source}: {value}' + for source, value, color in sources_info + ]) + # aliases = USER_CONFIG.get(key, {}).get("aliases", []) aliases = [] @@ -813,7 +949,8 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont "fields": { 'Key': key, 'Type': find_config_type(key), - 'Value': FLAT_CONFIG.get(key, CONFIGS.get(key, None)) if key_is_safe(key) else '********', + 'Value': final_value, + 'Source': find_config_source(key, merged_config), }, "help_texts": { 'Key': mark_safe(f''' @@ -830,10 +967,8 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont 'Value': mark_safe(f''' {'Value is redacted for your security. (Passwords, secrets, API tokens, etc. cannot be viewed in the Web UI)

' if not key_is_safe(key) else ''}


- Default:                               - - {find_config_default(key) or '↗️ See in ArchiveBox source code...'} - + Configuration Sources (in priority order):

+ {sources_html}

To change this value, edit data/ArchiveBox.conf or run: @@ -845,6 +980,20 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont }"

'''), + 'Source': mark_safe(f''' + The value shown in the "Value" field comes from the {find_config_source(key, merged_config)} source. +

+ Priority order (highest to lowest): +
    +
  1. Machine - Machine-specific overrides (e.g., resolved binary paths) + {f'
    → Edit {key} in Machine.config for this server' if machine_admin_url else ''} +
  2. +
  3. Environment - Environment variables
  4. +
  5. Config File - data/ArchiveBox.conf
  6. +
  7. Default - Default value from code
  8. +
+ {f'
💡 Tip: To override {key} on this machine, edit the Machine.config field and add:
{{"\\"{key}\\": "your_value_here"}}' if machine_admin_url and key not in CONSTANTS_CONFIG else ''} + '''), }, }, ], diff --git a/archivebox/crawls/admin.py b/archivebox/crawls/admin.py index fa41f851..f5e1c9ae 100644 --- a/archivebox/crawls/admin.py +++ b/archivebox/crawls/admin.py @@ -3,6 +3,7 @@ __package__ = 'archivebox.crawls' import json from pathlib import Path +from django import forms from django.utils.html import format_html, format_html_join, mark_safe from django.contrib import admin, messages from django.urls import path @@ -136,16 +137,32 @@ def render_snapshots_list(snapshots_qs, limit=20): ''') +class CrawlAdminForm(forms.ModelForm): + """Custom form for Crawl admin to render urls field as textarea.""" + + class Meta: + model = Crawl + fields = '__all__' + widgets = { + 'urls': forms.Textarea(attrs={ + 'rows': 8, + 'style': 'width: 100%; font-family: monospace; font-size: 13px;', + 'placeholder': 'https://example.com\nhttps://example2.com\n# Comments start with #', + }), + } + + class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin): + form = CrawlAdminForm list_display = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'urls_preview', 'schedule_str', 'status', 'retry_at', 'num_snapshots') sort_fields = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'schedule_str', 'status', 'retry_at') search_fields = ('id', 'created_by__username', 'max_depth', 'label', 'notes', 'schedule_id', 'status', 'urls') - readonly_fields = ('created_at', 'modified_at', 'snapshots', 'urls_editor') + readonly_fields = ('created_at', 'modified_at', 'snapshots') fieldsets = ( ('URLs', { - 'fields': ('urls_editor',), + 'fields': ('urls',), 'classes': ('card', 'wide'), }), ('Info', { @@ -177,9 +194,32 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin): list_filter = ('max_depth', 'extractor', 'schedule', 'created_by', 'status', 'retry_at') ordering = ['-created_at', '-retry_at'] list_per_page = 100 - actions = ["delete_selected"] + actions = ["delete_selected_batched"] change_actions = ['recrawl'] + def get_queryset(self, request): + """Optimize queries with select_related and annotations.""" + qs = super().get_queryset(request) + return qs.select_related('schedule', 'created_by').annotate( + num_snapshots_cached=Count('snapshot_set') + ) + + @admin.action(description='Delete selected crawls') + def delete_selected_batched(self, request, queryset): + """Delete crawls in a single transaction to avoid SQLite concurrency issues.""" + from django.db import transaction + + total = queryset.count() + + # Get list of IDs to delete first (outside transaction) + ids_to_delete = list(queryset.values_list('pk', flat=True)) + + # Delete everything in a single atomic transaction + with transaction.atomic(): + deleted_count, _ = Crawl.objects.filter(pk__in=ids_to_delete).delete() + + messages.success(request, f'Successfully deleted {total} crawls ({deleted_count} total objects including related records).') + @action(label='Recrawl', description='Create a new crawl with the same settings') def recrawl(self, request, obj): """Duplicate this crawl as a new crawl with the same URLs and settings.""" @@ -214,7 +254,8 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin): return redirect('admin:crawls_crawl_change', new_crawl.id) def num_snapshots(self, obj): - return obj.snapshot_set.count() + # Use cached annotation from get_queryset to avoid N+1 + return getattr(obj, 'num_snapshots_cached', obj.snapshot_set.count()) def snapshots(self, obj): return render_snapshots_list(obj.snapshot_set.all()) @@ -269,7 +310,7 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin): placeholder="https://example.com https://example2.com # Comments start with #" readonly>{escaped_urls}

- {line_count} URL{'s' if line_count != 1 else ''} · URLs are read-only in admin, edit via API or CLI + {line_count} URL{'s' if line_count != 1 else ''} · Note: URLs displayed here for reference only

diff --git a/archivebox/crawls/models.py b/archivebox/crawls/models.py index 263869fe..9f11b1c4 100644 --- a/archivebox/crawls/models.py +++ b/archivebox/crawls/models.py @@ -88,7 +88,9 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith def __str__(self): first_url = self.get_urls_list()[0] if self.get_urls_list() else '' - return f'[{self.id}] {first_url[:64]}' + # Show last 8 digits of UUID and more of the URL + short_id = str(self.id)[-8:] + return f'[...{short_id}] {first_url[:120]}' def save(self, *args, **kwargs): is_new = self._state.adding diff --git a/archivebox/crawls/statemachines.py b/archivebox/crawls/statemachines.py index 45cb62fc..99b72699 100644 --- a/archivebox/crawls/statemachines.py +++ b/archivebox/crawls/statemachines.py @@ -83,7 +83,7 @@ class CrawlMachine(StateMachine, strict_states=True): # Suppressed: state transition logs # lock the crawl object while we create snapshots self.crawl.update_for_workers( - retry_at=timezone.now() + timedelta(seconds=5), + retry_at=timezone.now(), # Process immediately status=Crawl.StatusChoices.QUEUED, ) @@ -96,7 +96,7 @@ class CrawlMachine(StateMachine, strict_states=True): # only update status to STARTED once snapshots are created self.crawl.update_for_workers( - retry_at=timezone.now() + timedelta(seconds=5), + retry_at=timezone.now(), # Process immediately status=Crawl.StatusChoices.STARTED, ) except Exception as e: @@ -129,7 +129,7 @@ class CrawlMachine(StateMachine, strict_states=True): timeout=60, config_objects=[self.crawl], crawl_id=str(self.crawl.id), - seed_uri=first_url, + source_url=first_url, ) # Process hook results - parse JSONL output and create DB objects @@ -195,8 +195,43 @@ class CrawlMachine(StateMachine, strict_states=True): @sealed.enter def enter_sealed(self): + # Run on_CrawlEnd hooks to clean up resources (e.g., kill shared Chrome) + self._run_crawl_end_hooks() + # Suppressed: state transition logs self.crawl.update_for_workers( retry_at=None, status=Crawl.StatusChoices.SEALED, ) + + def _run_crawl_end_hooks(self): + """Run on_CrawlEnd hooks to clean up resources at crawl completion.""" + from pathlib import Path + from archivebox.hooks import run_hooks, discover_hooks + from archivebox.config import CONSTANTS + + # Discover and run all on_CrawlEnd hooks + hooks = discover_hooks('CrawlEnd') + if not hooks: + return + + # Use the same temporary output directory from crawl start + output_dir = Path(CONSTANTS.DATA_DIR) / 'tmp' / f'crawl_{self.crawl.id}' + + # Run all on_CrawlEnd hooks + first_url = self.crawl.get_urls_list()[0] if self.crawl.get_urls_list() else '' + results = run_hooks( + event_name='CrawlEnd', + output_dir=output_dir, + timeout=30, # Cleanup hooks should be quick + config_objects=[self.crawl], + crawl_id=str(self.crawl.id), + source_url=first_url, + ) + + # Log any failures but don't block sealing + for result in results: + if result['returncode'] != 0: + print(f'[yellow]⚠️ CrawlEnd hook failed: {result.get("hook", "unknown")}[/yellow]') + if result.get('stderr'): + print(f'[dim]{result["stderr"][:200]}[/dim]') diff --git a/archivebox/hooks.py b/archivebox/hooks.py index 4b06324a..f8f75b18 100644 --- a/archivebox/hooks.py +++ b/archivebox/hooks.py @@ -199,16 +199,24 @@ def run_hook( # Build CLI arguments from kwargs for key, value in kwargs.items(): + # Skip keys that start with underscore (internal parameters) + if key.startswith('_'): + continue + arg_key = f'--{key.replace("_", "-")}' if isinstance(value, bool): if value: cmd.append(arg_key) - elif value is not None: + elif value is not None and value != '': # JSON-encode complex values, use str for simple ones + # Skip empty strings to avoid --key= which breaks argument parsers if isinstance(value, (dict, list)): cmd.append(f'{arg_key}={json.dumps(value)}') else: - cmd.append(f'{arg_key}={value}') + # Ensure value is converted to string and strip whitespace + str_value = str(value).strip() + if str_value: # Only add if non-empty after stripping + cmd.append(f'{arg_key}={str_value}') # Set up environment with base paths env = os.environ.copy() diff --git a/archivebox/mcp/server.py b/archivebox/mcp/server.py index 1789d806..a8abf996 100644 --- a/archivebox/mcp/server.py +++ b/archivebox/mcp/server.py @@ -125,24 +125,64 @@ def execute_click_command(cmd_name: str, click_command: click.Command, arguments Returns MCP-formatted result with captured output and error status. """ + # Setup Django for archive commands (commands that need database access) + from archivebox.cli import ArchiveBoxGroup + if cmd_name in ArchiveBoxGroup.archive_commands: + try: + from archivebox.config.django import setup_django + from archivebox.misc.checks import check_data_folder + setup_django() + check_data_folder() + except Exception as e: + # If Django setup fails, return error (unless it's manage/shell which handle this themselves) + if cmd_name not in ('manage', 'shell'): + return { + "content": [{ + "type": "text", + "text": f"Error setting up Django: {str(e)}\n\nMake sure you're running the MCP server from inside an ArchiveBox data directory." + }], + "isError": True + } + # Use Click's test runner to invoke command programmatically runner = CliRunner() + # Build a map of parameter names to their Click types (Argument vs Option) + param_map = {param.name: param for param in click_command.params} + # Convert arguments dict to CLI args list args = [] + positional_args = [] + for key, value in arguments.items(): param_name = key.replace('_', '-') # Click uses dashes + param = param_map.get(key) - if isinstance(value, bool): - if value: + # Check if this is a positional Argument (not an Option) + is_argument = isinstance(param, click.Argument) + + if is_argument: + # Positional arguments - add them without dashes + if isinstance(value, list): + positional_args.extend([str(v) for v in value]) + elif value is not None: + positional_args.append(str(value)) + else: + # Options - add with dashes + if isinstance(value, bool): + if value: + args.append(f'--{param_name}') + elif isinstance(value, list): + # Multiple values for an option (rare) + for item in value: + args.append(f'--{param_name}') + args.append(str(item)) + elif value is not None: args.append(f'--{param_name}') - elif isinstance(value, list): - # Multiple values (e.g., multiple URLs) - for item in value: - args.append(str(item)) - elif value is not None: - args.append(f'--{param_name}') - args.append(str(value)) + args.append(str(value)) + + # Add positional arguments at the end + args.extend(positional_args) # Execute the command try: diff --git a/archivebox/misc/logging_util.py b/archivebox/misc/logging_util.py index 1016539e..2a725dd4 100644 --- a/archivebox/misc/logging_util.py +++ b/archivebox/misc/logging_util.py @@ -542,24 +542,32 @@ def log_worker_event( """ indent = ' ' * indent_level - # Build worker identifier + from rich.markup import escape + + # Build worker identifier (without URL/extractor) worker_parts = [worker_type] # Don't add pid/worker_id for DB operations (they happen in whatever process is running) if pid and worker_type != 'DB': worker_parts.append(f'pid={pid}') if worker_id and worker_type in ('CrawlWorker', 'Orchestrator') and worker_type != 'DB': worker_parts.append(f'id={worker_id}') - if url and worker_type in ('SnapshotWorker', 'DB'): - worker_parts.append(f'url={truncate_url(url)}') - if extractor and worker_type in ('ArchiveResultWorker', 'DB'): - worker_parts.append(f'extractor={extractor}') # Format worker label - only add brackets if there are additional identifiers + # Use double brackets [[...]] to escape Rich markup if len(worker_parts) > 1: - worker_label = f'{worker_parts[0]}[{", ".join(worker_parts[1:])}]' + worker_label = f'{worker_parts[0]}[[{", ".join(worker_parts[1:])}]]' else: worker_label = worker_parts[0] + # Build URL/extractor display (shown AFTER the label, outside brackets) + url_extractor_parts = [] + if url: + url_extractor_parts.append(f'url: {escape(url)}') + if extractor: + url_extractor_parts.append(f'extractor: {escape(extractor)}') + + url_extractor_str = ' | '.join(url_extractor_parts) if url_extractor_parts else '' + # Build metadata string metadata_str = '' if metadata: @@ -592,8 +600,6 @@ def log_worker_event( color = 'green' elif event.startswith('Created'): color = 'cyan' # DB creation events - elif event in ('Processing...', 'PROCESSING'): - color = 'blue' elif event in ('Completed', 'COMPLETED', 'All work complete'): color = 'blue' elif event in ('Failed', 'ERROR', 'Failed to spawn worker'): @@ -610,6 +616,12 @@ def log_worker_event( text = Text() text.append(indent) text.append(f'{worker_label} {event}{error_str}', style=color) + + # Add URL/extractor info first (more important) + if url_extractor_str: + text.append(f' | {url_extractor_str}') + + # Then add other metadata if metadata_str: text.append(f' | {metadata_str}') diff --git a/archivebox/plugins/captcha2/on_Snapshot__01_captcha2.js b/archivebox/plugins/captcha2/on_Crawl__01_captcha2.js similarity index 98% rename from archivebox/plugins/captcha2/on_Snapshot__01_captcha2.js rename to archivebox/plugins/captcha2/on_Crawl__01_captcha2.js index db7dd896..3e6dbca2 100755 --- a/archivebox/plugins/captcha2/on_Snapshot__01_captcha2.js +++ b/archivebox/plugins/captcha2/on_Crawl__01_captcha2.js @@ -8,8 +8,8 @@ * Extension: https://chromewebstore.google.com/detail/ifibfemgeogfhoebkmokieepdoobkbpo * Documentation: https://2captcha.com/blog/how-to-use-2captcha-solver-extension-in-puppeteer * - * Priority: 01 (early) - Must install before Chrome session starts - * Hook: on_Snapshot + * Priority: 01 (early) - Must install before Chrome session starts at Crawl level + * Hook: on_Crawl (runs once per crawl, not per snapshot) * * Requirements: * - API_KEY_2CAPTCHA environment variable must be set diff --git a/archivebox/plugins/captcha2/on_Snapshot__21_captcha2_config.js b/archivebox/plugins/captcha2/on_Crawl__11_captcha2_config.js similarity index 91% rename from archivebox/plugins/captcha2/on_Snapshot__21_captcha2_config.js rename to archivebox/plugins/captcha2/on_Crawl__11_captcha2_config.js index f97c9ef1..d370c81f 100755 --- a/archivebox/plugins/captcha2/on_Snapshot__21_captcha2_config.js +++ b/archivebox/plugins/captcha2/on_Crawl__11_captcha2_config.js @@ -2,11 +2,11 @@ /** * 2Captcha Extension Configuration * - * Configures the 2captcha extension with API key after Chrome session starts. - * Runs once per browser session to inject API key into extension storage. + * Configures the 2captcha extension with API key after Crawl-level Chrome session starts. + * Runs once per crawl to inject API key into extension storage. * - * Priority: 21 (after chrome_session at 20, before navigation at 30) - * Hook: on_Snapshot + * Priority: 11 (after chrome_session at 10) + * Hook: on_Crawl (runs once per crawl, not per snapshot) * * Requirements: * - API_KEY_2CAPTCHA environment variable must be set @@ -17,8 +17,19 @@ const path = require('path'); const fs = require('fs'); const puppeteer = require('puppeteer-core'); -const OUTPUT_DIR = 'chrome_session'; -const CONFIG_MARKER = path.join(OUTPUT_DIR, '.captcha2_configured'); +// Get crawl ID from args to find the crawl-level chrome session +function getCrawlChromeSessionDir() { + const args = parseArgs(); + const crawlId = args.crawl_id; + if (!crawlId) { + return null; + } + const dataDir = process.env.DATA_DIR || '.'; + return path.join(dataDir, 'tmp', `crawl_${crawlId}`, 'chrome_session'); +} + +const CHROME_SESSION_DIR = getCrawlChromeSessionDir() || '../chrome_session'; +const CONFIG_MARKER = path.join(CHROME_SESSION_DIR, '.captcha2_configured'); // Get environment variable with default function getEnv(name, defaultValue = '') { @@ -53,7 +64,7 @@ async function configure2Captcha() { } // Load extensions metadata - const extensionsFile = path.join(OUTPUT_DIR, 'extensions.json'); + const extensionsFile = path.join(CHROME_SESSION_DIR, 'extensions.json'); if (!fs.existsSync(extensionsFile)) { return { success: false, error: 'extensions.json not found - chrome_session must run first' }; } @@ -70,7 +81,7 @@ async function configure2Captcha() { try { // Connect to the existing Chrome session via CDP - const cdpFile = path.join(OUTPUT_DIR, 'cdp_url.txt'); + const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt'); if (!fs.existsSync(cdpFile)) { return { success: false, error: 'CDP URL not found - chrome_session must run first' }; } diff --git a/archivebox/plugins/chrome_cleanup/on_Snapshot__45_chrome_cleanup.py b/archivebox/plugins/chrome_cleanup/on_Snapshot__45_chrome_cleanup.py index fae91ffb..6c7133e4 100644 --- a/archivebox/plugins/chrome_cleanup/on_Snapshot__45_chrome_cleanup.py +++ b/archivebox/plugins/chrome_cleanup/on_Snapshot__45_chrome_cleanup.py @@ -3,10 +3,11 @@ Clean up Chrome browser session started by chrome_session extractor. This extractor runs after all Chrome-based extractors (screenshot, pdf, dom) -to terminate the Chrome process and clean up any leftover files. +to clean up the Chrome session. For shared sessions (crawl-level Chrome), it +closes only this snapshot's tab. For standalone sessions, it kills Chrome. -Usage: on_Snapshot__24_chrome_cleanup.py --url= --snapshot-id= -Output: Terminates Chrome process and removes lock files +Usage: on_Snapshot__45_chrome_cleanup.py --url= --snapshot-id= +Output: Closes tab or terminates Chrome process Environment variables: CHROME_USER_DATA_DIR: Chrome profile directory (for lock file cleanup) @@ -18,6 +19,7 @@ import os import signal import sys import time +import urllib.request from datetime import datetime, timezone from pathlib import Path @@ -33,18 +35,126 @@ def get_env(name: str, default: str = '') -> str: return os.environ.get(name, default).strip() +def close_tab_via_cdp(cdp_url: str, page_id: str) -> bool: + """ + Close a specific tab via Chrome DevTools Protocol. + + Returns True if tab was closed successfully. + """ + try: + # Extract port from WebSocket URL (ws://127.0.0.1:PORT/...) + import re + match = re.search(r':(\d+)/', cdp_url) + if not match: + return False + port = match.group(1) + + # Use CDP HTTP endpoint to close the target + close_url = f'http://127.0.0.1:{port}/json/close/{page_id}' + req = urllib.request.Request(close_url, method='GET') + + with urllib.request.urlopen(req, timeout=5) as resp: + return resp.status == 200 + + except Exception as e: + print(f'Failed to close tab via CDP: {e}', file=sys.stderr) + return False + + +def kill_listener_processes() -> list[str]: + """ + Kill any daemonized listener processes (consolelog, ssl, responses, etc.). + + These hooks write listener.pid files that we need to kill. + Returns list of killed process descriptions. + """ + killed = [] + snapshot_dir = Path('.').resolve().parent # Go up from chrome_cleanup dir + + # Look for listener.pid files in sibling directories + for extractor_dir in snapshot_dir.iterdir(): + if not extractor_dir.is_dir(): + continue + + pid_file = extractor_dir / 'listener.pid' + if not pid_file.exists(): + continue + + try: + pid = int(pid_file.read_text().strip()) + try: + os.kill(pid, signal.SIGTERM) + # Brief wait for graceful shutdown + for _ in range(5): + try: + os.kill(pid, 0) + time.sleep(0.05) + except OSError: + break + else: + # Force kill if still running + try: + os.kill(pid, signal.SIGKILL) + except OSError: + pass + + killed.append(f'{extractor_dir.name} listener (PID {pid})') + except OSError as e: + if e.errno != 3: # Not "No such process" + killed.append(f'{extractor_dir.name} listener (already dead)') + except (ValueError, FileNotFoundError): + pass + + return killed + + def cleanup_chrome_session() -> tuple[bool, str | None, str]: """ Clean up Chrome session started by chrome_session extractor. + For shared sessions (crawl-level Chrome), closes only this snapshot's tab. + For standalone sessions, kills the Chrome process. + Returns: (success, output_info, error_message) """ + # First, kill any daemonized listener processes + killed = kill_listener_processes() + if killed: + print(f'Killed listener processes: {", ".join(killed)}') + session_dir = Path(CHROME_SESSION_DIR) if not session_dir.exists(): return True, 'No chrome_session directory found', '' + # Check if this is a shared session + shared_file = session_dir / 'shared_session.txt' + is_shared = False + if shared_file.exists(): + is_shared = shared_file.read_text().strip().lower() == 'true' + pid_file = session_dir / 'pid.txt' + cdp_file = session_dir / 'cdp_url.txt' + page_id_file = session_dir / 'page_id.txt' + + if is_shared: + # Shared session - only close this snapshot's tab + if cdp_file.exists() and page_id_file.exists(): + try: + cdp_url = cdp_file.read_text().strip() + page_id = page_id_file.read_text().strip() + + if close_tab_via_cdp(cdp_url, page_id): + return True, f'Closed tab {page_id[:8]}... (shared Chrome session)', '' + else: + return True, f'Tab may already be closed (shared Chrome session)', '' + + except Exception as e: + return True, f'Tab cleanup attempted: {e}', '' + + return True, 'Shared session - Chrome stays running', '' + + # Standalone session - kill the Chrome process killed = False if pid_file.exists(): diff --git a/archivebox/plugins/chrome_navigate/on_Snapshot__30_chrome_navigate.js b/archivebox/plugins/chrome_navigate/on_Snapshot__30_chrome_navigate.js index b34c8c96..fb414ee7 100644 --- a/archivebox/plugins/chrome_navigate/on_Snapshot__30_chrome_navigate.js +++ b/archivebox/plugins/chrome_navigate/on_Snapshot__30_chrome_navigate.js @@ -2,38 +2,27 @@ /** * Navigate the Chrome browser to the target URL. * - * This extractor runs AFTER pre-load extractors (21-29) have registered their - * CDP listeners. It connects to the existing Chrome session, navigates to the URL, - * waits for page load, and captures response headers. + * This is a simple hook that ONLY navigates - nothing else. + * Pre-load hooks (21-29) should set up their own CDP listeners. + * Post-load hooks (31+) can then read from the loaded page. * * Usage: on_Snapshot__30_chrome_navigate.js --url= --snapshot-id= - * Output: Writes to chrome_session/: - * - response_headers.json: HTTP response headers from main document - * - final_url.txt: Final URL after any redirects - * - page_loaded.txt: Marker file indicating navigation is complete + * Output: Writes page_loaded.txt marker when navigation completes * * Environment variables: - * CHROME_PAGELOAD_TIMEOUT: Timeout for page load in seconds (default: 60) + * CHROME_PAGELOAD_TIMEOUT: Timeout in seconds (default: 60) * CHROME_DELAY_AFTER_LOAD: Extra delay after load in seconds (default: 0) * CHROME_WAIT_FOR: Wait condition (default: networkidle2) - * - domcontentloaded: DOM is ready, resources may still load - * - load: Page fully loaded including resources - * - networkidle0: No network activity for 500ms (strictest) - * - networkidle2: At most 2 network connections for 500ms - * - * # Fallbacks - * TIMEOUT: Fallback timeout */ const fs = require('fs'); const path = require('path'); const puppeteer = require('puppeteer-core'); -// Extractor metadata const EXTRACTOR_NAME = 'chrome_navigate'; const CHROME_SESSION_DIR = '../chrome_session'; +const OUTPUT_DIR = '.'; -// Parse command line arguments function parseArgs() { const args = {}; process.argv.slice(2).forEach(arg => { @@ -45,18 +34,10 @@ function parseArgs() { return args; } -// Get environment variable with default function getEnv(name, defaultValue = '') { return (process.env[name] || defaultValue).trim(); } -function getEnvBool(name, defaultValue = false) { - const val = getEnv(name, '').toLowerCase(); - if (['true', '1', 'yes', 'on'].includes(val)) return true; - if (['false', '0', 'no', 'off'].includes(val)) return false; - return defaultValue; -} - function getEnvInt(name, defaultValue = 0) { const val = parseInt(getEnv(name, String(defaultValue)), 10); return isNaN(val) ? defaultValue : val; @@ -67,159 +48,79 @@ function getEnvFloat(name, defaultValue = 0) { return isNaN(val) ? defaultValue : val; } -// Read CDP URL from chrome_session function getCdpUrl() { const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt'); - if (!fs.existsSync(cdpFile)) { - return null; - } + if (!fs.existsSync(cdpFile)) return null; return fs.readFileSync(cdpFile, 'utf8').trim(); } -// Read URL from chrome_session (set by chrome_session extractor) -function getTargetUrl() { - const urlFile = path.join(CHROME_SESSION_DIR, 'url.txt'); - if (!fs.existsSync(urlFile)) { - return null; - } - return fs.readFileSync(urlFile, 'utf8').trim(); +function getPageId() { + const pageIdFile = path.join(CHROME_SESSION_DIR, 'page_id.txt'); + if (!fs.existsSync(pageIdFile)) return null; + return fs.readFileSync(pageIdFile, 'utf8').trim(); } -// Validate wait condition function getWaitCondition() { const waitFor = getEnv('CHROME_WAIT_FOR', 'networkidle2').toLowerCase(); - const validConditions = ['domcontentloaded', 'load', 'networkidle0', 'networkidle2']; - if (validConditions.includes(waitFor)) { - return waitFor; - } - console.error(`Warning: Invalid CHROME_WAIT_FOR="${waitFor}", using networkidle2`); - return 'networkidle2'; + const valid = ['domcontentloaded', 'load', 'networkidle0', 'networkidle2']; + return valid.includes(waitFor) ? waitFor : 'networkidle2'; } -// Sleep helper function sleep(ms) { return new Promise(resolve => setTimeout(resolve, ms)); } -async function navigateToUrl(url, cdpUrl) { +async function navigate(url, cdpUrl) { const timeout = (getEnvInt('CHROME_PAGELOAD_TIMEOUT') || getEnvInt('CHROME_TIMEOUT') || getEnvInt('TIMEOUT', 60)) * 1000; const delayAfterLoad = getEnvFloat('CHROME_DELAY_AFTER_LOAD', 0) * 1000; const waitUntil = getWaitCondition(); + const pageId = getPageId(); let browser = null; - let responseHeaders = {}; - let redirectChain = []; - let finalUrl = url; try { - // Connect to existing browser - browser = await puppeteer.connect({ - browserWSEndpoint: cdpUrl, - }); + browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl }); - // Get all pages and find our target page const pages = await browser.pages(); if (pages.length === 0) { return { success: false, error: 'No pages found in browser' }; } - // Use the last created page (most likely the one chrome_session created) - const page = pages[pages.length - 1]; - - // Set up response interception to capture headers and redirects - page.on('response', async (response) => { - const request = response.request(); - - // Track redirects - if (response.status() >= 300 && response.status() < 400) { - redirectChain.push({ - url: response.url(), - status: response.status(), - location: response.headers()['location'] || null, - }); - } - - // Capture headers from the main document request - if (request.isNavigationRequest() && request.frame() === page.mainFrame()) { - try { - responseHeaders = { - url: response.url(), - status: response.status(), - statusText: response.statusText(), - headers: response.headers(), - }; - finalUrl = response.url(); - } catch (e) { - // Ignore errors capturing headers - } - } - }); - - // Navigate to URL and wait for load - console.log(`Navigating to ${url} (wait: ${waitUntil}, timeout: ${timeout}ms)`); - - const response = await page.goto(url, { - waitUntil, - timeout, - }); - - // Capture final response if not already captured - if (response && Object.keys(responseHeaders).length === 0) { - responseHeaders = { - url: response.url(), - status: response.status(), - statusText: response.statusText(), - headers: response.headers(), - }; - finalUrl = response.url(); + // Find page by target ID if available + let page = null; + if (pageId) { + page = pages.find(p => { + const target = p.target(); + return target && target._targetId === pageId; + }); + } + if (!page) { + page = pages[pages.length - 1]; } - // Apply optional delay after load + // Navigate + console.log(`Navigating to ${url} (wait: ${waitUntil}, timeout: ${timeout}ms)`); + const response = await page.goto(url, { waitUntil, timeout }); + + // Optional delay if (delayAfterLoad > 0) { console.log(`Waiting ${delayAfterLoad}ms after load...`); await sleep(delayAfterLoad); } - // Write response headers - if (Object.keys(responseHeaders).length > 0) { - // Add redirect chain to headers - responseHeaders.redirect_chain = redirectChain; + const finalUrl = page.url(); + const status = response ? response.status() : null; - fs.writeFileSync( - path.join(CHROME_SESSION_DIR, 'response_headers.json'), - JSON.stringify(responseHeaders, null, 2) - ); - } + // Write marker file + fs.writeFileSync(path.join(OUTPUT_DIR, 'page_loaded.txt'), new Date().toISOString()); + fs.writeFileSync(path.join(OUTPUT_DIR, 'final_url.txt'), finalUrl); - // Write final URL (after redirects) - fs.writeFileSync(path.join(CHROME_SESSION_DIR, 'final_url.txt'), finalUrl); - - // Write marker file indicating page is loaded - fs.writeFileSync( - path.join(CHROME_SESSION_DIR, 'page_loaded.txt'), - new Date().toISOString() - ); - - // Disconnect but leave browser running for post-load extractors browser.disconnect(); - return { - success: true, - output: CHROME_SESSION_DIR, - finalUrl, - status: responseHeaders.status, - redirectCount: redirectChain.length, - }; + return { success: true, finalUrl, status }; } catch (e) { - // Don't close browser on error - let cleanup handle it - if (browser) { - try { - browser.disconnect(); - } catch (disconnectErr) { - // Ignore - } - } + if (browser) browser.disconnect(); return { success: false, error: `${e.name}: ${e.message}` }; } } @@ -239,55 +140,33 @@ async function main() { let output = null; let error = ''; - try { - // Check for chrome_session - const cdpUrl = getCdpUrl(); - if (!cdpUrl) { - console.error('ERROR: chrome_session not found (cdp_url.txt missing)'); - console.error('chrome_navigate requires chrome_session to run first'); - process.exit(1); - } + const cdpUrl = getCdpUrl(); + if (!cdpUrl) { + console.error('ERROR: chrome_session not found'); + process.exit(1); + } - // Get URL from chrome_session or use provided URL - const targetUrl = getTargetUrl() || url; + const result = await navigate(url, cdpUrl); - const result = await navigateToUrl(targetUrl, cdpUrl); - - if (result.success) { - status = 'succeeded'; - output = result.output; - console.log(`Page loaded: ${result.finalUrl}`); - console.log(`HTTP status: ${result.status}`); - if (result.redirectCount > 0) { - console.log(`Redirects: ${result.redirectCount}`); - } - } else { - status = 'failed'; - error = result.error; - } - } catch (e) { - error = `${e.name}: ${e.message}`; - status = 'failed'; + if (result.success) { + status = 'succeeded'; + output = OUTPUT_DIR; + console.log(`Page loaded: ${result.finalUrl} (HTTP ${result.status})`); + } else { + error = result.error; } const endTs = new Date(); const duration = (endTs - startTs) / 1000; - // Print results console.log(`START_TS=${startTs.toISOString()}`); console.log(`END_TS=${endTs.toISOString()}`); console.log(`DURATION=${duration.toFixed(2)}`); - if (output) { - console.log(`OUTPUT=${output}`); - } + if (output) console.log(`OUTPUT=${output}`); console.log(`STATUS=${status}`); + if (error) console.error(`ERROR=${error}`); - if (error) { - console.error(`ERROR=${error}`); - } - - // Print JSON result - const resultJson = { + console.log(`RESULT_JSON=${JSON.stringify({ extractor: EXTRACTOR_NAME, url, snapshot_id: snapshotId, @@ -297,8 +176,7 @@ async function main() { duration: Math.round(duration * 100) / 100, output, error: error || null, - }; - console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`); + })}`); process.exit(status === 'succeeded' ? 0 : 1); } diff --git a/archivebox/plugins/chrome_session/on_CrawlEnd__99_chrome_cleanup.py b/archivebox/plugins/chrome_session/on_CrawlEnd__99_chrome_cleanup.py new file mode 100644 index 00000000..45c6aee7 --- /dev/null +++ b/archivebox/plugins/chrome_session/on_CrawlEnd__99_chrome_cleanup.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python3 +""" +Clean up Chrome browser session at the end of a crawl. + +This runs after all snapshots in a crawl have been processed to terminate +the shared Chrome session that was started by on_Crawl__10_chrome_session.js. + +Usage: on_Crawl__99_chrome_cleanup.py --crawl-id= +Output: Terminates the crawl's Chrome process +""" + +import json +import os +import signal +import sys +import time +from datetime import datetime, timezone +from pathlib import Path + +import rich_click as click + + +# Extractor metadata +EXTRACTOR_NAME = 'chrome_cleanup' +CHROME_SESSION_DIR = 'chrome_session' + + +def get_env(name: str, default: str = '') -> str: + return os.environ.get(name, default).strip() + + +def cleanup_crawl_chrome() -> tuple[bool, str | None, str]: + """ + Clean up Chrome session for the crawl. + + Returns: (success, output_info, error_message) + """ + session_dir = Path(CHROME_SESSION_DIR) + + if not session_dir.exists(): + return True, 'No chrome_session directory found', '' + + pid_file = session_dir / 'pid.txt' + killed = False + + if pid_file.exists(): + try: + pid = int(pid_file.read_text().strip()) + + # Try graceful termination first + try: + os.kill(pid, signal.SIGTERM) + killed = True + print(f'[*] Sent SIGTERM to Chrome PID {pid}') + + # Wait briefly for graceful shutdown + for _ in range(20): + try: + os.kill(pid, 0) # Check if still running + time.sleep(0.1) + except OSError: + print(f'[+] Chrome process {pid} terminated') + break # Process is gone + else: + # Force kill if still running + print(f'[!] Chrome still running, sending SIGKILL') + try: + os.kill(pid, signal.SIGKILL) + except OSError: + pass + + except OSError as e: + # Process might already be dead, that's fine + if e.errno == 3: # No such process + print(f'[*] Chrome process {pid} already terminated') + else: + return False, None, f'Failed to kill Chrome PID {pid}: {e}' + + except ValueError: + return False, None, f'Invalid PID in {pid_file}' + except Exception as e: + return False, None, f'{type(e).__name__}: {e}' + + result_info = f'Crawl Chrome cleanup: PID {"killed" if killed else "not found or already terminated"}' + return True, result_info, '' + + +@click.command() +@click.option('--crawl-id', required=True, help='Crawl UUID') +@click.option('--source-url', default='', help='Source URL (unused)') +def main(crawl_id: str, source_url: str): + """Clean up shared Chrome browser session for crawl.""" + + start_ts = datetime.now(timezone.utc) + output = None + status = 'failed' + error = '' + + try: + success, output, error = cleanup_crawl_chrome() + status = 'succeeded' if success else 'failed' + + if success: + print(f'Crawl Chrome cleanup completed: {output}') + + except Exception as e: + error = f'{type(e).__name__}: {e}' + status = 'failed' + + # Print results + end_ts = datetime.now(timezone.utc) + duration = (end_ts - start_ts).total_seconds() + + print(f'START_TS={start_ts.isoformat()}') + print(f'END_TS={end_ts.isoformat()}') + print(f'DURATION={duration:.2f}') + if output: + print(f'OUTPUT={output}') + print(f'STATUS={status}') + + if error: + print(f'ERROR={error}', file=sys.stderr) + + # Print JSON result + result_json = { + 'extractor': EXTRACTOR_NAME, + 'crawl_id': crawl_id, + 'status': status, + 'start_ts': start_ts.isoformat(), + 'end_ts': end_ts.isoformat(), + 'duration': round(duration, 2), + 'output': output, + 'error': error or None, + } + print(f'RESULT_JSON={json.dumps(result_json)}') + + sys.exit(0 if status == 'succeeded' else 1) + + +if __name__ == '__main__': + main() diff --git a/archivebox/plugins/chrome_session/on_Crawl__10_chrome_session.js b/archivebox/plugins/chrome_session/on_Crawl__10_chrome_session.js new file mode 100644 index 00000000..b3ad9ff8 --- /dev/null +++ b/archivebox/plugins/chrome_session/on_Crawl__10_chrome_session.js @@ -0,0 +1,343 @@ +#!/usr/bin/env node +/** + * Launch a shared Chrome browser session for the entire crawl. + * + * This runs once per crawl and keeps Chrome alive for all snapshots to share. + * Each snapshot creates its own tab via on_Snapshot__20_chrome_session.js. + * + * Usage: on_Crawl__10_chrome_session.js --crawl-id= --source-url= + * Output: Creates chrome_session/ with: + * - cdp_url.txt: WebSocket URL for CDP connection + * - pid.txt: Chrome process ID (for cleanup) + * + * Environment variables: + * CHROME_BINARY: Path to Chrome/Chromium binary + * CHROME_RESOLUTION: Page resolution (default: 1440,2000) + * CHROME_HEADLESS: Run in headless mode (default: true) + * CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true) + */ + +const fs = require('fs'); +const path = require('path'); +const { spawn } = require('child_process'); +const http = require('http'); + +// Extractor metadata +const EXTRACTOR_NAME = 'chrome_session'; +const OUTPUT_DIR = 'chrome_session'; + +// Parse command line arguments +function parseArgs() { + const args = {}; + process.argv.slice(2).forEach(arg => { + if (arg.startsWith('--')) { + const [key, ...valueParts] = arg.slice(2).split('='); + args[key.replace(/-/g, '_')] = valueParts.join('=') || true; + } + }); + return args; +} + +// Get environment variable with default +function getEnv(name, defaultValue = '') { + return (process.env[name] || defaultValue).trim(); +} + +function getEnvBool(name, defaultValue = false) { + const val = getEnv(name, '').toLowerCase(); + if (['true', '1', 'yes', 'on'].includes(val)) return true; + if (['false', '0', 'no', 'off'].includes(val)) return false; + return defaultValue; +} + +// Find Chrome binary +function findChrome() { + const chromeBinary = getEnv('CHROME_BINARY'); + if (chromeBinary && fs.existsSync(chromeBinary)) { + return chromeBinary; + } + + const candidates = [ + // Linux + '/usr/bin/google-chrome', + '/usr/bin/google-chrome-stable', + '/usr/bin/chromium', + '/usr/bin/chromium-browser', + // macOS + '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', + '/Applications/Chromium.app/Contents/MacOS/Chromium', + ]; + + for (const candidate of candidates) { + if (fs.existsSync(candidate)) { + return candidate; + } + } + + return null; +} + +// Parse resolution string +function parseResolution(resolution) { + const [width, height] = resolution.split(',').map(x => parseInt(x.trim(), 10)); + return { width: width || 1440, height: height || 2000 }; +} + +// Find a free port +function findFreePort() { + return new Promise((resolve, reject) => { + const server = require('net').createServer(); + server.unref(); + server.on('error', reject); + server.listen(0, () => { + const port = server.address().port; + server.close(() => resolve(port)); + }); + }); +} + +// Wait for Chrome's DevTools port to be ready +function waitForDebugPort(port, timeout = 30000) { + const startTime = Date.now(); + + return new Promise((resolve, reject) => { + const tryConnect = () => { + if (Date.now() - startTime > timeout) { + reject(new Error(`Timeout waiting for Chrome debug port ${port}`)); + return; + } + + const req = http.get(`http://127.0.0.1:${port}/json/version`, (res) => { + let data = ''; + res.on('data', chunk => data += chunk); + res.on('end', () => { + try { + const info = JSON.parse(data); + resolve(info); + } catch (e) { + setTimeout(tryConnect, 100); + } + }); + }); + + req.on('error', () => { + setTimeout(tryConnect, 100); + }); + + req.setTimeout(1000, () => { + req.destroy(); + setTimeout(tryConnect, 100); + }); + }; + + tryConnect(); + }); +} + +async function launchChrome(binary) { + const resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000'); + const checkSsl = getEnvBool('CHROME_CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true)); + const headless = getEnvBool('CHROME_HEADLESS', true); + + const { width, height } = parseResolution(resolution); + + // Create output directory + if (!fs.existsSync(OUTPUT_DIR)) { + fs.mkdirSync(OUTPUT_DIR, { recursive: true }); + } + + // Find a free port for Chrome DevTools + const debugPort = await findFreePort(); + console.log(`[*] Using debug port: ${debugPort}`); + + // Load any installed extensions + const extensionUtils = require('../chrome_extensions/chrome_extension_utils.js'); + const extensionsDir = getEnv('CHROME_EXTENSIONS_DIR') || + path.join(getEnv('DATA_DIR', '.'), 'personas', getEnv('ACTIVE_PERSONA', 'Default'), 'chrome_extensions'); + + const installedExtensions = []; + if (fs.existsSync(extensionsDir)) { + const files = fs.readdirSync(extensionsDir); + for (const file of files) { + if (file.endsWith('.extension.json')) { + try { + const extPath = path.join(extensionsDir, file); + const extData = JSON.parse(fs.readFileSync(extPath, 'utf-8')); + if (extData.unpacked_path && fs.existsSync(extData.unpacked_path)) { + installedExtensions.push(extData); + console.log(`[*] Loading extension: ${extData.name || file}`); + } + } catch (e) { + // Skip invalid cache files + console.warn(`[!] Skipping invalid extension cache: ${file}`); + } + } + } + } + + // Get extension launch arguments + const extensionArgs = extensionUtils.getExtensionLaunchArgs(installedExtensions); + if (extensionArgs.length > 0) { + console.log(`[+] Loaded ${installedExtensions.length} extension(s)`); + // Write extensions metadata for config hooks to use + fs.writeFileSync( + path.join(OUTPUT_DIR, 'extensions.json'), + JSON.stringify(installedExtensions, null, 2) + ); + } + + // Build Chrome arguments + const chromeArgs = [ + `--remote-debugging-port=${debugPort}`, + '--remote-debugging-address=127.0.0.1', + '--no-sandbox', + '--disable-setuid-sandbox', + '--disable-dev-shm-usage', + '--disable-gpu', + '--disable-sync', + '--no-first-run', + '--no-default-browser-check', + '--disable-default-apps', + '--disable-infobars', + '--disable-blink-features=AutomationControlled', + '--disable-component-update', + '--disable-domain-reliability', + '--disable-breakpad', + '--disable-background-networking', + '--disable-background-timer-throttling', + '--disable-backgrounding-occluded-windows', + '--disable-renderer-backgrounding', + '--disable-ipc-flooding-protection', + '--password-store=basic', + '--use-mock-keychain', + '--font-render-hinting=none', + '--force-color-profile=srgb', + `--window-size=${width},${height}`, + ...extensionArgs, // Load extensions + ...(headless ? ['--headless=new'] : []), + ...(checkSsl ? [] : ['--ignore-certificate-errors']), + 'about:blank', // Start with blank page + ]; + + // Launch Chrome as a child process (NOT detached - stays with crawl process) + // Using stdio: 'ignore' so we don't block on output but Chrome stays as our child + const chromeProcess = spawn(binary, chromeArgs, { + stdio: ['ignore', 'ignore', 'ignore'], + }); + + const chromePid = chromeProcess.pid; + console.log(`[*] Launched Chrome (PID: ${chromePid}), waiting for debug port...`); + + // Write PID immediately for cleanup + fs.writeFileSync(path.join(OUTPUT_DIR, 'pid.txt'), String(chromePid)); + fs.writeFileSync(path.join(OUTPUT_DIR, 'port.txt'), String(debugPort)); + + try { + // Wait for Chrome to be ready + const versionInfo = await waitForDebugPort(debugPort, 30000); + console.log(`[+] Chrome ready: ${versionInfo.Browser}`); + + // Build WebSocket URL + const wsUrl = versionInfo.webSocketDebuggerUrl; + fs.writeFileSync(path.join(OUTPUT_DIR, 'cdp_url.txt'), wsUrl); + + return { success: true, cdpUrl: wsUrl, pid: chromePid, port: debugPort }; + + } catch (e) { + // Kill Chrome if setup failed + try { + process.kill(chromePid, 'SIGTERM'); + } catch (killErr) { + // Ignore + } + return { success: false, error: `${e.name}: ${e.message}` }; + } +} + +async function main() { + const args = parseArgs(); + const crawlId = args.crawl_id; + + const startTs = new Date(); + let status = 'failed'; + let output = null; + let error = ''; + let version = ''; + + try { + const binary = findChrome(); + if (!binary) { + console.error('ERROR: Chrome/Chromium binary not found'); + console.error('DEPENDENCY_NEEDED=chrome'); + console.error('BIN_PROVIDERS=puppeteer,env,playwright,apt,brew'); + console.error('INSTALL_HINT=npx @puppeteer/browsers install chrome@stable'); + process.exit(1); + } + + // Get Chrome version + try { + const { execSync } = require('child_process'); + version = execSync(`"${binary}" --version`, { encoding: 'utf8', timeout: 5000 }).trim().slice(0, 64); + } catch (e) { + version = ''; + } + + const result = await launchChrome(binary); + + if (result.success) { + status = 'succeeded'; + output = OUTPUT_DIR; + console.log(`[+] Chrome session started for crawl ${crawlId}`); + console.log(`[+] CDP URL: ${result.cdpUrl}`); + console.log(`[+] PID: ${result.pid}`); + } else { + status = 'failed'; + error = result.error; + } + } catch (e) { + error = `${e.name}: ${e.message}`; + status = 'failed'; + } + + const endTs = new Date(); + const duration = (endTs - startTs) / 1000; + + // Print results + console.log(`START_TS=${startTs.toISOString()}`); + console.log(`END_TS=${endTs.toISOString()}`); + console.log(`DURATION=${duration.toFixed(2)}`); + if (version) { + console.log(`VERSION=${version}`); + } + if (output) { + console.log(`OUTPUT=${output}`); + } + console.log(`STATUS=${status}`); + + if (error) { + console.error(`ERROR=${error}`); + } + + // Print JSON result + const resultJson = { + extractor: EXTRACTOR_NAME, + crawl_id: crawlId, + status, + start_ts: startTs.toISOString(), + end_ts: endTs.toISOString(), + duration: Math.round(duration * 100) / 100, + cmd_version: version, + output, + error: error || null, + }; + console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`); + + // Exit with success - Chrome stays running as our child process + // It will be cleaned up when the crawl process terminates + process.exit(status === 'succeeded' ? 0 : 1); +} + +main().catch(e => { + console.error(`Fatal error: ${e.message}`); + process.exit(1); +}); diff --git a/archivebox/plugins/chrome_session/on_Snapshot__20_chrome_session.js b/archivebox/plugins/chrome_session/on_Snapshot__20_chrome_session.js index 84837d0a..409ba212 100755 --- a/archivebox/plugins/chrome_session/on_Snapshot__20_chrome_session.js +++ b/archivebox/plugins/chrome_session/on_Snapshot__20_chrome_session.js @@ -1,20 +1,21 @@ #!/usr/bin/env node /** - * Start a Chrome browser session for use by other extractors. + * Create a Chrome tab for this snapshot in the shared crawl Chrome session. * - * This extractor ONLY launches Chrome and creates a blank page - it does NOT navigate. - * Pre-load extractors (21-29) can connect via CDP to register listeners before navigation. - * The chrome_navigate extractor (30) performs the actual page load. + * If a crawl-level Chrome session exists (from on_Crawl__10_chrome_session.js), + * this connects to it and creates a new tab. Otherwise, falls back to launching + * its own Chrome instance. * - * Usage: on_Snapshot__20_chrome_session.js --url= --snapshot-id= + * Usage: on_Snapshot__20_chrome_session.js --url= --snapshot-id= --crawl-id= * Output: Creates chrome_session/ with: - * - cdp_url.txt: WebSocket URL for CDP connection - * - pid.txt: Chrome process ID (for cleanup) - * - page_id.txt: Target ID of the page for other extractors to use - * - url.txt: The URL to be navigated to (for chrome_navigate) + * - cdp_url.txt: WebSocket URL for CDP connection (copied or new) + * - pid.txt: Chrome process ID (from crawl or new) + * - page_id.txt: Target ID of this snapshot's tab + * - url.txt: The URL to be navigated to * * Environment variables: - * CHROME_BINARY: Path to Chrome/Chromium binary + * DATA_DIR: Data directory (to find crawl's Chrome session) + * CHROME_BINARY: Path to Chrome/Chromium binary (for fallback) * CHROME_RESOLUTION: Page resolution (default: 1440,2000) * CHROME_USER_AGENT: User agent string (optional) * CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true) @@ -23,18 +24,13 @@ const fs = require('fs'); const path = require('path'); +const { spawn } = require('child_process'); +const http = require('http'); const puppeteer = require('puppeteer-core'); -// Import extension utilities -const extensionUtils = require('../chrome_extensions/chrome_extension_utils.js'); - // Extractor metadata const EXTRACTOR_NAME = 'chrome_session'; -const OUTPUT_DIR = 'chrome_session'; - -// Get extensions directory from environment or use default -const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR || - path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions'); +const OUTPUT_DIR = '.'; // Hook already runs in the output directory // Parse command line arguments function parseArgs() { @@ -60,13 +56,7 @@ function getEnvBool(name, defaultValue = false) { return defaultValue; } -function getEnvInt(name, defaultValue = 0) { - const val = parseInt(getEnv(name, String(defaultValue)), 10); - return isNaN(val) ? defaultValue : val; -} - - -// Find Chrome binary +// Find Chrome binary (for fallback) function findChrome() { const chromeBinary = getEnv('CHROME_BINARY'); if (chromeBinary && fs.existsSync(chromeBinary)) { @@ -74,12 +64,10 @@ function findChrome() { } const candidates = [ - // Linux '/usr/bin/google-chrome', '/usr/bin/google-chrome-stable', '/usr/bin/chromium', '/usr/bin/chromium-browser', - // macOS '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', '/Applications/Chromium.app/Contents/MacOS/Chromium', ]; @@ -99,40 +87,132 @@ function parseResolution(resolution) { return { width: width || 1440, height: height || 2000 }; } -// Load installed extensions from cache files -function loadInstalledExtensions() { - const extensions = []; +// Find a free port +function findFreePort() { + return new Promise((resolve, reject) => { + const server = require('net').createServer(); + server.unref(); + server.on('error', reject); + server.listen(0, () => { + const port = server.address().port; + server.close(() => resolve(port)); + }); + }); +} - if (!fs.existsSync(EXTENSIONS_DIR)) { - return extensions; - } +// Wait for Chrome's DevTools port to be ready +function waitForDebugPort(port, timeout = 30000) { + const startTime = Date.now(); - // Look for *.extension.json cache files created by extension plugins - const files = fs.readdirSync(EXTENSIONS_DIR); - const extensionFiles = files.filter(f => f.endsWith('.extension.json')); + return new Promise((resolve, reject) => { + const tryConnect = () => { + if (Date.now() - startTime > timeout) { + reject(new Error(`Timeout waiting for Chrome debug port ${port}`)); + return; + } - for (const file of extensionFiles) { + const req = http.get(`http://127.0.0.1:${port}/json/version`, (res) => { + let data = ''; + res.on('data', chunk => data += chunk); + res.on('end', () => { + try { + const info = JSON.parse(data); + resolve(info); + } catch (e) { + setTimeout(tryConnect, 100); + } + }); + }); + + req.on('error', () => { + setTimeout(tryConnect, 100); + }); + + req.setTimeout(1000, () => { + req.destroy(); + setTimeout(tryConnect, 100); + }); + }; + + tryConnect(); + }); +} + +// Try to find the crawl's Chrome session +function findCrawlChromeSession(crawlId) { + if (!crawlId) return null; + + const dataDir = getEnv('DATA_DIR', '.'); + const crawlChromeDir = path.join(dataDir, 'tmp', `crawl_${crawlId}`, 'chrome_session'); + + const cdpFile = path.join(crawlChromeDir, 'cdp_url.txt'); + const pidFile = path.join(crawlChromeDir, 'pid.txt'); + + if (fs.existsSync(cdpFile) && fs.existsSync(pidFile)) { try { - const filePath = path.join(EXTENSIONS_DIR, file); - const data = fs.readFileSync(filePath, 'utf-8'); - const extension = JSON.parse(data); + const cdpUrl = fs.readFileSync(cdpFile, 'utf-8').trim(); + const pid = parseInt(fs.readFileSync(pidFile, 'utf-8').trim(), 10); - // Verify extension is actually installed - const manifestPath = path.join(extension.unpacked_path, 'manifest.json'); - if (fs.existsSync(manifestPath)) { - extensions.push(extension); - console.log(`[+] Loaded extension: ${extension.name} (${extension.webstore_id})`); + // Verify the process is still running + try { + process.kill(pid, 0); // Signal 0 = check if process exists + return { cdpUrl, pid }; + } catch (e) { + // Process not running + return null; } } catch (e) { - console.warn(`[⚠️] Failed to load extension from ${file}: ${e.message}`); + return null; } } - return extensions; + return null; } +// Create a new tab in an existing Chrome session +async function createTabInExistingChrome(cdpUrl, url, pid) { + const resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000'); + const userAgent = getEnv('CHROME_USER_AGENT') || getEnv('USER_AGENT', ''); + const { width, height } = parseResolution(resolution); -async function startChromeSession(url, binary) { + console.log(`[*] Connecting to existing Chrome session: ${cdpUrl}`); + + // Connect Puppeteer to the running Chrome + const browser = await puppeteer.connect({ + browserWSEndpoint: cdpUrl, + defaultViewport: { width, height }, + }); + + // Create a new tab for this snapshot + const page = await browser.newPage(); + + // Set viewport + await page.setViewport({ width, height }); + + // Set user agent if specified + if (userAgent) { + await page.setUserAgent(userAgent); + } + + // Get the page target ID + const target = page.target(); + const targetId = target._targetId; + + // Write session info + fs.writeFileSync(path.join(OUTPUT_DIR, 'cdp_url.txt'), cdpUrl); + fs.writeFileSync(path.join(OUTPUT_DIR, 'pid.txt'), String(pid)); + fs.writeFileSync(path.join(OUTPUT_DIR, 'page_id.txt'), targetId); + fs.writeFileSync(path.join(OUTPUT_DIR, 'url.txt'), url); + fs.writeFileSync(path.join(OUTPUT_DIR, 'shared_session.txt'), 'true'); + + // Disconnect Puppeteer (Chrome and tab stay alive) + browser.disconnect(); + + return { success: true, output: OUTPUT_DIR, cdpUrl, targetId, pid, shared: true }; +} + +// Fallback: Launch a new Chrome instance for this snapshot +async function launchNewChrome(url, binary) { const resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000'); const userAgent = getEnv('CHROME_USER_AGENT') || getEnv('USER_AGENT', ''); const checkSsl = getEnvBool('CHROME_CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true)); @@ -140,115 +220,98 @@ async function startChromeSession(url, binary) { const { width, height } = parseResolution(resolution); - // Load installed extensions - const extensions = loadInstalledExtensions(); - const extensionArgs = extensionUtils.getExtensionLaunchArgs(extensions); + // Find a free port for Chrome DevTools + const debugPort = await findFreePort(); + console.log(`[*] Launching new Chrome on port: ${debugPort}`); - if (extensions.length > 0) { - console.log(`[*] Loading ${extensions.length} Chrome extensions...`); - } + // Build Chrome arguments + const chromeArgs = [ + `--remote-debugging-port=${debugPort}`, + '--remote-debugging-address=127.0.0.1', + '--no-sandbox', + '--disable-setuid-sandbox', + '--disable-dev-shm-usage', + '--disable-gpu', + '--disable-sync', + '--no-first-run', + '--no-default-browser-check', + '--disable-default-apps', + '--disable-infobars', + '--disable-blink-features=AutomationControlled', + '--disable-component-update', + '--disable-domain-reliability', + '--disable-breakpad', + '--disable-background-networking', + '--disable-background-timer-throttling', + '--disable-backgrounding-occluded-windows', + '--disable-renderer-backgrounding', + '--disable-ipc-flooding-protection', + '--password-store=basic', + '--use-mock-keychain', + '--font-render-hinting=none', + '--force-color-profile=srgb', + `--window-size=${width},${height}`, + ...(headless ? ['--headless=new'] : []), + ...(checkSsl ? [] : ['--ignore-certificate-errors']), + 'about:blank', + ]; - // Create output directory - if (!fs.existsSync(OUTPUT_DIR)) { - fs.mkdirSync(OUTPUT_DIR, { recursive: true }); - } + // Launch Chrome as a detached process (since no crawl-level Chrome exists) + const chromeProcess = spawn(binary, chromeArgs, { + detached: true, + stdio: ['ignore', 'ignore', 'ignore'], + }); + chromeProcess.unref(); - let browser = null; + const chromePid = chromeProcess.pid; + console.log(`[*] Launched Chrome (PID: ${chromePid}), waiting for debug port...`); + + // Write PID immediately for cleanup + fs.writeFileSync(path.join(OUTPUT_DIR, 'pid.txt'), String(chromePid)); try { - // Launch browser with Puppeteer - browser = await puppeteer.launch({ - executablePath: binary, - headless: headless ? 'new' : false, - args: [ - '--no-sandbox', - '--disable-setuid-sandbox', - '--disable-dev-shm-usage', - '--disable-gpu', - '--disable-sync', - '--no-first-run', - '--no-default-browser-check', - '--disable-default-apps', - '--disable-infobars', - '--disable-blink-features=AutomationControlled', - '--disable-component-update', - '--disable-domain-reliability', - '--disable-breakpad', - '--disable-background-networking', - '--disable-background-timer-throttling', - '--disable-backgrounding-occluded-windows', - '--disable-renderer-backgrounding', - '--disable-ipc-flooding-protection', - '--password-store=basic', - '--use-mock-keychain', - '--font-render-hinting=none', - '--force-color-profile=srgb', - `--window-size=${width},${height}`, - ...(checkSsl ? [] : ['--ignore-certificate-errors']), - ...extensionArgs, - ], + // Wait for Chrome to be ready + const versionInfo = await waitForDebugPort(debugPort, 30000); + console.log(`[+] Chrome ready: ${versionInfo.Browser}`); + + const wsUrl = versionInfo.webSocketDebuggerUrl; + fs.writeFileSync(path.join(OUTPUT_DIR, 'cdp_url.txt'), wsUrl); + + // Connect Puppeteer to get page info + const browser = await puppeteer.connect({ + browserWSEndpoint: wsUrl, defaultViewport: { width, height }, }); - // Get the WebSocket endpoint URL - const cdpUrl = browser.wsEndpoint(); - fs.writeFileSync(path.join(OUTPUT_DIR, 'cdp_url.txt'), cdpUrl); + let pages = await browser.pages(); + let page = pages[0]; - // Write PID for cleanup - const browserProcess = browser.process(); - if (browserProcess) { - fs.writeFileSync(path.join(OUTPUT_DIR, 'pid.txt'), String(browserProcess.pid)); + if (!page) { + page = await browser.newPage(); } - // Create a new page (but DON'T navigate yet) - const page = await browser.newPage(); + await page.setViewport({ width, height }); - // Set user agent if specified if (userAgent) { await page.setUserAgent(userAgent); } - // Write the page target ID so other extractors can find this specific page const target = page.target(); const targetId = target._targetId; + fs.writeFileSync(path.join(OUTPUT_DIR, 'page_id.txt'), targetId); - - // Write the URL for chrome_navigate to use fs.writeFileSync(path.join(OUTPUT_DIR, 'url.txt'), url); + fs.writeFileSync(path.join(OUTPUT_DIR, 'shared_session.txt'), 'false'); - // Connect to loaded extensions at runtime (only if not already done) - const extensionsFile = path.join(OUTPUT_DIR, 'extensions.json'); - if (extensions.length > 0 && !fs.existsSync(extensionsFile)) { - console.log('[*] Connecting to loaded extensions (first time setup)...'); - try { - const loadedExtensions = await extensionUtils.loadAllExtensionsFromBrowser(browser, extensions); - - // Write loaded extensions metadata for other extractors to use - fs.writeFileSync(extensionsFile, JSON.stringify(loadedExtensions, null, 2)); - - console.log(`[+] Extensions loaded and available at ${extensionsFile}`); - console.log(`[+] ${loadedExtensions.length} extensions ready for configuration by subsequent plugins`); - } catch (e) { - console.warn(`[⚠️] Failed to load extensions from browser: ${e.message}`); - } - } else if (extensions.length > 0) { - console.log('[*] Extensions already loaded from previous snapshot'); - } - - // Don't close browser - leave it running for other extractors - // Detach puppeteer from browser so it stays running browser.disconnect(); - return { success: true, output: OUTPUT_DIR, cdpUrl, targetId }; + return { success: true, output: OUTPUT_DIR, cdpUrl: wsUrl, targetId, pid: chromePid, shared: false }; } catch (e) { - // Kill browser if startup failed - if (browser) { - try { - await browser.close(); - } catch (closeErr) { - // Ignore - } + try { + process.kill(chromePid, 'SIGTERM'); + } catch (killErr) { + // Ignore } return { success: false, error: `${e.name}: ${e.message}` }; } @@ -258,9 +321,10 @@ async function main() { const args = parseArgs(); const url = args.url; const snapshotId = args.snapshot_id; + const crawlId = args.crawl_id; if (!url || !snapshotId) { - console.error('Usage: on_Snapshot__20_chrome_session.js --url= --snapshot-id='); + console.error('Usage: on_Snapshot__20_chrome_session.js --url= --snapshot-id= [--crawl-id=]'); process.exit(1); } @@ -271,9 +335,6 @@ async function main() { let version = ''; try { - // chrome_session launches Chrome and creates a blank page - // Pre-load extractors (21-29) register CDP listeners - // chrome_navigate (30) performs actual navigation const binary = findChrome(); if (!binary) { console.error('ERROR: Chrome/Chromium binary not found'); @@ -291,13 +352,24 @@ async function main() { version = ''; } - const result = await startChromeSession(url, binary); + // Try to use existing crawl Chrome session + const crawlSession = findCrawlChromeSession(crawlId); + let result; + + if (crawlSession) { + console.log(`[*] Found existing Chrome session from crawl ${crawlId}`); + result = await createTabInExistingChrome(crawlSession.cdpUrl, url, crawlSession.pid); + } else { + console.log(`[*] No crawl Chrome session found, launching new Chrome`); + result = await launchNewChrome(url, binary); + } if (result.success) { status = 'succeeded'; output = result.output; - console.log(`Chrome session started (no navigation yet): ${result.cdpUrl}`); - console.log(`Page target ID: ${result.targetId}`); + console.log(`[+] Chrome session ready (shared: ${result.shared})`); + console.log(`[+] CDP URL: ${result.cdpUrl}`); + console.log(`[+] Page target ID: ${result.targetId}`); } else { status = 'failed'; error = result.error; @@ -331,6 +403,7 @@ async function main() { extractor: EXTRACTOR_NAME, url, snapshot_id: snapshotId, + crawl_id: crawlId || null, status, start_ts: startTs.toISOString(), end_ts: endTs.toISOString(), diff --git a/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.js b/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.js index fc90aa03..f305b08a 100755 --- a/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.js +++ b/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.js @@ -1,31 +1,24 @@ #!/usr/bin/env node /** - * Capture console output from a page. + * Capture console output from a page (DAEMON MODE). * - * Captures all console messages during page load: - * - log, warn, error, info, debug - * - Includes stack traces for errors - * - Timestamps for each message + * This hook daemonizes and stays alive to capture console logs throughout + * the snapshot lifecycle. It's killed by chrome_cleanup at the end. * - * Usage: on_Snapshot__14_consolelog.js --url= --snapshot-id= - * Output: Writes consolelog/console.jsonl (one message per line) - * - * Environment variables: - * SAVE_CONSOLELOG: Enable console log capture (default: true) - * CONSOLELOG_TIMEOUT: Capture duration in seconds (default: 5) + * Usage: on_Snapshot__21_consolelog.js --url= --snapshot-id= + * Output: Writes console.jsonl + listener.pid */ const fs = require('fs'); const path = require('path'); const puppeteer = require('puppeteer-core'); -// Extractor metadata const EXTRACTOR_NAME = 'consolelog'; const OUTPUT_DIR = '.'; const OUTPUT_FILE = 'console.jsonl'; +const PID_FILE = 'listener.pid'; const CHROME_SESSION_DIR = '../chrome_session'; -// Parse command line arguments function parseArgs() { const args = {}; process.argv.slice(2).forEach(arg => { @@ -37,7 +30,6 @@ function parseArgs() { return args; } -// Get environment variable with default function getEnv(name, defaultValue = '') { return (process.env[name] || defaultValue).trim(); } @@ -49,12 +41,6 @@ function getEnvBool(name, defaultValue = false) { return defaultValue; } -function getEnvInt(name, defaultValue = 0) { - const val = parseInt(getEnv(name, String(defaultValue)), 10); - return isNaN(val) ? defaultValue : val; -} - -// Get CDP URL from chrome_session function getCdpUrl() { const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt'); if (fs.existsSync(cdpFile)) { @@ -63,7 +49,14 @@ function getCdpUrl() { return null; } -// Serialize console message arguments +function getPageId() { + const pageIdFile = path.join(CHROME_SESSION_DIR, 'page_id.txt'); + if (fs.existsSync(pageIdFile)) { + return fs.readFileSync(pageIdFile, 'utf8').trim(); + } + return null; +} + async function serializeArgs(args) { const serialized = []; for (const arg of args) { @@ -71,7 +64,6 @@ async function serializeArgs(args) { const json = await arg.jsonValue(); serialized.push(json); } catch (e) { - // If jsonValue() fails, try to get text representation try { serialized.push(String(arg)); } catch (e2) { @@ -82,128 +74,84 @@ async function serializeArgs(args) { return serialized; } -// Capture console logs -async function captureConsoleLogs(url) { - const captureTimeout = (getEnvInt('CONSOLELOG_TIMEOUT') || 5) * 1000; - - // Output directory is current directory (hook already runs in output dir) +async function setupListeners() { const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE); + fs.writeFileSync(outputPath, ''); // Clear existing - // Clear existing file - fs.writeFileSync(outputPath, ''); - - let browser = null; - const consoleLogs = []; - - try { - // Connect to existing Chrome session - const cdpUrl = getCdpUrl(); - if (!cdpUrl) { - return { success: false, error: 'No Chrome session found (chrome_session extractor must run first)' }; - } - - browser = await puppeteer.connect({ - browserWSEndpoint: cdpUrl, - }); - - // Get the page - const pages = await browser.pages(); - const page = pages.find(p => p.url().startsWith('http')) || pages[0]; - - if (!page) { - return { success: false, error: 'No page found in Chrome session' }; - } - - // Listen for console messages - page.on('console', async (msg) => { - try { - const type = msg.type(); - const text = msg.text(); - const location = msg.location(); - const args = await serializeArgs(msg.args()); - - const logEntry = { - timestamp: new Date().toISOString(), - type, - text, - args, - location: { - url: location.url || '', - lineNumber: location.lineNumber, - columnNumber: location.columnNumber, - }, - }; - - // Write immediately to file - fs.appendFileSync(outputPath, JSON.stringify(logEntry) + '\n'); - consoleLogs.push(logEntry); - } catch (e) { - // Error processing console message, skip it - console.error(`Error processing console message: ${e.message}`); - } - }); - - // Listen for page errors - page.on('pageerror', (error) => { - try { - const logEntry = { - timestamp: new Date().toISOString(), - type: 'error', - text: error.message, - stack: error.stack || '', - location: {}, - }; - - fs.appendFileSync(outputPath, JSON.stringify(logEntry) + '\n'); - consoleLogs.push(logEntry); - } catch (e) { - console.error(`Error processing page error: ${e.message}`); - } - }); - - // Listen for request failures - page.on('requestfailed', (request) => { - try { - const failure = request.failure(); - const logEntry = { - timestamp: new Date().toISOString(), - type: 'request_failed', - text: `Request failed: ${request.url()}`, - error: failure ? failure.errorText : 'Unknown error', - url: request.url(), - location: {}, - }; - - fs.appendFileSync(outputPath, JSON.stringify(logEntry) + '\n'); - consoleLogs.push(logEntry); - } catch (e) { - console.error(`Error processing request failure: ${e.message}`); - } - }); - - // Wait to capture logs - await new Promise(resolve => setTimeout(resolve, captureTimeout)); - - // Group logs by type - const logStats = consoleLogs.reduce((acc, log) => { - acc[log.type] = (acc[log.type] || 0) + 1; - return acc; - }, {}); - - return { - success: true, - output: outputPath, - logCount: consoleLogs.length, - logStats, - }; - - } catch (e) { - return { success: false, error: `${e.name}: ${e.message}` }; - } finally { - if (browser) { - browser.disconnect(); - } + const cdpUrl = getCdpUrl(); + if (!cdpUrl) { + throw new Error('No Chrome session found'); } + + const browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl }); + + // Find our page + const pages = await browser.pages(); + const pageId = getPageId(); + let page = null; + + if (pageId) { + page = pages.find(p => { + const target = p.target(); + return target && target._targetId === pageId; + }); + } + if (!page) { + page = pages[pages.length - 1]; + } + + if (!page) { + throw new Error('No page found'); + } + + // Set up listeners that write directly to file + page.on('console', async (msg) => { + try { + const logEntry = { + timestamp: new Date().toISOString(), + type: msg.type(), + text: msg.text(), + args: await serializeArgs(msg.args()), + location: msg.location(), + }; + fs.appendFileSync(outputPath, JSON.stringify(logEntry) + '\n'); + } catch (e) { + // Ignore errors + } + }); + + page.on('pageerror', (error) => { + try { + const logEntry = { + timestamp: new Date().toISOString(), + type: 'error', + text: error.message, + stack: error.stack || '', + }; + fs.appendFileSync(outputPath, JSON.stringify(logEntry) + '\n'); + } catch (e) { + // Ignore + } + }); + + page.on('requestfailed', (request) => { + try { + const failure = request.failure(); + const logEntry = { + timestamp: new Date().toISOString(), + type: 'request_failed', + text: `Request failed: ${request.url()}`, + error: failure ? failure.errorText : 'Unknown error', + url: request.url(), + }; + fs.appendFileSync(outputPath, JSON.stringify(logEntry) + '\n'); + } catch (e) { + // Ignore + } + }); + + // Don't disconnect - keep browser connection alive + return { browser, page }; } async function main() { @@ -212,80 +160,83 @@ async function main() { const snapshotId = args.snapshot_id; if (!url || !snapshotId) { - console.error('Usage: on_Snapshot__14_consolelog.js --url= --snapshot-id='); + console.error('Usage: on_Snapshot__21_consolelog.js --url= --snapshot-id='); process.exit(1); } + if (!getEnvBool('SAVE_CONSOLELOG', true)) { + console.log('Skipping (SAVE_CONSOLELOG=False)'); + const result = { + extractor: EXTRACTOR_NAME, + status: 'skipped', + url, + snapshot_id: snapshotId, + }; + console.log(`RESULT_JSON=${JSON.stringify(result)}`); + process.exit(0); + } + const startTs = new Date(); - let status = 'failed'; - let output = null; - let error = ''; - let logCount = 0; try { - // Check if enabled - if (!getEnvBool('SAVE_CONSOLELOG', true)) { - console.log('Skipping console log (SAVE_CONSOLELOG=False)'); - status = 'skipped'; - const endTs = new Date(); - console.log(`START_TS=${startTs.toISOString()}`); - console.log(`END_TS=${endTs.toISOString()}`); - console.log(`STATUS=${status}`); - console.log(`RESULT_JSON=${JSON.stringify({extractor: EXTRACTOR_NAME, status, url, snapshot_id: snapshotId})}`); - process.exit(0); - } + // Set up listeners + await setupListeners(); - const result = await captureConsoleLogs(url); + // Write PID file so chrome_cleanup can kill us + fs.writeFileSync(path.join(OUTPUT_DIR, PID_FILE), String(process.pid)); - if (result.success) { - status = 'succeeded'; - output = result.output; - logCount = result.logCount || 0; - const statsStr = Object.entries(result.logStats || {}) - .map(([type, count]) => `${count} ${type}`) - .join(', '); - console.log(`Captured ${logCount} console messages: ${statsStr}`); - } else { - status = 'failed'; - error = result.error; + // Report success immediately (we're staying alive in background) + const endTs = new Date(); + const duration = (endTs - startTs) / 1000; + + console.log(`START_TS=${startTs.toISOString()}`); + console.log(`END_TS=${endTs.toISOString()}`); + console.log(`DURATION=${duration.toFixed(2)}`); + console.log(`OUTPUT=${OUTPUT_FILE}`); + console.log(`STATUS=succeeded`); + + const result = { + extractor: EXTRACTOR_NAME, + url, + snapshot_id: snapshotId, + status: 'succeeded', + start_ts: startTs.toISOString(), + end_ts: endTs.toISOString(), + duration: Math.round(duration * 100) / 100, + output: OUTPUT_FILE, + }; + console.log(`RESULT_JSON=${JSON.stringify(result)}`); + + // Daemonize: detach from parent and keep running + // This process will be killed by chrome_cleanup + if (process.stdin.isTTY) { + process.stdin.pause(); } + process.stdin.unref(); + process.stdout.end(); + process.stderr.end(); + + // Keep the process alive indefinitely + // Will be killed by chrome_cleanup via the PID file + setInterval(() => {}, 1000); + } catch (e) { - error = `${e.name}: ${e.message}`; - status = 'failed'; - } - - const endTs = new Date(); - const duration = (endTs - startTs) / 1000; - - // Print results - console.log(`START_TS=${startTs.toISOString()}`); - console.log(`END_TS=${endTs.toISOString()}`); - console.log(`DURATION=${duration.toFixed(2)}`); - if (output) { - console.log(`OUTPUT=${output}`); - } - console.log(`STATUS=${status}`); - - if (error) { + const error = `${e.name}: ${e.message}`; console.error(`ERROR=${error}`); + + const endTs = new Date(); + const result = { + extractor: EXTRACTOR_NAME, + url, + snapshot_id: snapshotId, + status: 'failed', + start_ts: startTs.toISOString(), + end_ts: endTs.toISOString(), + error, + }; + console.log(`RESULT_JSON=${JSON.stringify(result)}`); + process.exit(1); } - - // Print JSON result - const resultJson = { - extractor: EXTRACTOR_NAME, - url, - snapshot_id: snapshotId, - status, - start_ts: startTs.toISOString(), - end_ts: endTs.toISOString(), - duration: Math.round(duration * 100) / 100, - output, - log_count: logCount, - error: error || null, - }; - console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`); - - process.exit(status === 'succeeded' ? 0 : 1); } main().catch(e => { diff --git a/archivebox/plugins/istilldontcareaboutcookies/on_Snapshot__02_istilldontcareaboutcookies.js b/archivebox/plugins/istilldontcareaboutcookies/on_Crawl__02_istilldontcareaboutcookies.js similarity index 97% rename from archivebox/plugins/istilldontcareaboutcookies/on_Snapshot__02_istilldontcareaboutcookies.js rename to archivebox/plugins/istilldontcareaboutcookies/on_Crawl__02_istilldontcareaboutcookies.js index 0f2672f5..77b50dec 100755 --- a/archivebox/plugins/istilldontcareaboutcookies/on_Snapshot__02_istilldontcareaboutcookies.js +++ b/archivebox/plugins/istilldontcareaboutcookies/on_Crawl__02_istilldontcareaboutcookies.js @@ -7,8 +7,8 @@ * * Extension: https://chromewebstore.google.com/detail/edibdbjcniadpccecjdfdjjppcpchdlm * - * Priority: 02 (early) - Must install before Chrome session starts - * Hook: on_Snapshot + * Priority: 02 (early) - Must install before Chrome session starts at Crawl level + * Hook: on_Crawl (runs once per crawl, not per snapshot) * * This extension automatically: * - Dismisses cookie consent popups diff --git a/archivebox/plugins/redirects/on_Snapshot__22_redirects.js b/archivebox/plugins/redirects/on_Snapshot__22_redirects.js deleted file mode 100755 index aaa43232..00000000 --- a/archivebox/plugins/redirects/on_Snapshot__22_redirects.js +++ /dev/null @@ -1,278 +0,0 @@ -#!/usr/bin/env node -/** - * Track complete redirect chains for a URL. - * - * Captures: - * - HTTP redirects (301, 302, 303, 307, 308) - * - Meta refresh redirects - * - JavaScript redirects (basic detection) - * - Full redirect chain with timestamps - * - * Usage: on_Snapshot__15_redirects.js --url= --snapshot-id= - * Output: Writes redirects/redirects.json - * - * Environment variables: - * SAVE_REDIRECTS: Enable redirect tracking (default: true) - */ - -const fs = require('fs'); -const path = require('path'); -const puppeteer = require('puppeteer-core'); - -// Extractor metadata -const EXTRACTOR_NAME = 'redirects'; -const OUTPUT_DIR = '.'; -const OUTPUT_FILE = 'redirects.json'; -const CHROME_SESSION_DIR = '../chrome_session'; - -// Parse command line arguments -function parseArgs() { - const args = {}; - process.argv.slice(2).forEach(arg => { - if (arg.startsWith('--')) { - const [key, ...valueParts] = arg.slice(2).split('='); - args[key.replace(/-/g, '_')] = valueParts.join('=') || true; - } - }); - return args; -} - -// Get environment variable with default -function getEnv(name, defaultValue = '') { - return (process.env[name] || defaultValue).trim(); -} - -function getEnvBool(name, defaultValue = false) { - const val = getEnv(name, '').toLowerCase(); - if (['true', '1', 'yes', 'on'].includes(val)) return true; - if (['false', '0', 'no', 'off'].includes(val)) return false; - return defaultValue; -} - -// Get CDP URL from chrome_session -function getCdpUrl() { - const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt'); - if (fs.existsSync(cdpFile)) { - return fs.readFileSync(cdpFile, 'utf8').trim(); - } - return null; -} - -// Track redirect chain -async function trackRedirects(url) { - // Output directory is current directory (hook already runs in output dir) - const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE); - - let browser = null; - const redirectChain = []; - - try { - // Connect to existing Chrome session - const cdpUrl = getCdpUrl(); - if (!cdpUrl) { - return { success: false, error: 'No Chrome session found (chrome_session extractor must run first)' }; - } - - browser = await puppeteer.connect({ - browserWSEndpoint: cdpUrl, - }); - - // Get the page - const pages = await browser.pages(); - const page = pages.find(p => p.url().startsWith('http')) || pages[0]; - - if (!page) { - return { success: false, error: 'No page found in Chrome session' }; - } - - // Track all responses to capture redirects - page.on('response', async (response) => { - const status = response.status(); - const responseUrl = response.url(); - const headers = response.headers(); - - // Check if it's a redirect - if (status >= 300 && status < 400) { - redirectChain.push({ - timestamp: new Date().toISOString(), - url: responseUrl, - status, - statusText: response.statusText(), - location: headers['location'] || headers['Location'] || '', - type: 'http', - }); - } - }); - - // Get the current URL (which is the final destination after redirects) - const finalUrl = page.url(); - - // Check for meta refresh redirects - const metaRefresh = await page.evaluate(() => { - const meta = document.querySelector('meta[http-equiv="refresh"]'); - if (meta) { - const content = meta.getAttribute('content') || ''; - const match = content.match(/url=['"]?([^'"]+)['"]?/i); - return { - content, - url: match ? match[1] : null, - }; - } - return null; - }); - - if (metaRefresh && metaRefresh.url) { - redirectChain.push({ - timestamp: new Date().toISOString(), - url: finalUrl, - status: null, - statusText: 'Meta Refresh', - location: metaRefresh.url, - type: 'meta_refresh', - content: metaRefresh.content, - }); - } - - // Check for JavaScript redirects (basic detection) - const jsRedirect = await page.evaluate(() => { - // Check for common JavaScript redirect patterns - const html = document.documentElement.outerHTML; - const patterns = [ - /window\.location\s*=\s*['"]([^'"]+)['"]/i, - /window\.location\.href\s*=\s*['"]([^'"]+)['"]/i, - /window\.location\.replace\s*\(\s*['"]([^'"]+)['"]\s*\)/i, - /document\.location\s*=\s*['"]([^'"]+)['"]/i, - ]; - - for (const pattern of patterns) { - const match = html.match(pattern); - if (match) { - return { - pattern: pattern.toString(), - url: match[1], - }; - } - } - return null; - }); - - if (jsRedirect && jsRedirect.url) { - redirectChain.push({ - timestamp: new Date().toISOString(), - url: finalUrl, - status: null, - statusText: 'JavaScript Redirect', - location: jsRedirect.url, - type: 'javascript', - pattern: jsRedirect.pattern, - }); - } - - const redirectData = { - original_url: url, - final_url: finalUrl, - redirect_count: redirectChain.length, - redirects: redirectChain, - is_redirect: redirectChain.length > 0, - }; - - // Write output - fs.writeFileSync(outputPath, JSON.stringify(redirectData, null, 2)); - - return { success: true, output: outputPath, redirectData }; - - } catch (e) { - return { success: false, error: `${e.name}: ${e.message}` }; - } finally { - if (browser) { - browser.disconnect(); - } - } -} - -async function main() { - const args = parseArgs(); - const url = args.url; - const snapshotId = args.snapshot_id; - - if (!url || !snapshotId) { - console.error('Usage: on_Snapshot__15_redirects.js --url= --snapshot-id='); - process.exit(1); - } - - const startTs = new Date(); - let status = 'failed'; - let output = null; - let error = ''; - - try { - // Check if enabled - if (!getEnvBool('SAVE_REDIRECTS', true)) { - console.log('Skipping redirects (SAVE_REDIRECTS=False)'); - status = 'skipped'; - const endTs = new Date(); - console.log(`START_TS=${startTs.toISOString()}`); - console.log(`END_TS=${endTs.toISOString()}`); - console.log(`STATUS=${status}`); - console.log(`RESULT_JSON=${JSON.stringify({extractor: EXTRACTOR_NAME, status, url, snapshot_id: snapshotId})}`); - process.exit(0); - } - - const result = await trackRedirects(url); - - if (result.success) { - status = 'succeeded'; - output = result.output; - const redirectCount = result.redirectData.redirect_count; - const finalUrl = result.redirectData.final_url; - if (redirectCount > 0) { - console.log(`Tracked ${redirectCount} redirect(s) to: ${finalUrl}`); - } else { - console.log('No redirects detected'); - } - } else { - status = 'failed'; - error = result.error; - } - } catch (e) { - error = `${e.name}: ${e.message}`; - status = 'failed'; - } - - const endTs = new Date(); - const duration = (endTs - startTs) / 1000; - - // Print results - console.log(`START_TS=${startTs.toISOString()}`); - console.log(`END_TS=${endTs.toISOString()}`); - console.log(`DURATION=${duration.toFixed(2)}`); - if (output) { - console.log(`OUTPUT=${output}`); - } - console.log(`STATUS=${status}`); - - if (error) { - console.error(`ERROR=${error}`); - } - - // Print JSON result - const resultJson = { - extractor: EXTRACTOR_NAME, - url, - snapshot_id: snapshotId, - status, - start_ts: startTs.toISOString(), - end_ts: endTs.toISOString(), - duration: Math.round(duration * 100) / 100, - output, - error: error || null, - }; - console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`); - - process.exit(status === 'succeeded' ? 0 : 1); -} - -main().catch(e => { - console.error(`Fatal error: ${e.message}`); - process.exit(1); -}); diff --git a/archivebox/plugins/redirects/on_Snapshot__31_redirects.js b/archivebox/plugins/redirects/on_Snapshot__31_redirects.js new file mode 100755 index 00000000..9a4188a5 --- /dev/null +++ b/archivebox/plugins/redirects/on_Snapshot__31_redirects.js @@ -0,0 +1,248 @@ +#!/usr/bin/env node +/** + * Detect redirects by comparing original URL to final URL. + * + * This runs AFTER chrome_navigate and checks: + * - URL changed (HTTP redirect occurred) + * - Meta refresh tags (pending redirects) + * - JavaScript redirects (basic detection) + * + * Usage: on_Snapshot__31_redirects.js --url= --snapshot-id= + * Output: Writes redirects.json + */ + +const fs = require('fs'); +const path = require('path'); +const puppeteer = require('puppeteer-core'); + +const EXTRACTOR_NAME = 'redirects'; +const OUTPUT_DIR = '.'; +const OUTPUT_FILE = 'redirects.json'; +const CHROME_SESSION_DIR = '../chrome_session'; +const CHROME_NAVIGATE_DIR = '../chrome_navigate'; + +function parseArgs() { + const args = {}; + process.argv.slice(2).forEach(arg => { + if (arg.startsWith('--')) { + const [key, ...valueParts] = arg.slice(2).split('='); + args[key.replace(/-/g, '_')] = valueParts.join('=') || true; + } + }); + return args; +} + +function getEnv(name, defaultValue = '') { + return (process.env[name] || defaultValue).trim(); +} + +function getEnvBool(name, defaultValue = false) { + const val = getEnv(name, '').toLowerCase(); + if (['true', '1', 'yes', 'on'].includes(val)) return true; + if (['false', '0', 'no', 'off'].includes(val)) return false; + return defaultValue; +} + +function getCdpUrl() { + const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt'); + if (fs.existsSync(cdpFile)) { + return fs.readFileSync(cdpFile, 'utf8').trim(); + } + return null; +} + +function getPageId() { + const pageIdFile = path.join(CHROME_SESSION_DIR, 'page_id.txt'); + if (fs.existsSync(pageIdFile)) { + return fs.readFileSync(pageIdFile, 'utf8').trim(); + } + return null; +} + +function getFinalUrl() { + // Try chrome_navigate output first + const navFile = path.join(CHROME_NAVIGATE_DIR, 'final_url.txt'); + if (fs.existsSync(navFile)) { + return fs.readFileSync(navFile, 'utf8').trim(); + } + return null; +} + +async function detectRedirects(originalUrl) { + const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE); + const redirects = []; + + // Get final URL from chrome_navigate + let finalUrl = getFinalUrl() || originalUrl; + + // Check if URL changed (indicates redirect) + const urlChanged = originalUrl !== finalUrl; + if (urlChanged) { + redirects.push({ + timestamp: new Date().toISOString(), + from_url: originalUrl, + to_url: finalUrl, + type: 'http', + detected_by: 'url_comparison', + }); + } + + // Connect to Chrome to check for meta refresh and JS redirects + const cdpUrl = getCdpUrl(); + if (cdpUrl) { + let browser = null; + try { + browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl }); + + const pages = await browser.pages(); + const pageId = getPageId(); + let page = null; + + if (pageId) { + page = pages.find(p => { + const target = p.target(); + return target && target._targetId === pageId; + }); + } + if (!page) { + page = pages.find(p => p.url().startsWith('http')) || pages[pages.length - 1]; + } + + if (page) { + // Update finalUrl from actual page + const pageUrl = page.url(); + if (pageUrl && pageUrl !== 'about:blank') { + finalUrl = pageUrl; + } + + // Check for meta refresh + try { + const metaRefresh = await page.evaluate(() => { + const meta = document.querySelector('meta[http-equiv="refresh"]'); + if (meta) { + const content = meta.getAttribute('content') || ''; + const match = content.match(/url=['"]?([^'";\s]+)['"]?/i); + return { content, url: match ? match[1] : null }; + } + return null; + }); + + if (metaRefresh && metaRefresh.url) { + redirects.push({ + timestamp: new Date().toISOString(), + from_url: finalUrl, + to_url: metaRefresh.url, + type: 'meta_refresh', + content: metaRefresh.content, + }); + } + } catch (e) { /* ignore */ } + + // Check for JS redirects + try { + const jsRedirect = await page.evaluate(() => { + const html = document.documentElement.outerHTML; + const patterns = [ + /window\.location\s*=\s*['"]([^'"]+)['"]/i, + /window\.location\.href\s*=\s*['"]([^'"]+)['"]/i, + /window\.location\.replace\s*\(\s*['"]([^'"]+)['"]\s*\)/i, + ]; + for (const pattern of patterns) { + const match = html.match(pattern); + if (match) return { url: match[1], pattern: pattern.toString() }; + } + return null; + }); + + if (jsRedirect && jsRedirect.url) { + redirects.push({ + timestamp: new Date().toISOString(), + from_url: finalUrl, + to_url: jsRedirect.url, + type: 'javascript', + }); + } + } catch (e) { /* ignore */ } + } + + browser.disconnect(); + } catch (e) { + console.error(`Warning: Could not connect to Chrome: ${e.message}`); + } + } + + const result = { + original_url: originalUrl, + final_url: finalUrl, + redirect_count: redirects.length, + redirects, + is_redirect: originalUrl !== finalUrl || redirects.length > 0, + }; + + fs.writeFileSync(outputPath, JSON.stringify(result, null, 2)); + return { success: true, output: outputPath, data: result }; +} + +async function main() { + const args = parseArgs(); + const url = args.url; + const snapshotId = args.snapshot_id; + + if (!url || !snapshotId) { + console.error('Usage: on_Snapshot__31_redirects.js --url= --snapshot-id='); + process.exit(1); + } + + const startTs = new Date(); + let status = 'failed'; + let output = null; + let error = ''; + + if (!getEnvBool('SAVE_REDIRECTS', true)) { + console.log('Skipping redirects (SAVE_REDIRECTS=False)'); + status = 'skipped'; + } else { + try { + const result = await detectRedirects(url); + status = 'succeeded'; + output = result.output; + + if (result.data.is_redirect) { + console.log(`Redirect detected: ${url} -> ${result.data.final_url}`); + } else { + console.log('No redirects detected'); + } + } catch (e) { + error = `${e.name}: ${e.message}`; + } + } + + const endTs = new Date(); + const duration = (endTs - startTs) / 1000; + + console.log(`START_TS=${startTs.toISOString()}`); + console.log(`END_TS=${endTs.toISOString()}`); + console.log(`DURATION=${duration.toFixed(2)}`); + if (output) console.log(`OUTPUT=${output}`); + console.log(`STATUS=${status}`); + if (error) console.error(`ERROR=${error}`); + + console.log(`RESULT_JSON=${JSON.stringify({ + extractor: EXTRACTOR_NAME, + url, + snapshot_id: snapshotId, + status, + start_ts: startTs.toISOString(), + end_ts: endTs.toISOString(), + duration: Math.round(duration * 100) / 100, + output, + error: error || null, + })}`); + + process.exit(status === 'succeeded' ? 0 : 1); +} + +main().catch(e => { + console.error(`Fatal error: ${e.message}`); + process.exit(1); +}); diff --git a/archivebox/plugins/responses/on_Snapshot__24_responses.js b/archivebox/plugins/responses/on_Snapshot__24_responses.js index c69743b4..5c65f3fe 100755 --- a/archivebox/plugins/responses/on_Snapshot__24_responses.js +++ b/archivebox/plugins/responses/on_Snapshot__24_responses.js @@ -1,22 +1,12 @@ #!/usr/bin/env node /** - * Archive all network responses during page load. + * Archive all network responses during page load (DAEMON MODE). * - * Connects to Chrome session and captures ALL network responses (XHR, images, scripts, etc.) - * Saves them in an organized directory structure with both timestamped unique files - * and URL-organized symlinks. + * This hook daemonizes and stays alive to capture network responses throughout + * the snapshot lifecycle. It's killed by chrome_cleanup at the end. * - * Usage: on_Snapshot__23_responses.js --url= --snapshot-id= - * Output: Creates responses/ directory with: - * - all/____.: Timestamped unique files - * - ///: URL-organized symlinks by resource type - * - index.jsonl: Searchable index of all responses - * - * Environment variables: - * SAVE_RESPONSES: Enable response archiving (default: true) - * RESPONSES_TIMEOUT: Timeout in seconds (default: 120) - * RESPONSES_TYPES: Comma-separated resource types to save (default: all) - * Options: script,stylesheet,font,image,media,xhr,websocket,document + * Usage: on_Snapshot__24_responses.js --url= --snapshot-id= + * Output: Creates responses/ directory with index.jsonl + listener.pid */ const fs = require('fs'); @@ -27,6 +17,7 @@ const puppeteer = require('puppeteer-core'); // Extractor metadata const EXTRACTOR_NAME = 'responses'; const OUTPUT_DIR = '.'; +const PID_FILE = 'listener.pid'; const CHROME_SESSION_DIR = '../chrome_session'; // Resource types to capture (by default, capture everything) @@ -70,6 +61,14 @@ function getCdpUrl() { return null; } +function getPageId() { + const pageIdFile = path.join(CHROME_SESSION_DIR, 'page_id.txt'); + if (fs.existsSync(pageIdFile)) { + return fs.readFileSync(pageIdFile, 'utf8').trim(); + } + return null; +} + // Get file extension from MIME type function getExtensionFromMimeType(mimeType) { const mimeMap = { @@ -139,17 +138,14 @@ async function createSymlink(target, linkPath) { fs.symlinkSync(relativePath, linkPath); } catch (e) { // Ignore symlink errors (file conflicts, permissions, etc.) - console.error(`Failed to create symlink: ${e.message}`); } } -// Archive responses by intercepting network traffic -async function archiveResponses(originalUrl) { - const timeout = (getEnvInt('RESPONSES_TIMEOUT') || getEnvInt('TIMEOUT', 120)) * 1000; +// Set up response listener +async function setupListener() { const typesStr = getEnv('RESPONSES_TYPES', DEFAULT_TYPES.join(',')); const typesToSave = typesStr.split(',').map(t => t.trim().toLowerCase()); - // Output directory is current directory (hook already runs in output dir) // Create subdirectories for organizing responses const allDir = path.join(OUTPUT_DIR, 'all'); if (!fs.existsSync(allDir)) { @@ -160,138 +156,119 @@ async function archiveResponses(originalUrl) { const indexPath = path.join(OUTPUT_DIR, 'index.jsonl'); fs.writeFileSync(indexPath, ''); // Clear existing - let browser = null; - let savedCount = 0; - const savedResponses = []; - - try { - // Connect to existing Chrome session - const cdpUrl = getCdpUrl(); - if (!cdpUrl) { - return { success: false, error: 'No Chrome session found (chrome_session extractor must run first)' }; - } - - browser = await puppeteer.connect({ - browserWSEndpoint: cdpUrl, - }); - - // Get the page - const pages = await browser.pages(); - const page = pages.find(p => p.url().startsWith('http')) || pages[0]; - - if (!page) { - return { success: false, error: 'No page found in Chrome session' }; - } - - // Enable request interception - await page.setRequestInterception(false); // Don't block requests - - // Listen for responses - page.on('response', async (response) => { - try { - const request = response.request(); - const url = response.url(); - const resourceType = request.resourceType().toLowerCase(); - const method = request.method(); - const status = response.status(); - - // Skip redirects and errors - if (status >= 300 && status < 400) return; - if (status >= 400 && status < 600) return; - - // Check if we should save this resource type - if (typesToSave.length && !typesToSave.includes(resourceType)) { - return; - } - - // Get response body - let bodyBuffer = null; - try { - bodyBuffer = await response.buffer(); - } catch (e) { - // Some responses can't be captured (already consumed, etc.) - return; - } - - if (!bodyBuffer || bodyBuffer.length === 0) { - return; - } - - // Determine file extension - const mimeType = response.headers()['content-type'] || ''; - let extension = getExtensionFromMimeType(mimeType) || getExtensionFromUrl(url); - - // Create timestamp-based unique filename - const timestamp = new Date().toISOString().replace(/[-:]/g, '').replace(/\..+/, ''); - const urlHash = sanitizeFilename(encodeURIComponent(url).slice(0, 64)); - const uniqueFilename = `${timestamp}__${method}__${urlHash}${extension ? '.' + extension : ''}`; - const uniquePath = path.join(allDir, uniqueFilename); - - // Save to unique file - fs.writeFileSync(uniquePath, bodyBuffer); - - // Create URL-organized symlink - try { - const urlObj = new URL(url); - const hostname = urlObj.hostname; - const pathname = urlObj.pathname || '/'; - const filename = path.basename(pathname) || 'index' + (extension ? '.' + extension : ''); - const dirPath = path.dirname(pathname); - - // Create symlink: responses//// - const symlinkDir = path.join(OUTPUT_DIR, resourceType, hostname, dirPath); - const symlinkPath = path.join(symlinkDir, filename); - await createSymlink(uniquePath, symlinkPath); - } catch (e) { - // URL parsing or symlink creation failed, skip - } - - // Calculate SHA256 - const sha256 = crypto.createHash('sha256').update(bodyBuffer).digest('hex'); - const urlSha256 = crypto.createHash('sha256').update(url).digest('hex'); - - // Write to index - const indexEntry = { - ts: timestamp, - method, - url: method === 'DATA' ? url.slice(0, 128) : url, // Truncate data: URLs - urlSha256, - status, - resourceType, - mimeType: mimeType.split(';')[0], - responseSha256: sha256, - path: './' + path.relative(OUTPUT_DIR, uniquePath), - extension, - }; - - fs.appendFileSync(indexPath, JSON.stringify(indexEntry) + '\n'); - savedResponses.push(indexEntry); - savedCount++; - - } catch (e) { - // Log but don't fail the whole extraction - console.error(`Error capturing response: ${e.message}`); - } - }); - - // Wait a bit to ensure we capture responses - // (chrome_session already loaded the page, just capture any remaining traffic) - await new Promise(resolve => setTimeout(resolve, 2000)); - - return { - success: true, - output: OUTPUT_DIR, - savedCount, - indexPath, - }; - - } catch (e) { - return { success: false, error: `${e.name}: ${e.message}` }; - } finally { - if (browser) { - browser.disconnect(); - } + const cdpUrl = getCdpUrl(); + if (!cdpUrl) { + throw new Error('No Chrome session found'); } + + const browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl }); + + // Find our page + const pages = await browser.pages(); + const pageId = getPageId(); + let page = null; + + if (pageId) { + page = pages.find(p => { + const target = p.target(); + return target && target._targetId === pageId; + }); + } + if (!page) { + page = pages[pages.length - 1]; + } + + if (!page) { + throw new Error('No page found'); + } + + // Set up response listener to capture network traffic + page.on('response', async (response) => { + try { + const request = response.request(); + const url = response.url(); + const resourceType = request.resourceType().toLowerCase(); + const method = request.method(); + const status = response.status(); + + // Skip redirects and errors + if (status >= 300 && status < 400) return; + if (status >= 400 && status < 600) return; + + // Check if we should save this resource type + if (typesToSave.length && !typesToSave.includes(resourceType)) { + return; + } + + // Get response body + let bodyBuffer = null; + try { + bodyBuffer = await response.buffer(); + } catch (e) { + // Some responses can't be captured (already consumed, etc.) + return; + } + + if (!bodyBuffer || bodyBuffer.length === 0) { + return; + } + + // Determine file extension + const mimeType = response.headers()['content-type'] || ''; + let extension = getExtensionFromMimeType(mimeType) || getExtensionFromUrl(url); + + // Create timestamp-based unique filename + const timestamp = new Date().toISOString().replace(/[-:]/g, '').replace(/\..+/, ''); + const urlHash = sanitizeFilename(encodeURIComponent(url).slice(0, 64)); + const uniqueFilename = `${timestamp}__${method}__${urlHash}${extension ? '.' + extension : ''}`; + const uniquePath = path.join(allDir, uniqueFilename); + + // Save to unique file + fs.writeFileSync(uniquePath, bodyBuffer); + + // Create URL-organized symlink + try { + const urlObj = new URL(url); + const hostname = urlObj.hostname; + const pathname = urlObj.pathname || '/'; + const filename = path.basename(pathname) || 'index' + (extension ? '.' + extension : ''); + const dirPath = path.dirname(pathname); + + // Create symlink: responses//// + const symlinkDir = path.join(OUTPUT_DIR, resourceType, hostname, dirPath); + const symlinkPath = path.join(symlinkDir, filename); + await createSymlink(uniquePath, symlinkPath); + } catch (e) { + // URL parsing or symlink creation failed, skip + } + + // Calculate SHA256 + const sha256 = crypto.createHash('sha256').update(bodyBuffer).digest('hex'); + const urlSha256 = crypto.createHash('sha256').update(url).digest('hex'); + + // Write to index + const indexEntry = { + ts: timestamp, + method, + url: method === 'DATA' ? url.slice(0, 128) : url, // Truncate data: URLs + urlSha256, + status, + resourceType, + mimeType: mimeType.split(';')[0], + responseSha256: sha256, + path: './' + path.relative(OUTPUT_DIR, uniquePath), + extension, + }; + + fs.appendFileSync(indexPath, JSON.stringify(indexEntry) + '\n'); + + } catch (e) { + // Ignore errors + } + }); + + // Don't disconnect - keep browser connection alive + return { browser, page }; } async function main() { @@ -300,77 +277,83 @@ async function main() { const snapshotId = args.snapshot_id; if (!url || !snapshotId) { - console.error('Usage: on_Snapshot__23_responses.js --url= --snapshot-id='); + console.error('Usage: on_Snapshot__24_responses.js --url= --snapshot-id='); process.exit(1); } + if (!getEnvBool('SAVE_RESPONSES', true)) { + console.log('Skipping (SAVE_RESPONSES=False)'); + const result = { + extractor: EXTRACTOR_NAME, + status: 'skipped', + url, + snapshot_id: snapshotId, + }; + console.log(`RESULT_JSON=${JSON.stringify(result)}`); + process.exit(0); + } + const startTs = new Date(); - let status = 'failed'; - let output = null; - let error = ''; - let savedCount = 0; try { - // Check if enabled - if (!getEnvBool('SAVE_RESPONSES', true)) { - console.log('Skipping responses (SAVE_RESPONSES=False)'); - status = 'skipped'; - const endTs = new Date(); - console.log(`START_TS=${startTs.toISOString()}`); - console.log(`END_TS=${endTs.toISOString()}`); - console.log(`STATUS=${status}`); - console.log(`RESULT_JSON=${JSON.stringify({extractor: EXTRACTOR_NAME, status, url, snapshot_id: snapshotId})}`); - process.exit(0); - } + // Set up listener + await setupListener(); - const result = await archiveResponses(url); + // Write PID file so chrome_cleanup can kill us + fs.writeFileSync(path.join(OUTPUT_DIR, PID_FILE), String(process.pid)); - if (result.success) { - status = 'succeeded'; - output = result.output; - savedCount = result.savedCount || 0; - console.log(`Saved ${savedCount} network responses to ${output}/`); - } else { - status = 'failed'; - error = result.error; + // Report success immediately (we're staying alive in background) + const endTs = new Date(); + const duration = (endTs - startTs) / 1000; + + console.log(`START_TS=${startTs.toISOString()}`); + console.log(`END_TS=${endTs.toISOString()}`); + console.log(`DURATION=${duration.toFixed(2)}`); + console.log(`OUTPUT=responses/`); + console.log(`STATUS=succeeded`); + + const result = { + extractor: EXTRACTOR_NAME, + url, + snapshot_id: snapshotId, + status: 'succeeded', + start_ts: startTs.toISOString(), + end_ts: endTs.toISOString(), + duration: Math.round(duration * 100) / 100, + output: 'responses/', + }; + console.log(`RESULT_JSON=${JSON.stringify(result)}`); + + // Daemonize: detach from parent and keep running + // This process will be killed by chrome_cleanup + if (process.stdin.isTTY) { + process.stdin.pause(); } + process.stdin.unref(); + process.stdout.end(); + process.stderr.end(); + + // Keep the process alive indefinitely + // Will be killed by chrome_cleanup via the PID file + setInterval(() => {}, 1000); + } catch (e) { - error = `${e.name}: ${e.message}`; - status = 'failed'; - } - - const endTs = new Date(); - const duration = (endTs - startTs) / 1000; - - // Print results - console.log(`START_TS=${startTs.toISOString()}`); - console.log(`END_TS=${endTs.toISOString()}`); - console.log(`DURATION=${duration.toFixed(2)}`); - if (output) { - console.log(`OUTPUT=${output}`); - } - console.log(`STATUS=${status}`); - - if (error) { + const error = `${e.name}: ${e.message}`; console.error(`ERROR=${error}`); + + const endTs = new Date(); + const result = { + extractor: EXTRACTOR_NAME, + url, + snapshot_id: snapshotId, + status: 'failed', + start_ts: startTs.toISOString(), + end_ts: endTs.toISOString(), + error, + }; + console.log(`RESULT_JSON=${JSON.stringify(result)}`); + process.exit(1); } - - // Print JSON result - const resultJson = { - extractor: EXTRACTOR_NAME, - url, - snapshot_id: snapshotId, - status, - start_ts: startTs.toISOString(), - end_ts: endTs.toISOString(), - duration: Math.round(duration * 100) / 100, - output, - saved_count: savedCount, - error: error || null, - }; - console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`); - - process.exit(status === 'succeeded' ? 0 : 1); } main().catch(e => { diff --git a/archivebox/plugins/singlefile/on_Snapshot__04_singlefile.js b/archivebox/plugins/singlefile/on_Crawl__04_singlefile.js similarity index 99% rename from archivebox/plugins/singlefile/on_Snapshot__04_singlefile.js rename to archivebox/plugins/singlefile/on_Crawl__04_singlefile.js index 81d23435..cb17a9a3 100755 --- a/archivebox/plugins/singlefile/on_Snapshot__04_singlefile.js +++ b/archivebox/plugins/singlefile/on_Crawl__04_singlefile.js @@ -7,8 +7,8 @@ * * Extension: https://chromewebstore.google.com/detail/mpiodijhokgodhhofbcjdecpffjipkle * - * Priority: 04 (early) - Must install before Chrome session starts - * Hook: on_Snapshot + * Priority: 04 (early) - Must install before Chrome session starts at Crawl level + * Hook: on_Crawl (runs once per crawl, not per snapshot) * * This extension automatically: * - Saves complete web pages as single HTML files diff --git a/archivebox/plugins/ssl/on_Snapshot__23_ssl.js b/archivebox/plugins/ssl/on_Snapshot__23_ssl.js index 2ce4cd65..4d18e010 100755 --- a/archivebox/plugins/ssl/on_Snapshot__23_ssl.js +++ b/archivebox/plugins/ssl/on_Snapshot__23_ssl.js @@ -1,18 +1,12 @@ #!/usr/bin/env node /** - * Extract SSL/TLS certificate details from a URL. + * Extract SSL/TLS certificate details from a URL (DAEMON MODE). * - * Connects to Chrome session and retrieves security details including: - * - Protocol (TLS 1.2, TLS 1.3, etc.) - * - Cipher suite - * - Certificate issuer, validity period - * - Security state + * This hook daemonizes and stays alive to capture SSL details throughout + * the snapshot lifecycle. It's killed by chrome_cleanup at the end. * - * Usage: on_Snapshot__16_ssl.js --url= --snapshot-id= - * Output: Writes ssl/ssl.json - * - * Environment variables: - * SAVE_SSL: Enable SSL extraction (default: true) + * Usage: on_Snapshot__23_ssl.js --url= --snapshot-id= + * Output: Writes ssl.json + listener.pid */ const fs = require('fs'); @@ -23,6 +17,7 @@ const puppeteer = require('puppeteer-core'); const EXTRACTOR_NAME = 'ssl'; const OUTPUT_DIR = '.'; const OUTPUT_FILE = 'ssl.json'; +const PID_FILE = 'listener.pid'; const CHROME_SESSION_DIR = '../chrome_session'; // Parse command line arguments @@ -58,103 +53,103 @@ function getCdpUrl() { return null; } -// Extract SSL details -async function extractSsl(url) { - // Output directory is current directory (hook already runs in output dir) +function getPageId() { + const pageIdFile = path.join(CHROME_SESSION_DIR, 'page_id.txt'); + if (fs.existsSync(pageIdFile)) { + return fs.readFileSync(pageIdFile, 'utf8').trim(); + } + return null; +} + +// Set up SSL listener +async function setupListener(url) { const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE); // Only extract SSL for HTTPS URLs if (!url.startsWith('https://')) { - return { success: false, error: 'URL is not HTTPS' }; + throw new Error('URL is not HTTPS'); } - let browser = null; - let sslInfo = {}; + const cdpUrl = getCdpUrl(); + if (!cdpUrl) { + throw new Error('No Chrome session found'); + } - try { - // Connect to existing Chrome session - const cdpUrl = getCdpUrl(); - if (!cdpUrl) { - return { success: false, error: 'No Chrome session found (chrome_session extractor must run first)' }; - } + const browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl }); - browser = await puppeteer.connect({ - browserWSEndpoint: cdpUrl, + // Find our page + const pages = await browser.pages(); + const pageId = getPageId(); + let page = null; + + if (pageId) { + page = pages.find(p => { + const target = p.target(); + return target && target._targetId === pageId; }); + } + if (!page) { + page = pages[pages.length - 1]; + } - // Get the page - const pages = await browser.pages(); - const page = pages.find(p => p.url().startsWith('http')) || pages[0]; + if (!page) { + throw new Error('No page found'); + } - if (!page) { - return { success: false, error: 'No page found in Chrome session' }; - } + // Set up listener to capture SSL details when chrome_navigate loads the page + page.on('response', async (response) => { + try { + const request = response.request(); - // Get CDP client for low-level access - const client = await page.target().createCDPSession(); - - // Enable Security domain - await client.send('Security.enable'); - - // Get security details from the loaded page - const securityState = await client.send('Security.getSecurityState'); - - sslInfo = { - url, - securityState: securityState.securityState, - schemeIsCryptographic: securityState.schemeIsCryptographic, - summary: securityState.summary || '', - }; - - // Try to get detailed certificate info if available - if (securityState.securityStateIssueIds && securityState.securityStateIssueIds.length > 0) { - sslInfo.issues = securityState.securityStateIssueIds; - } - - // Get response security details from navigation - let mainResponse = null; - page.on('response', async (response) => { - if (response.url() === url || response.request().isNavigationRequest()) { - mainResponse = response; + // Only capture the main navigation request + if (!request.isNavigationRequest() || request.frame() !== page.mainFrame()) { + return; } - }); - // If we have security details from response - if (mainResponse) { - try { - const securityDetails = await mainResponse.securityDetails(); - if (securityDetails) { - sslInfo.protocol = securityDetails.protocol(); - sslInfo.subjectName = securityDetails.subjectName(); - sslInfo.issuer = securityDetails.issuer(); - sslInfo.validFrom = securityDetails.validFrom(); - sslInfo.validTo = securityDetails.validTo(); - sslInfo.certificateId = securityDetails.subjectName(); + // Only capture if it's for our target URL + if (!response.url().startsWith(url.split('?')[0])) { + return; + } - const sanList = securityDetails.sanList(); - if (sanList && sanList.length > 0) { - sslInfo.subjectAlternativeNames = sanList; - } + // Get security details from the response + const securityDetails = response.securityDetails(); + let sslInfo = {}; + + if (securityDetails) { + sslInfo.protocol = securityDetails.protocol(); + sslInfo.subjectName = securityDetails.subjectName(); + sslInfo.issuer = securityDetails.issuer(); + sslInfo.validFrom = securityDetails.validFrom(); + sslInfo.validTo = securityDetails.validTo(); + sslInfo.certificateId = securityDetails.subjectName(); + sslInfo.securityState = 'secure'; + sslInfo.schemeIsCryptographic = true; + + const sanList = securityDetails.sanList(); + if (sanList && sanList.length > 0) { + sslInfo.subjectAlternativeNames = sanList; } - } catch (e) { - // Security details not available + } else if (response.url().startsWith('https://')) { + // HTTPS URL but no security details means something went wrong + sslInfo.securityState = 'unknown'; + sslInfo.schemeIsCryptographic = true; + sslInfo.error = 'No security details available'; + } else { + // Non-HTTPS URL + sslInfo.securityState = 'insecure'; + sslInfo.schemeIsCryptographic = false; } + + // Write output directly to file + fs.writeFileSync(outputPath, JSON.stringify(sslInfo, null, 2)); + + } catch (e) { + // Ignore errors } + }); - await client.detach(); - - // Write output - fs.writeFileSync(outputPath, JSON.stringify(sslInfo, null, 2)); - - return { success: true, output: outputPath, sslInfo }; - - } catch (e) { - return { success: false, error: `${e.name}: ${e.message}` }; - } finally { - if (browser) { - browser.disconnect(); - } - } + // Don't disconnect - keep browser connection alive + return { browser, page }; } async function main() { @@ -163,75 +158,83 @@ async function main() { const snapshotId = args.snapshot_id; if (!url || !snapshotId) { - console.error('Usage: on_Snapshot__16_ssl.js --url= --snapshot-id='); + console.error('Usage: on_Snapshot__23_ssl.js --url= --snapshot-id='); process.exit(1); } + if (!getEnvBool('SAVE_SSL', true)) { + console.log('Skipping (SAVE_SSL=False)'); + const result = { + extractor: EXTRACTOR_NAME, + status: 'skipped', + url, + snapshot_id: snapshotId, + }; + console.log(`RESULT_JSON=${JSON.stringify(result)}`); + process.exit(0); + } + const startTs = new Date(); - let status = 'failed'; - let output = null; - let error = ''; try { - // Check if enabled - if (!getEnvBool('SAVE_SSL', true)) { - console.log('Skipping SSL (SAVE_SSL=False)'); - status = 'skipped'; - const endTs = new Date(); - console.log(`START_TS=${startTs.toISOString()}`); - console.log(`END_TS=${endTs.toISOString()}`); - console.log(`STATUS=${status}`); - console.log(`RESULT_JSON=${JSON.stringify({extractor: EXTRACTOR_NAME, status, url, snapshot_id: snapshotId})}`); - process.exit(0); - } + // Set up listener + await setupListener(url); - const result = await extractSsl(url); + // Write PID file so chrome_cleanup can kill us + fs.writeFileSync(path.join(OUTPUT_DIR, PID_FILE), String(process.pid)); - if (result.success) { - status = 'succeeded'; - output = result.output; - const protocol = result.sslInfo?.protocol || 'unknown'; - console.log(`SSL details extracted: ${protocol}`); - } else { - status = 'failed'; - error = result.error; + // Report success immediately (we're staying alive in background) + const endTs = new Date(); + const duration = (endTs - startTs) / 1000; + + console.log(`START_TS=${startTs.toISOString()}`); + console.log(`END_TS=${endTs.toISOString()}`); + console.log(`DURATION=${duration.toFixed(2)}`); + console.log(`OUTPUT=${OUTPUT_FILE}`); + console.log(`STATUS=succeeded`); + + const result = { + extractor: EXTRACTOR_NAME, + url, + snapshot_id: snapshotId, + status: 'succeeded', + start_ts: startTs.toISOString(), + end_ts: endTs.toISOString(), + duration: Math.round(duration * 100) / 100, + output: OUTPUT_FILE, + }; + console.log(`RESULT_JSON=${JSON.stringify(result)}`); + + // Daemonize: detach from parent and keep running + // This process will be killed by chrome_cleanup + if (process.stdin.isTTY) { + process.stdin.pause(); } + process.stdin.unref(); + process.stdout.end(); + process.stderr.end(); + + // Keep the process alive indefinitely + // Will be killed by chrome_cleanup via the PID file + setInterval(() => {}, 1000); + } catch (e) { - error = `${e.name}: ${e.message}`; - status = 'failed'; - } - - const endTs = new Date(); - const duration = (endTs - startTs) / 1000; - - // Print results - console.log(`START_TS=${startTs.toISOString()}`); - console.log(`END_TS=${endTs.toISOString()}`); - console.log(`DURATION=${duration.toFixed(2)}`); - if (output) { - console.log(`OUTPUT=${output}`); - } - console.log(`STATUS=${status}`); - - if (error) { + const error = `${e.name}: ${e.message}`; console.error(`ERROR=${error}`); + + const endTs = new Date(); + const result = { + extractor: EXTRACTOR_NAME, + url, + snapshot_id: snapshotId, + status: 'failed', + start_ts: startTs.toISOString(), + end_ts: endTs.toISOString(), + error, + }; + console.log(`RESULT_JSON=${JSON.stringify(result)}`); + process.exit(1); } - - // Print JSON result - const resultJson = { - extractor: EXTRACTOR_NAME, - url, - snapshot_id: snapshotId, - status, - start_ts: startTs.toISOString(), - end_ts: endTs.toISOString(), - duration: Math.round(duration * 100) / 100, - output, - error: error || null, - }; - console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`); - - process.exit(status === 'succeeded' ? 0 : 1); } main().catch(e => { diff --git a/archivebox/plugins/ublock/on_Snapshot__03_ublock.js b/archivebox/plugins/ublock/on_Crawl__03_ublock.js similarity index 97% rename from archivebox/plugins/ublock/on_Snapshot__03_ublock.js rename to archivebox/plugins/ublock/on_Crawl__03_ublock.js index 190a24e6..cf0f8240 100755 --- a/archivebox/plugins/ublock/on_Snapshot__03_ublock.js +++ b/archivebox/plugins/ublock/on_Crawl__03_ublock.js @@ -7,8 +7,8 @@ * * Extension: https://chromewebstore.google.com/detail/cjpalhdlnbpafiamejdnhcphjbkeiagm * - * Priority: 03 (early) - Must install before Chrome session starts - * Hook: on_Snapshot + * Priority: 03 (early) - Must install before Chrome session starts at Crawl level + * Hook: on_Crawl (runs once per crawl, not per snapshot) * * This extension automatically: * - Blocks ads, trackers, and malware domains diff --git a/archivebox/templates/admin/base.html b/archivebox/templates/admin/base.html index 8d3f1e90..bbcb0a3b 100644 --- a/archivebox/templates/admin/base.html +++ b/archivebox/templates/admin/base.html @@ -751,13 +751,15 @@ } .messagelist li.warning { - background: #fffbeb; + background: #fffbeb !important; + background-image: none !important; border: 1px solid #fde68a; color: #92400e; } .messagelist li.error { - background: #fef2f2; + background: #fef2f2 !important; + background-image: none !important; border: 1px solid #fecaca; color: #991b1b; } @@ -916,11 +918,13 @@ gap: 12px; } - #toolbar form { + #toolbar form, + #changelist-search { display: flex; align-items: center; gap: 8px; - flex: 1; + flex: 0 1 auto; + max-width: 500px; } #searchbar { diff --git a/archivebox/templates/admin/progress_monitor.html b/archivebox/templates/admin/progress_monitor.html index 3b5299af..10286104 100644 --- a/archivebox/templates/admin/progress_monitor.html +++ b/archivebox/templates/admin/progress_monitor.html @@ -162,10 +162,15 @@ padding: 10px 14px; background: rgba(0,0,0,0.2); cursor: pointer; + text-decoration: none; + color: inherit; } #progress-monitor .crawl-header:hover { background: rgba(88, 166, 255, 0.1); } + #progress-monitor a.crawl-header:visited { + color: inherit; + } #progress-monitor .crawl-icon { font-size: 16px; width: 20px; @@ -252,10 +257,15 @@ gap: 10px; padding: 8px 12px; cursor: pointer; + text-decoration: none; + color: inherit; } #progress-monitor .snapshot-header:hover { background: rgba(88, 166, 255, 0.05); } + #progress-monitor a.snapshot-header:visited { + color: inherit; + } #progress-monitor .snapshot-icon { font-size: 14px; width: 18px; @@ -391,15 +401,6 @@ color: #f85149; } - /* Expand/Collapse Icons */ - #progress-monitor .expand-icon { - color: #8b949e; - font-size: 10px; - transition: transform 0.2s; - } - #progress-monitor .expand-icon.expanded { - transform: rotate(90deg); - }
@@ -449,8 +450,6 @@ let pollInterval = null; let isCollapsed = localStorage.getItem('progress-monitor-collapsed') === 'true'; - let expandedCrawls = new Set(JSON.parse(localStorage.getItem('progress-monitor-expanded-crawls') || '[]')); - let expandedSnapshots = new Set(JSON.parse(localStorage.getItem('progress-monitor-expanded-snapshots') || '[]')); // Baselines for resettable counters let succeededBaseline = parseInt(localStorage.getItem('progress-succeeded-baseline') || '0'); @@ -496,9 +495,8 @@ } function renderSnapshot(snapshot, crawlId) { - const snapshotKey = `${crawlId}-${snapshot.id}`; - const isExpanded = expandedSnapshots.has(snapshotKey); const statusIcon = snapshot.status === 'started' ? '↻' : '📄'; + const adminUrl = `/admin/core/snapshot/${snapshot.id}/change/`; let extractorHtml = ''; if (snapshot.all_extractors && snapshot.all_extractors.length > 0) { @@ -507,16 +505,15 @@ a.extractor.localeCompare(b.extractor) ); extractorHtml = ` -
+
${sortedExtractors.map(e => renderExtractor(e)).join('')}
`; } return ` -
- +
0) { @@ -583,8 +580,7 @@ return `
- +
${warningHtml} -
+
${snapshotsHtml}
@@ -613,41 +609,6 @@ `; } - window.toggleCrawl = function(crawlId) { - const item = document.querySelector(`[data-crawl-id="${crawlId}"]`); - const body = item.querySelector('.crawl-body'); - const icon = item.querySelector('.expand-icon'); - - if (expandedCrawls.has(crawlId)) { - expandedCrawls.delete(crawlId); - body.style.display = 'none'; - icon.classList.remove('expanded'); - } else { - expandedCrawls.add(crawlId); - body.style.display = ''; - icon.classList.add('expanded'); - } - localStorage.setItem('progress-monitor-expanded-crawls', JSON.stringify([...expandedCrawls])); - }; - - window.toggleSnapshot = function(snapshotKey) { - const item = document.querySelector(`[data-snapshot-key="${snapshotKey}"]`); - const extractorList = item.querySelector('.extractor-list'); - const icon = item.querySelector('.expand-icon'); - - if (!extractorList) return; - - if (expandedSnapshots.has(snapshotKey)) { - expandedSnapshots.delete(snapshotKey); - extractorList.style.display = 'none'; - icon.classList.remove('expanded'); - } else { - expandedSnapshots.add(snapshotKey); - extractorList.style.display = ''; - icon.classList.add('expanded'); - } - localStorage.setItem('progress-monitor-expanded-snapshots', JSON.stringify([...expandedSnapshots])); - }; function updateProgress(data) { // Calculate if there's activity diff --git a/archivebox/workers/orchestrator.py b/archivebox/workers/orchestrator.py index 4536fa83..e4d2ba97 100644 --- a/archivebox/workers/orchestrator.py +++ b/archivebox/workers/orchestrator.py @@ -66,9 +66,9 @@ class Orchestrator: """ WORKER_TYPES: list[Type[Worker]] = [CrawlWorker, SnapshotWorker, ArchiveResultWorker] - + # Configuration - POLL_INTERVAL: float = 1.0 + POLL_INTERVAL: float = 2.0 # How often to check for new work (seconds) IDLE_TIMEOUT: int = 3 # Exit after N idle ticks (0 = never exit) MAX_WORKERS_PER_TYPE: int = 4 # Max workers per model type MAX_TOTAL_WORKERS: int = 12 # Max workers across all types diff --git a/archivebox/workers/supervisord_util.py b/archivebox/workers/supervisord_util.py index 691adae4..8ec749ee 100644 --- a/archivebox/workers/supervisord_util.py +++ b/archivebox/workers/supervisord_util.py @@ -3,6 +3,7 @@ __package__ = 'archivebox.workers' import sys import time import signal +import socket import psutil import shutil import subprocess @@ -47,6 +48,16 @@ SERVER_WORKER = lambda host, port: { "redirect_stderr": "true", } +def is_port_in_use(host: str, port: int) -> bool: + """Check if a port is already in use.""" + try: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + s.bind((host, port)) + return False + except OSError: + return True + @cache def get_sock_file(): """Get the path to the supervisord socket file, symlinking to a shorter path if needed due to unix path length limits""" @@ -161,9 +172,10 @@ def stop_existing_supervisord_process(): except subprocess.TimeoutExpired: _supervisord_proc.kill() _supervisord_proc.wait(timeout=2) - except (BaseException, BrokenPipeError, IOError, KeyboardInterrupt): + except (BrokenPipeError, IOError): pass - _supervisord_proc = None + finally: + _supervisord_proc = None return # Fallback: if pid file exists, load PID int and kill that process @@ -194,7 +206,7 @@ def stop_existing_supervisord_process(): pass except psutil.NoSuchProcess: pass - except (BaseException, BrokenPipeError, IOError, KeyboardInterrupt): + except (BrokenPipeError, IOError): pass finally: try: @@ -423,15 +435,8 @@ def tail_multiple_worker_logs(log_files: list[str], follow=True, proc=None): for log_path in log_paths: try: f = open(log_path, 'r') - # Don't seek to end - show recent content so user sees something - # Go to end minus 4KB to show some recent logs - f.seek(0, 2) # Go to end first - file_size = f.tell() - if file_size > 4096: - f.seek(file_size - 4096) - f.readline() # Skip partial line - else: - f.seek(0) # Small file, read from start + # Seek to end - only show NEW logs from now on, not old logs + f.seek(0, 2) # Go to end file_handles.append((log_path, f)) print(f" [tailing {log_path.name}]") @@ -536,7 +541,7 @@ def start_server_workers(host='0.0.0.0', port='8000', daemonize=False): finally: # Ensure supervisord and all children are stopped stop_existing_supervisord_process() - time.sleep(0.5) + time.sleep(1.0) # Give processes time to fully terminate def start_cli_workers(watch=False): @@ -563,7 +568,7 @@ def start_cli_workers(watch=False): finally: # Ensure supervisord and all children are stopped stop_existing_supervisord_process() - time.sleep(0.5) + time.sleep(1.0) # Give processes time to fully terminate return [ORCHESTRATOR_WORKER] diff --git a/archivebox/workers/worker.py b/archivebox/workers/worker.py index 991a0e72..91860fbe 100644 --- a/archivebox/workers/worker.py +++ b/archivebox/workers/worker.py @@ -67,8 +67,8 @@ class Worker: # Configuration (can be overridden by subclasses) MAX_TICK_TIME: ClassVar[int] = 60 MAX_CONCURRENT_TASKS: ClassVar[int] = 1 - POLL_INTERVAL: ClassVar[float] = 1.0 - IDLE_TIMEOUT: ClassVar[int] = 10 # Exit after N idle iterations (10 sec at 1.0 poll interval) + POLL_INTERVAL: ClassVar[float] = 0.2 # How often to check for new work (seconds) + IDLE_TIMEOUT: ClassVar[int] = 50 # Exit after N idle iterations (10 sec at 0.2 poll interval) def __init__(self, worker_id: int = 0, daemon: bool = False, **kwargs: Any): self.worker_id = worker_id @@ -214,28 +214,33 @@ class Worker: self.idle_count = 0 # Build metadata for task start - start_metadata = {'task_id': str(obj.pk)} + start_metadata = {} + url = None if hasattr(obj, 'url'): # SnapshotWorker url = str(obj.url) if obj.url else None - else: - url = None + elif hasattr(obj, 'snapshot') and hasattr(obj.snapshot, 'url'): + # ArchiveResultWorker + url = str(obj.snapshot.url) if obj.snapshot.url else None + elif hasattr(obj, 'get_urls_list'): + # CrawlWorker + urls = obj.get_urls_list() + url = urls[0] if urls else None extractor = None if hasattr(obj, 'extractor'): - # ArchiveResultWorker + # ArchiveResultWorker, Crawl extractor = obj.extractor - start_metadata['extractor'] = extractor log_worker_event( worker_type=worker_type_name, - event='Processing...', + event='Starting...', indent_level=indent_level, pid=self.pid, worker_id=str(self.worker_id), url=url, extractor=extractor, - metadata=start_metadata, + metadata=start_metadata if start_metadata else None, ) start_time = time.time() @@ -244,7 +249,6 @@ class Worker: # Build metadata for task completion complete_metadata = { - 'task_id': str(obj.pk), 'duration': elapsed, 'status': 'success' if success else 'failed', } diff --git a/test_extensions.sh b/test_extensions.sh deleted file mode 100755 index 79efc6b2..00000000 --- a/test_extensions.sh +++ /dev/null @@ -1,101 +0,0 @@ -#!/bin/bash -set -e - -echo "==========================================" -echo "Testing Chrome Extension System" -echo "==========================================" - -# Get absolute path to project root -PROJECT_ROOT="$(cd "$(dirname "$0")" && pwd)" - -# Set up test environment with absolute paths -export DATA_DIR="$PROJECT_ROOT/data" -export ACTIVE_PERSONA="Test" -export CHROME_EXTENSIONS_DIR="$PROJECT_ROOT/data/personas/Test/chrome_extensions" -export API_KEY_2CAPTCHA="test_api_key_12345" - -# Clean up any previous test data -echo "" -echo "[1/6] Cleaning up previous test data..." -rm -rf "$CHROME_EXTENSIONS_DIR" -rm -rf "$PROJECT_ROOT/chrome_session" -# Also clean up any files created in plugin directories from previous runs -find "$PROJECT_ROOT/archivebox/plugins" -type d -name "data" -exec rm -rf {} + 2>/dev/null || true -mkdir -p "$CHROME_EXTENSIONS_DIR" - -echo "✓ Clean slate ready" - -# Test 1: Install captcha2 extension -echo "" -echo "[2/6] Testing captcha2 extension installation..." -node "$PROJECT_ROOT/archivebox/plugins/captcha2/on_Snapshot__01_captcha2.js" -if [ -f "$CHROME_EXTENSIONS_DIR/captcha2.extension.json" ]; then - echo "✓ captcha2.extension.json created" -else - echo "✗ Failed to create captcha2.extension.json" - exit 1 -fi - -# Test 2: Check caching (run again, should skip) -echo "" -echo "[3/6] Testing cache (should skip re-installation)..." -node "$PROJECT_ROOT/archivebox/plugins/captcha2/on_Snapshot__01_captcha2.js" -echo "✓ Cache check passed" - -# Test 3: Install other extensions -echo "" -echo "[4/6] Testing other extensions..." -node "$PROJECT_ROOT/archivebox/plugins/istilldontcareaboutcookies/on_Snapshot__02_istilldontcareaboutcookies.js" -node "$PROJECT_ROOT/archivebox/plugins/ublock/on_Snapshot__03_ublock.js" -node "$PROJECT_ROOT/archivebox/plugins/singlefile/on_Snapshot__04_singlefile.js" - -echo "✓ All extensions installed" - -# Test 4: List installed extensions -echo "" -echo "[5/6] Verifying extension files..." -ls -lh "$CHROME_EXTENSIONS_DIR"/*.extension.json 2>/dev/null || echo "No extension.json files found" - -# Count extensions -EXT_COUNT=$(ls -1 "$CHROME_EXTENSIONS_DIR"/*.extension.json 2>/dev/null | wc -l | tr -d ' ') -echo "" -echo "Found $EXT_COUNT extension metadata files" - -if [ "$EXT_COUNT" -ge "3" ]; then - echo "✓ Expected extensions installed" -else - echo "✗ Expected at least 3 extensions, found $EXT_COUNT" - exit 1 -fi - -# Test 5: Check unpacked directories -echo "" -echo "[6/6] Checking unpacked extension directories..." -UNPACKED_COUNT=$(find "$CHROME_EXTENSIONS_DIR" -type d -name "*__*" 2>/dev/null | wc -l | tr -d ' ') -echo "Found $UNPACKED_COUNT unpacked extension directories" - -if [ "$UNPACKED_COUNT" -ge "3" ]; then - echo "✓ Extensions unpacked successfully" -else - echo "✗ Expected at least 3 unpacked directories, found $UNPACKED_COUNT" - exit 1 -fi - -# Summary -echo "" -echo "==========================================" -echo "✓ All tests passed!" -echo "==========================================" -echo "" -echo "Installed extensions:" -for json_file in "$CHROME_EXTENSIONS_DIR"/*.extension.json; do - if [ -f "$json_file" ]; then - NAME=$(node -e "console.log(require('$json_file').name)") - VERSION=$(node -e "console.log(require('$json_file').version || 'unknown')") - echo " - $NAME (v$VERSION)" - fi -done - -echo "" -echo "To clean up test data:" -echo " rm -rf ./data/personas/Test"