tons of ui fixes and plugin fixes

This commit is contained in:
Nick Sweeting
2025-12-25 03:59:51 -08:00
parent bb53228ebf
commit 9838d7ba02
36 changed files with 2215 additions and 1491 deletions

View File

@@ -122,12 +122,10 @@ class ModelWithOutputDir(ModelWithSerializers):
class Meta:
abstract = True
def save(self, *args, write_indexes=False, **kwargs):
def save(self, *args, **kwargs):
super().save(*args, **kwargs)
self.OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
self.save_json_index()
if write_indexes:
self.write_indexes()
@property
def output_dir_parent(self) -> str:
@@ -145,17 +143,5 @@ class ModelWithOutputDir(ModelWithSerializers):
def OUTPUT_DIR(self) -> Path:
return DATA_DIR / self.output_dir_str
def write_indexes(self):
self.OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
self.save_merkle_index()
self.save_html_index()
def save_merkle_index(self):
with open(self.OUTPUT_DIR / '.hashes.json', 'w') as f:
json.dump(get_dir_info(self.OUTPUT_DIR, max_depth=6), f)
def save_html_index(self):
(self.OUTPUT_DIR / 'index.html').write_text(str(self))
def save_json_index(self):
(self.OUTPUT_DIR / 'index.json').write_text(to_json(self.as_json()))

View File

@@ -72,30 +72,42 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
get_worker,
start_server_workers,
tail_multiple_worker_logs,
is_port_in_use,
)
from workers.orchestrator import Orchestrator
import sys
# Check if port is already in use
if is_port_in_use(host, int(port)):
print(f'[red][X] Error: Port {port} is already in use[/red]')
print(f' Another process (possibly daphne) is already listening on {host}:{port}')
print(f' Stop the conflicting process or choose a different port')
sys.exit(1)
# Check if orchestrator is already running for this data directory
if Orchestrator.is_running():
print(f'[red][X] Error: ArchiveBox orchestrator is already running for this data directory[/red]')
print(f' Stop the existing orchestrator before starting a new server')
print(f' To stop: pkill -f "archivebox manage orchestrator"')
sys.exit(1)
# Check if supervisord is already running
supervisor = get_existing_supervisord_process()
if supervisor:
daphne_proc = get_worker(supervisor, 'worker_daphne')
# If daphne is already running, just tail logs
# If daphne is already running, error out
if daphne_proc and daphne_proc.get('statename') == 'RUNNING':
orchestrator_proc = get_worker(supervisor, 'worker_orchestrator')
print('[yellow][!] ArchiveBox server is already running[/yellow]')
print('[red][X] Error: ArchiveBox server is already running[/red]')
print(f' [green]√[/green] Web server (worker_daphne) is RUNNING on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
if orchestrator_proc and orchestrator_proc.get('statename') == 'RUNNING':
print(f' [green]√[/green] Background worker (worker_orchestrator) is RUNNING')
print()
print('[blue][i] Tailing worker logs (Ctrl+C to stop watching)...[/i][/blue]')
print()
# Tail logs for both workers
tail_multiple_worker_logs(
log_files=['logs/worker_daphne.log', 'logs/worker_orchestrator.log'],
follow=True,
)
return
print('[yellow]To stop the existing server, run:[/yellow]')
print(' pkill -f "archivebox server"')
print(' pkill -f supervisord')
sys.exit(1)
# Otherwise, daphne is not running - fall through to start it
# No existing workers found - start new ones

View File

@@ -91,31 +91,43 @@ def get_detected_binaries() -> Dict[str, Dict[str, Any]]:
def get_filesystem_plugins() -> Dict[str, Dict[str, Any]]:
"""Discover plugins from filesystem directories."""
import json
from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR
plugins = {}
for base_dir, source in [(BUILTIN_PLUGINS_DIR, 'builtin'), (USER_PLUGINS_DIR, 'user')]:
if not base_dir.exists():
continue
for plugin_dir in base_dir.iterdir():
if plugin_dir.is_dir() and not plugin_dir.name.startswith('_'):
plugin_id = f'{source}.{plugin_dir.name}'
# Find hook scripts
hooks = []
for ext in ('sh', 'py', 'js'):
hooks.extend(plugin_dir.glob(f'on_*__*.{ext}'))
# Load config.json if it exists
config_file = plugin_dir / 'config.json'
config_data = None
if config_file.exists():
try:
with open(config_file, 'r') as f:
config_data = json.load(f)
except (json.JSONDecodeError, IOError):
config_data = None
plugins[plugin_id] = {
'id': plugin_id,
'name': plugin_dir.name,
'path': str(plugin_dir),
'source': source,
'hooks': [str(h.name) for h in hooks],
'config': config_data,
}
return plugins
@@ -242,6 +254,7 @@ def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext:
"Source": [],
"Path": [],
"Hooks": [],
"Config": [],
}
plugins = get_filesystem_plugins()
@@ -252,12 +265,21 @@ def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext:
rows['Path'].append(format_html('<code>{}</code>', plugin['path']))
rows['Hooks'].append(', '.join(plugin['hooks']) or '(none)')
# Show config status
if plugin.get('config'):
config_properties = plugin['config'].get('properties', {})
config_count = len(config_properties)
rows['Config'].append(f'{config_count} properties' if config_count > 0 else '✅ present')
else:
rows['Config'].append('❌ none')
if not plugins:
# Show a helpful message when no plugins found
rows['Name'].append('(no plugins found)')
rows['Source'].append('-')
rows['Path'].append(mark_safe('<code>archivebox/plugins/</code> or <code>data/plugins/</code>'))
rows['Hooks'].append('-')
rows['Config'].append('-')
return TableContext(
title="Installed plugins",
@@ -266,11 +288,12 @@ def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext:
@render_with_item_view
def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
import json
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
plugins = get_filesystem_plugins()
plugin = plugins.get(key)
if not plugin:
return ItemContext(
@@ -279,6 +302,33 @@ def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
data=[],
)
# Base fields that all plugins have
fields = {
"id": plugin['id'],
"name": plugin['name'],
"source": plugin['source'],
"path": plugin['path'],
"hooks": plugin['hooks'],
}
# Add config.json data if available
if plugin.get('config'):
config_json = json.dumps(plugin['config'], indent=2)
fields["config.json"] = mark_safe(f'<pre style="max-height: 600px; overflow-y: auto; background: #f5f5f5; padding: 10px; border-radius: 4px;"><code>{config_json}</code></pre>')
# Also extract and display individual config properties for easier viewing
if 'properties' in plugin['config']:
config_properties = plugin['config']['properties']
properties_summary = []
for prop_name, prop_info in config_properties.items():
prop_type = prop_info.get('type', 'unknown')
prop_default = prop_info.get('default', 'N/A')
prop_desc = prop_info.get('description', '')
properties_summary.append(f"{prop_name} ({prop_type}): {prop_desc}")
if properties_summary:
fields["Config Properties"] = mark_safe('<br/>'.join(properties_summary))
return ItemContext(
slug=key,
title=plugin['name'],
@@ -286,13 +336,7 @@ def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
{
"name": plugin['name'],
"description": plugin['path'],
"fields": {
"id": plugin['id'],
"name": plugin['name'],
"source": plugin['source'],
"path": plugin['path'],
"hooks": plugin['hooks'],
},
"fields": fields,
"help_texts": {},
},
],

View File

@@ -22,7 +22,7 @@ from core.models import ArchiveResult, Snapshot
def render_archiveresults_list(archiveresults_qs, limit=50):
"""Render a nice inline list view of archive results with status, extractor, output, and actions."""
results = list(archiveresults_qs.order_by('-end_ts').select_related('snapshot')[:limit])
results = list(archiveresults_qs.order_by('extractor').select_related('snapshot')[:limit])
if not results:
return mark_safe('<div style="color: #64748b; font-style: italic; padding: 16px 0;">No Archive Results yet...</div>')
@@ -239,7 +239,7 @@ class ArchiveResultInline(admin.TabularInline):
class ArchiveResultAdmin(BaseModelAdmin):
list_display = ('id', 'created_by', 'created_at', 'snapshot_info', 'tags_str', 'status', 'extractor_with_icon', 'cmd_str', 'output_str')
sort_fields = ('id', 'created_by', 'created_at', 'extractor', 'status')
readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'extractor_with_icon')
readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'extractor_with_icon', 'iface')
search_fields = ('id', 'snapshot__url', 'extractor', 'output', 'cmd_version', 'cmd', 'snapshot__timestamp')
autocomplete_fields = ['snapshot']
@@ -249,7 +249,7 @@ class ArchiveResultAdmin(BaseModelAdmin):
'classes': ('card', 'wide'),
}),
('Extractor', {
'fields': ('extractor', 'extractor_with_icon', 'status', 'retry_at'),
'fields': ('extractor', 'extractor_with_icon', 'status', 'retry_at', 'iface'),
'classes': ('card',),
}),
('Timing', {

View File

@@ -12,7 +12,7 @@ class ArchiveBoxAdmin(admin.AdminSite):
archivebox_admin = ArchiveBoxAdmin()
archivebox_admin.disable_action('delete_selected')
# Note: delete_selected is enabled per-model via actions = ['delete_selected'] in each ModelAdmin
# TODO: https://stackoverflow.com/questions/40760880/add-custom-button-to-django-admin-panel

View File

@@ -23,7 +23,7 @@ from archivebox.search.admin import SearchResultsAdminMixin
from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
from archivebox.workers.tasks import bg_archive_snapshots, bg_add
from core.models import Tag
from core.models import Tag, Snapshot
from core.admin_tags import TagInline
from core.admin_archiveresults import ArchiveResultInline, render_archiveresults_list
@@ -262,6 +262,10 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
for tag in obj.tags.all()
if str(tag.name).strip()
)
# Show title if available, otherwise show URL
display_text = obj.title or obj.url
css_class = 'fetched' if obj.title else 'pending'
return format_html(
'<a href="/{}">'
'<img src="/{}/favicon.ico" class="favicon" onerror="this.remove()">'
@@ -272,8 +276,8 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
obj.archive_path,
obj.archive_path,
obj.archive_path,
'fetched' if obj.title else 'pending',
urldecode(htmldecode(obj.title or ''))[:128] or 'Pending...'
css_class,
urldecode(htmldecode(display_text))[:128]
) + mark_safe(f' <span class="tags">{tags}</span>')
@admin.display(
@@ -402,12 +406,21 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
description="☠️ Delete"
)
def delete_snapshots(self, request, queryset):
from archivebox.cli.archivebox_remove import remove
remove(snapshots=queryset, yes=True, delete=True, out_dir=DATA_DIR)
"""Delete snapshots in a single transaction to avoid SQLite concurrency issues."""
from django.db import transaction
total = queryset.count()
# Get list of IDs to delete first (outside transaction)
ids_to_delete = list(queryset.values_list('pk', flat=True))
# Delete everything in a single atomic transaction
with transaction.atomic():
deleted_count, _ = Snapshot.objects.filter(pk__in=ids_to_delete).delete()
messages.success(
request,
mark_safe(f"Succesfully deleted {queryset.count()} Snapshots. Don't forget to scrub URLs from import logs (data/sources) and error logs (data/logs) if needed."),
mark_safe(f"Successfully deleted {total} Snapshots ({deleted_count} total objects including related records). Don't forget to scrub URLs from import logs (data/sources) and error logs (data/logs) if needed."),
)

View File

@@ -295,7 +295,7 @@ class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)):
class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='snapshot_set', db_index=True)
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False, related_name='snapshot_set', db_index=True)
created_at = models.DateTimeField(default=timezone.now, db_index=True)
modified_at = models.DateTimeField(auto_now=True)
@@ -362,9 +362,11 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
},
)
@property
def output_dir_parent(self) -> str:
return 'archive'
@property
def output_dir_name(self) -> str:
return str(self.timestamp)
@@ -808,7 +810,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
# UUID field is added separately by migration for new records
id = models.AutoField(primary_key=True, editable=False)
uuid = models.UUIDField(default=uuid7, null=True, blank=True, db_index=True, unique=True)
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='archiveresult_set', db_index=True)
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False, related_name='archiveresult_set', db_index=True)
created_at = models.DateTimeField(default=timezone.now, db_index=True)
modified_at = models.DateTimeField(auto_now=True)
@@ -844,7 +846,10 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
def save(self, *args, **kwargs):
is_new = self._state.adding
super().save(*args, **kwargs)
# Skip ModelWithOutputDir.save() to avoid creating index.json in plugin directories
# Call the Django Model.save() directly instead
models.Model.save(self, *args, **kwargs)
if is_new:
from archivebox.misc.logging_util import log_worker_event
log_worker_event(
@@ -916,9 +921,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
def output_dir_parent(self) -> str:
return str(self.snapshot.OUTPUT_DIR.relative_to(CONSTANTS.DATA_DIR))
def write_indexes(self):
super().write_indexes()
def save_search_index(self):
pass
@@ -966,6 +968,16 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
)
end_ts = timezone.now()
# Clean up empty output directory if no files were created
output_files = result.get('output_files', [])
if not output_files and extractor_dir.exists():
try:
# Only remove if directory is completely empty
if not any(extractor_dir.iterdir()):
extractor_dir.rmdir()
except (OSError, RuntimeError):
pass # Directory not empty or can't be removed, that's fine
# Determine status from return code and JSON output
output_json = result.get('output_json') or {}
json_status = output_json.get('status')
@@ -990,15 +1002,46 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
self.start_ts = start_ts
self.end_ts = end_ts
self.retry_at = None
self.pwd = str(extractor_dir)
# Save cmd and cmd_version from extractor output
if output_json.get('cmd_version'):
self.cmd_version = output_json['cmd_version'][:128] # Max length from model
if output_json.get('cmd'):
self.cmd = output_json['cmd']
self.save()
# Queue any discovered URLs for crawling (parser extractors write urls.jsonl)
self._queue_urls_for_crawl(extractor_dir)
# Update snapshot title if this is the title extractor
# Check both old numeric name and new plugin name for compatibility
extractor_name = get_extractor_name(self.extractor)
if self.status == self.StatusChoices.SUCCEEDED and extractor_name == 'title':
self._update_snapshot_title(extractor_dir)
# Trigger search indexing if succeeded
if self.status == self.StatusChoices.SUCCEEDED:
self.trigger_search_indexing()
def _update_snapshot_title(self, extractor_dir: Path):
"""
Update snapshot title from title extractor output.
The title extractor writes title.txt with the extracted page title.
This updates the Snapshot.title field if the file exists and has content.
"""
title_file = extractor_dir / 'title.txt'
if title_file.exists():
try:
title = title_file.read_text(encoding='utf-8').strip()
if title and (not self.snapshot.title or len(title) > len(self.snapshot.title)):
self.snapshot.title = title[:512] # Max length from model
self.snapshot.save(update_fields=['title', 'modified_at'])
except Exception:
pass # Failed to read title, that's okay
def _queue_urls_for_crawl(self, extractor_dir: Path):
"""
Read urls.jsonl and queue discovered URLs for crawling.

View File

@@ -91,7 +91,7 @@ class SnapshotMachine(StateMachine, strict_states=True):
# unlock the snapshot after we're done + set status = started
self.snapshot.update_for_workers(
retry_at=timezone.now() + timedelta(seconds=5), # wait 5s before checking it again
retry_at=timezone.now() + timedelta(seconds=5), # check again in 5s
status=Snapshot.StatusChoices.STARTED,
)
@@ -209,12 +209,15 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
@started.enter
def enter_started(self):
from machine.models import NetworkInterface
# Suppressed: state transition logs
# Lock the object and mark start time
self.archiveresult.update_for_workers(
retry_at=timezone.now() + timedelta(seconds=120), # 2 min timeout for extractor
status=ArchiveResult.StatusChoices.STARTED,
start_ts=timezone.now(),
iface=NetworkInterface.current(),
)
# Run the extractor - this updates status, output, timestamps, etc.
@@ -234,7 +237,7 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
end_ts=None,
# retries=F('retries') + 1, # F() equivalent to getattr(self.archiveresult, 'retries', 0) + 1,
)
self.archiveresult.save(write_indexes=True)
self.archiveresult.save()
@succeeded.enter
def enter_succeeded(self):
@@ -245,7 +248,7 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
end_ts=timezone.now(),
# **self.archiveresult.get_output_dict(), # {output, output_json, stderr, stdout, returncode, errors, cmd_version, pwd, cmd, machine}
)
self.archiveresult.save(write_indexes=True)
self.archiveresult.save()
# Increment health stats on ArchiveResult, Snapshot, and optionally Crawl
ArchiveResult.objects.filter(pk=self.archiveresult.pk).update(num_uses_succeeded=F('num_uses_succeeded') + 1)

View File

@@ -560,16 +560,28 @@ def live_progress_view(request):
archiveresults_failed = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.FAILED).count()
# Build hierarchical active crawls with nested snapshots and archive results
active_crawls = []
for crawl in Crawl.objects.filter(
from django.db.models import Prefetch
active_crawls_qs = Crawl.objects.filter(
status__in=[Crawl.StatusChoices.QUEUED, Crawl.StatusChoices.STARTED]
).order_by('-modified_at')[:10]:
# Get snapshots for this crawl
crawl_snapshots = Snapshot.objects.filter(crawl=crawl)
total_snapshots = crawl_snapshots.count()
completed_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.SEALED).count()
started_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.STARTED).count()
pending_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.QUEUED).count()
).prefetch_related(
'snapshot_set',
'snapshot_set__archiveresult_set',
).distinct().order_by('-modified_at')[:10]
active_crawls = []
for crawl in active_crawls_qs:
# Get active snapshots for this crawl - filter in Python since we prefetched all
crawl_snapshots = [
s for s in crawl.snapshot_set.all()
if s.status in [Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]
][:5] # Limit to 5 most recent
# Count snapshots by status (in memory, not DB)
total_snapshots = Snapshot.objects.filter(crawl=crawl).count() # Full count needs DB
completed_snapshots = sum(1 for s in crawl_snapshots if s.status == Snapshot.StatusChoices.SEALED)
started_snapshots = sum(1 for s in crawl_snapshots if s.status == Snapshot.StatusChoices.STARTED)
pending_snapshots = sum(1 for s in crawl_snapshots if s.status == Snapshot.StatusChoices.QUEUED)
# Count URLs in the crawl (for when snapshots haven't been created yet)
urls_count = 0
@@ -579,39 +591,39 @@ def live_progress_view(request):
# Calculate crawl progress
crawl_progress = int((completed_snapshots / total_snapshots) * 100) if total_snapshots > 0 else 0
# Get active snapshots for this crawl
# Get active snapshots for this crawl (already prefetched)
active_snapshots_for_crawl = []
for snapshot in crawl_snapshots.filter(
status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]
).order_by('-modified_at')[:5]:
# Get archive results for this snapshot
snapshot_results = ArchiveResult.objects.filter(snapshot=snapshot)
total_extractors = snapshot_results.count()
completed_extractors = snapshot_results.filter(status=ArchiveResult.StatusChoices.SUCCEEDED).count()
failed_extractors = snapshot_results.filter(status=ArchiveResult.StatusChoices.FAILED).count()
pending_extractors = snapshot_results.filter(status=ArchiveResult.StatusChoices.QUEUED).count()
for snapshot in crawl_snapshots:
# Get archive results for this snapshot (already prefetched)
snapshot_results = snapshot.archiveresult_set.all()
# Count in memory instead of DB queries
total_extractors = len(snapshot_results)
completed_extractors = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.SUCCEEDED)
failed_extractors = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.FAILED)
pending_extractors = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.QUEUED)
# Calculate snapshot progress
snapshot_progress = int(((completed_extractors + failed_extractors) / total_extractors) * 100) if total_extractors > 0 else 0
# Get all extractors for this snapshot
# Get all extractors for this snapshot (already prefetched, sort in Python)
# Order: started first, then queued, then completed
def extractor_sort_key(ar):
status_order = {
ArchiveResult.StatusChoices.STARTED: 0,
ArchiveResult.StatusChoices.QUEUED: 1,
ArchiveResult.StatusChoices.SUCCEEDED: 2,
ArchiveResult.StatusChoices.FAILED: 3,
}
return (status_order.get(ar.status, 4), ar.extractor)
all_extractors = [
{
'id': str(ar.id),
'extractor': ar.extractor,
'status': ar.status,
}
for ar in snapshot_results.annotate(
status_order=Case(
When(status=ArchiveResult.StatusChoices.STARTED, then=Value(0)),
When(status=ArchiveResult.StatusChoices.QUEUED, then=Value(1)),
When(status=ArchiveResult.StatusChoices.SUCCEEDED, then=Value(2)),
When(status=ArchiveResult.StatusChoices.FAILED, then=Value(3)),
default=Value(4),
output_field=IntegerField(),
)
).order_by('status_order', 'extractor')
for ar in sorted(snapshot_results, key=extractor_sort_key)
]
active_snapshots_for_crawl.append({
@@ -726,15 +738,39 @@ def find_config_default(key: str) -> str:
return default_val
def find_config_type(key: str) -> str:
from typing import get_type_hints, ClassVar
CONFIGS = get_all_configs()
for config in CONFIGS.values():
if hasattr(config, key):
type_hints = get_type_hints(config)
# Try to get from pydantic model_fields first (more reliable)
if hasattr(config, 'model_fields') and key in config.model_fields:
field = config.model_fields[key]
if hasattr(field, 'annotation'):
try:
return str(field.annotation.__name__)
except AttributeError:
return str(field.annotation)
# Fallback to get_type_hints with proper namespace
try:
return str(type_hints[key].__name__)
except AttributeError:
return str(type_hints[key])
import typing
namespace = {
'ClassVar': ClassVar,
'Optional': typing.Optional,
'Union': typing.Union,
'List': typing.List,
'Dict': typing.Dict,
'Path': Path,
}
type_hints = get_type_hints(config, globalns=namespace, localns=namespace)
try:
return str(type_hints[key].__name__)
except AttributeError:
return str(type_hints[key])
except Exception:
# If all else fails, return str
pass
return 'str'
def key_is_safe(key: str) -> bool:
@@ -743,17 +779,55 @@ def key_is_safe(key: str) -> bool:
return False
return True
def find_config_source(key: str, merged_config: dict) -> str:
"""Determine where a config value comes from."""
import os
from machine.models import Machine
# Check if it's from machine config
try:
machine = Machine.current()
if machine.config and key in machine.config:
return 'Machine'
except Exception:
pass
# Check if it's from environment variable
if key in os.environ:
return 'Environment'
# Check if it's from config file
from archivebox.config.configset import BaseConfigSet
file_config = BaseConfigSet.load_from_file(CONSTANTS.CONFIG_FILE)
if key in file_config:
return 'Config File'
# Otherwise it's using the default
return 'Default'
@render_with_table_view
def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
CONFIGS = get_all_configs()
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
# Get merged config that includes Machine.config overrides
try:
from machine.models import Machine
machine = Machine.current()
merged_config = get_config()
except Exception as e:
# Fallback if Machine model not available
merged_config = get_config()
machine = None
rows = {
"Section": [],
"Key": [],
"Type": [],
"Value": [],
"Source": [],
"Default": [],
# "Documentation": [],
# "Aliases": [],
@@ -764,7 +838,21 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
rows['Section'].append(section_id) # section.replace('_', ' ').title().replace(' Config', '')
rows['Key'].append(ItemLink(key, key=key))
rows['Type'].append(format_html('<code>{}</code>', find_config_type(key)))
rows['Value'].append(mark_safe(f'<code>{getattr(section, key)}</code>') if key_is_safe(key) else '******** (redacted)')
# Use merged config value (includes machine overrides)
actual_value = merged_config.get(key, getattr(section, key, None))
rows['Value'].append(mark_safe(f'<code>{actual_value}</code>') if key_is_safe(key) else '******** (redacted)')
# Show where the value comes from
source = find_config_source(key, merged_config)
source_colors = {
'Machine': 'purple',
'Environment': 'blue',
'Config File': 'green',
'Default': 'gray'
}
rows['Source'].append(format_html('<code style="color: {}">{}</code>', source_colors.get(source, 'gray'), source))
rows['Default'].append(mark_safe(f'<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig+{key}&type=code"><code style="text-decoration: underline">{find_config_default(key) or "See here..."}</code></a>'))
# rows['Documentation'].append(mark_safe(f'Wiki: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">{key}</a>'))
# rows['Aliases'].append(', '.join(find_config_aliases(key)))
@@ -775,6 +863,7 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
rows['Key'].append(ItemLink(key, key=key))
rows['Type'].append(format_html('<code>{}</code>', getattr(type(CONSTANTS_CONFIG[key]), '__name__', str(CONSTANTS_CONFIG[key]))))
rows['Value'].append(format_html('<code>{}</code>', CONSTANTS_CONFIG[key]) if key_is_safe(key) else '******** (redacted)')
rows['Source'].append(mark_safe('<code style="color: gray">Constant</code>'))
rows['Default'].append(mark_safe(f'<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig+{key}&type=code"><code style="text-decoration: underline">{find_config_default(key) or "See here..."}</code></a>'))
# rows['Documentation'].append(mark_safe(f'Wiki: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">{key}</a>'))
# rows['Aliases'].append('')
@@ -787,11 +876,58 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
@render_with_item_view
def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
import os
from machine.models import Machine
from archivebox.config.configset import BaseConfigSet
CONFIGS = get_all_configs()
FLAT_CONFIG = get_flat_config()
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
# Get merged config
merged_config = get_config()
# Determine all sources for this config value
sources_info = []
# Default value
default_val = find_config_default(key)
if default_val:
sources_info.append(('Default', default_val, 'gray'))
# Config file value
if CONSTANTS.CONFIG_FILE.exists():
file_config = BaseConfigSet.load_from_file(CONSTANTS.CONFIG_FILE)
if key in file_config:
sources_info.append(('Config File', file_config[key], 'green'))
# Environment variable
if key in os.environ:
sources_info.append(('Environment', os.environ[key] if key_is_safe(key) else '********', 'blue'))
# Machine config
machine = None
machine_admin_url = None
try:
machine = Machine.current()
machine_admin_url = f'/admin/machine/machine/{machine.id}/change/'
if machine.config and key in machine.config:
sources_info.append(('Machine', machine.config[key] if key_is_safe(key) else '********', 'purple'))
except Exception:
pass
# Final computed value
final_value = merged_config.get(key, FLAT_CONFIG.get(key, CONFIGS.get(key, None)))
if not key_is_safe(key):
final_value = '********'
# Build sources display
sources_html = '<br/>'.join([
f'<b style="color: {color}">{source}:</b> <code>{value}</code>'
for source, value, color in sources_info
])
# aliases = USER_CONFIG.get(key, {}).get("aliases", [])
aliases = []
@@ -813,7 +949,8 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
"fields": {
'Key': key,
'Type': find_config_type(key),
'Value': FLAT_CONFIG.get(key, CONFIGS.get(key, None)) if key_is_safe(key) else '********',
'Value': final_value,
'Source': find_config_source(key, merged_config),
},
"help_texts": {
'Key': mark_safe(f'''
@@ -830,10 +967,8 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
'Value': mark_safe(f'''
{'<b style="color: red">Value is redacted for your security. (Passwords, secrets, API tokens, etc. cannot be viewed in the Web UI)</b><br/><br/>' if not key_is_safe(key) else ''}
<br/><hr/><br/>
Default: &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;
<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig+{key}&type=code">
<code>{find_config_default(key) or '↗️ See in ArchiveBox source code...'}</code>
</a>
<b>Configuration Sources (in priority order):</b><br/><br/>
{sources_html}
<br/><br/>
<p style="display: {"block" if key in FLAT_CONFIG and key not in CONSTANTS_CONFIG else "none"}">
<i>To change this value, edit <code>data/ArchiveBox.conf</code> or run:</i>
@@ -845,6 +980,20 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
}"</code>
</p>
'''),
'Source': mark_safe(f'''
The value shown in the "Value" field comes from the <b>{find_config_source(key, merged_config)}</b> source.
<br/><br/>
Priority order (highest to lowest):
<ol>
<li><b style="color: purple">Machine</b> - Machine-specific overrides (e.g., resolved binary paths)
{f'<br/><a href="{machine_admin_url}">→ Edit <code>{key}</code> in Machine.config for this server</a>' if machine_admin_url else ''}
</li>
<li><b style="color: blue">Environment</b> - Environment variables</li>
<li><b style="color: green">Config File</b> - data/ArchiveBox.conf</li>
<li><b style="color: gray">Default</b> - Default value from code</li>
</ol>
{f'<br/><b>💡 Tip:</b> To override <code>{key}</code> on this machine, <a href="{machine_admin_url}">edit the Machine.config field</a> and add:<br/><code>{{"\\"{key}\\": "your_value_here"}}</code>' if machine_admin_url and key not in CONSTANTS_CONFIG else ''}
'''),
},
},
],

View File

@@ -3,6 +3,7 @@ __package__ = 'archivebox.crawls'
import json
from pathlib import Path
from django import forms
from django.utils.html import format_html, format_html_join, mark_safe
from django.contrib import admin, messages
from django.urls import path
@@ -136,16 +137,32 @@ def render_snapshots_list(snapshots_qs, limit=20):
''')
class CrawlAdminForm(forms.ModelForm):
"""Custom form for Crawl admin to render urls field as textarea."""
class Meta:
model = Crawl
fields = '__all__'
widgets = {
'urls': forms.Textarea(attrs={
'rows': 8,
'style': 'width: 100%; font-family: monospace; font-size: 13px;',
'placeholder': 'https://example.com\nhttps://example2.com\n# Comments start with #',
}),
}
class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
form = CrawlAdminForm
list_display = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'urls_preview', 'schedule_str', 'status', 'retry_at', 'num_snapshots')
sort_fields = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'schedule_str', 'status', 'retry_at')
search_fields = ('id', 'created_by__username', 'max_depth', 'label', 'notes', 'schedule_id', 'status', 'urls')
readonly_fields = ('created_at', 'modified_at', 'snapshots', 'urls_editor')
readonly_fields = ('created_at', 'modified_at', 'snapshots')
fieldsets = (
('URLs', {
'fields': ('urls_editor',),
'fields': ('urls',),
'classes': ('card', 'wide'),
}),
('Info', {
@@ -177,9 +194,32 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
list_filter = ('max_depth', 'extractor', 'schedule', 'created_by', 'status', 'retry_at')
ordering = ['-created_at', '-retry_at']
list_per_page = 100
actions = ["delete_selected"]
actions = ["delete_selected_batched"]
change_actions = ['recrawl']
def get_queryset(self, request):
"""Optimize queries with select_related and annotations."""
qs = super().get_queryset(request)
return qs.select_related('schedule', 'created_by').annotate(
num_snapshots_cached=Count('snapshot_set')
)
@admin.action(description='Delete selected crawls')
def delete_selected_batched(self, request, queryset):
"""Delete crawls in a single transaction to avoid SQLite concurrency issues."""
from django.db import transaction
total = queryset.count()
# Get list of IDs to delete first (outside transaction)
ids_to_delete = list(queryset.values_list('pk', flat=True))
# Delete everything in a single atomic transaction
with transaction.atomic():
deleted_count, _ = Crawl.objects.filter(pk__in=ids_to_delete).delete()
messages.success(request, f'Successfully deleted {total} crawls ({deleted_count} total objects including related records).')
@action(label='Recrawl', description='Create a new crawl with the same settings')
def recrawl(self, request, obj):
"""Duplicate this crawl as a new crawl with the same URLs and settings."""
@@ -214,7 +254,8 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
return redirect('admin:crawls_crawl_change', new_crawl.id)
def num_snapshots(self, obj):
return obj.snapshot_set.count()
# Use cached annotation from get_queryset to avoid N+1
return getattr(obj, 'num_snapshots_cached', obj.snapshot_set.count())
def snapshots(self, obj):
return render_snapshots_list(obj.snapshot_set.all())
@@ -269,7 +310,7 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
placeholder="https://example.com&#10;https://example2.com&#10;# Comments start with #"
readonly>{escaped_urls}</textarea>
<p style="color: #666; font-size: 12px; margin: 4px 0 0 0;">
{line_count} URL{'s' if line_count != 1 else ''} · URLs are read-only in admin, edit via API or CLI
{line_count} URL{'s' if line_count != 1 else ''} · Note: URLs displayed here for reference only
</p>
</div>

View File

@@ -88,7 +88,9 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
def __str__(self):
first_url = self.get_urls_list()[0] if self.get_urls_list() else ''
return f'[{self.id}] {first_url[:64]}'
# Show last 8 digits of UUID and more of the URL
short_id = str(self.id)[-8:]
return f'[...{short_id}] {first_url[:120]}'
def save(self, *args, **kwargs):
is_new = self._state.adding

View File

@@ -83,7 +83,7 @@ class CrawlMachine(StateMachine, strict_states=True):
# Suppressed: state transition logs
# lock the crawl object while we create snapshots
self.crawl.update_for_workers(
retry_at=timezone.now() + timedelta(seconds=5),
retry_at=timezone.now(), # Process immediately
status=Crawl.StatusChoices.QUEUED,
)
@@ -96,7 +96,7 @@ class CrawlMachine(StateMachine, strict_states=True):
# only update status to STARTED once snapshots are created
self.crawl.update_for_workers(
retry_at=timezone.now() + timedelta(seconds=5),
retry_at=timezone.now(), # Process immediately
status=Crawl.StatusChoices.STARTED,
)
except Exception as e:
@@ -129,7 +129,7 @@ class CrawlMachine(StateMachine, strict_states=True):
timeout=60,
config_objects=[self.crawl],
crawl_id=str(self.crawl.id),
seed_uri=first_url,
source_url=first_url,
)
# Process hook results - parse JSONL output and create DB objects
@@ -195,8 +195,43 @@ class CrawlMachine(StateMachine, strict_states=True):
@sealed.enter
def enter_sealed(self):
# Run on_CrawlEnd hooks to clean up resources (e.g., kill shared Chrome)
self._run_crawl_end_hooks()
# Suppressed: state transition logs
self.crawl.update_for_workers(
retry_at=None,
status=Crawl.StatusChoices.SEALED,
)
def _run_crawl_end_hooks(self):
"""Run on_CrawlEnd hooks to clean up resources at crawl completion."""
from pathlib import Path
from archivebox.hooks import run_hooks, discover_hooks
from archivebox.config import CONSTANTS
# Discover and run all on_CrawlEnd hooks
hooks = discover_hooks('CrawlEnd')
if not hooks:
return
# Use the same temporary output directory from crawl start
output_dir = Path(CONSTANTS.DATA_DIR) / 'tmp' / f'crawl_{self.crawl.id}'
# Run all on_CrawlEnd hooks
first_url = self.crawl.get_urls_list()[0] if self.crawl.get_urls_list() else ''
results = run_hooks(
event_name='CrawlEnd',
output_dir=output_dir,
timeout=30, # Cleanup hooks should be quick
config_objects=[self.crawl],
crawl_id=str(self.crawl.id),
source_url=first_url,
)
# Log any failures but don't block sealing
for result in results:
if result['returncode'] != 0:
print(f'[yellow]⚠️ CrawlEnd hook failed: {result.get("hook", "unknown")}[/yellow]')
if result.get('stderr'):
print(f'[dim]{result["stderr"][:200]}[/dim]')

View File

@@ -199,16 +199,24 @@ def run_hook(
# Build CLI arguments from kwargs
for key, value in kwargs.items():
# Skip keys that start with underscore (internal parameters)
if key.startswith('_'):
continue
arg_key = f'--{key.replace("_", "-")}'
if isinstance(value, bool):
if value:
cmd.append(arg_key)
elif value is not None:
elif value is not None and value != '':
# JSON-encode complex values, use str for simple ones
# Skip empty strings to avoid --key= which breaks argument parsers
if isinstance(value, (dict, list)):
cmd.append(f'{arg_key}={json.dumps(value)}')
else:
cmd.append(f'{arg_key}={value}')
# Ensure value is converted to string and strip whitespace
str_value = str(value).strip()
if str_value: # Only add if non-empty after stripping
cmd.append(f'{arg_key}={str_value}')
# Set up environment with base paths
env = os.environ.copy()

View File

@@ -125,24 +125,64 @@ def execute_click_command(cmd_name: str, click_command: click.Command, arguments
Returns MCP-formatted result with captured output and error status.
"""
# Setup Django for archive commands (commands that need database access)
from archivebox.cli import ArchiveBoxGroup
if cmd_name in ArchiveBoxGroup.archive_commands:
try:
from archivebox.config.django import setup_django
from archivebox.misc.checks import check_data_folder
setup_django()
check_data_folder()
except Exception as e:
# If Django setup fails, return error (unless it's manage/shell which handle this themselves)
if cmd_name not in ('manage', 'shell'):
return {
"content": [{
"type": "text",
"text": f"Error setting up Django: {str(e)}\n\nMake sure you're running the MCP server from inside an ArchiveBox data directory."
}],
"isError": True
}
# Use Click's test runner to invoke command programmatically
runner = CliRunner()
# Build a map of parameter names to their Click types (Argument vs Option)
param_map = {param.name: param for param in click_command.params}
# Convert arguments dict to CLI args list
args = []
positional_args = []
for key, value in arguments.items():
param_name = key.replace('_', '-') # Click uses dashes
param = param_map.get(key)
if isinstance(value, bool):
if value:
# Check if this is a positional Argument (not an Option)
is_argument = isinstance(param, click.Argument)
if is_argument:
# Positional arguments - add them without dashes
if isinstance(value, list):
positional_args.extend([str(v) for v in value])
elif value is not None:
positional_args.append(str(value))
else:
# Options - add with dashes
if isinstance(value, bool):
if value:
args.append(f'--{param_name}')
elif isinstance(value, list):
# Multiple values for an option (rare)
for item in value:
args.append(f'--{param_name}')
args.append(str(item))
elif value is not None:
args.append(f'--{param_name}')
elif isinstance(value, list):
# Multiple values (e.g., multiple URLs)
for item in value:
args.append(str(item))
elif value is not None:
args.append(f'--{param_name}')
args.append(str(value))
args.append(str(value))
# Add positional arguments at the end
args.extend(positional_args)
# Execute the command
try:

View File

@@ -542,24 +542,32 @@ def log_worker_event(
"""
indent = ' ' * indent_level
# Build worker identifier
from rich.markup import escape
# Build worker identifier (without URL/extractor)
worker_parts = [worker_type]
# Don't add pid/worker_id for DB operations (they happen in whatever process is running)
if pid and worker_type != 'DB':
worker_parts.append(f'pid={pid}')
if worker_id and worker_type in ('CrawlWorker', 'Orchestrator') and worker_type != 'DB':
worker_parts.append(f'id={worker_id}')
if url and worker_type in ('SnapshotWorker', 'DB'):
worker_parts.append(f'url={truncate_url(url)}')
if extractor and worker_type in ('ArchiveResultWorker', 'DB'):
worker_parts.append(f'extractor={extractor}')
# Format worker label - only add brackets if there are additional identifiers
# Use double brackets [[...]] to escape Rich markup
if len(worker_parts) > 1:
worker_label = f'{worker_parts[0]}[{", ".join(worker_parts[1:])}]'
worker_label = f'{worker_parts[0]}[[{", ".join(worker_parts[1:])}]]'
else:
worker_label = worker_parts[0]
# Build URL/extractor display (shown AFTER the label, outside brackets)
url_extractor_parts = []
if url:
url_extractor_parts.append(f'url: {escape(url)}')
if extractor:
url_extractor_parts.append(f'extractor: {escape(extractor)}')
url_extractor_str = ' | '.join(url_extractor_parts) if url_extractor_parts else ''
# Build metadata string
metadata_str = ''
if metadata:
@@ -592,8 +600,6 @@ def log_worker_event(
color = 'green'
elif event.startswith('Created'):
color = 'cyan' # DB creation events
elif event in ('Processing...', 'PROCESSING'):
color = 'blue'
elif event in ('Completed', 'COMPLETED', 'All work complete'):
color = 'blue'
elif event in ('Failed', 'ERROR', 'Failed to spawn worker'):
@@ -610,6 +616,12 @@ def log_worker_event(
text = Text()
text.append(indent)
text.append(f'{worker_label} {event}{error_str}', style=color)
# Add URL/extractor info first (more important)
if url_extractor_str:
text.append(f' | {url_extractor_str}')
# Then add other metadata
if metadata_str:
text.append(f' | {metadata_str}')

View File

@@ -8,8 +8,8 @@
* Extension: https://chromewebstore.google.com/detail/ifibfemgeogfhoebkmokieepdoobkbpo
* Documentation: https://2captcha.com/blog/how-to-use-2captcha-solver-extension-in-puppeteer
*
* Priority: 01 (early) - Must install before Chrome session starts
* Hook: on_Snapshot
* Priority: 01 (early) - Must install before Chrome session starts at Crawl level
* Hook: on_Crawl (runs once per crawl, not per snapshot)
*
* Requirements:
* - API_KEY_2CAPTCHA environment variable must be set

View File

@@ -2,11 +2,11 @@
/**
* 2Captcha Extension Configuration
*
* Configures the 2captcha extension with API key after Chrome session starts.
* Runs once per browser session to inject API key into extension storage.
* Configures the 2captcha extension with API key after Crawl-level Chrome session starts.
* Runs once per crawl to inject API key into extension storage.
*
* Priority: 21 (after chrome_session at 20, before navigation at 30)
* Hook: on_Snapshot
* Priority: 11 (after chrome_session at 10)
* Hook: on_Crawl (runs once per crawl, not per snapshot)
*
* Requirements:
* - API_KEY_2CAPTCHA environment variable must be set
@@ -17,8 +17,19 @@ const path = require('path');
const fs = require('fs');
const puppeteer = require('puppeteer-core');
const OUTPUT_DIR = 'chrome_session';
const CONFIG_MARKER = path.join(OUTPUT_DIR, '.captcha2_configured');
// Get crawl ID from args to find the crawl-level chrome session
function getCrawlChromeSessionDir() {
const args = parseArgs();
const crawlId = args.crawl_id;
if (!crawlId) {
return null;
}
const dataDir = process.env.DATA_DIR || '.';
return path.join(dataDir, 'tmp', `crawl_${crawlId}`, 'chrome_session');
}
const CHROME_SESSION_DIR = getCrawlChromeSessionDir() || '../chrome_session';
const CONFIG_MARKER = path.join(CHROME_SESSION_DIR, '.captcha2_configured');
// Get environment variable with default
function getEnv(name, defaultValue = '') {
@@ -53,7 +64,7 @@ async function configure2Captcha() {
}
// Load extensions metadata
const extensionsFile = path.join(OUTPUT_DIR, 'extensions.json');
const extensionsFile = path.join(CHROME_SESSION_DIR, 'extensions.json');
if (!fs.existsSync(extensionsFile)) {
return { success: false, error: 'extensions.json not found - chrome_session must run first' };
}
@@ -70,7 +81,7 @@ async function configure2Captcha() {
try {
// Connect to the existing Chrome session via CDP
const cdpFile = path.join(OUTPUT_DIR, 'cdp_url.txt');
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
if (!fs.existsSync(cdpFile)) {
return { success: false, error: 'CDP URL not found - chrome_session must run first' };
}

View File

@@ -3,10 +3,11 @@
Clean up Chrome browser session started by chrome_session extractor.
This extractor runs after all Chrome-based extractors (screenshot, pdf, dom)
to terminate the Chrome process and clean up any leftover files.
to clean up the Chrome session. For shared sessions (crawl-level Chrome), it
closes only this snapshot's tab. For standalone sessions, it kills Chrome.
Usage: on_Snapshot__24_chrome_cleanup.py --url=<url> --snapshot-id=<uuid>
Output: Terminates Chrome process and removes lock files
Usage: on_Snapshot__45_chrome_cleanup.py --url=<url> --snapshot-id=<uuid>
Output: Closes tab or terminates Chrome process
Environment variables:
CHROME_USER_DATA_DIR: Chrome profile directory (for lock file cleanup)
@@ -18,6 +19,7 @@ import os
import signal
import sys
import time
import urllib.request
from datetime import datetime, timezone
from pathlib import Path
@@ -33,18 +35,126 @@ def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def close_tab_via_cdp(cdp_url: str, page_id: str) -> bool:
"""
Close a specific tab via Chrome DevTools Protocol.
Returns True if tab was closed successfully.
"""
try:
# Extract port from WebSocket URL (ws://127.0.0.1:PORT/...)
import re
match = re.search(r':(\d+)/', cdp_url)
if not match:
return False
port = match.group(1)
# Use CDP HTTP endpoint to close the target
close_url = f'http://127.0.0.1:{port}/json/close/{page_id}'
req = urllib.request.Request(close_url, method='GET')
with urllib.request.urlopen(req, timeout=5) as resp:
return resp.status == 200
except Exception as e:
print(f'Failed to close tab via CDP: {e}', file=sys.stderr)
return False
def kill_listener_processes() -> list[str]:
"""
Kill any daemonized listener processes (consolelog, ssl, responses, etc.).
These hooks write listener.pid files that we need to kill.
Returns list of killed process descriptions.
"""
killed = []
snapshot_dir = Path('.').resolve().parent # Go up from chrome_cleanup dir
# Look for listener.pid files in sibling directories
for extractor_dir in snapshot_dir.iterdir():
if not extractor_dir.is_dir():
continue
pid_file = extractor_dir / 'listener.pid'
if not pid_file.exists():
continue
try:
pid = int(pid_file.read_text().strip())
try:
os.kill(pid, signal.SIGTERM)
# Brief wait for graceful shutdown
for _ in range(5):
try:
os.kill(pid, 0)
time.sleep(0.05)
except OSError:
break
else:
# Force kill if still running
try:
os.kill(pid, signal.SIGKILL)
except OSError:
pass
killed.append(f'{extractor_dir.name} listener (PID {pid})')
except OSError as e:
if e.errno != 3: # Not "No such process"
killed.append(f'{extractor_dir.name} listener (already dead)')
except (ValueError, FileNotFoundError):
pass
return killed
def cleanup_chrome_session() -> tuple[bool, str | None, str]:
"""
Clean up Chrome session started by chrome_session extractor.
For shared sessions (crawl-level Chrome), closes only this snapshot's tab.
For standalone sessions, kills the Chrome process.
Returns: (success, output_info, error_message)
"""
# First, kill any daemonized listener processes
killed = kill_listener_processes()
if killed:
print(f'Killed listener processes: {", ".join(killed)}')
session_dir = Path(CHROME_SESSION_DIR)
if not session_dir.exists():
return True, 'No chrome_session directory found', ''
# Check if this is a shared session
shared_file = session_dir / 'shared_session.txt'
is_shared = False
if shared_file.exists():
is_shared = shared_file.read_text().strip().lower() == 'true'
pid_file = session_dir / 'pid.txt'
cdp_file = session_dir / 'cdp_url.txt'
page_id_file = session_dir / 'page_id.txt'
if is_shared:
# Shared session - only close this snapshot's tab
if cdp_file.exists() and page_id_file.exists():
try:
cdp_url = cdp_file.read_text().strip()
page_id = page_id_file.read_text().strip()
if close_tab_via_cdp(cdp_url, page_id):
return True, f'Closed tab {page_id[:8]}... (shared Chrome session)', ''
else:
return True, f'Tab may already be closed (shared Chrome session)', ''
except Exception as e:
return True, f'Tab cleanup attempted: {e}', ''
return True, 'Shared session - Chrome stays running', ''
# Standalone session - kill the Chrome process
killed = False
if pid_file.exists():

View File

@@ -2,38 +2,27 @@
/**
* Navigate the Chrome browser to the target URL.
*
* This extractor runs AFTER pre-load extractors (21-29) have registered their
* CDP listeners. It connects to the existing Chrome session, navigates to the URL,
* waits for page load, and captures response headers.
* This is a simple hook that ONLY navigates - nothing else.
* Pre-load hooks (21-29) should set up their own CDP listeners.
* Post-load hooks (31+) can then read from the loaded page.
*
* Usage: on_Snapshot__30_chrome_navigate.js --url=<url> --snapshot-id=<uuid>
* Output: Writes to chrome_session/:
* - response_headers.json: HTTP response headers from main document
* - final_url.txt: Final URL after any redirects
* - page_loaded.txt: Marker file indicating navigation is complete
* Output: Writes page_loaded.txt marker when navigation completes
*
* Environment variables:
* CHROME_PAGELOAD_TIMEOUT: Timeout for page load in seconds (default: 60)
* CHROME_PAGELOAD_TIMEOUT: Timeout in seconds (default: 60)
* CHROME_DELAY_AFTER_LOAD: Extra delay after load in seconds (default: 0)
* CHROME_WAIT_FOR: Wait condition (default: networkidle2)
* - domcontentloaded: DOM is ready, resources may still load
* - load: Page fully loaded including resources
* - networkidle0: No network activity for 500ms (strictest)
* - networkidle2: At most 2 network connections for 500ms
*
* # Fallbacks
* TIMEOUT: Fallback timeout
*/
const fs = require('fs');
const path = require('path');
const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'chrome_navigate';
const CHROME_SESSION_DIR = '../chrome_session';
const OUTPUT_DIR = '.';
// Parse command line arguments
function parseArgs() {
const args = {};
process.argv.slice(2).forEach(arg => {
@@ -45,18 +34,10 @@ function parseArgs() {
return args;
}
// Get environment variable with default
function getEnv(name, defaultValue = '') {
return (process.env[name] || defaultValue).trim();
}
function getEnvBool(name, defaultValue = false) {
const val = getEnv(name, '').toLowerCase();
if (['true', '1', 'yes', 'on'].includes(val)) return true;
if (['false', '0', 'no', 'off'].includes(val)) return false;
return defaultValue;
}
function getEnvInt(name, defaultValue = 0) {
const val = parseInt(getEnv(name, String(defaultValue)), 10);
return isNaN(val) ? defaultValue : val;
@@ -67,159 +48,79 @@ function getEnvFloat(name, defaultValue = 0) {
return isNaN(val) ? defaultValue : val;
}
// Read CDP URL from chrome_session
function getCdpUrl() {
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
if (!fs.existsSync(cdpFile)) {
return null;
}
if (!fs.existsSync(cdpFile)) return null;
return fs.readFileSync(cdpFile, 'utf8').trim();
}
// Read URL from chrome_session (set by chrome_session extractor)
function getTargetUrl() {
const urlFile = path.join(CHROME_SESSION_DIR, 'url.txt');
if (!fs.existsSync(urlFile)) {
return null;
}
return fs.readFileSync(urlFile, 'utf8').trim();
function getPageId() {
const pageIdFile = path.join(CHROME_SESSION_DIR, 'page_id.txt');
if (!fs.existsSync(pageIdFile)) return null;
return fs.readFileSync(pageIdFile, 'utf8').trim();
}
// Validate wait condition
function getWaitCondition() {
const waitFor = getEnv('CHROME_WAIT_FOR', 'networkidle2').toLowerCase();
const validConditions = ['domcontentloaded', 'load', 'networkidle0', 'networkidle2'];
if (validConditions.includes(waitFor)) {
return waitFor;
}
console.error(`Warning: Invalid CHROME_WAIT_FOR="${waitFor}", using networkidle2`);
return 'networkidle2';
const valid = ['domcontentloaded', 'load', 'networkidle0', 'networkidle2'];
return valid.includes(waitFor) ? waitFor : 'networkidle2';
}
// Sleep helper
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
async function navigateToUrl(url, cdpUrl) {
async function navigate(url, cdpUrl) {
const timeout = (getEnvInt('CHROME_PAGELOAD_TIMEOUT') || getEnvInt('CHROME_TIMEOUT') || getEnvInt('TIMEOUT', 60)) * 1000;
const delayAfterLoad = getEnvFloat('CHROME_DELAY_AFTER_LOAD', 0) * 1000;
const waitUntil = getWaitCondition();
const pageId = getPageId();
let browser = null;
let responseHeaders = {};
let redirectChain = [];
let finalUrl = url;
try {
// Connect to existing browser
browser = await puppeteer.connect({
browserWSEndpoint: cdpUrl,
});
browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl });
// Get all pages and find our target page
const pages = await browser.pages();
if (pages.length === 0) {
return { success: false, error: 'No pages found in browser' };
}
// Use the last created page (most likely the one chrome_session created)
const page = pages[pages.length - 1];
// Set up response interception to capture headers and redirects
page.on('response', async (response) => {
const request = response.request();
// Track redirects
if (response.status() >= 300 && response.status() < 400) {
redirectChain.push({
url: response.url(),
status: response.status(),
location: response.headers()['location'] || null,
});
}
// Capture headers from the main document request
if (request.isNavigationRequest() && request.frame() === page.mainFrame()) {
try {
responseHeaders = {
url: response.url(),
status: response.status(),
statusText: response.statusText(),
headers: response.headers(),
};
finalUrl = response.url();
} catch (e) {
// Ignore errors capturing headers
}
}
});
// Navigate to URL and wait for load
console.log(`Navigating to ${url} (wait: ${waitUntil}, timeout: ${timeout}ms)`);
const response = await page.goto(url, {
waitUntil,
timeout,
});
// Capture final response if not already captured
if (response && Object.keys(responseHeaders).length === 0) {
responseHeaders = {
url: response.url(),
status: response.status(),
statusText: response.statusText(),
headers: response.headers(),
};
finalUrl = response.url();
// Find page by target ID if available
let page = null;
if (pageId) {
page = pages.find(p => {
const target = p.target();
return target && target._targetId === pageId;
});
}
if (!page) {
page = pages[pages.length - 1];
}
// Apply optional delay after load
// Navigate
console.log(`Navigating to ${url} (wait: ${waitUntil}, timeout: ${timeout}ms)`);
const response = await page.goto(url, { waitUntil, timeout });
// Optional delay
if (delayAfterLoad > 0) {
console.log(`Waiting ${delayAfterLoad}ms after load...`);
await sleep(delayAfterLoad);
}
// Write response headers
if (Object.keys(responseHeaders).length > 0) {
// Add redirect chain to headers
responseHeaders.redirect_chain = redirectChain;
const finalUrl = page.url();
const status = response ? response.status() : null;
fs.writeFileSync(
path.join(CHROME_SESSION_DIR, 'response_headers.json'),
JSON.stringify(responseHeaders, null, 2)
);
}
// Write marker file
fs.writeFileSync(path.join(OUTPUT_DIR, 'page_loaded.txt'), new Date().toISOString());
fs.writeFileSync(path.join(OUTPUT_DIR, 'final_url.txt'), finalUrl);
// Write final URL (after redirects)
fs.writeFileSync(path.join(CHROME_SESSION_DIR, 'final_url.txt'), finalUrl);
// Write marker file indicating page is loaded
fs.writeFileSync(
path.join(CHROME_SESSION_DIR, 'page_loaded.txt'),
new Date().toISOString()
);
// Disconnect but leave browser running for post-load extractors
browser.disconnect();
return {
success: true,
output: CHROME_SESSION_DIR,
finalUrl,
status: responseHeaders.status,
redirectCount: redirectChain.length,
};
return { success: true, finalUrl, status };
} catch (e) {
// Don't close browser on error - let cleanup handle it
if (browser) {
try {
browser.disconnect();
} catch (disconnectErr) {
// Ignore
}
}
if (browser) browser.disconnect();
return { success: false, error: `${e.name}: ${e.message}` };
}
}
@@ -239,55 +140,33 @@ async function main() {
let output = null;
let error = '';
try {
// Check for chrome_session
const cdpUrl = getCdpUrl();
if (!cdpUrl) {
console.error('ERROR: chrome_session not found (cdp_url.txt missing)');
console.error('chrome_navigate requires chrome_session to run first');
process.exit(1);
}
const cdpUrl = getCdpUrl();
if (!cdpUrl) {
console.error('ERROR: chrome_session not found');
process.exit(1);
}
// Get URL from chrome_session or use provided URL
const targetUrl = getTargetUrl() || url;
const result = await navigate(url, cdpUrl);
const result = await navigateToUrl(targetUrl, cdpUrl);
if (result.success) {
status = 'succeeded';
output = result.output;
console.log(`Page loaded: ${result.finalUrl}`);
console.log(`HTTP status: ${result.status}`);
if (result.redirectCount > 0) {
console.log(`Redirects: ${result.redirectCount}`);
}
} else {
status = 'failed';
error = result.error;
}
} catch (e) {
error = `${e.name}: ${e.message}`;
status = 'failed';
if (result.success) {
status = 'succeeded';
output = OUTPUT_DIR;
console.log(`Page loaded: ${result.finalUrl} (HTTP ${result.status})`);
} else {
error = result.error;
}
const endTs = new Date();
const duration = (endTs - startTs) / 1000;
// Print results
console.log(`START_TS=${startTs.toISOString()}`);
console.log(`END_TS=${endTs.toISOString()}`);
console.log(`DURATION=${duration.toFixed(2)}`);
if (output) {
console.log(`OUTPUT=${output}`);
}
if (output) console.log(`OUTPUT=${output}`);
console.log(`STATUS=${status}`);
if (error) console.error(`ERROR=${error}`);
if (error) {
console.error(`ERROR=${error}`);
}
// Print JSON result
const resultJson = {
console.log(`RESULT_JSON=${JSON.stringify({
extractor: EXTRACTOR_NAME,
url,
snapshot_id: snapshotId,
@@ -297,8 +176,7 @@ async function main() {
duration: Math.round(duration * 100) / 100,
output,
error: error || null,
};
console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
})}`);
process.exit(status === 'succeeded' ? 0 : 1);
}

View File

@@ -0,0 +1,141 @@
#!/usr/bin/env python3
"""
Clean up Chrome browser session at the end of a crawl.
This runs after all snapshots in a crawl have been processed to terminate
the shared Chrome session that was started by on_Crawl__10_chrome_session.js.
Usage: on_Crawl__99_chrome_cleanup.py --crawl-id=<uuid>
Output: Terminates the crawl's Chrome process
"""
import json
import os
import signal
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'chrome_cleanup'
CHROME_SESSION_DIR = 'chrome_session'
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def cleanup_crawl_chrome() -> tuple[bool, str | None, str]:
"""
Clean up Chrome session for the crawl.
Returns: (success, output_info, error_message)
"""
session_dir = Path(CHROME_SESSION_DIR)
if not session_dir.exists():
return True, 'No chrome_session directory found', ''
pid_file = session_dir / 'pid.txt'
killed = False
if pid_file.exists():
try:
pid = int(pid_file.read_text().strip())
# Try graceful termination first
try:
os.kill(pid, signal.SIGTERM)
killed = True
print(f'[*] Sent SIGTERM to Chrome PID {pid}')
# Wait briefly for graceful shutdown
for _ in range(20):
try:
os.kill(pid, 0) # Check if still running
time.sleep(0.1)
except OSError:
print(f'[+] Chrome process {pid} terminated')
break # Process is gone
else:
# Force kill if still running
print(f'[!] Chrome still running, sending SIGKILL')
try:
os.kill(pid, signal.SIGKILL)
except OSError:
pass
except OSError as e:
# Process might already be dead, that's fine
if e.errno == 3: # No such process
print(f'[*] Chrome process {pid} already terminated')
else:
return False, None, f'Failed to kill Chrome PID {pid}: {e}'
except ValueError:
return False, None, f'Invalid PID in {pid_file}'
except Exception as e:
return False, None, f'{type(e).__name__}: {e}'
result_info = f'Crawl Chrome cleanup: PID {"killed" if killed else "not found or already terminated"}'
return True, result_info, ''
@click.command()
@click.option('--crawl-id', required=True, help='Crawl UUID')
@click.option('--source-url', default='', help='Source URL (unused)')
def main(crawl_id: str, source_url: str):
"""Clean up shared Chrome browser session for crawl."""
start_ts = datetime.now(timezone.utc)
output = None
status = 'failed'
error = ''
try:
success, output, error = cleanup_crawl_chrome()
status = 'succeeded' if success else 'failed'
if success:
print(f'Crawl Chrome cleanup completed: {output}')
except Exception as e:
error = f'{type(e).__name__}: {e}'
status = 'failed'
# Print results
end_ts = datetime.now(timezone.utc)
duration = (end_ts - start_ts).total_seconds()
print(f'START_TS={start_ts.isoformat()}')
print(f'END_TS={end_ts.isoformat()}')
print(f'DURATION={duration:.2f}')
if output:
print(f'OUTPUT={output}')
print(f'STATUS={status}')
if error:
print(f'ERROR={error}', file=sys.stderr)
# Print JSON result
result_json = {
'extractor': EXTRACTOR_NAME,
'crawl_id': crawl_id,
'status': status,
'start_ts': start_ts.isoformat(),
'end_ts': end_ts.isoformat(),
'duration': round(duration, 2),
'output': output,
'error': error or None,
}
print(f'RESULT_JSON={json.dumps(result_json)}')
sys.exit(0 if status == 'succeeded' else 1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,343 @@
#!/usr/bin/env node
/**
* Launch a shared Chrome browser session for the entire crawl.
*
* This runs once per crawl and keeps Chrome alive for all snapshots to share.
* Each snapshot creates its own tab via on_Snapshot__20_chrome_session.js.
*
* Usage: on_Crawl__10_chrome_session.js --crawl-id=<uuid> --source-url=<url>
* Output: Creates chrome_session/ with:
* - cdp_url.txt: WebSocket URL for CDP connection
* - pid.txt: Chrome process ID (for cleanup)
*
* Environment variables:
* CHROME_BINARY: Path to Chrome/Chromium binary
* CHROME_RESOLUTION: Page resolution (default: 1440,2000)
* CHROME_HEADLESS: Run in headless mode (default: true)
* CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true)
*/
const fs = require('fs');
const path = require('path');
const { spawn } = require('child_process');
const http = require('http');
// Extractor metadata
const EXTRACTOR_NAME = 'chrome_session';
const OUTPUT_DIR = 'chrome_session';
// Parse command line arguments
function parseArgs() {
const args = {};
process.argv.slice(2).forEach(arg => {
if (arg.startsWith('--')) {
const [key, ...valueParts] = arg.slice(2).split('=');
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
}
});
return args;
}
// Get environment variable with default
function getEnv(name, defaultValue = '') {
return (process.env[name] || defaultValue).trim();
}
function getEnvBool(name, defaultValue = false) {
const val = getEnv(name, '').toLowerCase();
if (['true', '1', 'yes', 'on'].includes(val)) return true;
if (['false', '0', 'no', 'off'].includes(val)) return false;
return defaultValue;
}
// Find Chrome binary
function findChrome() {
const chromeBinary = getEnv('CHROME_BINARY');
if (chromeBinary && fs.existsSync(chromeBinary)) {
return chromeBinary;
}
const candidates = [
// Linux
'/usr/bin/google-chrome',
'/usr/bin/google-chrome-stable',
'/usr/bin/chromium',
'/usr/bin/chromium-browser',
// macOS
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
'/Applications/Chromium.app/Contents/MacOS/Chromium',
];
for (const candidate of candidates) {
if (fs.existsSync(candidate)) {
return candidate;
}
}
return null;
}
// Parse resolution string
function parseResolution(resolution) {
const [width, height] = resolution.split(',').map(x => parseInt(x.trim(), 10));
return { width: width || 1440, height: height || 2000 };
}
// Find a free port
function findFreePort() {
return new Promise((resolve, reject) => {
const server = require('net').createServer();
server.unref();
server.on('error', reject);
server.listen(0, () => {
const port = server.address().port;
server.close(() => resolve(port));
});
});
}
// Wait for Chrome's DevTools port to be ready
function waitForDebugPort(port, timeout = 30000) {
const startTime = Date.now();
return new Promise((resolve, reject) => {
const tryConnect = () => {
if (Date.now() - startTime > timeout) {
reject(new Error(`Timeout waiting for Chrome debug port ${port}`));
return;
}
const req = http.get(`http://127.0.0.1:${port}/json/version`, (res) => {
let data = '';
res.on('data', chunk => data += chunk);
res.on('end', () => {
try {
const info = JSON.parse(data);
resolve(info);
} catch (e) {
setTimeout(tryConnect, 100);
}
});
});
req.on('error', () => {
setTimeout(tryConnect, 100);
});
req.setTimeout(1000, () => {
req.destroy();
setTimeout(tryConnect, 100);
});
};
tryConnect();
});
}
async function launchChrome(binary) {
const resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000');
const checkSsl = getEnvBool('CHROME_CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true));
const headless = getEnvBool('CHROME_HEADLESS', true);
const { width, height } = parseResolution(resolution);
// Create output directory
if (!fs.existsSync(OUTPUT_DIR)) {
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
}
// Find a free port for Chrome DevTools
const debugPort = await findFreePort();
console.log(`[*] Using debug port: ${debugPort}`);
// Load any installed extensions
const extensionUtils = require('../chrome_extensions/chrome_extension_utils.js');
const extensionsDir = getEnv('CHROME_EXTENSIONS_DIR') ||
path.join(getEnv('DATA_DIR', '.'), 'personas', getEnv('ACTIVE_PERSONA', 'Default'), 'chrome_extensions');
const installedExtensions = [];
if (fs.existsSync(extensionsDir)) {
const files = fs.readdirSync(extensionsDir);
for (const file of files) {
if (file.endsWith('.extension.json')) {
try {
const extPath = path.join(extensionsDir, file);
const extData = JSON.parse(fs.readFileSync(extPath, 'utf-8'));
if (extData.unpacked_path && fs.existsSync(extData.unpacked_path)) {
installedExtensions.push(extData);
console.log(`[*] Loading extension: ${extData.name || file}`);
}
} catch (e) {
// Skip invalid cache files
console.warn(`[!] Skipping invalid extension cache: ${file}`);
}
}
}
}
// Get extension launch arguments
const extensionArgs = extensionUtils.getExtensionLaunchArgs(installedExtensions);
if (extensionArgs.length > 0) {
console.log(`[+] Loaded ${installedExtensions.length} extension(s)`);
// Write extensions metadata for config hooks to use
fs.writeFileSync(
path.join(OUTPUT_DIR, 'extensions.json'),
JSON.stringify(installedExtensions, null, 2)
);
}
// Build Chrome arguments
const chromeArgs = [
`--remote-debugging-port=${debugPort}`,
'--remote-debugging-address=127.0.0.1',
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
'--disable-sync',
'--no-first-run',
'--no-default-browser-check',
'--disable-default-apps',
'--disable-infobars',
'--disable-blink-features=AutomationControlled',
'--disable-component-update',
'--disable-domain-reliability',
'--disable-breakpad',
'--disable-background-networking',
'--disable-background-timer-throttling',
'--disable-backgrounding-occluded-windows',
'--disable-renderer-backgrounding',
'--disable-ipc-flooding-protection',
'--password-store=basic',
'--use-mock-keychain',
'--font-render-hinting=none',
'--force-color-profile=srgb',
`--window-size=${width},${height}`,
...extensionArgs, // Load extensions
...(headless ? ['--headless=new'] : []),
...(checkSsl ? [] : ['--ignore-certificate-errors']),
'about:blank', // Start with blank page
];
// Launch Chrome as a child process (NOT detached - stays with crawl process)
// Using stdio: 'ignore' so we don't block on output but Chrome stays as our child
const chromeProcess = spawn(binary, chromeArgs, {
stdio: ['ignore', 'ignore', 'ignore'],
});
const chromePid = chromeProcess.pid;
console.log(`[*] Launched Chrome (PID: ${chromePid}), waiting for debug port...`);
// Write PID immediately for cleanup
fs.writeFileSync(path.join(OUTPUT_DIR, 'pid.txt'), String(chromePid));
fs.writeFileSync(path.join(OUTPUT_DIR, 'port.txt'), String(debugPort));
try {
// Wait for Chrome to be ready
const versionInfo = await waitForDebugPort(debugPort, 30000);
console.log(`[+] Chrome ready: ${versionInfo.Browser}`);
// Build WebSocket URL
const wsUrl = versionInfo.webSocketDebuggerUrl;
fs.writeFileSync(path.join(OUTPUT_DIR, 'cdp_url.txt'), wsUrl);
return { success: true, cdpUrl: wsUrl, pid: chromePid, port: debugPort };
} catch (e) {
// Kill Chrome if setup failed
try {
process.kill(chromePid, 'SIGTERM');
} catch (killErr) {
// Ignore
}
return { success: false, error: `${e.name}: ${e.message}` };
}
}
async function main() {
const args = parseArgs();
const crawlId = args.crawl_id;
const startTs = new Date();
let status = 'failed';
let output = null;
let error = '';
let version = '';
try {
const binary = findChrome();
if (!binary) {
console.error('ERROR: Chrome/Chromium binary not found');
console.error('DEPENDENCY_NEEDED=chrome');
console.error('BIN_PROVIDERS=puppeteer,env,playwright,apt,brew');
console.error('INSTALL_HINT=npx @puppeteer/browsers install chrome@stable');
process.exit(1);
}
// Get Chrome version
try {
const { execSync } = require('child_process');
version = execSync(`"${binary}" --version`, { encoding: 'utf8', timeout: 5000 }).trim().slice(0, 64);
} catch (e) {
version = '';
}
const result = await launchChrome(binary);
if (result.success) {
status = 'succeeded';
output = OUTPUT_DIR;
console.log(`[+] Chrome session started for crawl ${crawlId}`);
console.log(`[+] CDP URL: ${result.cdpUrl}`);
console.log(`[+] PID: ${result.pid}`);
} else {
status = 'failed';
error = result.error;
}
} catch (e) {
error = `${e.name}: ${e.message}`;
status = 'failed';
}
const endTs = new Date();
const duration = (endTs - startTs) / 1000;
// Print results
console.log(`START_TS=${startTs.toISOString()}`);
console.log(`END_TS=${endTs.toISOString()}`);
console.log(`DURATION=${duration.toFixed(2)}`);
if (version) {
console.log(`VERSION=${version}`);
}
if (output) {
console.log(`OUTPUT=${output}`);
}
console.log(`STATUS=${status}`);
if (error) {
console.error(`ERROR=${error}`);
}
// Print JSON result
const resultJson = {
extractor: EXTRACTOR_NAME,
crawl_id: crawlId,
status,
start_ts: startTs.toISOString(),
end_ts: endTs.toISOString(),
duration: Math.round(duration * 100) / 100,
cmd_version: version,
output,
error: error || null,
};
console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
// Exit with success - Chrome stays running as our child process
// It will be cleaned up when the crawl process terminates
process.exit(status === 'succeeded' ? 0 : 1);
}
main().catch(e => {
console.error(`Fatal error: ${e.message}`);
process.exit(1);
});

View File

@@ -1,20 +1,21 @@
#!/usr/bin/env node
/**
* Start a Chrome browser session for use by other extractors.
* Create a Chrome tab for this snapshot in the shared crawl Chrome session.
*
* This extractor ONLY launches Chrome and creates a blank page - it does NOT navigate.
* Pre-load extractors (21-29) can connect via CDP to register listeners before navigation.
* The chrome_navigate extractor (30) performs the actual page load.
* If a crawl-level Chrome session exists (from on_Crawl__10_chrome_session.js),
* this connects to it and creates a new tab. Otherwise, falls back to launching
* its own Chrome instance.
*
* Usage: on_Snapshot__20_chrome_session.js --url=<url> --snapshot-id=<uuid>
* Usage: on_Snapshot__20_chrome_session.js --url=<url> --snapshot-id=<uuid> --crawl-id=<uuid>
* Output: Creates chrome_session/ with:
* - cdp_url.txt: WebSocket URL for CDP connection
* - pid.txt: Chrome process ID (for cleanup)
* - page_id.txt: Target ID of the page for other extractors to use
* - url.txt: The URL to be navigated to (for chrome_navigate)
* - cdp_url.txt: WebSocket URL for CDP connection (copied or new)
* - pid.txt: Chrome process ID (from crawl or new)
* - page_id.txt: Target ID of this snapshot's tab
* - url.txt: The URL to be navigated to
*
* Environment variables:
* CHROME_BINARY: Path to Chrome/Chromium binary
* DATA_DIR: Data directory (to find crawl's Chrome session)
* CHROME_BINARY: Path to Chrome/Chromium binary (for fallback)
* CHROME_RESOLUTION: Page resolution (default: 1440,2000)
* CHROME_USER_AGENT: User agent string (optional)
* CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true)
@@ -23,18 +24,13 @@
const fs = require('fs');
const path = require('path');
const { spawn } = require('child_process');
const http = require('http');
const puppeteer = require('puppeteer-core');
// Import extension utilities
const extensionUtils = require('../chrome_extensions/chrome_extension_utils.js');
// Extractor metadata
const EXTRACTOR_NAME = 'chrome_session';
const OUTPUT_DIR = 'chrome_session';
// Get extensions directory from environment or use default
const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions');
const OUTPUT_DIR = '.'; // Hook already runs in the output directory
// Parse command line arguments
function parseArgs() {
@@ -60,13 +56,7 @@ function getEnvBool(name, defaultValue = false) {
return defaultValue;
}
function getEnvInt(name, defaultValue = 0) {
const val = parseInt(getEnv(name, String(defaultValue)), 10);
return isNaN(val) ? defaultValue : val;
}
// Find Chrome binary
// Find Chrome binary (for fallback)
function findChrome() {
const chromeBinary = getEnv('CHROME_BINARY');
if (chromeBinary && fs.existsSync(chromeBinary)) {
@@ -74,12 +64,10 @@ function findChrome() {
}
const candidates = [
// Linux
'/usr/bin/google-chrome',
'/usr/bin/google-chrome-stable',
'/usr/bin/chromium',
'/usr/bin/chromium-browser',
// macOS
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
'/Applications/Chromium.app/Contents/MacOS/Chromium',
];
@@ -99,40 +87,132 @@ function parseResolution(resolution) {
return { width: width || 1440, height: height || 2000 };
}
// Load installed extensions from cache files
function loadInstalledExtensions() {
const extensions = [];
// Find a free port
function findFreePort() {
return new Promise((resolve, reject) => {
const server = require('net').createServer();
server.unref();
server.on('error', reject);
server.listen(0, () => {
const port = server.address().port;
server.close(() => resolve(port));
});
});
}
if (!fs.existsSync(EXTENSIONS_DIR)) {
return extensions;
}
// Wait for Chrome's DevTools port to be ready
function waitForDebugPort(port, timeout = 30000) {
const startTime = Date.now();
// Look for *.extension.json cache files created by extension plugins
const files = fs.readdirSync(EXTENSIONS_DIR);
const extensionFiles = files.filter(f => f.endsWith('.extension.json'));
return new Promise((resolve, reject) => {
const tryConnect = () => {
if (Date.now() - startTime > timeout) {
reject(new Error(`Timeout waiting for Chrome debug port ${port}`));
return;
}
for (const file of extensionFiles) {
const req = http.get(`http://127.0.0.1:${port}/json/version`, (res) => {
let data = '';
res.on('data', chunk => data += chunk);
res.on('end', () => {
try {
const info = JSON.parse(data);
resolve(info);
} catch (e) {
setTimeout(tryConnect, 100);
}
});
});
req.on('error', () => {
setTimeout(tryConnect, 100);
});
req.setTimeout(1000, () => {
req.destroy();
setTimeout(tryConnect, 100);
});
};
tryConnect();
});
}
// Try to find the crawl's Chrome session
function findCrawlChromeSession(crawlId) {
if (!crawlId) return null;
const dataDir = getEnv('DATA_DIR', '.');
const crawlChromeDir = path.join(dataDir, 'tmp', `crawl_${crawlId}`, 'chrome_session');
const cdpFile = path.join(crawlChromeDir, 'cdp_url.txt');
const pidFile = path.join(crawlChromeDir, 'pid.txt');
if (fs.existsSync(cdpFile) && fs.existsSync(pidFile)) {
try {
const filePath = path.join(EXTENSIONS_DIR, file);
const data = fs.readFileSync(filePath, 'utf-8');
const extension = JSON.parse(data);
const cdpUrl = fs.readFileSync(cdpFile, 'utf-8').trim();
const pid = parseInt(fs.readFileSync(pidFile, 'utf-8').trim(), 10);
// Verify extension is actually installed
const manifestPath = path.join(extension.unpacked_path, 'manifest.json');
if (fs.existsSync(manifestPath)) {
extensions.push(extension);
console.log(`[+] Loaded extension: ${extension.name} (${extension.webstore_id})`);
// Verify the process is still running
try {
process.kill(pid, 0); // Signal 0 = check if process exists
return { cdpUrl, pid };
} catch (e) {
// Process not running
return null;
}
} catch (e) {
console.warn(`[⚠️] Failed to load extension from ${file}: ${e.message}`);
return null;
}
}
return extensions;
return null;
}
// Create a new tab in an existing Chrome session
async function createTabInExistingChrome(cdpUrl, url, pid) {
const resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000');
const userAgent = getEnv('CHROME_USER_AGENT') || getEnv('USER_AGENT', '');
const { width, height } = parseResolution(resolution);
async function startChromeSession(url, binary) {
console.log(`[*] Connecting to existing Chrome session: ${cdpUrl}`);
// Connect Puppeteer to the running Chrome
const browser = await puppeteer.connect({
browserWSEndpoint: cdpUrl,
defaultViewport: { width, height },
});
// Create a new tab for this snapshot
const page = await browser.newPage();
// Set viewport
await page.setViewport({ width, height });
// Set user agent if specified
if (userAgent) {
await page.setUserAgent(userAgent);
}
// Get the page target ID
const target = page.target();
const targetId = target._targetId;
// Write session info
fs.writeFileSync(path.join(OUTPUT_DIR, 'cdp_url.txt'), cdpUrl);
fs.writeFileSync(path.join(OUTPUT_DIR, 'pid.txt'), String(pid));
fs.writeFileSync(path.join(OUTPUT_DIR, 'page_id.txt'), targetId);
fs.writeFileSync(path.join(OUTPUT_DIR, 'url.txt'), url);
fs.writeFileSync(path.join(OUTPUT_DIR, 'shared_session.txt'), 'true');
// Disconnect Puppeteer (Chrome and tab stay alive)
browser.disconnect();
return { success: true, output: OUTPUT_DIR, cdpUrl, targetId, pid, shared: true };
}
// Fallback: Launch a new Chrome instance for this snapshot
async function launchNewChrome(url, binary) {
const resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000');
const userAgent = getEnv('CHROME_USER_AGENT') || getEnv('USER_AGENT', '');
const checkSsl = getEnvBool('CHROME_CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true));
@@ -140,115 +220,98 @@ async function startChromeSession(url, binary) {
const { width, height } = parseResolution(resolution);
// Load installed extensions
const extensions = loadInstalledExtensions();
const extensionArgs = extensionUtils.getExtensionLaunchArgs(extensions);
// Find a free port for Chrome DevTools
const debugPort = await findFreePort();
console.log(`[*] Launching new Chrome on port: ${debugPort}`);
if (extensions.length > 0) {
console.log(`[*] Loading ${extensions.length} Chrome extensions...`);
}
// Build Chrome arguments
const chromeArgs = [
`--remote-debugging-port=${debugPort}`,
'--remote-debugging-address=127.0.0.1',
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
'--disable-sync',
'--no-first-run',
'--no-default-browser-check',
'--disable-default-apps',
'--disable-infobars',
'--disable-blink-features=AutomationControlled',
'--disable-component-update',
'--disable-domain-reliability',
'--disable-breakpad',
'--disable-background-networking',
'--disable-background-timer-throttling',
'--disable-backgrounding-occluded-windows',
'--disable-renderer-backgrounding',
'--disable-ipc-flooding-protection',
'--password-store=basic',
'--use-mock-keychain',
'--font-render-hinting=none',
'--force-color-profile=srgb',
`--window-size=${width},${height}`,
...(headless ? ['--headless=new'] : []),
...(checkSsl ? [] : ['--ignore-certificate-errors']),
'about:blank',
];
// Create output directory
if (!fs.existsSync(OUTPUT_DIR)) {
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
}
// Launch Chrome as a detached process (since no crawl-level Chrome exists)
const chromeProcess = spawn(binary, chromeArgs, {
detached: true,
stdio: ['ignore', 'ignore', 'ignore'],
});
chromeProcess.unref();
let browser = null;
const chromePid = chromeProcess.pid;
console.log(`[*] Launched Chrome (PID: ${chromePid}), waiting for debug port...`);
// Write PID immediately for cleanup
fs.writeFileSync(path.join(OUTPUT_DIR, 'pid.txt'), String(chromePid));
try {
// Launch browser with Puppeteer
browser = await puppeteer.launch({
executablePath: binary,
headless: headless ? 'new' : false,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
'--disable-sync',
'--no-first-run',
'--no-default-browser-check',
'--disable-default-apps',
'--disable-infobars',
'--disable-blink-features=AutomationControlled',
'--disable-component-update',
'--disable-domain-reliability',
'--disable-breakpad',
'--disable-background-networking',
'--disable-background-timer-throttling',
'--disable-backgrounding-occluded-windows',
'--disable-renderer-backgrounding',
'--disable-ipc-flooding-protection',
'--password-store=basic',
'--use-mock-keychain',
'--font-render-hinting=none',
'--force-color-profile=srgb',
`--window-size=${width},${height}`,
...(checkSsl ? [] : ['--ignore-certificate-errors']),
...extensionArgs,
],
// Wait for Chrome to be ready
const versionInfo = await waitForDebugPort(debugPort, 30000);
console.log(`[+] Chrome ready: ${versionInfo.Browser}`);
const wsUrl = versionInfo.webSocketDebuggerUrl;
fs.writeFileSync(path.join(OUTPUT_DIR, 'cdp_url.txt'), wsUrl);
// Connect Puppeteer to get page info
const browser = await puppeteer.connect({
browserWSEndpoint: wsUrl,
defaultViewport: { width, height },
});
// Get the WebSocket endpoint URL
const cdpUrl = browser.wsEndpoint();
fs.writeFileSync(path.join(OUTPUT_DIR, 'cdp_url.txt'), cdpUrl);
let pages = await browser.pages();
let page = pages[0];
// Write PID for cleanup
const browserProcess = browser.process();
if (browserProcess) {
fs.writeFileSync(path.join(OUTPUT_DIR, 'pid.txt'), String(browserProcess.pid));
if (!page) {
page = await browser.newPage();
}
// Create a new page (but DON'T navigate yet)
const page = await browser.newPage();
await page.setViewport({ width, height });
// Set user agent if specified
if (userAgent) {
await page.setUserAgent(userAgent);
}
// Write the page target ID so other extractors can find this specific page
const target = page.target();
const targetId = target._targetId;
fs.writeFileSync(path.join(OUTPUT_DIR, 'page_id.txt'), targetId);
// Write the URL for chrome_navigate to use
fs.writeFileSync(path.join(OUTPUT_DIR, 'url.txt'), url);
fs.writeFileSync(path.join(OUTPUT_DIR, 'shared_session.txt'), 'false');
// Connect to loaded extensions at runtime (only if not already done)
const extensionsFile = path.join(OUTPUT_DIR, 'extensions.json');
if (extensions.length > 0 && !fs.existsSync(extensionsFile)) {
console.log('[*] Connecting to loaded extensions (first time setup)...');
try {
const loadedExtensions = await extensionUtils.loadAllExtensionsFromBrowser(browser, extensions);
// Write loaded extensions metadata for other extractors to use
fs.writeFileSync(extensionsFile, JSON.stringify(loadedExtensions, null, 2));
console.log(`[+] Extensions loaded and available at ${extensionsFile}`);
console.log(`[+] ${loadedExtensions.length} extensions ready for configuration by subsequent plugins`);
} catch (e) {
console.warn(`[⚠️] Failed to load extensions from browser: ${e.message}`);
}
} else if (extensions.length > 0) {
console.log('[*] Extensions already loaded from previous snapshot');
}
// Don't close browser - leave it running for other extractors
// Detach puppeteer from browser so it stays running
browser.disconnect();
return { success: true, output: OUTPUT_DIR, cdpUrl, targetId };
return { success: true, output: OUTPUT_DIR, cdpUrl: wsUrl, targetId, pid: chromePid, shared: false };
} catch (e) {
// Kill browser if startup failed
if (browser) {
try {
await browser.close();
} catch (closeErr) {
// Ignore
}
try {
process.kill(chromePid, 'SIGTERM');
} catch (killErr) {
// Ignore
}
return { success: false, error: `${e.name}: ${e.message}` };
}
@@ -258,9 +321,10 @@ async function main() {
const args = parseArgs();
const url = args.url;
const snapshotId = args.snapshot_id;
const crawlId = args.crawl_id;
if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__20_chrome_session.js --url=<url> --snapshot-id=<uuid>');
console.error('Usage: on_Snapshot__20_chrome_session.js --url=<url> --snapshot-id=<uuid> [--crawl-id=<uuid>]');
process.exit(1);
}
@@ -271,9 +335,6 @@ async function main() {
let version = '';
try {
// chrome_session launches Chrome and creates a blank page
// Pre-load extractors (21-29) register CDP listeners
// chrome_navigate (30) performs actual navigation
const binary = findChrome();
if (!binary) {
console.error('ERROR: Chrome/Chromium binary not found');
@@ -291,13 +352,24 @@ async function main() {
version = '';
}
const result = await startChromeSession(url, binary);
// Try to use existing crawl Chrome session
const crawlSession = findCrawlChromeSession(crawlId);
let result;
if (crawlSession) {
console.log(`[*] Found existing Chrome session from crawl ${crawlId}`);
result = await createTabInExistingChrome(crawlSession.cdpUrl, url, crawlSession.pid);
} else {
console.log(`[*] No crawl Chrome session found, launching new Chrome`);
result = await launchNewChrome(url, binary);
}
if (result.success) {
status = 'succeeded';
output = result.output;
console.log(`Chrome session started (no navigation yet): ${result.cdpUrl}`);
console.log(`Page target ID: ${result.targetId}`);
console.log(`[+] Chrome session ready (shared: ${result.shared})`);
console.log(`[+] CDP URL: ${result.cdpUrl}`);
console.log(`[+] Page target ID: ${result.targetId}`);
} else {
status = 'failed';
error = result.error;
@@ -331,6 +403,7 @@ async function main() {
extractor: EXTRACTOR_NAME,
url,
snapshot_id: snapshotId,
crawl_id: crawlId || null,
status,
start_ts: startTs.toISOString(),
end_ts: endTs.toISOString(),

View File

@@ -1,31 +1,24 @@
#!/usr/bin/env node
/**
* Capture console output from a page.
* Capture console output from a page (DAEMON MODE).
*
* Captures all console messages during page load:
* - log, warn, error, info, debug
* - Includes stack traces for errors
* - Timestamps for each message
* This hook daemonizes and stays alive to capture console logs throughout
* the snapshot lifecycle. It's killed by chrome_cleanup at the end.
*
* Usage: on_Snapshot__14_consolelog.js --url=<url> --snapshot-id=<uuid>
* Output: Writes consolelog/console.jsonl (one message per line)
*
* Environment variables:
* SAVE_CONSOLELOG: Enable console log capture (default: true)
* CONSOLELOG_TIMEOUT: Capture duration in seconds (default: 5)
* Usage: on_Snapshot__21_consolelog.js --url=<url> --snapshot-id=<uuid>
* Output: Writes console.jsonl + listener.pid
*/
const fs = require('fs');
const path = require('path');
const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'consolelog';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'console.jsonl';
const PID_FILE = 'listener.pid';
const CHROME_SESSION_DIR = '../chrome_session';
// Parse command line arguments
function parseArgs() {
const args = {};
process.argv.slice(2).forEach(arg => {
@@ -37,7 +30,6 @@ function parseArgs() {
return args;
}
// Get environment variable with default
function getEnv(name, defaultValue = '') {
return (process.env[name] || defaultValue).trim();
}
@@ -49,12 +41,6 @@ function getEnvBool(name, defaultValue = false) {
return defaultValue;
}
function getEnvInt(name, defaultValue = 0) {
const val = parseInt(getEnv(name, String(defaultValue)), 10);
return isNaN(val) ? defaultValue : val;
}
// Get CDP URL from chrome_session
function getCdpUrl() {
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
if (fs.existsSync(cdpFile)) {
@@ -63,7 +49,14 @@ function getCdpUrl() {
return null;
}
// Serialize console message arguments
function getPageId() {
const pageIdFile = path.join(CHROME_SESSION_DIR, 'page_id.txt');
if (fs.existsSync(pageIdFile)) {
return fs.readFileSync(pageIdFile, 'utf8').trim();
}
return null;
}
async function serializeArgs(args) {
const serialized = [];
for (const arg of args) {
@@ -71,7 +64,6 @@ async function serializeArgs(args) {
const json = await arg.jsonValue();
serialized.push(json);
} catch (e) {
// If jsonValue() fails, try to get text representation
try {
serialized.push(String(arg));
} catch (e2) {
@@ -82,128 +74,84 @@ async function serializeArgs(args) {
return serialized;
}
// Capture console logs
async function captureConsoleLogs(url) {
const captureTimeout = (getEnvInt('CONSOLELOG_TIMEOUT') || 5) * 1000;
// Output directory is current directory (hook already runs in output dir)
async function setupListeners() {
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
fs.writeFileSync(outputPath, ''); // Clear existing
// Clear existing file
fs.writeFileSync(outputPath, '');
let browser = null;
const consoleLogs = [];
try {
// Connect to existing Chrome session
const cdpUrl = getCdpUrl();
if (!cdpUrl) {
return { success: false, error: 'No Chrome session found (chrome_session extractor must run first)' };
}
browser = await puppeteer.connect({
browserWSEndpoint: cdpUrl,
});
// Get the page
const pages = await browser.pages();
const page = pages.find(p => p.url().startsWith('http')) || pages[0];
if (!page) {
return { success: false, error: 'No page found in Chrome session' };
}
// Listen for console messages
page.on('console', async (msg) => {
try {
const type = msg.type();
const text = msg.text();
const location = msg.location();
const args = await serializeArgs(msg.args());
const logEntry = {
timestamp: new Date().toISOString(),
type,
text,
args,
location: {
url: location.url || '',
lineNumber: location.lineNumber,
columnNumber: location.columnNumber,
},
};
// Write immediately to file
fs.appendFileSync(outputPath, JSON.stringify(logEntry) + '\n');
consoleLogs.push(logEntry);
} catch (e) {
// Error processing console message, skip it
console.error(`Error processing console message: ${e.message}`);
}
});
// Listen for page errors
page.on('pageerror', (error) => {
try {
const logEntry = {
timestamp: new Date().toISOString(),
type: 'error',
text: error.message,
stack: error.stack || '',
location: {},
};
fs.appendFileSync(outputPath, JSON.stringify(logEntry) + '\n');
consoleLogs.push(logEntry);
} catch (e) {
console.error(`Error processing page error: ${e.message}`);
}
});
// Listen for request failures
page.on('requestfailed', (request) => {
try {
const failure = request.failure();
const logEntry = {
timestamp: new Date().toISOString(),
type: 'request_failed',
text: `Request failed: ${request.url()}`,
error: failure ? failure.errorText : 'Unknown error',
url: request.url(),
location: {},
};
fs.appendFileSync(outputPath, JSON.stringify(logEntry) + '\n');
consoleLogs.push(logEntry);
} catch (e) {
console.error(`Error processing request failure: ${e.message}`);
}
});
// Wait to capture logs
await new Promise(resolve => setTimeout(resolve, captureTimeout));
// Group logs by type
const logStats = consoleLogs.reduce((acc, log) => {
acc[log.type] = (acc[log.type] || 0) + 1;
return acc;
}, {});
return {
success: true,
output: outputPath,
logCount: consoleLogs.length,
logStats,
};
} catch (e) {
return { success: false, error: `${e.name}: ${e.message}` };
} finally {
if (browser) {
browser.disconnect();
}
const cdpUrl = getCdpUrl();
if (!cdpUrl) {
throw new Error('No Chrome session found');
}
const browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl });
// Find our page
const pages = await browser.pages();
const pageId = getPageId();
let page = null;
if (pageId) {
page = pages.find(p => {
const target = p.target();
return target && target._targetId === pageId;
});
}
if (!page) {
page = pages[pages.length - 1];
}
if (!page) {
throw new Error('No page found');
}
// Set up listeners that write directly to file
page.on('console', async (msg) => {
try {
const logEntry = {
timestamp: new Date().toISOString(),
type: msg.type(),
text: msg.text(),
args: await serializeArgs(msg.args()),
location: msg.location(),
};
fs.appendFileSync(outputPath, JSON.stringify(logEntry) + '\n');
} catch (e) {
// Ignore errors
}
});
page.on('pageerror', (error) => {
try {
const logEntry = {
timestamp: new Date().toISOString(),
type: 'error',
text: error.message,
stack: error.stack || '',
};
fs.appendFileSync(outputPath, JSON.stringify(logEntry) + '\n');
} catch (e) {
// Ignore
}
});
page.on('requestfailed', (request) => {
try {
const failure = request.failure();
const logEntry = {
timestamp: new Date().toISOString(),
type: 'request_failed',
text: `Request failed: ${request.url()}`,
error: failure ? failure.errorText : 'Unknown error',
url: request.url(),
};
fs.appendFileSync(outputPath, JSON.stringify(logEntry) + '\n');
} catch (e) {
// Ignore
}
});
// Don't disconnect - keep browser connection alive
return { browser, page };
}
async function main() {
@@ -212,80 +160,83 @@ async function main() {
const snapshotId = args.snapshot_id;
if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__14_consolelog.js --url=<url> --snapshot-id=<uuid>');
console.error('Usage: on_Snapshot__21_consolelog.js --url=<url> --snapshot-id=<uuid>');
process.exit(1);
}
if (!getEnvBool('SAVE_CONSOLELOG', true)) {
console.log('Skipping (SAVE_CONSOLELOG=False)');
const result = {
extractor: EXTRACTOR_NAME,
status: 'skipped',
url,
snapshot_id: snapshotId,
};
console.log(`RESULT_JSON=${JSON.stringify(result)}`);
process.exit(0);
}
const startTs = new Date();
let status = 'failed';
let output = null;
let error = '';
let logCount = 0;
try {
// Check if enabled
if (!getEnvBool('SAVE_CONSOLELOG', true)) {
console.log('Skipping console log (SAVE_CONSOLELOG=False)');
status = 'skipped';
const endTs = new Date();
console.log(`START_TS=${startTs.toISOString()}`);
console.log(`END_TS=${endTs.toISOString()}`);
console.log(`STATUS=${status}`);
console.log(`RESULT_JSON=${JSON.stringify({extractor: EXTRACTOR_NAME, status, url, snapshot_id: snapshotId})}`);
process.exit(0);
}
// Set up listeners
await setupListeners();
const result = await captureConsoleLogs(url);
// Write PID file so chrome_cleanup can kill us
fs.writeFileSync(path.join(OUTPUT_DIR, PID_FILE), String(process.pid));
if (result.success) {
status = 'succeeded';
output = result.output;
logCount = result.logCount || 0;
const statsStr = Object.entries(result.logStats || {})
.map(([type, count]) => `${count} ${type}`)
.join(', ');
console.log(`Captured ${logCount} console messages: ${statsStr}`);
} else {
status = 'failed';
error = result.error;
// Report success immediately (we're staying alive in background)
const endTs = new Date();
const duration = (endTs - startTs) / 1000;
console.log(`START_TS=${startTs.toISOString()}`);
console.log(`END_TS=${endTs.toISOString()}`);
console.log(`DURATION=${duration.toFixed(2)}`);
console.log(`OUTPUT=${OUTPUT_FILE}`);
console.log(`STATUS=succeeded`);
const result = {
extractor: EXTRACTOR_NAME,
url,
snapshot_id: snapshotId,
status: 'succeeded',
start_ts: startTs.toISOString(),
end_ts: endTs.toISOString(),
duration: Math.round(duration * 100) / 100,
output: OUTPUT_FILE,
};
console.log(`RESULT_JSON=${JSON.stringify(result)}`);
// Daemonize: detach from parent and keep running
// This process will be killed by chrome_cleanup
if (process.stdin.isTTY) {
process.stdin.pause();
}
process.stdin.unref();
process.stdout.end();
process.stderr.end();
// Keep the process alive indefinitely
// Will be killed by chrome_cleanup via the PID file
setInterval(() => {}, 1000);
} catch (e) {
error = `${e.name}: ${e.message}`;
status = 'failed';
}
const endTs = new Date();
const duration = (endTs - startTs) / 1000;
// Print results
console.log(`START_TS=${startTs.toISOString()}`);
console.log(`END_TS=${endTs.toISOString()}`);
console.log(`DURATION=${duration.toFixed(2)}`);
if (output) {
console.log(`OUTPUT=${output}`);
}
console.log(`STATUS=${status}`);
if (error) {
const error = `${e.name}: ${e.message}`;
console.error(`ERROR=${error}`);
const endTs = new Date();
const result = {
extractor: EXTRACTOR_NAME,
url,
snapshot_id: snapshotId,
status: 'failed',
start_ts: startTs.toISOString(),
end_ts: endTs.toISOString(),
error,
};
console.log(`RESULT_JSON=${JSON.stringify(result)}`);
process.exit(1);
}
// Print JSON result
const resultJson = {
extractor: EXTRACTOR_NAME,
url,
snapshot_id: snapshotId,
status,
start_ts: startTs.toISOString(),
end_ts: endTs.toISOString(),
duration: Math.round(duration * 100) / 100,
output,
log_count: logCount,
error: error || null,
};
console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
process.exit(status === 'succeeded' ? 0 : 1);
}
main().catch(e => {

View File

@@ -7,8 +7,8 @@
*
* Extension: https://chromewebstore.google.com/detail/edibdbjcniadpccecjdfdjjppcpchdlm
*
* Priority: 02 (early) - Must install before Chrome session starts
* Hook: on_Snapshot
* Priority: 02 (early) - Must install before Chrome session starts at Crawl level
* Hook: on_Crawl (runs once per crawl, not per snapshot)
*
* This extension automatically:
* - Dismisses cookie consent popups

View File

@@ -1,278 +0,0 @@
#!/usr/bin/env node
/**
* Track complete redirect chains for a URL.
*
* Captures:
* - HTTP redirects (301, 302, 303, 307, 308)
* - Meta refresh redirects
* - JavaScript redirects (basic detection)
* - Full redirect chain with timestamps
*
* Usage: on_Snapshot__15_redirects.js --url=<url> --snapshot-id=<uuid>
* Output: Writes redirects/redirects.json
*
* Environment variables:
* SAVE_REDIRECTS: Enable redirect tracking (default: true)
*/
const fs = require('fs');
const path = require('path');
const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'redirects';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'redirects.json';
const CHROME_SESSION_DIR = '../chrome_session';
// Parse command line arguments
function parseArgs() {
const args = {};
process.argv.slice(2).forEach(arg => {
if (arg.startsWith('--')) {
const [key, ...valueParts] = arg.slice(2).split('=');
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
}
});
return args;
}
// Get environment variable with default
function getEnv(name, defaultValue = '') {
return (process.env[name] || defaultValue).trim();
}
function getEnvBool(name, defaultValue = false) {
const val = getEnv(name, '').toLowerCase();
if (['true', '1', 'yes', 'on'].includes(val)) return true;
if (['false', '0', 'no', 'off'].includes(val)) return false;
return defaultValue;
}
// Get CDP URL from chrome_session
function getCdpUrl() {
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
if (fs.existsSync(cdpFile)) {
return fs.readFileSync(cdpFile, 'utf8').trim();
}
return null;
}
// Track redirect chain
async function trackRedirects(url) {
// Output directory is current directory (hook already runs in output dir)
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
let browser = null;
const redirectChain = [];
try {
// Connect to existing Chrome session
const cdpUrl = getCdpUrl();
if (!cdpUrl) {
return { success: false, error: 'No Chrome session found (chrome_session extractor must run first)' };
}
browser = await puppeteer.connect({
browserWSEndpoint: cdpUrl,
});
// Get the page
const pages = await browser.pages();
const page = pages.find(p => p.url().startsWith('http')) || pages[0];
if (!page) {
return { success: false, error: 'No page found in Chrome session' };
}
// Track all responses to capture redirects
page.on('response', async (response) => {
const status = response.status();
const responseUrl = response.url();
const headers = response.headers();
// Check if it's a redirect
if (status >= 300 && status < 400) {
redirectChain.push({
timestamp: new Date().toISOString(),
url: responseUrl,
status,
statusText: response.statusText(),
location: headers['location'] || headers['Location'] || '',
type: 'http',
});
}
});
// Get the current URL (which is the final destination after redirects)
const finalUrl = page.url();
// Check for meta refresh redirects
const metaRefresh = await page.evaluate(() => {
const meta = document.querySelector('meta[http-equiv="refresh"]');
if (meta) {
const content = meta.getAttribute('content') || '';
const match = content.match(/url=['"]?([^'"]+)['"]?/i);
return {
content,
url: match ? match[1] : null,
};
}
return null;
});
if (metaRefresh && metaRefresh.url) {
redirectChain.push({
timestamp: new Date().toISOString(),
url: finalUrl,
status: null,
statusText: 'Meta Refresh',
location: metaRefresh.url,
type: 'meta_refresh',
content: metaRefresh.content,
});
}
// Check for JavaScript redirects (basic detection)
const jsRedirect = await page.evaluate(() => {
// Check for common JavaScript redirect patterns
const html = document.documentElement.outerHTML;
const patterns = [
/window\.location\s*=\s*['"]([^'"]+)['"]/i,
/window\.location\.href\s*=\s*['"]([^'"]+)['"]/i,
/window\.location\.replace\s*\(\s*['"]([^'"]+)['"]\s*\)/i,
/document\.location\s*=\s*['"]([^'"]+)['"]/i,
];
for (const pattern of patterns) {
const match = html.match(pattern);
if (match) {
return {
pattern: pattern.toString(),
url: match[1],
};
}
}
return null;
});
if (jsRedirect && jsRedirect.url) {
redirectChain.push({
timestamp: new Date().toISOString(),
url: finalUrl,
status: null,
statusText: 'JavaScript Redirect',
location: jsRedirect.url,
type: 'javascript',
pattern: jsRedirect.pattern,
});
}
const redirectData = {
original_url: url,
final_url: finalUrl,
redirect_count: redirectChain.length,
redirects: redirectChain,
is_redirect: redirectChain.length > 0,
};
// Write output
fs.writeFileSync(outputPath, JSON.stringify(redirectData, null, 2));
return { success: true, output: outputPath, redirectData };
} catch (e) {
return { success: false, error: `${e.name}: ${e.message}` };
} finally {
if (browser) {
browser.disconnect();
}
}
}
async function main() {
const args = parseArgs();
const url = args.url;
const snapshotId = args.snapshot_id;
if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__15_redirects.js --url=<url> --snapshot-id=<uuid>');
process.exit(1);
}
const startTs = new Date();
let status = 'failed';
let output = null;
let error = '';
try {
// Check if enabled
if (!getEnvBool('SAVE_REDIRECTS', true)) {
console.log('Skipping redirects (SAVE_REDIRECTS=False)');
status = 'skipped';
const endTs = new Date();
console.log(`START_TS=${startTs.toISOString()}`);
console.log(`END_TS=${endTs.toISOString()}`);
console.log(`STATUS=${status}`);
console.log(`RESULT_JSON=${JSON.stringify({extractor: EXTRACTOR_NAME, status, url, snapshot_id: snapshotId})}`);
process.exit(0);
}
const result = await trackRedirects(url);
if (result.success) {
status = 'succeeded';
output = result.output;
const redirectCount = result.redirectData.redirect_count;
const finalUrl = result.redirectData.final_url;
if (redirectCount > 0) {
console.log(`Tracked ${redirectCount} redirect(s) to: ${finalUrl}`);
} else {
console.log('No redirects detected');
}
} else {
status = 'failed';
error = result.error;
}
} catch (e) {
error = `${e.name}: ${e.message}`;
status = 'failed';
}
const endTs = new Date();
const duration = (endTs - startTs) / 1000;
// Print results
console.log(`START_TS=${startTs.toISOString()}`);
console.log(`END_TS=${endTs.toISOString()}`);
console.log(`DURATION=${duration.toFixed(2)}`);
if (output) {
console.log(`OUTPUT=${output}`);
}
console.log(`STATUS=${status}`);
if (error) {
console.error(`ERROR=${error}`);
}
// Print JSON result
const resultJson = {
extractor: EXTRACTOR_NAME,
url,
snapshot_id: snapshotId,
status,
start_ts: startTs.toISOString(),
end_ts: endTs.toISOString(),
duration: Math.round(duration * 100) / 100,
output,
error: error || null,
};
console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
process.exit(status === 'succeeded' ? 0 : 1);
}
main().catch(e => {
console.error(`Fatal error: ${e.message}`);
process.exit(1);
});

View File

@@ -0,0 +1,248 @@
#!/usr/bin/env node
/**
* Detect redirects by comparing original URL to final URL.
*
* This runs AFTER chrome_navigate and checks:
* - URL changed (HTTP redirect occurred)
* - Meta refresh tags (pending redirects)
* - JavaScript redirects (basic detection)
*
* Usage: on_Snapshot__31_redirects.js --url=<url> --snapshot-id=<uuid>
* Output: Writes redirects.json
*/
const fs = require('fs');
const path = require('path');
const puppeteer = require('puppeteer-core');
const EXTRACTOR_NAME = 'redirects';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'redirects.json';
const CHROME_SESSION_DIR = '../chrome_session';
const CHROME_NAVIGATE_DIR = '../chrome_navigate';
function parseArgs() {
const args = {};
process.argv.slice(2).forEach(arg => {
if (arg.startsWith('--')) {
const [key, ...valueParts] = arg.slice(2).split('=');
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
}
});
return args;
}
function getEnv(name, defaultValue = '') {
return (process.env[name] || defaultValue).trim();
}
function getEnvBool(name, defaultValue = false) {
const val = getEnv(name, '').toLowerCase();
if (['true', '1', 'yes', 'on'].includes(val)) return true;
if (['false', '0', 'no', 'off'].includes(val)) return false;
return defaultValue;
}
function getCdpUrl() {
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
if (fs.existsSync(cdpFile)) {
return fs.readFileSync(cdpFile, 'utf8').trim();
}
return null;
}
function getPageId() {
const pageIdFile = path.join(CHROME_SESSION_DIR, 'page_id.txt');
if (fs.existsSync(pageIdFile)) {
return fs.readFileSync(pageIdFile, 'utf8').trim();
}
return null;
}
function getFinalUrl() {
// Try chrome_navigate output first
const navFile = path.join(CHROME_NAVIGATE_DIR, 'final_url.txt');
if (fs.existsSync(navFile)) {
return fs.readFileSync(navFile, 'utf8').trim();
}
return null;
}
async function detectRedirects(originalUrl) {
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
const redirects = [];
// Get final URL from chrome_navigate
let finalUrl = getFinalUrl() || originalUrl;
// Check if URL changed (indicates redirect)
const urlChanged = originalUrl !== finalUrl;
if (urlChanged) {
redirects.push({
timestamp: new Date().toISOString(),
from_url: originalUrl,
to_url: finalUrl,
type: 'http',
detected_by: 'url_comparison',
});
}
// Connect to Chrome to check for meta refresh and JS redirects
const cdpUrl = getCdpUrl();
if (cdpUrl) {
let browser = null;
try {
browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl });
const pages = await browser.pages();
const pageId = getPageId();
let page = null;
if (pageId) {
page = pages.find(p => {
const target = p.target();
return target && target._targetId === pageId;
});
}
if (!page) {
page = pages.find(p => p.url().startsWith('http')) || pages[pages.length - 1];
}
if (page) {
// Update finalUrl from actual page
const pageUrl = page.url();
if (pageUrl && pageUrl !== 'about:blank') {
finalUrl = pageUrl;
}
// Check for meta refresh
try {
const metaRefresh = await page.evaluate(() => {
const meta = document.querySelector('meta[http-equiv="refresh"]');
if (meta) {
const content = meta.getAttribute('content') || '';
const match = content.match(/url=['"]?([^'";\s]+)['"]?/i);
return { content, url: match ? match[1] : null };
}
return null;
});
if (metaRefresh && metaRefresh.url) {
redirects.push({
timestamp: new Date().toISOString(),
from_url: finalUrl,
to_url: metaRefresh.url,
type: 'meta_refresh',
content: metaRefresh.content,
});
}
} catch (e) { /* ignore */ }
// Check for JS redirects
try {
const jsRedirect = await page.evaluate(() => {
const html = document.documentElement.outerHTML;
const patterns = [
/window\.location\s*=\s*['"]([^'"]+)['"]/i,
/window\.location\.href\s*=\s*['"]([^'"]+)['"]/i,
/window\.location\.replace\s*\(\s*['"]([^'"]+)['"]\s*\)/i,
];
for (const pattern of patterns) {
const match = html.match(pattern);
if (match) return { url: match[1], pattern: pattern.toString() };
}
return null;
});
if (jsRedirect && jsRedirect.url) {
redirects.push({
timestamp: new Date().toISOString(),
from_url: finalUrl,
to_url: jsRedirect.url,
type: 'javascript',
});
}
} catch (e) { /* ignore */ }
}
browser.disconnect();
} catch (e) {
console.error(`Warning: Could not connect to Chrome: ${e.message}`);
}
}
const result = {
original_url: originalUrl,
final_url: finalUrl,
redirect_count: redirects.length,
redirects,
is_redirect: originalUrl !== finalUrl || redirects.length > 0,
};
fs.writeFileSync(outputPath, JSON.stringify(result, null, 2));
return { success: true, output: outputPath, data: result };
}
async function main() {
const args = parseArgs();
const url = args.url;
const snapshotId = args.snapshot_id;
if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__31_redirects.js --url=<url> --snapshot-id=<uuid>');
process.exit(1);
}
const startTs = new Date();
let status = 'failed';
let output = null;
let error = '';
if (!getEnvBool('SAVE_REDIRECTS', true)) {
console.log('Skipping redirects (SAVE_REDIRECTS=False)');
status = 'skipped';
} else {
try {
const result = await detectRedirects(url);
status = 'succeeded';
output = result.output;
if (result.data.is_redirect) {
console.log(`Redirect detected: ${url} -> ${result.data.final_url}`);
} else {
console.log('No redirects detected');
}
} catch (e) {
error = `${e.name}: ${e.message}`;
}
}
const endTs = new Date();
const duration = (endTs - startTs) / 1000;
console.log(`START_TS=${startTs.toISOString()}`);
console.log(`END_TS=${endTs.toISOString()}`);
console.log(`DURATION=${duration.toFixed(2)}`);
if (output) console.log(`OUTPUT=${output}`);
console.log(`STATUS=${status}`);
if (error) console.error(`ERROR=${error}`);
console.log(`RESULT_JSON=${JSON.stringify({
extractor: EXTRACTOR_NAME,
url,
snapshot_id: snapshotId,
status,
start_ts: startTs.toISOString(),
end_ts: endTs.toISOString(),
duration: Math.round(duration * 100) / 100,
output,
error: error || null,
})}`);
process.exit(status === 'succeeded' ? 0 : 1);
}
main().catch(e => {
console.error(`Fatal error: ${e.message}`);
process.exit(1);
});

View File

@@ -1,22 +1,12 @@
#!/usr/bin/env node
/**
* Archive all network responses during page load.
* Archive all network responses during page load (DAEMON MODE).
*
* Connects to Chrome session and captures ALL network responses (XHR, images, scripts, etc.)
* Saves them in an organized directory structure with both timestamped unique files
* and URL-organized symlinks.
* This hook daemonizes and stays alive to capture network responses throughout
* the snapshot lifecycle. It's killed by chrome_cleanup at the end.
*
* Usage: on_Snapshot__23_responses.js --url=<url> --snapshot-id=<uuid>
* Output: Creates responses/ directory with:
* - all/<timestamp>__<METHOD>__<URL>.<ext>: Timestamped unique files
* - <type>/<domain>/<path>/: URL-organized symlinks by resource type
* - index.jsonl: Searchable index of all responses
*
* Environment variables:
* SAVE_RESPONSES: Enable response archiving (default: true)
* RESPONSES_TIMEOUT: Timeout in seconds (default: 120)
* RESPONSES_TYPES: Comma-separated resource types to save (default: all)
* Options: script,stylesheet,font,image,media,xhr,websocket,document
* Usage: on_Snapshot__24_responses.js --url=<url> --snapshot-id=<uuid>
* Output: Creates responses/ directory with index.jsonl + listener.pid
*/
const fs = require('fs');
@@ -27,6 +17,7 @@ const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'responses';
const OUTPUT_DIR = '.';
const PID_FILE = 'listener.pid';
const CHROME_SESSION_DIR = '../chrome_session';
// Resource types to capture (by default, capture everything)
@@ -70,6 +61,14 @@ function getCdpUrl() {
return null;
}
function getPageId() {
const pageIdFile = path.join(CHROME_SESSION_DIR, 'page_id.txt');
if (fs.existsSync(pageIdFile)) {
return fs.readFileSync(pageIdFile, 'utf8').trim();
}
return null;
}
// Get file extension from MIME type
function getExtensionFromMimeType(mimeType) {
const mimeMap = {
@@ -139,17 +138,14 @@ async function createSymlink(target, linkPath) {
fs.symlinkSync(relativePath, linkPath);
} catch (e) {
// Ignore symlink errors (file conflicts, permissions, etc.)
console.error(`Failed to create symlink: ${e.message}`);
}
}
// Archive responses by intercepting network traffic
async function archiveResponses(originalUrl) {
const timeout = (getEnvInt('RESPONSES_TIMEOUT') || getEnvInt('TIMEOUT', 120)) * 1000;
// Set up response listener
async function setupListener() {
const typesStr = getEnv('RESPONSES_TYPES', DEFAULT_TYPES.join(','));
const typesToSave = typesStr.split(',').map(t => t.trim().toLowerCase());
// Output directory is current directory (hook already runs in output dir)
// Create subdirectories for organizing responses
const allDir = path.join(OUTPUT_DIR, 'all');
if (!fs.existsSync(allDir)) {
@@ -160,138 +156,119 @@ async function archiveResponses(originalUrl) {
const indexPath = path.join(OUTPUT_DIR, 'index.jsonl');
fs.writeFileSync(indexPath, ''); // Clear existing
let browser = null;
let savedCount = 0;
const savedResponses = [];
try {
// Connect to existing Chrome session
const cdpUrl = getCdpUrl();
if (!cdpUrl) {
return { success: false, error: 'No Chrome session found (chrome_session extractor must run first)' };
}
browser = await puppeteer.connect({
browserWSEndpoint: cdpUrl,
});
// Get the page
const pages = await browser.pages();
const page = pages.find(p => p.url().startsWith('http')) || pages[0];
if (!page) {
return { success: false, error: 'No page found in Chrome session' };
}
// Enable request interception
await page.setRequestInterception(false); // Don't block requests
// Listen for responses
page.on('response', async (response) => {
try {
const request = response.request();
const url = response.url();
const resourceType = request.resourceType().toLowerCase();
const method = request.method();
const status = response.status();
// Skip redirects and errors
if (status >= 300 && status < 400) return;
if (status >= 400 && status < 600) return;
// Check if we should save this resource type
if (typesToSave.length && !typesToSave.includes(resourceType)) {
return;
}
// Get response body
let bodyBuffer = null;
try {
bodyBuffer = await response.buffer();
} catch (e) {
// Some responses can't be captured (already consumed, etc.)
return;
}
if (!bodyBuffer || bodyBuffer.length === 0) {
return;
}
// Determine file extension
const mimeType = response.headers()['content-type'] || '';
let extension = getExtensionFromMimeType(mimeType) || getExtensionFromUrl(url);
// Create timestamp-based unique filename
const timestamp = new Date().toISOString().replace(/[-:]/g, '').replace(/\..+/, '');
const urlHash = sanitizeFilename(encodeURIComponent(url).slice(0, 64));
const uniqueFilename = `${timestamp}__${method}__${urlHash}${extension ? '.' + extension : ''}`;
const uniquePath = path.join(allDir, uniqueFilename);
// Save to unique file
fs.writeFileSync(uniquePath, bodyBuffer);
// Create URL-organized symlink
try {
const urlObj = new URL(url);
const hostname = urlObj.hostname;
const pathname = urlObj.pathname || '/';
const filename = path.basename(pathname) || 'index' + (extension ? '.' + extension : '');
const dirPath = path.dirname(pathname);
// Create symlink: responses/<type>/<hostname>/<path>/<filename>
const symlinkDir = path.join(OUTPUT_DIR, resourceType, hostname, dirPath);
const symlinkPath = path.join(symlinkDir, filename);
await createSymlink(uniquePath, symlinkPath);
} catch (e) {
// URL parsing or symlink creation failed, skip
}
// Calculate SHA256
const sha256 = crypto.createHash('sha256').update(bodyBuffer).digest('hex');
const urlSha256 = crypto.createHash('sha256').update(url).digest('hex');
// Write to index
const indexEntry = {
ts: timestamp,
method,
url: method === 'DATA' ? url.slice(0, 128) : url, // Truncate data: URLs
urlSha256,
status,
resourceType,
mimeType: mimeType.split(';')[0],
responseSha256: sha256,
path: './' + path.relative(OUTPUT_DIR, uniquePath),
extension,
};
fs.appendFileSync(indexPath, JSON.stringify(indexEntry) + '\n');
savedResponses.push(indexEntry);
savedCount++;
} catch (e) {
// Log but don't fail the whole extraction
console.error(`Error capturing response: ${e.message}`);
}
});
// Wait a bit to ensure we capture responses
// (chrome_session already loaded the page, just capture any remaining traffic)
await new Promise(resolve => setTimeout(resolve, 2000));
return {
success: true,
output: OUTPUT_DIR,
savedCount,
indexPath,
};
} catch (e) {
return { success: false, error: `${e.name}: ${e.message}` };
} finally {
if (browser) {
browser.disconnect();
}
const cdpUrl = getCdpUrl();
if (!cdpUrl) {
throw new Error('No Chrome session found');
}
const browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl });
// Find our page
const pages = await browser.pages();
const pageId = getPageId();
let page = null;
if (pageId) {
page = pages.find(p => {
const target = p.target();
return target && target._targetId === pageId;
});
}
if (!page) {
page = pages[pages.length - 1];
}
if (!page) {
throw new Error('No page found');
}
// Set up response listener to capture network traffic
page.on('response', async (response) => {
try {
const request = response.request();
const url = response.url();
const resourceType = request.resourceType().toLowerCase();
const method = request.method();
const status = response.status();
// Skip redirects and errors
if (status >= 300 && status < 400) return;
if (status >= 400 && status < 600) return;
// Check if we should save this resource type
if (typesToSave.length && !typesToSave.includes(resourceType)) {
return;
}
// Get response body
let bodyBuffer = null;
try {
bodyBuffer = await response.buffer();
} catch (e) {
// Some responses can't be captured (already consumed, etc.)
return;
}
if (!bodyBuffer || bodyBuffer.length === 0) {
return;
}
// Determine file extension
const mimeType = response.headers()['content-type'] || '';
let extension = getExtensionFromMimeType(mimeType) || getExtensionFromUrl(url);
// Create timestamp-based unique filename
const timestamp = new Date().toISOString().replace(/[-:]/g, '').replace(/\..+/, '');
const urlHash = sanitizeFilename(encodeURIComponent(url).slice(0, 64));
const uniqueFilename = `${timestamp}__${method}__${urlHash}${extension ? '.' + extension : ''}`;
const uniquePath = path.join(allDir, uniqueFilename);
// Save to unique file
fs.writeFileSync(uniquePath, bodyBuffer);
// Create URL-organized symlink
try {
const urlObj = new URL(url);
const hostname = urlObj.hostname;
const pathname = urlObj.pathname || '/';
const filename = path.basename(pathname) || 'index' + (extension ? '.' + extension : '');
const dirPath = path.dirname(pathname);
// Create symlink: responses/<type>/<hostname>/<path>/<filename>
const symlinkDir = path.join(OUTPUT_DIR, resourceType, hostname, dirPath);
const symlinkPath = path.join(symlinkDir, filename);
await createSymlink(uniquePath, symlinkPath);
} catch (e) {
// URL parsing or symlink creation failed, skip
}
// Calculate SHA256
const sha256 = crypto.createHash('sha256').update(bodyBuffer).digest('hex');
const urlSha256 = crypto.createHash('sha256').update(url).digest('hex');
// Write to index
const indexEntry = {
ts: timestamp,
method,
url: method === 'DATA' ? url.slice(0, 128) : url, // Truncate data: URLs
urlSha256,
status,
resourceType,
mimeType: mimeType.split(';')[0],
responseSha256: sha256,
path: './' + path.relative(OUTPUT_DIR, uniquePath),
extension,
};
fs.appendFileSync(indexPath, JSON.stringify(indexEntry) + '\n');
} catch (e) {
// Ignore errors
}
});
// Don't disconnect - keep browser connection alive
return { browser, page };
}
async function main() {
@@ -300,77 +277,83 @@ async function main() {
const snapshotId = args.snapshot_id;
if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__23_responses.js --url=<url> --snapshot-id=<uuid>');
console.error('Usage: on_Snapshot__24_responses.js --url=<url> --snapshot-id=<uuid>');
process.exit(1);
}
if (!getEnvBool('SAVE_RESPONSES', true)) {
console.log('Skipping (SAVE_RESPONSES=False)');
const result = {
extractor: EXTRACTOR_NAME,
status: 'skipped',
url,
snapshot_id: snapshotId,
};
console.log(`RESULT_JSON=${JSON.stringify(result)}`);
process.exit(0);
}
const startTs = new Date();
let status = 'failed';
let output = null;
let error = '';
let savedCount = 0;
try {
// Check if enabled
if (!getEnvBool('SAVE_RESPONSES', true)) {
console.log('Skipping responses (SAVE_RESPONSES=False)');
status = 'skipped';
const endTs = new Date();
console.log(`START_TS=${startTs.toISOString()}`);
console.log(`END_TS=${endTs.toISOString()}`);
console.log(`STATUS=${status}`);
console.log(`RESULT_JSON=${JSON.stringify({extractor: EXTRACTOR_NAME, status, url, snapshot_id: snapshotId})}`);
process.exit(0);
}
// Set up listener
await setupListener();
const result = await archiveResponses(url);
// Write PID file so chrome_cleanup can kill us
fs.writeFileSync(path.join(OUTPUT_DIR, PID_FILE), String(process.pid));
if (result.success) {
status = 'succeeded';
output = result.output;
savedCount = result.savedCount || 0;
console.log(`Saved ${savedCount} network responses to ${output}/`);
} else {
status = 'failed';
error = result.error;
// Report success immediately (we're staying alive in background)
const endTs = new Date();
const duration = (endTs - startTs) / 1000;
console.log(`START_TS=${startTs.toISOString()}`);
console.log(`END_TS=${endTs.toISOString()}`);
console.log(`DURATION=${duration.toFixed(2)}`);
console.log(`OUTPUT=responses/`);
console.log(`STATUS=succeeded`);
const result = {
extractor: EXTRACTOR_NAME,
url,
snapshot_id: snapshotId,
status: 'succeeded',
start_ts: startTs.toISOString(),
end_ts: endTs.toISOString(),
duration: Math.round(duration * 100) / 100,
output: 'responses/',
};
console.log(`RESULT_JSON=${JSON.stringify(result)}`);
// Daemonize: detach from parent and keep running
// This process will be killed by chrome_cleanup
if (process.stdin.isTTY) {
process.stdin.pause();
}
process.stdin.unref();
process.stdout.end();
process.stderr.end();
// Keep the process alive indefinitely
// Will be killed by chrome_cleanup via the PID file
setInterval(() => {}, 1000);
} catch (e) {
error = `${e.name}: ${e.message}`;
status = 'failed';
}
const endTs = new Date();
const duration = (endTs - startTs) / 1000;
// Print results
console.log(`START_TS=${startTs.toISOString()}`);
console.log(`END_TS=${endTs.toISOString()}`);
console.log(`DURATION=${duration.toFixed(2)}`);
if (output) {
console.log(`OUTPUT=${output}`);
}
console.log(`STATUS=${status}`);
if (error) {
const error = `${e.name}: ${e.message}`;
console.error(`ERROR=${error}`);
const endTs = new Date();
const result = {
extractor: EXTRACTOR_NAME,
url,
snapshot_id: snapshotId,
status: 'failed',
start_ts: startTs.toISOString(),
end_ts: endTs.toISOString(),
error,
};
console.log(`RESULT_JSON=${JSON.stringify(result)}`);
process.exit(1);
}
// Print JSON result
const resultJson = {
extractor: EXTRACTOR_NAME,
url,
snapshot_id: snapshotId,
status,
start_ts: startTs.toISOString(),
end_ts: endTs.toISOString(),
duration: Math.round(duration * 100) / 100,
output,
saved_count: savedCount,
error: error || null,
};
console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
process.exit(status === 'succeeded' ? 0 : 1);
}
main().catch(e => {

View File

@@ -7,8 +7,8 @@
*
* Extension: https://chromewebstore.google.com/detail/mpiodijhokgodhhofbcjdecpffjipkle
*
* Priority: 04 (early) - Must install before Chrome session starts
* Hook: on_Snapshot
* Priority: 04 (early) - Must install before Chrome session starts at Crawl level
* Hook: on_Crawl (runs once per crawl, not per snapshot)
*
* This extension automatically:
* - Saves complete web pages as single HTML files

View File

@@ -1,18 +1,12 @@
#!/usr/bin/env node
/**
* Extract SSL/TLS certificate details from a URL.
* Extract SSL/TLS certificate details from a URL (DAEMON MODE).
*
* Connects to Chrome session and retrieves security details including:
* - Protocol (TLS 1.2, TLS 1.3, etc.)
* - Cipher suite
* - Certificate issuer, validity period
* - Security state
* This hook daemonizes and stays alive to capture SSL details throughout
* the snapshot lifecycle. It's killed by chrome_cleanup at the end.
*
* Usage: on_Snapshot__16_ssl.js --url=<url> --snapshot-id=<uuid>
* Output: Writes ssl/ssl.json
*
* Environment variables:
* SAVE_SSL: Enable SSL extraction (default: true)
* Usage: on_Snapshot__23_ssl.js --url=<url> --snapshot-id=<uuid>
* Output: Writes ssl.json + listener.pid
*/
const fs = require('fs');
@@ -23,6 +17,7 @@ const puppeteer = require('puppeteer-core');
const EXTRACTOR_NAME = 'ssl';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'ssl.json';
const PID_FILE = 'listener.pid';
const CHROME_SESSION_DIR = '../chrome_session';
// Parse command line arguments
@@ -58,103 +53,103 @@ function getCdpUrl() {
return null;
}
// Extract SSL details
async function extractSsl(url) {
// Output directory is current directory (hook already runs in output dir)
function getPageId() {
const pageIdFile = path.join(CHROME_SESSION_DIR, 'page_id.txt');
if (fs.existsSync(pageIdFile)) {
return fs.readFileSync(pageIdFile, 'utf8').trim();
}
return null;
}
// Set up SSL listener
async function setupListener(url) {
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
// Only extract SSL for HTTPS URLs
if (!url.startsWith('https://')) {
return { success: false, error: 'URL is not HTTPS' };
throw new Error('URL is not HTTPS');
}
let browser = null;
let sslInfo = {};
const cdpUrl = getCdpUrl();
if (!cdpUrl) {
throw new Error('No Chrome session found');
}
try {
// Connect to existing Chrome session
const cdpUrl = getCdpUrl();
if (!cdpUrl) {
return { success: false, error: 'No Chrome session found (chrome_session extractor must run first)' };
}
const browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl });
browser = await puppeteer.connect({
browserWSEndpoint: cdpUrl,
// Find our page
const pages = await browser.pages();
const pageId = getPageId();
let page = null;
if (pageId) {
page = pages.find(p => {
const target = p.target();
return target && target._targetId === pageId;
});
}
if (!page) {
page = pages[pages.length - 1];
}
// Get the page
const pages = await browser.pages();
const page = pages.find(p => p.url().startsWith('http')) || pages[0];
if (!page) {
throw new Error('No page found');
}
if (!page) {
return { success: false, error: 'No page found in Chrome session' };
}
// Set up listener to capture SSL details when chrome_navigate loads the page
page.on('response', async (response) => {
try {
const request = response.request();
// Get CDP client for low-level access
const client = await page.target().createCDPSession();
// Enable Security domain
await client.send('Security.enable');
// Get security details from the loaded page
const securityState = await client.send('Security.getSecurityState');
sslInfo = {
url,
securityState: securityState.securityState,
schemeIsCryptographic: securityState.schemeIsCryptographic,
summary: securityState.summary || '',
};
// Try to get detailed certificate info if available
if (securityState.securityStateIssueIds && securityState.securityStateIssueIds.length > 0) {
sslInfo.issues = securityState.securityStateIssueIds;
}
// Get response security details from navigation
let mainResponse = null;
page.on('response', async (response) => {
if (response.url() === url || response.request().isNavigationRequest()) {
mainResponse = response;
// Only capture the main navigation request
if (!request.isNavigationRequest() || request.frame() !== page.mainFrame()) {
return;
}
});
// If we have security details from response
if (mainResponse) {
try {
const securityDetails = await mainResponse.securityDetails();
if (securityDetails) {
sslInfo.protocol = securityDetails.protocol();
sslInfo.subjectName = securityDetails.subjectName();
sslInfo.issuer = securityDetails.issuer();
sslInfo.validFrom = securityDetails.validFrom();
sslInfo.validTo = securityDetails.validTo();
sslInfo.certificateId = securityDetails.subjectName();
// Only capture if it's for our target URL
if (!response.url().startsWith(url.split('?')[0])) {
return;
}
const sanList = securityDetails.sanList();
if (sanList && sanList.length > 0) {
sslInfo.subjectAlternativeNames = sanList;
}
// Get security details from the response
const securityDetails = response.securityDetails();
let sslInfo = {};
if (securityDetails) {
sslInfo.protocol = securityDetails.protocol();
sslInfo.subjectName = securityDetails.subjectName();
sslInfo.issuer = securityDetails.issuer();
sslInfo.validFrom = securityDetails.validFrom();
sslInfo.validTo = securityDetails.validTo();
sslInfo.certificateId = securityDetails.subjectName();
sslInfo.securityState = 'secure';
sslInfo.schemeIsCryptographic = true;
const sanList = securityDetails.sanList();
if (sanList && sanList.length > 0) {
sslInfo.subjectAlternativeNames = sanList;
}
} catch (e) {
// Security details not available
} else if (response.url().startsWith('https://')) {
// HTTPS URL but no security details means something went wrong
sslInfo.securityState = 'unknown';
sslInfo.schemeIsCryptographic = true;
sslInfo.error = 'No security details available';
} else {
// Non-HTTPS URL
sslInfo.securityState = 'insecure';
sslInfo.schemeIsCryptographic = false;
}
// Write output directly to file
fs.writeFileSync(outputPath, JSON.stringify(sslInfo, null, 2));
} catch (e) {
// Ignore errors
}
});
await client.detach();
// Write output
fs.writeFileSync(outputPath, JSON.stringify(sslInfo, null, 2));
return { success: true, output: outputPath, sslInfo };
} catch (e) {
return { success: false, error: `${e.name}: ${e.message}` };
} finally {
if (browser) {
browser.disconnect();
}
}
// Don't disconnect - keep browser connection alive
return { browser, page };
}
async function main() {
@@ -163,75 +158,83 @@ async function main() {
const snapshotId = args.snapshot_id;
if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__16_ssl.js --url=<url> --snapshot-id=<uuid>');
console.error('Usage: on_Snapshot__23_ssl.js --url=<url> --snapshot-id=<uuid>');
process.exit(1);
}
if (!getEnvBool('SAVE_SSL', true)) {
console.log('Skipping (SAVE_SSL=False)');
const result = {
extractor: EXTRACTOR_NAME,
status: 'skipped',
url,
snapshot_id: snapshotId,
};
console.log(`RESULT_JSON=${JSON.stringify(result)}`);
process.exit(0);
}
const startTs = new Date();
let status = 'failed';
let output = null;
let error = '';
try {
// Check if enabled
if (!getEnvBool('SAVE_SSL', true)) {
console.log('Skipping SSL (SAVE_SSL=False)');
status = 'skipped';
const endTs = new Date();
console.log(`START_TS=${startTs.toISOString()}`);
console.log(`END_TS=${endTs.toISOString()}`);
console.log(`STATUS=${status}`);
console.log(`RESULT_JSON=${JSON.stringify({extractor: EXTRACTOR_NAME, status, url, snapshot_id: snapshotId})}`);
process.exit(0);
}
// Set up listener
await setupListener(url);
const result = await extractSsl(url);
// Write PID file so chrome_cleanup can kill us
fs.writeFileSync(path.join(OUTPUT_DIR, PID_FILE), String(process.pid));
if (result.success) {
status = 'succeeded';
output = result.output;
const protocol = result.sslInfo?.protocol || 'unknown';
console.log(`SSL details extracted: ${protocol}`);
} else {
status = 'failed';
error = result.error;
// Report success immediately (we're staying alive in background)
const endTs = new Date();
const duration = (endTs - startTs) / 1000;
console.log(`START_TS=${startTs.toISOString()}`);
console.log(`END_TS=${endTs.toISOString()}`);
console.log(`DURATION=${duration.toFixed(2)}`);
console.log(`OUTPUT=${OUTPUT_FILE}`);
console.log(`STATUS=succeeded`);
const result = {
extractor: EXTRACTOR_NAME,
url,
snapshot_id: snapshotId,
status: 'succeeded',
start_ts: startTs.toISOString(),
end_ts: endTs.toISOString(),
duration: Math.round(duration * 100) / 100,
output: OUTPUT_FILE,
};
console.log(`RESULT_JSON=${JSON.stringify(result)}`);
// Daemonize: detach from parent and keep running
// This process will be killed by chrome_cleanup
if (process.stdin.isTTY) {
process.stdin.pause();
}
process.stdin.unref();
process.stdout.end();
process.stderr.end();
// Keep the process alive indefinitely
// Will be killed by chrome_cleanup via the PID file
setInterval(() => {}, 1000);
} catch (e) {
error = `${e.name}: ${e.message}`;
status = 'failed';
}
const endTs = new Date();
const duration = (endTs - startTs) / 1000;
// Print results
console.log(`START_TS=${startTs.toISOString()}`);
console.log(`END_TS=${endTs.toISOString()}`);
console.log(`DURATION=${duration.toFixed(2)}`);
if (output) {
console.log(`OUTPUT=${output}`);
}
console.log(`STATUS=${status}`);
if (error) {
const error = `${e.name}: ${e.message}`;
console.error(`ERROR=${error}`);
const endTs = new Date();
const result = {
extractor: EXTRACTOR_NAME,
url,
snapshot_id: snapshotId,
status: 'failed',
start_ts: startTs.toISOString(),
end_ts: endTs.toISOString(),
error,
};
console.log(`RESULT_JSON=${JSON.stringify(result)}`);
process.exit(1);
}
// Print JSON result
const resultJson = {
extractor: EXTRACTOR_NAME,
url,
snapshot_id: snapshotId,
status,
start_ts: startTs.toISOString(),
end_ts: endTs.toISOString(),
duration: Math.round(duration * 100) / 100,
output,
error: error || null,
};
console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
process.exit(status === 'succeeded' ? 0 : 1);
}
main().catch(e => {

View File

@@ -7,8 +7,8 @@
*
* Extension: https://chromewebstore.google.com/detail/cjpalhdlnbpafiamejdnhcphjbkeiagm
*
* Priority: 03 (early) - Must install before Chrome session starts
* Hook: on_Snapshot
* Priority: 03 (early) - Must install before Chrome session starts at Crawl level
* Hook: on_Crawl (runs once per crawl, not per snapshot)
*
* This extension automatically:
* - Blocks ads, trackers, and malware domains

View File

@@ -751,13 +751,15 @@
}
.messagelist li.warning {
background: #fffbeb;
background: #fffbeb !important;
background-image: none !important;
border: 1px solid #fde68a;
color: #92400e;
}
.messagelist li.error {
background: #fef2f2;
background: #fef2f2 !important;
background-image: none !important;
border: 1px solid #fecaca;
color: #991b1b;
}
@@ -916,11 +918,13 @@
gap: 12px;
}
#toolbar form {
#toolbar form,
#changelist-search {
display: flex;
align-items: center;
gap: 8px;
flex: 1;
flex: 0 1 auto;
max-width: 500px;
}
#searchbar {

View File

@@ -162,10 +162,15 @@
padding: 10px 14px;
background: rgba(0,0,0,0.2);
cursor: pointer;
text-decoration: none;
color: inherit;
}
#progress-monitor .crawl-header:hover {
background: rgba(88, 166, 255, 0.1);
}
#progress-monitor a.crawl-header:visited {
color: inherit;
}
#progress-monitor .crawl-icon {
font-size: 16px;
width: 20px;
@@ -252,10 +257,15 @@
gap: 10px;
padding: 8px 12px;
cursor: pointer;
text-decoration: none;
color: inherit;
}
#progress-monitor .snapshot-header:hover {
background: rgba(88, 166, 255, 0.05);
}
#progress-monitor a.snapshot-header:visited {
color: inherit;
}
#progress-monitor .snapshot-icon {
font-size: 14px;
width: 18px;
@@ -391,15 +401,6 @@
color: #f85149;
}
/* Expand/Collapse Icons */
#progress-monitor .expand-icon {
color: #8b949e;
font-size: 10px;
transition: transform 0.2s;
}
#progress-monitor .expand-icon.expanded {
transform: rotate(90deg);
}
</style>
<div id="progress-monitor">
@@ -449,8 +450,6 @@
let pollInterval = null;
let isCollapsed = localStorage.getItem('progress-monitor-collapsed') === 'true';
let expandedCrawls = new Set(JSON.parse(localStorage.getItem('progress-monitor-expanded-crawls') || '[]'));
let expandedSnapshots = new Set(JSON.parse(localStorage.getItem('progress-monitor-expanded-snapshots') || '[]'));
// Baselines for resettable counters
let succeededBaseline = parseInt(localStorage.getItem('progress-succeeded-baseline') || '0');
@@ -496,9 +495,8 @@
}
function renderSnapshot(snapshot, crawlId) {
const snapshotKey = `${crawlId}-${snapshot.id}`;
const isExpanded = expandedSnapshots.has(snapshotKey);
const statusIcon = snapshot.status === 'started' ? '&#8635;' : '&#128196;';
const adminUrl = `/admin/core/snapshot/${snapshot.id}/change/`;
let extractorHtml = '';
if (snapshot.all_extractors && snapshot.all_extractors.length > 0) {
@@ -507,16 +505,15 @@
a.extractor.localeCompare(b.extractor)
);
extractorHtml = `
<div class="extractor-list" style="${isExpanded ? '' : 'display:none'}">
<div class="extractor-list">
${sortedExtractors.map(e => renderExtractor(e)).join('')}
</div>
`;
}
return `
<div class="snapshot-item" data-snapshot-key="${snapshotKey}">
<div class="snapshot-header" onclick="window.toggleSnapshot('${snapshotKey}')">
<span class="expand-icon ${isExpanded ? 'expanded' : ''}">${snapshot.all_extractors?.length ? '&#9654;' : ''}</span>
<div class="snapshot-item">
<a class="snapshot-header" href="${adminUrl}">
<span class="snapshot-icon">${statusIcon}</span>
<div class="snapshot-info">
<div class="snapshot-url">${formatUrl(snapshot.url)}</div>
@@ -526,7 +523,7 @@
</div>
</div>
<span class="status-badge ${snapshot.status}">${snapshot.status}</span>
</div>
</a>
<div class="snapshot-progress">
<div class="progress-bar-container">
<div class="progress-bar snapshot ${snapshot.status === 'started' && snapshot.progress === 0 ? 'indeterminate' : ''}"
@@ -539,8 +536,8 @@
}
function renderCrawl(crawl) {
const isExpanded = expandedCrawls.has(crawl.id);
const statusIcon = crawl.status === 'started' ? '&#8635;' : '&#128269;';
const adminUrl = `/admin/crawls/crawl/${crawl.id}/change/`;
let snapshotsHtml = '';
if (crawl.active_snapshots && crawl.active_snapshots.length > 0) {
@@ -583,8 +580,7 @@
return `
<div class="crawl-item" data-crawl-id="${crawl.id}">
<div class="crawl-header" onclick="window.toggleCrawl('${crawl.id}')">
<span class="expand-icon ${isExpanded ? 'expanded' : ''}">${crawl.active_snapshots?.length ? '&#9654;' : ''}</span>
<a class="crawl-header" href="${adminUrl}">
<span class="crawl-icon">${statusIcon}</span>
<div class="crawl-info">
<div class="crawl-label">${crawl.label}</div>
@@ -596,7 +592,7 @@
<span style="color:#8b949e">${crawl.pending_snapshots} pending</span>
</div>
<span class="status-badge ${crawl.status}">${crawl.status}</span>
</div>
</a>
<div class="crawl-progress">
<div class="progress-bar-container">
<div class="progress-bar crawl ${crawl.status === 'started' && crawl.progress === 0 ? 'indeterminate' : ''}"
@@ -604,7 +600,7 @@
</div>
</div>
${warningHtml}
<div class="crawl-body" style="${isExpanded ? '' : 'display:none'}">
<div class="crawl-body">
<div class="snapshot-list">
${snapshotsHtml}
</div>
@@ -613,41 +609,6 @@
`;
}
window.toggleCrawl = function(crawlId) {
const item = document.querySelector(`[data-crawl-id="${crawlId}"]`);
const body = item.querySelector('.crawl-body');
const icon = item.querySelector('.expand-icon');
if (expandedCrawls.has(crawlId)) {
expandedCrawls.delete(crawlId);
body.style.display = 'none';
icon.classList.remove('expanded');
} else {
expandedCrawls.add(crawlId);
body.style.display = '';
icon.classList.add('expanded');
}
localStorage.setItem('progress-monitor-expanded-crawls', JSON.stringify([...expandedCrawls]));
};
window.toggleSnapshot = function(snapshotKey) {
const item = document.querySelector(`[data-snapshot-key="${snapshotKey}"]`);
const extractorList = item.querySelector('.extractor-list');
const icon = item.querySelector('.expand-icon');
if (!extractorList) return;
if (expandedSnapshots.has(snapshotKey)) {
expandedSnapshots.delete(snapshotKey);
extractorList.style.display = 'none';
icon.classList.remove('expanded');
} else {
expandedSnapshots.add(snapshotKey);
extractorList.style.display = '';
icon.classList.add('expanded');
}
localStorage.setItem('progress-monitor-expanded-snapshots', JSON.stringify([...expandedSnapshots]));
};
function updateProgress(data) {
// Calculate if there's activity

View File

@@ -66,9 +66,9 @@ class Orchestrator:
"""
WORKER_TYPES: list[Type[Worker]] = [CrawlWorker, SnapshotWorker, ArchiveResultWorker]
# Configuration
POLL_INTERVAL: float = 1.0
POLL_INTERVAL: float = 2.0 # How often to check for new work (seconds)
IDLE_TIMEOUT: int = 3 # Exit after N idle ticks (0 = never exit)
MAX_WORKERS_PER_TYPE: int = 4 # Max workers per model type
MAX_TOTAL_WORKERS: int = 12 # Max workers across all types

View File

@@ -3,6 +3,7 @@ __package__ = 'archivebox.workers'
import sys
import time
import signal
import socket
import psutil
import shutil
import subprocess
@@ -47,6 +48,16 @@ SERVER_WORKER = lambda host, port: {
"redirect_stderr": "true",
}
def is_port_in_use(host: str, port: int) -> bool:
"""Check if a port is already in use."""
try:
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
s.bind((host, port))
return False
except OSError:
return True
@cache
def get_sock_file():
"""Get the path to the supervisord socket file, symlinking to a shorter path if needed due to unix path length limits"""
@@ -161,9 +172,10 @@ def stop_existing_supervisord_process():
except subprocess.TimeoutExpired:
_supervisord_proc.kill()
_supervisord_proc.wait(timeout=2)
except (BaseException, BrokenPipeError, IOError, KeyboardInterrupt):
except (BrokenPipeError, IOError):
pass
_supervisord_proc = None
finally:
_supervisord_proc = None
return
# Fallback: if pid file exists, load PID int and kill that process
@@ -194,7 +206,7 @@ def stop_existing_supervisord_process():
pass
except psutil.NoSuchProcess:
pass
except (BaseException, BrokenPipeError, IOError, KeyboardInterrupt):
except (BrokenPipeError, IOError):
pass
finally:
try:
@@ -423,15 +435,8 @@ def tail_multiple_worker_logs(log_files: list[str], follow=True, proc=None):
for log_path in log_paths:
try:
f = open(log_path, 'r')
# Don't seek to end - show recent content so user sees something
# Go to end minus 4KB to show some recent logs
f.seek(0, 2) # Go to end first
file_size = f.tell()
if file_size > 4096:
f.seek(file_size - 4096)
f.readline() # Skip partial line
else:
f.seek(0) # Small file, read from start
# Seek to end - only show NEW logs from now on, not old logs
f.seek(0, 2) # Go to end
file_handles.append((log_path, f))
print(f" [tailing {log_path.name}]")
@@ -536,7 +541,7 @@ def start_server_workers(host='0.0.0.0', port='8000', daemonize=False):
finally:
# Ensure supervisord and all children are stopped
stop_existing_supervisord_process()
time.sleep(0.5)
time.sleep(1.0) # Give processes time to fully terminate
def start_cli_workers(watch=False):
@@ -563,7 +568,7 @@ def start_cli_workers(watch=False):
finally:
# Ensure supervisord and all children are stopped
stop_existing_supervisord_process()
time.sleep(0.5)
time.sleep(1.0) # Give processes time to fully terminate
return [ORCHESTRATOR_WORKER]

View File

@@ -67,8 +67,8 @@ class Worker:
# Configuration (can be overridden by subclasses)
MAX_TICK_TIME: ClassVar[int] = 60
MAX_CONCURRENT_TASKS: ClassVar[int] = 1
POLL_INTERVAL: ClassVar[float] = 1.0
IDLE_TIMEOUT: ClassVar[int] = 10 # Exit after N idle iterations (10 sec at 1.0 poll interval)
POLL_INTERVAL: ClassVar[float] = 0.2 # How often to check for new work (seconds)
IDLE_TIMEOUT: ClassVar[int] = 50 # Exit after N idle iterations (10 sec at 0.2 poll interval)
def __init__(self, worker_id: int = 0, daemon: bool = False, **kwargs: Any):
self.worker_id = worker_id
@@ -214,28 +214,33 @@ class Worker:
self.idle_count = 0
# Build metadata for task start
start_metadata = {'task_id': str(obj.pk)}
start_metadata = {}
url = None
if hasattr(obj, 'url'):
# SnapshotWorker
url = str(obj.url) if obj.url else None
else:
url = None
elif hasattr(obj, 'snapshot') and hasattr(obj.snapshot, 'url'):
# ArchiveResultWorker
url = str(obj.snapshot.url) if obj.snapshot.url else None
elif hasattr(obj, 'get_urls_list'):
# CrawlWorker
urls = obj.get_urls_list()
url = urls[0] if urls else None
extractor = None
if hasattr(obj, 'extractor'):
# ArchiveResultWorker
# ArchiveResultWorker, Crawl
extractor = obj.extractor
start_metadata['extractor'] = extractor
log_worker_event(
worker_type=worker_type_name,
event='Processing...',
event='Starting...',
indent_level=indent_level,
pid=self.pid,
worker_id=str(self.worker_id),
url=url,
extractor=extractor,
metadata=start_metadata,
metadata=start_metadata if start_metadata else None,
)
start_time = time.time()
@@ -244,7 +249,6 @@ class Worker:
# Build metadata for task completion
complete_metadata = {
'task_id': str(obj.pk),
'duration': elapsed,
'status': 'success' if success else 'failed',
}

View File

@@ -1,101 +0,0 @@
#!/bin/bash
set -e
echo "=========================================="
echo "Testing Chrome Extension System"
echo "=========================================="
# Get absolute path to project root
PROJECT_ROOT="$(cd "$(dirname "$0")" && pwd)"
# Set up test environment with absolute paths
export DATA_DIR="$PROJECT_ROOT/data"
export ACTIVE_PERSONA="Test"
export CHROME_EXTENSIONS_DIR="$PROJECT_ROOT/data/personas/Test/chrome_extensions"
export API_KEY_2CAPTCHA="test_api_key_12345"
# Clean up any previous test data
echo ""
echo "[1/6] Cleaning up previous test data..."
rm -rf "$CHROME_EXTENSIONS_DIR"
rm -rf "$PROJECT_ROOT/chrome_session"
# Also clean up any files created in plugin directories from previous runs
find "$PROJECT_ROOT/archivebox/plugins" -type d -name "data" -exec rm -rf {} + 2>/dev/null || true
mkdir -p "$CHROME_EXTENSIONS_DIR"
echo "✓ Clean slate ready"
# Test 1: Install captcha2 extension
echo ""
echo "[2/6] Testing captcha2 extension installation..."
node "$PROJECT_ROOT/archivebox/plugins/captcha2/on_Snapshot__01_captcha2.js"
if [ -f "$CHROME_EXTENSIONS_DIR/captcha2.extension.json" ]; then
echo "✓ captcha2.extension.json created"
else
echo "✗ Failed to create captcha2.extension.json"
exit 1
fi
# Test 2: Check caching (run again, should skip)
echo ""
echo "[3/6] Testing cache (should skip re-installation)..."
node "$PROJECT_ROOT/archivebox/plugins/captcha2/on_Snapshot__01_captcha2.js"
echo "✓ Cache check passed"
# Test 3: Install other extensions
echo ""
echo "[4/6] Testing other extensions..."
node "$PROJECT_ROOT/archivebox/plugins/istilldontcareaboutcookies/on_Snapshot__02_istilldontcareaboutcookies.js"
node "$PROJECT_ROOT/archivebox/plugins/ublock/on_Snapshot__03_ublock.js"
node "$PROJECT_ROOT/archivebox/plugins/singlefile/on_Snapshot__04_singlefile.js"
echo "✓ All extensions installed"
# Test 4: List installed extensions
echo ""
echo "[5/6] Verifying extension files..."
ls -lh "$CHROME_EXTENSIONS_DIR"/*.extension.json 2>/dev/null || echo "No extension.json files found"
# Count extensions
EXT_COUNT=$(ls -1 "$CHROME_EXTENSIONS_DIR"/*.extension.json 2>/dev/null | wc -l | tr -d ' ')
echo ""
echo "Found $EXT_COUNT extension metadata files"
if [ "$EXT_COUNT" -ge "3" ]; then
echo "✓ Expected extensions installed"
else
echo "✗ Expected at least 3 extensions, found $EXT_COUNT"
exit 1
fi
# Test 5: Check unpacked directories
echo ""
echo "[6/6] Checking unpacked extension directories..."
UNPACKED_COUNT=$(find "$CHROME_EXTENSIONS_DIR" -type d -name "*__*" 2>/dev/null | wc -l | tr -d ' ')
echo "Found $UNPACKED_COUNT unpacked extension directories"
if [ "$UNPACKED_COUNT" -ge "3" ]; then
echo "✓ Extensions unpacked successfully"
else
echo "✗ Expected at least 3 unpacked directories, found $UNPACKED_COUNT"
exit 1
fi
# Summary
echo ""
echo "=========================================="
echo "✓ All tests passed!"
echo "=========================================="
echo ""
echo "Installed extensions:"
for json_file in "$CHROME_EXTENSIONS_DIR"/*.extension.json; do
if [ -f "$json_file" ]; then
NAME=$(node -e "console.log(require('$json_file').name)")
VERSION=$(node -e "console.log(require('$json_file').version || 'unknown')")
echo " - $NAME (v$VERSION)"
fi
done
echo ""
echo "To clean up test data:"
echo " rm -rf ./data/personas/Test"