From 4ccb0863bbd2ed7928991cf53bade2e719193f46 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sun, 28 Dec 2025 05:29:24 -0800 Subject: [PATCH] continue renaming extractor to plugin, add plan for hook concurrency, add chrome kill helper script --- CLAUDE.md | 5 + TODO_hook_concurrency.md | 136 ++++++---- archivebox/core/admin_archiveresults.py | 4 +- archivebox/core/admin_snapshots.py | 6 +- archivebox/core/models.py | 4 +- archivebox/core/statemachines.py | 12 +- archivebox/core/views.py | 42 +-- archivebox/crawls/models.py | 2 +- archivebox/misc/logging_util.py | 16 +- archivebox/misc/process_utils.py | 248 +++--------------- archivebox/misc/shell_welcome_message.py | 2 +- .../on_Snapshot__39_accessibility.js | 14 +- .../on_Snapshot__13_archive_org.py | 2 +- .../chrome/on_Crawl__20_chrome_launch.bg.js | 108 +------- .../chrome/on_Snapshot__20_chrome_tab.bg.js | 2 +- .../chrome/on_Snapshot__30_chrome_navigate.js | 2 +- .../on_Snapshot__21_consolelog.bg.js | 2 +- archivebox/plugins/dom/on_Snapshot__36_dom.js | 14 +- .../favicon/on_Snapshot__11_favicon.py | 2 +- .../forumdl/on_Snapshot__53_forumdl.py | 2 +- .../gallerydl/on_Snapshot__52_gallerydl.py | 2 +- archivebox/plugins/git/on_Snapshot__12_git.py | 2 +- .../headers/on_Snapshot__33_headers.js | 2 +- .../htmltotext/on_Snapshot__54_htmltotext.py | 2 +- .../plugins/media/on_Snapshot__51_media.py | 2 +- .../mercury/on_Snapshot__53_mercury.py | 2 +- .../papersdl/on_Snapshot__54_papersdl.py | 2 +- .../on_Snapshot__40_parse_dom_outlinks.js | 16 +- .../on_Snapshot__60_parse_html_urls.py | 4 +- .../tests/test_parse_html_urls.py | 2 +- .../on_Snapshot__64_parse_jsonl_urls.py | 4 +- .../tests/test_parse_jsonl_urls.py | 2 +- .../on_Snapshot__63_parse_netscape_urls.py | 4 +- .../on_Snapshot__61_parse_rss_urls.py | 4 +- .../test_parse_rss_urls_comprehensive.py | 2 +- .../on_Snapshot__62_parse_txt_urls.py | 4 +- .../tests/test_parse_txt_urls.py | 2 +- archivebox/plugins/pdf/on_Snapshot__35_pdf.js | 14 +- .../{extractor_utils.py => plugin_utils.py} | 12 +- .../on_Snapshot__52_readability.py | 2 +- .../redirects/on_Snapshot__31_redirects.bg.js | 4 +- .../responses/on_Snapshot__24_responses.bg.js | 2 +- .../screenshot/on_Snapshot__34_screenshot.js | 14 +- .../on_Snapshot__91_index_sonic.py | 10 +- .../on_Snapshot__90_index_sqlite.py | 10 +- archivebox/plugins/seo/on_Snapshot__38_seo.js | 14 +- .../singlefile/on_Snapshot__37_singlefile.py | 2 +- .../plugins/ssl/on_Snapshot__23_ssl.bg.js | 2 +- .../on_Snapshot__31_staticfile.bg.js | 12 +- .../plugins/title/on_Snapshot__32_title.js | 2 +- .../plugins/wget/on_Snapshot__50_wget.py | 2 +- bin/kill_chrome.sh | 156 +++++++++++ tests/test_recursive_crawl.py | 8 +- 53 files changed, 456 insertions(+), 493 deletions(-) rename archivebox/plugins/{extractor_utils.py => plugin_utils.py} (97%) create mode 100755 bin/kill_chrome.sh diff --git a/CLAUDE.md b/CLAUDE.md index 5e6040b0..ae17cc52 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -212,3 +212,8 @@ sqlite3 /path/to/index.sqlite3 "PRAGMA table_info(core_snapshot);" ```bash sudo -u testuser bash -c 'source .venv/bin/activate && python -m pytest archivebox/tests/test_migrations_08_to_09.py -xvs 2>&1 | head -200' ``` + +### Kill Zombie Chrome Processes +```bash +./bin/kill_chrome.sh +``` diff --git a/TODO_hook_concurrency.md b/TODO_hook_concurrency.md index 82190e7f..f8f1bcf7 100644 --- a/TODO_hook_concurrency.md +++ b/TODO_hook_concurrency.md @@ -6,6 +6,48 @@ Snapshot.run() should enforce that snapshot hooks are run in **10 discrete, sequ For every discovered hook script, ArchiveBox should create an ArchiveResult in `queued` state, then manage running them using `retry_at` and inline logic to enforce this ordering. +## Design Decisions + +### ArchiveResult Schema +- Add `ArchiveResult.hook_name` (CharField, nullable) - just filename, e.g., `'on_Snapshot__20_chrome_tab.bg.js'` +- Keep `ArchiveResult.plugin` - still important (plugin directory name) +- Step number derived on-the-fly from `hook_name` via `extract_step(hook_name)` - not stored + +### Snapshot Schema +- Add `Snapshot.current_step` (IntegerField 0-9, default=0) +- Integrate with `SnapshotMachine` state transitions for step advancement + +### Hook Discovery & Execution +- `Snapshot.run()` discovers all hooks upfront, creates one AR per hook with `hook_name` set +- All ARs for a given step can be claimed and executed in parallel by workers +- Workers claim ARs where `extract_step(ar.hook_name) <= snapshot.current_step` +- `Snapshot.advance_step_if_ready()` increments `current_step` when: + - All **foreground** hooks in current step are finished (SUCCEEDED/FAILED/SKIPPED) + - Background hooks don't block advancement (they continue running) + - Called from `SnapshotMachine` state transitions + +### ArchiveResult.run() Behavior +- If `self.hook_name` is set: run that single hook +- If `self.hook_name` is None: discover all hooks for `self.plugin` and run sequentially +- Background hooks detected by `.bg.` in filename (e.g., `on_Snapshot__20_chrome_tab.bg.js`) +- Background hooks return immediately (ArchiveResult stays in STARTED state) +- Foreground hooks wait for completion, update status from JSONL output + +### Hook Execution Flow +1. **Within a step**: Workers claim all ARs for current step in parallel +2. **Foreground hooks** (no .bg): ArchiveResult waits for completion, transitions to SUCCEEDED/FAILED/SKIPPED +3. **Background hooks** (.bg): ArchiveResult transitions to STARTED, hook continues running +4. **Step advancement**: `Snapshot.advance_step_if_ready()` checks: + - Are all foreground ARs in current step finished? (SUCCEEDED/FAILED/SKIPPED) + - Ignore ARs still in STARTED (background hooks) + - If yes, increment `current_step` +5. **Snapshot sealing**: When `current_step=9` and all foreground hooks done, kill background hooks via `Snapshot.cleanup()` + +### Unnumbered Hooks +- Extract step via `re.search(r'__(\d{2})_', hook_name)`, default to 9 if no match +- Log warning for unnumbered hooks +- Purely runtime derivation - no stored field + ## Hook Numbering Convention Hooks scripts are numbered `00` to `99` to control: @@ -31,20 +73,19 @@ on_Snapshot__53_media.bg.py ## Background (.bg) vs Foreground Scripts ### Foreground Scripts (no .bg suffix) -- Run sequentially within their step -- Block step progression until they exit -- Should exit naturally when work is complete +- Launch in parallel with other hooks in their step +- Step waits for all foreground hooks to complete or timeout - Get killed with SIGTERM if they exceed their `PLUGINNAME_TIMEOUT` +- Step advances when all foreground hooks finish ### Background Scripts (.bg suffix) -- Spawned and allowed to continue running -- Do NOT block step progression -- Run until **their own `PLUGINNAME_TIMEOUT` is reached** (not until step 99) -- Get polite SIGTERM when timeout expires, then SIGKILL 60s later if not exited -- Must implement their own concurrency control using filesystem (semaphore files, locks, etc.) +- Launch in parallel with other hooks in their step +- Do NOT block step progression - step can advance while they run +- Continue running across step boundaries until complete or timeout +- Get killed with SIGTERM when Snapshot transitions to SEALED (via `Snapshot.cleanup()`) - Should exit naturally when work is complete (best case) -**Important:** If a .bg script starts at step 05 with `MEDIA_TIMEOUT=3600s`, it gets the full 3600s regardless of when step 99 completes. It runs on its own timeline. +**Important:** A .bg script started in step 2 can keep running through steps 3, 4, 5... until the Snapshot seals or the hook exits naturally. ## Execution Step Guidelines @@ -268,54 +309,47 @@ archivebox/plugins/{plugin_name}/ ## Implementation Checklist -### Phase 1: Renumber Existing Hooks ✅ -- [ ] Renumber DOM extractors to 50-59 range -- [ ] Ensure pdf/screenshot are NOT .bg (need sequential access) -- [ ] Ensure media (ytdlp) IS .bg (can run for hours) -- [ ] Add step comments to each plugin for clarity +### Phase 1: Schema Migration ✅ +- [ ] Add `Snapshot.current_step` (IntegerField 0-9, default=0) +- [ ] Add `ArchiveResult.hook_name` (CharField, nullable) - just filename +- [ ] Create migration: `0033_snapshot_current_step_archiveresult_hook_name.py` -### Phase 2: Timeout Consistency ✅ -- [x] All plugins support `PLUGINNAME_TIMEOUT` env var -- [x] All plugins fall back to generic `TIMEOUT` env var -- [x] Background scripts handle SIGTERM gracefully (or exit naturally) +### Phase 2: Core Logic Updates +- [ ] Add `extract_step(hook_name)` utility in `archivebox/hooks.py` + - Extract first digit from `__XX_` pattern + - Default to 9 for unnumbered hooks +- [ ] Update `Snapshot.create_pending_archiveresults()` in `archivebox/core/models.py`: + - Discover all hooks (not plugins) + - Create one AR per hook with `hook_name` set +- [ ] Update `ArchiveResult.run()` in `archivebox/core/models.py`: + - If `hook_name` set: run single hook + - If `hook_name` None: discover all plugin hooks (existing behavior) +- [ ] Add `Snapshot.advance_step_if_ready()` method: + - Check if all foreground ARs in current step finished + - Increment `current_step` if ready + - Ignore background hooks (.bg) in completion check +- [ ] Integrate with `SnapshotMachine.is_finished()` in `archivebox/core/statemachines.py`: + - Call `advance_step_if_ready()` before checking if done -### Phase 3: Refactor Snapshot.run() -- [ ] Parse hook filenames to extract step number (first digit) -- [ ] Group hooks by step (0-9) -- [ ] Run each step sequentially -- [ ] Within each step: - - [ ] Launch foreground hooks sequentially - - [ ] Launch .bg hooks and track PIDs - - [ ] Wait for foreground hooks to complete before next step -- [ ] Track .bg script timeouts independently -- [ ] Send SIGTERM to .bg scripts when their timeout expires -- [ ] Send SIGKILL 60s after SIGTERM if not exited +### Phase 3: Worker Coordination +- [ ] Update worker AR claiming query in `archivebox/workers/worker.py`: + - Filter: `extract_step(ar.hook_name) <= snapshot.current_step` + - Note: May need to denormalize or use clever query since step is derived + - Alternative: Claim any AR in QUEUED state, check step in Python before processing -### Phase 4: ArchiveResult Management -- [ ] Create one ArchiveResult per hook (not per plugin) -- [ ] Set initial state to `queued` -- [ ] Update state based on JSONL output and exit code -- [ ] Set `retry_at` for hooks that exit non-zero with no JSONL -- [ ] Don't retry hooks that emit `{"status": "failed"}` - -### Phase 5: JSONL Streaming -- [ ] Parse stdout JSONL line-by-line during hook execution -- [ ] Create/update DB rows as JSONL is emitted (streaming mode) -- [ ] Handle partial JSONL on hook crash - -### Phase 6: Zombie Process Management -- [ ] Read `.pid` files from hook output directories -- [ ] Sweep zombies on cleanup -- [ ] Handle double-forked processes correctly +### Phase 4: Hook Renumbering +- [ ] Renumber hooks per renumbering map below +- [ ] Add `.bg` suffix to long-running hooks +- [ ] Test all hooks still work after renumbering ## Migration Path -### Backward Compatibility -During migration, support both old and new numbering: -1. Run hooks numbered 00-99 in step order -2. Run unnumbered hooks last (step 9) for compatibility -3. Log warnings for unnumbered hooks -4. Eventually require all hooks to be numbered +### Natural Compatibility +No special migration needed: +1. Existing ARs with `hook_name=None` continue to work (discover all plugin hooks at runtime) +2. New ARs get `hook_name` set (single hook per AR) +3. `ArchiveResult.run()` handles both cases naturally +4. Unnumbered hooks default to step 9 (log warning) ### Renumbering Map diff --git a/archivebox/core/admin_archiveresults.py b/archivebox/core/admin_archiveresults.py index 1acaf27a..e640e3e5 100644 --- a/archivebox/core/admin_archiveresults.py +++ b/archivebox/core/admin_archiveresults.py @@ -252,9 +252,9 @@ class ArchiveResultInline(admin.TabularInline): class ArchiveResultAdmin(BaseModelAdmin): - list_display = ('id', 'created_by', 'created_at', 'snapshot_info', 'tags_str', 'status', 'extractor_with_icon', 'cmd_str', 'output_str') + list_display = ('id', 'created_by', 'created_at', 'snapshot_info', 'tags_str', 'status', 'plugin_with_icon', 'cmd_str', 'output_str') sort_fields = ('id', 'created_by', 'created_at', 'plugin', 'status') - readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'extractor_with_icon', 'iface') + readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'plugin_with_icon', 'iface') search_fields = ('id', 'snapshot__url', 'plugin', 'output_str', 'cmd_version', 'cmd', 'snapshot__timestamp') autocomplete_fields = ['snapshot'] diff --git a/archivebox/core/admin_snapshots.py b/archivebox/core/admin_snapshots.py index 4f0217a3..ce89527e 100644 --- a/archivebox/core/admin_snapshots.py +++ b/archivebox/core/admin_snapshots.py @@ -46,9 +46,9 @@ class SnapshotActionForm(ActionForm): ), ) - # TODO: allow selecting actions for specific extractors? is this useful? - # extractor = forms.ChoiceField( - # choices=ArchiveResult.EXTRACTOR_CHOICES, + # TODO: allow selecting actions for specific extractor plugins? is this useful? + # plugin = forms.ChoiceField( + # choices=ArchiveResult.PLUGIN_CHOICES, # required=False, # widget=forms.MultileChoiceField(attrs={'class': "form-control"}) # ) diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 673c85a9..fbef95cd 100755 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -1041,7 +1041,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea return cache.get_or_set(cache_key, calc_tags_str) if not nocache else calc_tags_str() def icons(self) -> str: - """Generate HTML icons showing which extractors have succeeded for this snapshot""" + """Generate HTML icons showing which extractor plugins have succeeded for this snapshot""" from django.utils.html import format_html, mark_safe cache_key = f'result_icons:{self.pk}:{(self.downloaded_at or self.modified_at or self.created_at or self.bookmarked_at).timestamp()}' @@ -1475,7 +1475,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea priority = 50 elif 'index' in name_lower: priority = 100 - elif name_lower.startswith(('output', 'content', extractor_name)): + elif name_lower.startswith(('output', 'content', plugin_name)): priority = 50 elif ext in ('html', 'htm', 'pdf'): priority = 30 diff --git a/archivebox/core/statemachines.py b/archivebox/core/statemachines.py index 89dda0c8..cec2b64f 100644 --- a/archivebox/core/statemachines.py +++ b/archivebox/core/statemachines.py @@ -91,7 +91,7 @@ class SnapshotMachine(StateMachine, strict_states=True): retry_at=timezone.now() + timedelta(seconds=30), # if failed, wait 30s before retrying ) - # Run the snapshot - creates pending archiveresults for all enabled extractors + # Run the snapshot - creates pending archiveresults for all enabled plugins self.snapshot.run() # unlock the snapshot after we're done + set status = started @@ -179,15 +179,15 @@ class ArchiveResultMachine(StateMachine, strict_states=True): return can_start def is_succeeded(self) -> bool: - """Check if extraction succeeded (status was set by run_extractor()).""" + """Check if extractor plugin succeeded (status was set by run()).""" return self.archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED - + def is_failed(self) -> bool: - """Check if extraction failed (status was set by run_extractor()).""" + """Check if extractor plugin failed (status was set by run()).""" return self.archiveresult.status == ArchiveResult.StatusChoices.FAILED - + def is_skipped(self) -> bool: - """Check if extraction was skipped (status was set by run_extractor()).""" + """Check if extractor plugin was skipped (status was set by run()).""" return self.archiveresult.status == ArchiveResult.StatusChoices.SKIPPED def is_backoff(self) -> bool: diff --git a/archivebox/core/views.py b/archivebox/core/views.py index df17924a..37a885b2 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -96,8 +96,8 @@ class SnapshotView(View): if not key.endswith('_path') or not path or path.startswith('http'): continue - extractor_name = key.replace('_path', '') - if extractor_name in archiveresults: + plugin_name = key.replace('_path', '') + if plugin_name in archiveresults: continue # Already have this from ArchiveResult file_path = snap_dir / path @@ -107,8 +107,8 @@ class SnapshotView(View): try: file_size = file_path.stat().st_size if file_size >= 15_000: # Only show files > 15KB - archiveresults[extractor_name] = { - 'name': extractor_name, + archiveresults[plugin_name] = { + 'name': plugin_name, 'path': path, 'ts': ts_to_date_str(file_path.stat().st_mtime or 0), 'size': file_size, @@ -117,7 +117,7 @@ class SnapshotView(View): except OSError: continue - # Get available extractors from hooks (sorted by numeric prefix for ordering) + # Get available extractor plugins from hooks (sorted by numeric prefix for ordering) # Convert to base names for display ordering all_plugins = [get_extractor_name(e) for e in get_extractors()] preferred_types = tuple(all_plugins) @@ -437,7 +437,7 @@ class AddView(UserPassesTestMixin, FormView): parser = form.cleaned_data.get("parser", "auto") # default to auto-detect parser tag = form.cleaned_data["tag"] depth = 0 if form.cleaned_data["depth"] == "0" else 1 - extractors = ','.join(form.cleaned_data["archive_methods"]) + plugins = ','.join(form.cleaned_data["archive_methods"]) input_kwargs = { "urls": urls, "tag": tag, @@ -447,8 +447,8 @@ class AddView(UserPassesTestMixin, FormView): "out_dir": DATA_DIR, "created_by_id": self.request.user.pk, } - if extractors: - input_kwargs.update({"extractors": extractors}) + if plugins: + input_kwargs.update({"plugins": plugins}) from archivebox.config.permissions import HOSTNAME @@ -472,7 +472,7 @@ class AddView(UserPassesTestMixin, FormView): # 'INDEX_ONLY': index_only, # 'OVERWRITE': False, 'DEPTH': depth, - 'EXTRACTORS': extractors or '', + 'PLUGINS': plugins or '', # 'DEFAULT_PERSONA': persona or 'Default', } ) @@ -580,17 +580,17 @@ def live_progress_view(request): snapshot_results = snapshot.archiveresult_set.all() # Count in memory instead of DB queries - total_extractors = len(snapshot_results) - completed_extractors = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.SUCCEEDED) - failed_extractors = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.FAILED) - pending_extractors = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.QUEUED) + total_plugins = len(snapshot_results) + completed_plugins = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.SUCCEEDED) + failed_plugins = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.FAILED) + pending_plugins = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.QUEUED) # Calculate snapshot progress - snapshot_progress = int(((completed_extractors + failed_extractors) / total_extractors) * 100) if total_extractors > 0 else 0 + snapshot_progress = int(((completed_plugins + failed_plugins) / total_plugins) * 100) if total_plugins > 0 else 0 - # Get all extractors for this snapshot (already prefetched, sort in Python) + # Get all extractor plugins for this snapshot (already prefetched, sort in Python) # Order: started first, then queued, then completed - def extractor_sort_key(ar): + def plugin_sort_key(ar): status_order = { ArchiveResult.StatusChoices.STARTED: 0, ArchiveResult.StatusChoices.QUEUED: 1, @@ -605,7 +605,7 @@ def live_progress_view(request): 'plugin': ar.plugin, 'status': ar.status, } - for ar in sorted(snapshot_results, key=extractor_sort_key) + for ar in sorted(snapshot_results, key=plugin_sort_key) ] active_snapshots_for_crawl.append({ @@ -614,10 +614,10 @@ def live_progress_view(request): 'status': snapshot.status, 'started': snapshot.modified_at.isoformat() if snapshot.modified_at else None, 'progress': snapshot_progress, - 'total_extractors': total_extractors, - 'completed_extractors': completed_extractors, - 'failed_extractors': failed_extractors, - 'pending_extractors': pending_extractors, + 'total_plugins': total_plugins, + 'completed_plugins': completed_plugins, + 'failed_plugins': failed_plugins, + 'pending_plugins': pending_plugins, 'all_plugins': all_plugins, }) diff --git a/archivebox/crawls/models.py b/archivebox/crawls/models.py index b143f13f..f26ee5aa 100755 --- a/archivebox/crawls/models.py +++ b/archivebox/crawls/models.py @@ -196,7 +196,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith Add a URL to the crawl queue if not already present. Args: - entry: dict with 'url', optional 'depth', 'title', 'timestamp', 'tags', 'via_snapshot', 'via_extractor' + entry: dict with 'url', optional 'depth', 'title', 'timestamp', 'tags', 'via_snapshot', 'plugin' Returns: True if URL was added, False if skipped (duplicate or depth exceeded) diff --git a/archivebox/misc/logging_util.py b/archivebox/misc/logging_util.py index 36ab9c56..e1364eda 100644 --- a/archivebox/misc/logging_util.py +++ b/archivebox/misc/logging_util.py @@ -522,7 +522,7 @@ def log_worker_event( pid: Optional[int] = None, worker_id: Optional[str] = None, url: Optional[str] = None, - extractor: Optional[str] = None, + plugin: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None, error: Optional[Exception] = None, ) -> None: @@ -534,9 +534,9 @@ def log_worker_event( event: Event name (Starting, Completed, Failed, etc.) indent_level: Indentation level (0=Orchestrator, 1=CrawlWorker, 2=SnapshotWorker, 3=ArchiveResultWorker) pid: Process ID - worker_id: Worker ID (UUID for CrawlWorker, url for SnapshotWorker, extractor for ArchiveResultWorker) + worker_id: Worker ID (UUID for CrawlWorker, url for SnapshotWorker, plugin for ArchiveResultWorker) url: URL being processed (for SnapshotWorker/ArchiveResultWorker) - extractor: Extractor name (for ArchiveResultWorker) + plugin: Plugin name (for ArchiveResultWorker) metadata: Dict of metadata to show in curly braces error: Exception if event is an error """ @@ -544,7 +544,7 @@ def log_worker_event( from rich.markup import escape - # Build worker identifier (without URL/extractor) + # Build worker identifier (without URL/plugin) worker_parts = [worker_type] # Don't add pid/worker_id for DB operations (they happen in whatever process is running) if pid and worker_type != 'DB': @@ -556,12 +556,12 @@ def log_worker_event( worker_label_base = worker_parts[0] worker_bracket_content = ", ".join(worker_parts[1:]) if len(worker_parts) > 1 else None - # Build URL/extractor display (shown AFTER the label, outside brackets) + # Build URL/plugin display (shown AFTER the label, outside brackets) url_extractor_parts = [] if url: url_extractor_parts.append(f'url: {escape(url)}') - if extractor: - url_extractor_parts.append(f'extractor: {escape(extractor)}') + if plugin: + url_extractor_parts.append(f'extractor: {escape(plugin)}') url_extractor_str = ' | '.join(url_extractor_parts) if url_extractor_parts else '' @@ -623,7 +623,7 @@ def log_worker_event( text.append(f' {event}{error_str}', style=color) - # Add URL/extractor info first (more important) + # Add URL/plugin info first (more important) if url_extractor_str: text.append(f' | {url_extractor_str}') diff --git a/archivebox/misc/process_utils.py b/archivebox/misc/process_utils.py index 4856fc9d..9d3fe52d 100644 --- a/archivebox/misc/process_utils.py +++ b/archivebox/misc/process_utils.py @@ -1,14 +1,9 @@ """ -Cross-platform process validation utilities using psutil. +Process validation using psutil and filesystem mtime. -Uses filesystem mtime as a "password" to validate PIDs haven't been reused. -Since filesystem mtimes can be set arbitrarily, but process start times cannot, -we can detect PID reuse by comparing: - - PID file mtime (set to process start time when we launched it) - - Actual process start time (from psutil) - -If they match (within tolerance), it's our process. -If they don't match, the PID was reused by a different process. +Uses mtime as a "password": PID files are timestamped with process start time. +Since filesystem mtimes can be set arbitrarily but process start times cannot, +comparing them detects PID reuse. """ __package__ = 'archivebox.misc' @@ -20,245 +15,70 @@ from typing import Optional try: import psutil + PSUTIL_AVAILABLE = True except ImportError: - psutil = None + PSUTIL_AVAILABLE = False -def get_process_info(pid: int) -> Optional[dict]: - """ - Get process information using psutil. - - Args: - pid: Process ID - - Returns: - Dict with 'start_time', 'cmdline', 'name', 'status' or None if not found - """ - if psutil is None: - return None +def validate_pid_file(pid_file: Path, cmd_file: Optional[Path] = None, tolerance: float = 5.0) -> bool: + """Validate PID using mtime and optional cmd.sh. Returns True if process is ours.""" + if not PSUTIL_AVAILABLE or not pid_file.exists(): + return False try: + pid = int(pid_file.read_text().strip()) proc = psutil.Process(pid) - return { - 'start_time': proc.create_time(), # Unix epoch seconds - 'cmdline': proc.cmdline(), - 'name': proc.name(), - 'status': proc.status(), - } - except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess): - return None + # Check mtime matches process start time + if abs(pid_file.stat().st_mtime - proc.create_time()) > tolerance: + return False # PID reused -def validate_pid_file( - pid_file: Path, - cmd_file: Optional[Path] = None, - tolerance_seconds: float = 5.0 -) -> bool: - """ - Validate PID file using mtime as "password". - - Returns True only if ALL checks pass: - 1. PID file exists and contains valid integer - 2. Process with that PID exists - 3. File mtime matches process start time (within tolerance) - 4. If cmd_file provided, process cmdline contains expected args - - Args: - pid_file: Path to .pid file - cmd_file: Optional path to cmd.sh for command validation - tolerance_seconds: Allowed difference between mtime and start time - - Returns: - True if PID is validated, False if reused/invalid - """ - if psutil is None: - # Fallback: just check if process exists (no validation) - return _validate_pid_file_without_psutil(pid_file) - - # Check PID file exists - if not pid_file.exists(): - return False - - # Read PID - try: - pid = int(pid_file.read_text().strip()) - except (ValueError, OSError): - return False - - # Get process info - proc_info = get_process_info(pid) - if proc_info is None: - return False # Process doesn't exist - - # Check mtime matches process start time - try: - file_mtime = pid_file.stat().st_mtime - except OSError: - return False - - proc_start_time = proc_info['start_time'] - time_diff = abs(file_mtime - proc_start_time) - - if time_diff > tolerance_seconds: - # PID was reused by different process - return False - - # Validate command if provided - if cmd_file and cmd_file.exists(): - try: - expected_cmd = cmd_file.read_text().strip() - actual_cmdline = ' '.join(proc_info['cmdline']) - - # Check for key indicators (chrome, debug port, etc.) - # This is a heuristic - just checks if critical args are present - if '--remote-debugging-port' in expected_cmd: - if '--remote-debugging-port' not in actual_cmdline: + # Validate command if provided + if cmd_file and cmd_file.exists(): + cmd = cmd_file.read_text() + cmdline = ' '.join(proc.cmdline()) + if '--remote-debugging-port' in cmd and '--remote-debugging-port' not in cmdline: + return False + if ('chrome' in cmd.lower() or 'chromium' in cmd.lower()): + if 'chrome' not in proc.name().lower() and 'chromium' not in proc.name().lower(): return False - if 'chrome' in expected_cmd.lower() or 'chromium' in expected_cmd.lower(): - proc_name_lower = proc_info['name'].lower() - if 'chrome' not in proc_name_lower and 'chromium' not in proc_name_lower: - return False - - except OSError: - pass # Can't validate command, but other checks passed - - return True - - -def _validate_pid_file_without_psutil(pid_file: Path) -> bool: - """ - Fallback validation when psutil not available. - Only checks if process exists, no validation. - """ - if not pid_file.exists(): - return False - - try: - pid = int(pid_file.read_text().strip()) - os.kill(pid, 0) # Signal 0 = check existence return True - except (OSError, ValueError, ProcessLookupError): + except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess, ValueError, OSError): return False def write_pid_file_with_mtime(pid_file: Path, pid: int, start_time: float): - """ - Write PID file and set mtime to process start time. - - This creates a "password" that can be validated later to ensure - the PID hasn't been reused by a different process. - - Args: - pid_file: Path to .pid file to create - pid: Process ID to write - start_time: Process start time as Unix epoch seconds - """ + """Write PID file and set mtime to process start time.""" pid_file.write_text(str(pid)) - - # Set both atime and mtime to process start time try: os.utime(pid_file, (start_time, start_time)) except OSError: - # If we can't set mtime, file is still written - # Validation will be less reliable but won't break - pass + pass # mtime optional, validation degrades gracefully def write_cmd_file(cmd_file: Path, cmd: list[str]): - """ - Write command script for validation. - - Args: - cmd_file: Path to cmd.sh to create - cmd: Command list (e.g., ['chrome', '--remote-debugging-port=9222', ...]) - """ - # Shell escape arguments with spaces or special chars - def shell_escape(arg: str) -> str: - if ' ' in arg or '"' in arg or "'" in arg or '$' in arg: - # Escape double quotes and wrap in double quotes - return f'"{arg.replace(chr(34), chr(92) + chr(34))}"' - return arg - - escaped_cmd = [shell_escape(arg) for arg in cmd] - script = '#!/bin/bash\n' + ' '.join(escaped_cmd) + '\n' + """Write shell command script.""" + def escape(arg: str) -> str: + return f'"{arg.replace(chr(34), chr(92)+chr(34))}"' if any(c in arg for c in ' "$') else arg + script = '#!/bin/bash\n' + ' '.join(escape(arg) for arg in cmd) + '\n' cmd_file.write_text(script) try: cmd_file.chmod(0o755) except OSError: - pass # Best effort + pass -def safe_kill_process( - pid_file: Path, - cmd_file: Optional[Path] = None, - signal_num: int = 15, # SIGTERM - validate: bool = True -) -> bool: - """ - Safely kill a process with validation. - - Args: - pid_file: Path to .pid file - cmd_file: Optional path to cmd.sh for validation - signal_num: Signal to send (default SIGTERM=15) - validate: If True, validate process identity before killing - - Returns: - True if process was killed, False if not found or validation failed - """ - if not pid_file.exists(): +def safe_kill_process(pid_file: Path, cmd_file: Optional[Path] = None, signal_num: int = 15) -> bool: + """Kill process after validation. Returns True if killed.""" + if not validate_pid_file(pid_file, cmd_file): + pid_file.unlink(missing_ok=True) # Clean stale file return False - # Validate process identity first - if validate: - if not validate_pid_file(pid_file, cmd_file): - # PID reused by different process, don't kill - # Clean up stale PID file - try: - pid_file.unlink() - except OSError: - pass - return False - - # Read PID and kill try: pid = int(pid_file.read_text().strip()) os.kill(pid, signal_num) return True except (OSError, ValueError, ProcessLookupError): return False - - -def cleanup_stale_pid_files(directory: Path, cmd_file_name: str = 'cmd.sh') -> int: - """ - Remove stale PID files from directory. - - A PID file is stale if: - - Process no longer exists, OR - - Process exists but validation fails (PID reused) - - Args: - directory: Directory to scan for *.pid files - cmd_file_name: Name of command file for validation (default: cmd.sh) - - Returns: - Number of stale PID files removed - """ - if not directory.exists(): - return 0 - - removed = 0 - for pid_file in directory.glob('**/*.pid'): - cmd_file = pid_file.parent / cmd_file_name - - # Check if valid - if not validate_pid_file(pid_file, cmd_file): - try: - pid_file.unlink() - removed += 1 - except OSError: - pass - - return removed diff --git a/archivebox/misc/shell_welcome_message.py b/archivebox/misc/shell_welcome_message.py index c67a647d..b99e5867 100644 --- a/archivebox/misc/shell_welcome_message.py +++ b/archivebox/misc/shell_welcome_message.py @@ -53,5 +53,5 @@ if __name__ == '__main__': prnt(' add[blink][deep_sky_blue4]?[/deep_sky_blue4][/blink] [grey53]# add ? after anything to get help[/]') prnt(' add("https://example.com/some/new/url") [grey53]# call CLI methods from the shell[/]') prnt(' snap = Snapshot.objects.filter(url__contains="https://example.com").last() [grey53]# query for individual snapshots[/]') - prnt(' snap.archiveresult_set.all() [grey53]# see extractor results[/]') + prnt(' snap.archiveresult_set.all() [grey53]# see extractor plugin results[/]') prnt(' bool(re.compile(CONFIG.URL_DENYLIST).search("https://example.com/abc.exe")) [grey53]# test out a config change[/]') diff --git a/archivebox/plugins/accessibility/on_Snapshot__39_accessibility.js b/archivebox/plugins/accessibility/on_Snapshot__39_accessibility.js index 9b610aa2..078cc3a4 100755 --- a/archivebox/plugins/accessibility/on_Snapshot__39_accessibility.js +++ b/archivebox/plugins/accessibility/on_Snapshot__39_accessibility.js @@ -20,7 +20,7 @@ const path = require('path'); const puppeteer = require('puppeteer-core'); // Extractor metadata -const EXTRACTOR_NAME = 'accessibility'; +const PLUGIN_NAME = 'accessibility'; const OUTPUT_DIR = '.'; const OUTPUT_FILE = 'accessibility.json'; const CHROME_SESSION_DIR = '../chrome'; @@ -223,10 +223,14 @@ async function main() { process.exit(0); } - // Wait for page to be fully loaded - const pageLoaded = await waitForChromeTabLoaded(60000); - if (!pageLoaded) { - throw new Error('Page not loaded after 60s (chrome_navigate must complete first)'); + // Check if Chrome session exists, then wait for page load + const cdpUrl = getCdpUrl(); + if (cdpUrl) { + // Wait for page to be fully loaded + const pageLoaded = await waitForChromeTabLoaded(60000); + if (!pageLoaded) { + throw new Error('Page not loaded after 60s (chrome_navigate must complete first)'); + } } const result = await extractAccessibility(url); diff --git a/archivebox/plugins/archive_org/on_Snapshot__13_archive_org.py b/archivebox/plugins/archive_org/on_Snapshot__13_archive_org.py index 24a0075f..79c41934 100644 --- a/archivebox/plugins/archive_org/on_Snapshot__13_archive_org.py +++ b/archivebox/plugins/archive_org/on_Snapshot__13_archive_org.py @@ -25,7 +25,7 @@ import rich_click as click # Extractor metadata -EXTRACTOR_NAME = 'archive_org' +PLUGIN_NAME = 'archive_org' OUTPUT_DIR = '.' OUTPUT_FILE = 'archive.org.txt' diff --git a/archivebox/plugins/chrome/on_Crawl__20_chrome_launch.bg.js b/archivebox/plugins/chrome/on_Crawl__20_chrome_launch.bg.js index 3ae9a039..781d8c5f 100644 --- a/archivebox/plugins/chrome/on_Crawl__20_chrome_launch.bg.js +++ b/archivebox/plugins/chrome/on_Crawl__20_chrome_launch.bg.js @@ -26,101 +26,23 @@ const { spawn } = require('child_process'); const http = require('http'); // Extractor metadata -const EXTRACTOR_NAME = 'chrome_launch'; +const PLUGIN_NAME = 'chrome_launch'; const OUTPUT_DIR = 'chrome'; -// Helper: Write PID file with mtime set to process start time +// Helpers for PID file creation function writePidWithMtime(filePath, pid, startTimeSeconds) { fs.writeFileSync(filePath, String(pid)); - // Set both atime and mtime to process start time for validation const startTimeMs = startTimeSeconds * 1000; fs.utimesSync(filePath, new Date(startTimeMs), new Date(startTimeMs)); } -// Helper: Write command script for validation function writeCmdScript(filePath, binary, args) { - // Shell escape arguments containing spaces or special characters - const escapedArgs = args.map(arg => { - if (arg.includes(' ') || arg.includes('"') || arg.includes('$')) { - return `"${arg.replace(/"/g, '\\"')}"`; - } - return arg; - }); - const script = `#!/bin/bash\n${binary} ${escapedArgs.join(' ')}\n`; - fs.writeFileSync(filePath, script); + const escape = arg => (arg.includes(' ') || arg.includes('"') || arg.includes('$')) + ? `"${arg.replace(/"/g, '\\"')}"` : arg; + fs.writeFileSync(filePath, `#!/bin/bash\n${binary} ${args.map(escape).join(' ')}\n`); fs.chmodSync(filePath, 0o755); } -// Helper: Get process start time (cross-platform) -function getProcessStartTime(pid) { - try { - const { execSync } = require('child_process'); - if (process.platform === 'darwin') { - // macOS: ps -p PID -o lstart= gives start time - const output = execSync(`ps -p ${pid} -o lstart=`, { encoding: 'utf8', timeout: 1000 }); - return Date.parse(output.trim()) / 1000; // Convert to epoch seconds - } else { - // Linux: read /proc/PID/stat field 22 (starttime in clock ticks) - const stat = fs.readFileSync(`/proc/${pid}/stat`, 'utf8'); - const match = stat.match(/\) \w+ (\d+)/); - if (match) { - const startTicks = parseInt(match[1], 10); - // Convert clock ticks to seconds (assuming 100 ticks/sec) - const uptimeSeconds = parseFloat(fs.readFileSync('/proc/uptime', 'utf8').split(' ')[0]); - const bootTime = Date.now() / 1000 - uptimeSeconds; - return bootTime + (startTicks / 100); - } - } - } catch (e) { - // Can't get start time - return null; - } - return null; -} - -// Helper: Validate PID using mtime and command -function validatePid(pid, pidFile, cmdFile) { - try { - // Check process exists - try { - process.kill(pid, 0); // Signal 0 = check existence - } catch (e) { - return false; // Process doesn't exist - } - - // Check mtime matches process start time (within 5 sec tolerance) - const fileStat = fs.statSync(pidFile); - const fileMtime = fileStat.mtimeMs / 1000; // Convert to seconds - const procStartTime = getProcessStartTime(pid); - - if (procStartTime === null) { - // Can't validate - fall back to basic existence check - return true; - } - - if (Math.abs(fileMtime - procStartTime) > 5) { - // PID was reused by different process - return false; - } - - // Validate command if available - if (fs.existsSync(cmdFile)) { - const cmd = fs.readFileSync(cmdFile, 'utf8'); - // Check for Chrome/Chromium and debug port - if (!cmd.includes('chrome') && !cmd.includes('chromium')) { - return false; - } - if (!cmd.includes('--remote-debugging-port')) { - return false; - } - } - - return true; - } catch (e) { - return false; - } -} - // Global state for cleanup let chromePid = null; @@ -332,20 +254,20 @@ function killZombieChrome() { const pid = parseInt(fs.readFileSync(pidFile, 'utf8').trim(), 10); if (isNaN(pid) || pid <= 0) continue; - // Validate PID before killing - const cmdFile = path.join(chromeDir, 'cmd.sh'); - if (!validatePid(pid, pidFile, cmdFile)) { - // PID reused or validation failed - console.error(`[!] PID ${pid} failed validation (reused or wrong process) - cleaning up`); + // Check if process exists (simple check, Python will validate properly) + try { + process.kill(pid, 0); + } catch (e) { + // Process dead, remove stale PID file try { fs.unlinkSync(pidFile); } catch (e) {} continue; } - // Process alive, validated, and crawl is stale - zombie! - console.error(`[!] Found validated zombie (PID ${pid}) from stale crawl ${crawl.name}`); + // Process alive and crawl is stale - zombie! + console.error(`[!] Found zombie (PID ${pid}) from stale crawl ${crawl.name}`); try { - // Kill process group first + // Kill process group try { process.kill(-pid, 'SIGKILL'); } catch (e) { @@ -354,14 +276,10 @@ function killZombieChrome() { killed++; console.error(`[+] Killed zombie (PID ${pid})`); - - // Remove PID file try { fs.unlinkSync(pidFile); } catch (e) {} - } catch (e) { console.error(`[!] Failed to kill PID ${pid}: ${e.message}`); } - } catch (e) { // Skip invalid PID files } diff --git a/archivebox/plugins/chrome/on_Snapshot__20_chrome_tab.bg.js b/archivebox/plugins/chrome/on_Snapshot__20_chrome_tab.bg.js index b1ae8908..b2c222c7 100755 --- a/archivebox/plugins/chrome/on_Snapshot__20_chrome_tab.bg.js +++ b/archivebox/plugins/chrome/on_Snapshot__20_chrome_tab.bg.js @@ -29,7 +29,7 @@ const http = require('http'); const puppeteer = require('puppeteer-core'); // Extractor metadata -const EXTRACTOR_NAME = 'chrome_tab'; +const PLUGIN_NAME = 'chrome_tab'; const OUTPUT_DIR = '.'; // Hook already runs in chrome/ output directory const CHROME_SESSION_DIR = '.'; diff --git a/archivebox/plugins/chrome/on_Snapshot__30_chrome_navigate.js b/archivebox/plugins/chrome/on_Snapshot__30_chrome_navigate.js index bca41606..400d5bec 100644 --- a/archivebox/plugins/chrome/on_Snapshot__30_chrome_navigate.js +++ b/archivebox/plugins/chrome/on_Snapshot__30_chrome_navigate.js @@ -19,7 +19,7 @@ const fs = require('fs'); const path = require('path'); const puppeteer = require('puppeteer-core'); -const EXTRACTOR_NAME = 'chrome_navigate'; +const PLUGIN_NAME = 'chrome_navigate'; const CHROME_SESSION_DIR = '.'; const OUTPUT_DIR = '.'; diff --git a/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.bg.js b/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.bg.js index 27a7b702..8313ada0 100755 --- a/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.bg.js +++ b/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.bg.js @@ -14,7 +14,7 @@ const fs = require('fs'); const path = require('path'); const puppeteer = require('puppeteer-core'); -const EXTRACTOR_NAME = 'consolelog'; +const PLUGIN_NAME = 'consolelog'; const OUTPUT_DIR = '.'; const OUTPUT_FILE = 'console.jsonl'; const PID_FILE = 'hook.pid'; diff --git a/archivebox/plugins/dom/on_Snapshot__36_dom.js b/archivebox/plugins/dom/on_Snapshot__36_dom.js index aa2ce485..e5913681 100644 --- a/archivebox/plugins/dom/on_Snapshot__36_dom.js +++ b/archivebox/plugins/dom/on_Snapshot__36_dom.js @@ -23,7 +23,7 @@ const path = require('path'); const puppeteer = require('puppeteer-core'); // Extractor metadata -const EXTRACTOR_NAME = 'dom'; +const PLUGIN_NAME = 'dom'; const OUTPUT_DIR = '.'; const OUTPUT_FILE = 'output.html'; const CHROME_SESSION_DIR = '../chrome'; @@ -252,10 +252,14 @@ async function main() { })); process.exit(0); } else { - // Wait for page to be fully loaded - const pageLoaded = await waitForChromeTabLoaded(60000); - if (!pageLoaded) { - throw new Error('Page not loaded after 60s (chrome_navigate must complete first)'); + // Only wait for page load if using shared Chrome session + const cdpUrl = getCdpUrl(); + if (cdpUrl) { + // Wait for page to be fully loaded + const pageLoaded = await waitForChromeTabLoaded(60000); + if (!pageLoaded) { + throw new Error('Page not loaded after 60s (chrome_navigate must complete first)'); + } } const result = await dumpDom(url); diff --git a/archivebox/plugins/favicon/on_Snapshot__11_favicon.py b/archivebox/plugins/favicon/on_Snapshot__11_favicon.py index 7516929c..ea5e9200 100644 --- a/archivebox/plugins/favicon/on_Snapshot__11_favicon.py +++ b/archivebox/plugins/favicon/on_Snapshot__11_favicon.py @@ -27,7 +27,7 @@ import rich_click as click # Extractor metadata -EXTRACTOR_NAME = 'favicon' +PLUGIN_NAME = 'favicon' OUTPUT_DIR = '.' OUTPUT_FILE = 'favicon.ico' diff --git a/archivebox/plugins/forumdl/on_Snapshot__53_forumdl.py b/archivebox/plugins/forumdl/on_Snapshot__53_forumdl.py index 5b6d1963..00ee7c84 100755 --- a/archivebox/plugins/forumdl/on_Snapshot__53_forumdl.py +++ b/archivebox/plugins/forumdl/on_Snapshot__53_forumdl.py @@ -31,7 +31,7 @@ import rich_click as click # Extractor metadata -EXTRACTOR_NAME = 'forumdl' +PLUGIN_NAME = 'forumdl' BIN_NAME = 'forum-dl' BIN_PROVIDERS = 'pip,env' OUTPUT_DIR = '.' diff --git a/archivebox/plugins/gallerydl/on_Snapshot__52_gallerydl.py b/archivebox/plugins/gallerydl/on_Snapshot__52_gallerydl.py index 8740a43c..c021ed12 100755 --- a/archivebox/plugins/gallerydl/on_Snapshot__52_gallerydl.py +++ b/archivebox/plugins/gallerydl/on_Snapshot__52_gallerydl.py @@ -32,7 +32,7 @@ import rich_click as click # Extractor metadata -EXTRACTOR_NAME = 'gallerydl' +PLUGIN_NAME = 'gallerydl' BIN_NAME = 'gallery-dl' BIN_PROVIDERS = 'pip,env' OUTPUT_DIR = '.' diff --git a/archivebox/plugins/git/on_Snapshot__12_git.py b/archivebox/plugins/git/on_Snapshot__12_git.py index 2e476bdd..37f6e245 100644 --- a/archivebox/plugins/git/on_Snapshot__12_git.py +++ b/archivebox/plugins/git/on_Snapshot__12_git.py @@ -24,7 +24,7 @@ import rich_click as click # Extractor metadata -EXTRACTOR_NAME = 'git' +PLUGIN_NAME = 'git' BIN_NAME = 'git' BIN_PROVIDERS = 'apt,brew,env' OUTPUT_DIR = '.' diff --git a/archivebox/plugins/headers/on_Snapshot__33_headers.js b/archivebox/plugins/headers/on_Snapshot__33_headers.js index 5c2c9981..8613378a 100644 --- a/archivebox/plugins/headers/on_Snapshot__33_headers.js +++ b/archivebox/plugins/headers/on_Snapshot__33_headers.js @@ -21,7 +21,7 @@ const https = require('https'); const http = require('http'); // Extractor metadata -const EXTRACTOR_NAME = 'headers'; +const PLUGIN_NAME = 'headers'; const OUTPUT_DIR = '.'; const OUTPUT_FILE = 'headers.json'; const CHROME_SESSION_DIR = '../chrome'; diff --git a/archivebox/plugins/htmltotext/on_Snapshot__54_htmltotext.py b/archivebox/plugins/htmltotext/on_Snapshot__54_htmltotext.py index c7c31b37..1c084091 100644 --- a/archivebox/plugins/htmltotext/on_Snapshot__54_htmltotext.py +++ b/archivebox/plugins/htmltotext/on_Snapshot__54_htmltotext.py @@ -26,7 +26,7 @@ import rich_click as click # Extractor metadata -EXTRACTOR_NAME = 'htmltotext' +PLUGIN_NAME = 'htmltotext' OUTPUT_DIR = '.' OUTPUT_FILE = 'htmltotext.txt' diff --git a/archivebox/plugins/media/on_Snapshot__51_media.py b/archivebox/plugins/media/on_Snapshot__51_media.py index 9e45dcb1..adf58aad 100644 --- a/archivebox/plugins/media/on_Snapshot__51_media.py +++ b/archivebox/plugins/media/on_Snapshot__51_media.py @@ -34,7 +34,7 @@ import rich_click as click # Extractor metadata -EXTRACTOR_NAME = 'media' +PLUGIN_NAME = 'media' BIN_NAME = 'yt-dlp' BIN_PROVIDERS = 'pip,apt,brew,env' OUTPUT_DIR = '.' diff --git a/archivebox/plugins/mercury/on_Snapshot__53_mercury.py b/archivebox/plugins/mercury/on_Snapshot__53_mercury.py index d8131d51..9da02088 100644 --- a/archivebox/plugins/mercury/on_Snapshot__53_mercury.py +++ b/archivebox/plugins/mercury/on_Snapshot__53_mercury.py @@ -25,7 +25,7 @@ import rich_click as click # Extractor metadata -EXTRACTOR_NAME = 'mercury' +PLUGIN_NAME = 'mercury' BIN_NAME = 'postlight-parser' BIN_PROVIDERS = 'npm,env' OUTPUT_DIR = '.' diff --git a/archivebox/plugins/papersdl/on_Snapshot__54_papersdl.py b/archivebox/plugins/papersdl/on_Snapshot__54_papersdl.py index 6835f5fc..57521204 100755 --- a/archivebox/plugins/papersdl/on_Snapshot__54_papersdl.py +++ b/archivebox/plugins/papersdl/on_Snapshot__54_papersdl.py @@ -28,7 +28,7 @@ import rich_click as click # Extractor metadata -EXTRACTOR_NAME = 'papersdl' +PLUGIN_NAME = 'papersdl' BIN_NAME = 'papers-dl' BIN_PROVIDERS = 'pip,env' OUTPUT_DIR = '.' diff --git a/archivebox/plugins/parse_dom_outlinks/on_Snapshot__40_parse_dom_outlinks.js b/archivebox/plugins/parse_dom_outlinks/on_Snapshot__40_parse_dom_outlinks.js index a0a2030b..0f98e38e 100755 --- a/archivebox/plugins/parse_dom_outlinks/on_Snapshot__40_parse_dom_outlinks.js +++ b/archivebox/plugins/parse_dom_outlinks/on_Snapshot__40_parse_dom_outlinks.js @@ -23,7 +23,7 @@ const path = require('path'); const puppeteer = require('puppeteer-core'); // Extractor metadata -const EXTRACTOR_NAME = 'parse_dom_outlinks'; +const PLUGIN_NAME = 'parse_dom_outlinks'; const OUTPUT_DIR = '.'; const OUTPUT_FILE = 'outlinks.json'; const URLS_FILE = 'urls.jsonl'; // For crawl system @@ -190,7 +190,7 @@ async function extractOutlinks(url) { const urlsJsonl = crawlableUrls.map(href => JSON.stringify({ type: 'Snapshot', url: href, - via_extractor: EXTRACTOR_NAME, + plugin: PLUGIN_NAME, })).join('\n'); if (urlsJsonl) { @@ -236,10 +236,14 @@ async function main() { process.exit(0); } - // Wait for page to be fully loaded - const pageLoaded = await waitForChromeTabLoaded(60000); - if (!pageLoaded) { - throw new Error('Page not loaded after 60s (chrome_navigate must complete first)'); + // Check if Chrome session exists, then wait for page load + const cdpUrl = getCdpUrl(); + if (cdpUrl) { + // Wait for page to be fully loaded + const pageLoaded = await waitForChromeTabLoaded(60000); + if (!pageLoaded) { + throw new Error('Page not loaded after 60s (chrome_navigate must complete first)'); + } } const result = await extractOutlinks(url); diff --git a/archivebox/plugins/parse_html_urls/on_Snapshot__60_parse_html_urls.py b/archivebox/plugins/parse_html_urls/on_Snapshot__60_parse_html_urls.py index 0684c663..af5ba256 100755 --- a/archivebox/plugins/parse_html_urls/on_Snapshot__60_parse_html_urls.py +++ b/archivebox/plugins/parse_html_urls/on_Snapshot__60_parse_html_urls.py @@ -28,7 +28,7 @@ from urllib.parse import urljoin, urlparse import rich_click as click -EXTRACTOR_NAME = 'parse_html_urls' +PLUGIN_NAME = 'parse_html_urls' # Check if parse_dom_outlinks extractor already ran DOM_OUTLINKS_URLS_FILE = Path('parse_dom_outlinks/urls.jsonl') @@ -179,7 +179,7 @@ def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0 record = { 'type': 'Snapshot', 'url': found_url, - 'via_extractor': EXTRACTOR_NAME, + 'plugin': PLUGIN_NAME, 'depth': depth + 1, } if snapshot_id: diff --git a/archivebox/plugins/parse_html_urls/tests/test_parse_html_urls.py b/archivebox/plugins/parse_html_urls/tests/test_parse_html_urls.py index 8e8320c2..08791848 100644 --- a/archivebox/plugins/parse_html_urls/tests/test_parse_html_urls.py +++ b/archivebox/plugins/parse_html_urls/tests/test_parse_html_urls.py @@ -233,7 +233,7 @@ class TestParseHtmlUrls: entry = json.loads(output_file.read_text().strip()) assert entry['url'] == 'https://example.com' assert 'type' in entry - assert 'via_extractor' in entry + assert 'plugin' in entry if __name__ == '__main__': diff --git a/archivebox/plugins/parse_jsonl_urls/on_Snapshot__64_parse_jsonl_urls.py b/archivebox/plugins/parse_jsonl_urls/on_Snapshot__64_parse_jsonl_urls.py index b5fe8905..c92ddb0f 100755 --- a/archivebox/plugins/parse_jsonl_urls/on_Snapshot__64_parse_jsonl_urls.py +++ b/archivebox/plugins/parse_jsonl_urls/on_Snapshot__64_parse_jsonl_urls.py @@ -24,7 +24,7 @@ from urllib.parse import urlparse import rich_click as click -EXTRACTOR_NAME = 'parse_jsonl_urls' +PLUGIN_NAME = 'parse_jsonl_urls' def parse_bookmarked_at(link: dict) -> str | None: @@ -75,7 +75,7 @@ def json_object_to_entry(link: dict) -> dict | None: entry = { 'type': 'Snapshot', 'url': unescape(url), - 'via_extractor': EXTRACTOR_NAME, + 'plugin': PLUGIN_NAME, } # Parse title diff --git a/archivebox/plugins/parse_jsonl_urls/tests/test_parse_jsonl_urls.py b/archivebox/plugins/parse_jsonl_urls/tests/test_parse_jsonl_urls.py index e4be5a90..a169a09c 100644 --- a/archivebox/plugins/parse_jsonl_urls/tests/test_parse_jsonl_urls.py +++ b/archivebox/plugins/parse_jsonl_urls/tests/test_parse_jsonl_urls.py @@ -265,7 +265,7 @@ class TestParseJsonlUrls: entry = json.loads(output_file.read_text().strip()) assert entry['url'] == 'https://example.com' assert 'type' in entry - assert 'via_extractor' in entry + assert 'plugin' in entry if __name__ == '__main__': diff --git a/archivebox/plugins/parse_netscape_urls/on_Snapshot__63_parse_netscape_urls.py b/archivebox/plugins/parse_netscape_urls/on_Snapshot__63_parse_netscape_urls.py index 84a8a51d..7c5fdbca 100755 --- a/archivebox/plugins/parse_netscape_urls/on_Snapshot__63_parse_netscape_urls.py +++ b/archivebox/plugins/parse_netscape_urls/on_Snapshot__63_parse_netscape_urls.py @@ -22,7 +22,7 @@ from urllib.parse import urlparse import rich_click as click -EXTRACTOR_NAME = 'parse_netscape_urls' +PLUGIN_NAME = 'parse_netscape_urls' # Constants for timestamp epoch detection UNIX_EPOCH = 0 # 1970-01-01 00:00:00 UTC @@ -187,7 +187,7 @@ def main(url: str, snapshot_id: str = None): entry = { 'type': 'Snapshot', 'url': unescape(bookmark_url), - 'via_extractor': EXTRACTOR_NAME, + 'plugin': PLUGIN_NAME, } if title: entry['title'] = unescape(title) diff --git a/archivebox/plugins/parse_rss_urls/on_Snapshot__61_parse_rss_urls.py b/archivebox/plugins/parse_rss_urls/on_Snapshot__61_parse_rss_urls.py index 37b41f9f..8e64c5c5 100755 --- a/archivebox/plugins/parse_rss_urls/on_Snapshot__61_parse_rss_urls.py +++ b/archivebox/plugins/parse_rss_urls/on_Snapshot__61_parse_rss_urls.py @@ -23,7 +23,7 @@ from urllib.parse import urlparse import rich_click as click -EXTRACTOR_NAME = 'parse_rss_urls' +PLUGIN_NAME = 'parse_rss_urls' try: import feedparser @@ -107,7 +107,7 @@ def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0 entry = { 'type': 'Snapshot', 'url': unescape(item_url), - 'via_extractor': EXTRACTOR_NAME, + 'plugin': PLUGIN_NAME, 'depth': depth + 1, } if snapshot_id: diff --git a/archivebox/plugins/parse_rss_urls/tests/test_parse_rss_urls_comprehensive.py b/archivebox/plugins/parse_rss_urls/tests/test_parse_rss_urls_comprehensive.py index 7d4f181d..ca48527b 100644 --- a/archivebox/plugins/parse_rss_urls/tests/test_parse_rss_urls_comprehensive.py +++ b/archivebox/plugins/parse_rss_urls/tests/test_parse_rss_urls_comprehensive.py @@ -47,7 +47,7 @@ class TestRssVariants: assert entry['url'] == 'https://example.com/article1' assert entry['title'] == 'RSS 0.91 Article' - assert entry['via_extractor'] == 'parse_rss_urls' + assert entry['plugin'] == 'parse_rss_urls' def test_rss_10_rdf(self, tmp_path): """Test RSS 1.0 (RDF) format.""" diff --git a/archivebox/plugins/parse_txt_urls/on_Snapshot__62_parse_txt_urls.py b/archivebox/plugins/parse_txt_urls/on_Snapshot__62_parse_txt_urls.py index 9b94d35a..958de2eb 100755 --- a/archivebox/plugins/parse_txt_urls/on_Snapshot__62_parse_txt_urls.py +++ b/archivebox/plugins/parse_txt_urls/on_Snapshot__62_parse_txt_urls.py @@ -25,7 +25,7 @@ from urllib.request import urlopen import rich_click as click -EXTRACTOR_NAME = 'parse_txt_urls' +PLUGIN_NAME = 'parse_txt_urls' # URL regex from archivebox/misc/util.py # https://mathiasbynens.be/demo/url-regex @@ -127,7 +127,7 @@ def main(url: str, snapshot_id: str = None): f.write(json.dumps({ 'type': 'Snapshot', 'url': found_url, - 'via_extractor': EXTRACTOR_NAME, + 'plugin': PLUGIN_NAME, }) + '\n') click.echo(f'Found {len(urls_found)} URLs') diff --git a/archivebox/plugins/parse_txt_urls/tests/test_parse_txt_urls.py b/archivebox/plugins/parse_txt_urls/tests/test_parse_txt_urls.py index ad8b2299..64aa3fcc 100644 --- a/archivebox/plugins/parse_txt_urls/tests/test_parse_txt_urls.py +++ b/archivebox/plugins/parse_txt_urls/tests/test_parse_txt_urls.py @@ -186,7 +186,7 @@ https://other.com entry = json.loads(output_file.read_text().strip()) assert entry['url'] == 'https://example.com' assert 'type' in entry - assert 'via_extractor' in entry + assert 'plugin' in entry if __name__ == '__main__': diff --git a/archivebox/plugins/pdf/on_Snapshot__35_pdf.js b/archivebox/plugins/pdf/on_Snapshot__35_pdf.js index db0b90ec..2d25f971 100644 --- a/archivebox/plugins/pdf/on_Snapshot__35_pdf.js +++ b/archivebox/plugins/pdf/on_Snapshot__35_pdf.js @@ -22,7 +22,7 @@ const path = require('path'); const puppeteer = require('puppeteer-core'); // Extractor metadata -const EXTRACTOR_NAME = 'pdf'; +const PLUGIN_NAME = 'pdf'; const OUTPUT_DIR = '.'; const OUTPUT_FILE = 'output.pdf'; const CHROME_SESSION_DIR = '../chrome'; @@ -254,10 +254,14 @@ async function main() { })); process.exit(0); // Permanent skip - staticfile already handled } else { - // Wait for page to be fully loaded - const pageLoaded = await waitForChromeTabLoaded(60000); - if (!pageLoaded) { - throw new Error('Page not loaded after 60s (chrome_navigate must complete first)'); + // Only wait for page load if using shared Chrome session + const cdpUrl = getCdpUrl(); + if (cdpUrl) { + // Wait for page to be fully loaded + const pageLoaded = await waitForChromeTabLoaded(60000); + if (!pageLoaded) { + throw new Error('Page not loaded after 60s (chrome_navigate must complete first)'); + } } const result = await printToPdf(url); diff --git a/archivebox/plugins/extractor_utils.py b/archivebox/plugins/plugin_utils.py similarity index 97% rename from archivebox/plugins/extractor_utils.py rename to archivebox/plugins/plugin_utils.py index e62cae14..c324fa83 100644 --- a/archivebox/plugins/extractor_utils.py +++ b/archivebox/plugins/plugin_utils.py @@ -1,11 +1,11 @@ #!/usr/bin/env python3 """ -Shared utilities for extractor hooks. +Shared utilities for extractor plugin hooks. -This module provides common functionality for all extractors to ensure +This module provides common functionality for all extractor plugins to ensure consistent behavior, output format, error handling, and timing. -All extractors should: +All extractor plugins should: 1. Import and use these utilities 2. Output consistent metadata (CMD, VERSION, OUTPUT, timing) 3. Write all files to $PWD @@ -35,7 +35,7 @@ STATIC_EXTENSIONS = ( def is_static_file(url: str) -> bool: - """Check if URL points to a static file that may not need browser extraction.""" + """Check if URL points to a static file that may not need browser-based extractor plugins.""" return url.lower().split('?')[0].split('#')[0].endswith(STATIC_EXTENSIONS) @@ -96,7 +96,7 @@ def get_version(binary: str, version_args: list[str] | None = None) -> str: class ExtractorResult: """ - Tracks extractor execution and produces consistent output. + Tracks extractor plugin execution and produces consistent output. Usage: result = ExtractorResult(name='wget', url=url) @@ -152,7 +152,7 @@ class ExtractorResult: return 1 def finish(self, status: str | None = None): - """Mark extraction as finished and print results.""" + """Mark extractor plugin execution as finished and print results.""" self.end_ts = datetime.now(timezone.utc) if status: self.status = status diff --git a/archivebox/plugins/readability/on_Snapshot__52_readability.py b/archivebox/plugins/readability/on_Snapshot__52_readability.py index 534751f2..b103dab3 100644 --- a/archivebox/plugins/readability/on_Snapshot__52_readability.py +++ b/archivebox/plugins/readability/on_Snapshot__52_readability.py @@ -27,7 +27,7 @@ import rich_click as click # Extractor metadata -EXTRACTOR_NAME = 'readability' +PLUGIN_NAME = 'readability' BIN_NAME = 'readability-extractor' BIN_PROVIDERS = 'npm,env' OUTPUT_DIR = '.' diff --git a/archivebox/plugins/redirects/on_Snapshot__31_redirects.bg.js b/archivebox/plugins/redirects/on_Snapshot__31_redirects.bg.js index 99f22b2c..1ad75939 100755 --- a/archivebox/plugins/redirects/on_Snapshot__31_redirects.bg.js +++ b/archivebox/plugins/redirects/on_Snapshot__31_redirects.bg.js @@ -14,7 +14,7 @@ const fs = require('fs'); const path = require('path'); const puppeteer = require('puppeteer-core'); -const EXTRACTOR_NAME = 'redirects'; +const PLUGIN_NAME = 'redirects'; const OUTPUT_DIR = '.'; const OUTPUT_FILE = 'redirects.jsonl'; const PID_FILE = 'hook.pid'; @@ -235,7 +235,7 @@ function handleShutdown(signal) { type: 'ArchiveResult', status: 'succeeded', output_str: OUTPUT_FILE, - extractor: EXTRACTOR_NAME, + plugin: PLUGIN_NAME, original_url: originalUrl, final_url: finalUrl || originalUrl, redirect_count: redirectChain.length, diff --git a/archivebox/plugins/responses/on_Snapshot__24_responses.bg.js b/archivebox/plugins/responses/on_Snapshot__24_responses.bg.js index cebc875a..9cbaf2b7 100755 --- a/archivebox/plugins/responses/on_Snapshot__24_responses.bg.js +++ b/archivebox/plugins/responses/on_Snapshot__24_responses.bg.js @@ -15,7 +15,7 @@ const path = require('path'); const crypto = require('crypto'); const puppeteer = require('puppeteer-core'); -const EXTRACTOR_NAME = 'responses'; +const PLUGIN_NAME = 'responses'; const OUTPUT_DIR = '.'; const PID_FILE = 'hook.pid'; const CHROME_SESSION_DIR = '../chrome'; diff --git a/archivebox/plugins/screenshot/on_Snapshot__34_screenshot.js b/archivebox/plugins/screenshot/on_Snapshot__34_screenshot.js index 7b013cb2..d9b476d4 100644 --- a/archivebox/plugins/screenshot/on_Snapshot__34_screenshot.js +++ b/archivebox/plugins/screenshot/on_Snapshot__34_screenshot.js @@ -22,7 +22,7 @@ const path = require('path'); const puppeteer = require('puppeteer-core'); // Extractor metadata -const EXTRACTOR_NAME = 'screenshot'; +const PLUGIN_NAME = 'screenshot'; const OUTPUT_DIR = '.'; const OUTPUT_FILE = 'screenshot.png'; const CHROME_SESSION_DIR = '../chrome'; @@ -250,10 +250,14 @@ async function main() { })); process.exit(0); // Permanent skip - staticfile already handled } else { - // Wait for page to be fully loaded - const pageLoaded = await waitForChromeTabLoaded(60000); - if (!pageLoaded) { - throw new Error('Page not loaded after 60s (chrome_navigate must complete first)'); + // Only wait for page load if using shared Chrome session + const cdpUrl = getCdpUrl(); + if (cdpUrl) { + // Wait for page to be fully loaded + const pageLoaded = await waitForChromeTabLoaded(60000); + if (!pageLoaded) { + throw new Error('Page not loaded after 60s (chrome_navigate must complete first)'); + } } const result = await takeScreenshot(url); diff --git a/archivebox/plugins/search_backend_sonic/on_Snapshot__91_index_sonic.py b/archivebox/plugins/search_backend_sonic/on_Snapshot__91_index_sonic.py index 42265bc6..a44d773a 100644 --- a/archivebox/plugins/search_backend_sonic/on_Snapshot__91_index_sonic.py +++ b/archivebox/plugins/search_backend_sonic/on_Snapshot__91_index_sonic.py @@ -27,7 +27,7 @@ import rich_click as click # Extractor metadata -EXTRACTOR_NAME = 'index_sonic' +PLUGIN_NAME = 'index_sonic' OUTPUT_DIR = '.' # Text file patterns to index @@ -83,14 +83,14 @@ def find_indexable_content() -> list[tuple[str, str]]: cwd = Path.cwd() for extractor, file_pattern in INDEXABLE_FILES: - extractor_dir = cwd / extractor - if not extractor_dir.exists(): + plugin_dir = cwd / extractor + if not plugin_dir.exists(): continue if '*' in file_pattern: - matches = list(extractor_dir.glob(file_pattern)) + matches = list(plugin_dir.glob(file_pattern)) else: - match = extractor_dir / file_pattern + match = plugin_dir / file_pattern matches = [match] if match.exists() else [] for match in matches: diff --git a/archivebox/plugins/search_backend_sqlite/on_Snapshot__90_index_sqlite.py b/archivebox/plugins/search_backend_sqlite/on_Snapshot__90_index_sqlite.py index 907d21ab..8a8a21b6 100644 --- a/archivebox/plugins/search_backend_sqlite/on_Snapshot__90_index_sqlite.py +++ b/archivebox/plugins/search_backend_sqlite/on_Snapshot__90_index_sqlite.py @@ -25,7 +25,7 @@ import rich_click as click # Extractor metadata -EXTRACTOR_NAME = 'index_sqlite' +PLUGIN_NAME = 'index_sqlite' OUTPUT_DIR = '.' # Text file patterns to index, in priority order @@ -74,14 +74,14 @@ def find_indexable_content() -> list[tuple[str, str]]: cwd = Path.cwd() for extractor, file_pattern in INDEXABLE_FILES: - extractor_dir = cwd / extractor - if not extractor_dir.exists(): + plugin_dir = cwd / extractor + if not plugin_dir.exists(): continue if '*' in file_pattern: - matches = list(extractor_dir.glob(file_pattern)) + matches = list(plugin_dir.glob(file_pattern)) else: - match = extractor_dir / file_pattern + match = plugin_dir / file_pattern matches = [match] if match.exists() else [] for match in matches: diff --git a/archivebox/plugins/seo/on_Snapshot__38_seo.js b/archivebox/plugins/seo/on_Snapshot__38_seo.js index 0ff7e9f6..ee437382 100755 --- a/archivebox/plugins/seo/on_Snapshot__38_seo.js +++ b/archivebox/plugins/seo/on_Snapshot__38_seo.js @@ -20,7 +20,7 @@ const path = require('path'); const puppeteer = require('puppeteer-core'); // Extractor metadata -const EXTRACTOR_NAME = 'seo'; +const PLUGIN_NAME = 'seo'; const OUTPUT_DIR = '.'; const OUTPUT_FILE = 'seo.json'; const CHROME_SESSION_DIR = '../chrome'; @@ -177,10 +177,14 @@ async function main() { process.exit(0); } - // Wait for page to be fully loaded - const pageLoaded = await waitForChromeTabLoaded(60000); - if (!pageLoaded) { - throw new Error('Page not loaded after 60s (chrome_navigate must complete first)'); + // Check if Chrome session exists, then wait for page load + const cdpUrl = getCdpUrl(); + if (cdpUrl) { + // Wait for page to be fully loaded + const pageLoaded = await waitForChromeTabLoaded(60000); + if (!pageLoaded) { + throw new Error('Page not loaded after 60s (chrome_navigate must complete first)'); + } } const result = await extractSeo(url); diff --git a/archivebox/plugins/singlefile/on_Snapshot__37_singlefile.py b/archivebox/plugins/singlefile/on_Snapshot__37_singlefile.py index 785bc878..aee7ce49 100644 --- a/archivebox/plugins/singlefile/on_Snapshot__37_singlefile.py +++ b/archivebox/plugins/singlefile/on_Snapshot__37_singlefile.py @@ -36,7 +36,7 @@ import rich_click as click # Extractor metadata -EXTRACTOR_NAME = 'singlefile' +PLUGIN_NAME = 'singlefile' BIN_NAME = 'single-file' BIN_PROVIDERS = 'npm,env' OUTPUT_DIR = '.' diff --git a/archivebox/plugins/ssl/on_Snapshot__23_ssl.bg.js b/archivebox/plugins/ssl/on_Snapshot__23_ssl.bg.js index 20f271a8..b12e52e4 100755 --- a/archivebox/plugins/ssl/on_Snapshot__23_ssl.bg.js +++ b/archivebox/plugins/ssl/on_Snapshot__23_ssl.bg.js @@ -14,7 +14,7 @@ const fs = require('fs'); const path = require('path'); const puppeteer = require('puppeteer-core'); -const EXTRACTOR_NAME = 'ssl'; +const PLUGIN_NAME = 'ssl'; const OUTPUT_DIR = '.'; const OUTPUT_FILE = 'ssl.jsonl'; const PID_FILE = 'hook.pid'; diff --git a/archivebox/plugins/staticfile/on_Snapshot__31_staticfile.bg.js b/archivebox/plugins/staticfile/on_Snapshot__31_staticfile.bg.js index d1201a02..01945d37 100644 --- a/archivebox/plugins/staticfile/on_Snapshot__31_staticfile.bg.js +++ b/archivebox/plugins/staticfile/on_Snapshot__31_staticfile.bg.js @@ -14,7 +14,7 @@ const fs = require('fs'); const path = require('path'); const puppeteer = require('puppeteer-core'); -const EXTRACTOR_NAME = 'staticfile'; +const PLUGIN_NAME = 'staticfile'; const OUTPUT_DIR = '.'; const PID_FILE = 'hook.pid'; const CHROME_SESSION_DIR = '../chrome'; @@ -326,7 +326,7 @@ function handleShutdown(signal) { type: 'ArchiveResult', status: 'skipped', output_str: 'No Content-Type detected', - extractor: EXTRACTOR_NAME, + plugin: PLUGIN_NAME, }; } else if (!isStaticFile) { // Not a static file (normal case for HTML pages) @@ -334,7 +334,7 @@ function handleShutdown(signal) { type: 'ArchiveResult', status: 'skipped', output_str: `Not a static file (Content-Type: ${detectedContentType})`, - extractor: EXTRACTOR_NAME, + plugin: PLUGIN_NAME, content_type: detectedContentType, }; } else if (downloadError) { @@ -343,7 +343,7 @@ function handleShutdown(signal) { type: 'ArchiveResult', status: 'failed', output_str: downloadError, - extractor: EXTRACTOR_NAME, + plugin: PLUGIN_NAME, content_type: detectedContentType, }; } else if (downloadedFilePath) { @@ -352,7 +352,7 @@ function handleShutdown(signal) { type: 'ArchiveResult', status: 'succeeded', output_str: downloadedFilePath, - extractor: EXTRACTOR_NAME, + plugin: PLUGIN_NAME, content_type: detectedContentType, }; } else { @@ -361,7 +361,7 @@ function handleShutdown(signal) { type: 'ArchiveResult', status: 'failed', output_str: 'Static file detected but download did not complete', - extractor: EXTRACTOR_NAME, + plugin: PLUGIN_NAME, content_type: detectedContentType, }; } diff --git a/archivebox/plugins/title/on_Snapshot__32_title.js b/archivebox/plugins/title/on_Snapshot__32_title.js index 714c1af0..d35e6e48 100644 --- a/archivebox/plugins/title/on_Snapshot__32_title.js +++ b/archivebox/plugins/title/on_Snapshot__32_title.js @@ -20,7 +20,7 @@ const https = require('https'); const http = require('http'); // Extractor metadata -const EXTRACTOR_NAME = 'title'; +const PLUGIN_NAME = 'title'; const OUTPUT_DIR = '.'; const OUTPUT_FILE = 'title.txt'; const CHROME_SESSION_DIR = '../chrome'; diff --git a/archivebox/plugins/wget/on_Snapshot__50_wget.py b/archivebox/plugins/wget/on_Snapshot__50_wget.py index 06771af7..0385106f 100644 --- a/archivebox/plugins/wget/on_Snapshot__50_wget.py +++ b/archivebox/plugins/wget/on_Snapshot__50_wget.py @@ -39,7 +39,7 @@ import rich_click as click # Extractor metadata -EXTRACTOR_NAME = 'wget' +PLUGIN_NAME = 'wget' BIN_NAME = 'wget' BIN_PROVIDERS = 'apt,brew,env' OUTPUT_DIR = '.' diff --git a/bin/kill_chrome.sh b/bin/kill_chrome.sh new file mode 100755 index 00000000..3d6996ba --- /dev/null +++ b/bin/kill_chrome.sh @@ -0,0 +1,156 @@ +#!/usr/bin/env bash +# Kill zombie Chrome/Chromium processes listening on 127.0.0.1 +# Works cross-platform on macOS and Linux +# +# Usage: +# ./bin/kill_chrome.sh # Kill Chrome processes with verification +# ./bin/kill_chrome.sh --pkill # Quick kill using pkill (less precise) +# ./bin/kill_chrome.sh --help # Show this help + +set -e + +# Detect OS +OS="$(uname -s)" + +# Chrome binary patterns to search for (cross-platform) +CHROME_PATTERNS=( + "Google Chrome" + "google-chrome" + "chrome" + "chromium" + "chromium-browser" + "Chromium" +) + +# Function to kill Chrome processes +kill_chrome_processes() { + echo "Searching for Chrome processes listening on 127.0.0.1..." + local killed=0 + + for pattern in "${CHROME_PATTERNS[@]}"; do + # Find processes matching the pattern with remote debugging + if [ "$OS" = "Darwin" ]; then + # macOS + pids=$(ps aux | grep -i "$pattern" | grep -E "(remote-debugging-port|remote-debugging-address=127\.0\.0\.1)" | grep -v grep | awk '{print $2}' || true) + else + # Linux + pids=$(ps aux | grep -i "$pattern" | grep -E "(remote-debugging-port|remote-debugging-address=127\.0\.0\.1)" | grep -v grep | awk '{print $2}' || true) + fi + + if [ -n "$pids" ]; then + echo "Found Chrome processes ($pattern): $pids" + for pid in $pids; do + # Try regular kill first + if kill "$pid" 2>/dev/null; then + echo " Killed $pid" + killed=$((killed + 1)) + sleep 0.1 + fi + + # Check if still alive + if ps -p "$pid" > /dev/null 2>&1; then + # Check process state first to avoid attempting impossible kills + if [ "$OS" = "Darwin" ]; then + state=$(ps -o state -p "$pid" 2>/dev/null | tail -1 | tr -d ' ') + else + state=$(ps -o stat -p "$pid" 2>/dev/null | tail -1 | tr -d ' ') + fi + + # Check if it's a zombie/uninterruptible process BEFORE trying to kill + if [[ "$state" == *"Z"* ]] || [[ "$state" == *"D"* ]] || [[ "$state" == *"UNE"* ]]; then + echo " WARNING: $pid is in uninterruptible/zombie state ($state) - cannot be killed" + echo " Process will clean up automatically or requires system reboot" + else + # Try force kill + echo " Force killing $pid with -9..." + if kill -9 "$pid" 2>/dev/null; then + # Wait briefly and verify + sleep 0.2 + if ! ps -p "$pid" > /dev/null 2>&1; then + echo " Force killed $pid" + killed=$((killed + 1)) + else + echo " WARNING: $pid survived kill -9 (state: $state)" + fi + else + echo " ERROR: Failed to kill $pid (state: $state)" + fi + fi + fi + done + fi + done + + if [ $killed -eq 0 ]; then + echo "No Chrome processes listening on 127.0.0.1 found (or all are zombie/uninterruptible)" + else + echo "Successfully killed $killed Chrome process(es)" + fi + + # Show remaining Chrome processes (if any) + echo "" + echo "Remaining Chrome processes listening on 127.0.0.1:" + for pattern in "${CHROME_PATTERNS[@]}"; do + ps aux | grep -i "$pattern" | grep -E "(remote-debugging-port|remote-debugging-address=127\.0\.0\.1)" | grep -v grep || true + done | head -10 + + if [ $(ps aux | grep -iE "(google chrome|chrome|chromium)" | grep -E "(remote-debugging-port|remote-debugging-address=127\.0\.0\.1)" | grep -v grep | wc -l) -eq 0 ]; then + echo " (none)" + fi +} + +# Alternative approach using pkill (faster but less precise) +kill_chrome_pkill() { + echo "Using pkill to kill all Chrome processes..." + + for pattern in "${CHROME_PATTERNS[@]}"; do + if pkill -9 -f "$pattern" 2>/dev/null; then + echo " Killed processes matching: $pattern" + fi + done + + sleep 0.5 + echo "Done" +} + +# Show help +show_help() { + cat << EOF +Kill zombie Chrome/Chromium processes listening on 127.0.0.1 + +Usage: + $0 [OPTIONS] + +Options: + (none) Kill Chrome processes with state verification (recommended) + --pkill, -p Quick kill using pkill (faster but less precise) + --help, -h Show this help message + +Description: + This script finds and kills Chrome/Chromium processes that are listening + on 127.0.0.1 (with --remote-debugging-port or --remote-debugging-address). + + Supports multiple Chrome binary names: + - Google Chrome / chrome / google-chrome + - Chromium / chromium / chromium-browser + + Works on macOS and Linux. + + Zombie/uninterruptible processes (state UNE/Z/D) will be detected and + reported but cannot be killed. They will clean up automatically. + +Examples: + $0 # Kill with verification + $0 --pkill # Quick kill all Chrome processes + +EOF +} + +# Parse arguments +if [ "$1" = "--help" ] || [ "$1" = "-h" ]; then + show_help +elif [ "$1" = "--pkill" ] || [ "$1" = "-p" ]; then + kill_chrome_pkill +else + kill_chrome_processes +fi diff --git a/tests/test_recursive_crawl.py b/tests/test_recursive_crawl.py index 9ed52e16..a820e519 100644 --- a/tests/test_recursive_crawl.py +++ b/tests/test_recursive_crawl.py @@ -219,7 +219,13 @@ def test_recursive_crawl_creates_child_snapshots(tmp_path, process): # Kill the process proc.kill() - proc.wait() + stdout, stderr = proc.communicate() + + # Debug: print stderr to see what's happening + if stderr: + print(f"\n=== STDERR ===\n{stderr}\n=== END STDERR ===\n") + if stdout: + print(f"\n=== STDOUT (last 2000 chars) ===\n{stdout[-2000:]}\n=== END STDOUT ===\n") conn = sqlite3.connect('index.sqlite3') c = conn.cursor()