From c7b2217cd6cdb36eda6cddcbf86a6a32faae4025 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 19 Jan 2026 01:00:53 -0800 Subject: [PATCH] tons of fixes with codex --- archivebox/cli/archivebox_add.py | 61 +- archivebox/cli/archivebox_pluginmap.py | 13 +- archivebox/config/configset.py | 7 + archivebox/core/models.py | 171 +++-- archivebox/core/settings.py | 4 + archivebox/core/views.py | 10 +- archivebox/crawls/models.py | 64 +- archivebox/hooks.py | 156 ++--- archivebox/machine/detect.py | 66 +- archivebox/machine/models.py | 190 ++++-- .../machine/tests/test_machine_models.py | 36 +- archivebox/misc/checks.py | 4 +- archivebox/misc/progress_layout.py | 615 +++++++++++------- .../plugins/accessibility/templates/icon.html | 1 + .../plugins/accessibility/tests/__init__.py | 1 - .../plugins/apt/on_Binary__13_apt_install.py | 2 +- archivebox/plugins/apt/tests/__init__.py | 1 - .../plugins/apt/tests/test_apt_provider.py | 4 +- .../on_Snapshot__13_archivedotorg.py | 14 + .../plugins/archivedotorg/templates/icon.html | 2 +- archivebox/plugins/chrome/binaries.jsonl | 1 - archivebox/plugins/chrome/chrome_utils.js | 21 +- archivebox/plugins/chrome/config.json | 4 +- .../chrome/on_Crawl__01_chrome_install.py | 265 -------- .../chrome/on_Crawl__70_chrome_install.py | 34 + ...bg.js => on_Crawl__90_chrome_launch.bg.js} | 6 +- ...bg.js => on_Snapshot__10_chrome_tab.bg.js} | 83 ++- .../chrome/on_Snapshot__11_chrome_wait.js | 76 +++ .../chrome/on_Snapshot__30_chrome_navigate.js | 2 +- archivebox/plugins/chrome/templates/icon.html | 1 + archivebox/plugins/chrome/tests/__init__.py | 0 .../chrome/tests/chrome_test_helpers.py | 178 +++-- .../plugins/chrome/tests/test_chrome.py | 57 +- .../on_Snapshot__21_consolelog.bg.js | 61 +- .../plugins/consolelog/templates/icon.html | 1 + .../plugins/consolelog/tests/__init__.py | 1 - .../consolelog/tests/test_consolelog.py | 26 +- .../custom/on_Binary__14_custom_install.py | 13 +- archivebox/plugins/custom/tests/__init__.py | 1 - .../custom/tests/test_custom_provider.py | 4 +- .../plugins/dns/on_Snapshot__22_dns.bg.js | 63 +- archivebox/plugins/dns/templates/icon.html | 1 + archivebox/plugins/dom/on_Snapshot__53_dom.js | 16 +- archivebox/plugins/dom/templates/icon.html | 2 +- archivebox/plugins/dom/tests/test_dom.py | 2 +- .../plugins/env/on_Binary__15_env_install.py | 3 +- archivebox/plugins/env/tests/__init__.py | 1 - .../plugins/env/tests/test_env_provider.py | 4 +- .../favicon/on_Snapshot__11_favicon.py | 9 +- .../plugins/favicon/templates/icon.html | 2 +- archivebox/plugins/forumdl/binaries.jsonl | 1 - .../forumdl/on_Crawl__13_forumdl_install.py | 80 --- .../forumdl/on_Crawl__25_forumdl_install.py | 79 +++ ...dl.bg.py => on_Snapshot__04_forumdl.bg.py} | 37 +- .../plugins/forumdl/templates/icon.html | 2 +- archivebox/plugins/gallerydl/binaries.jsonl | 1 - .../on_Crawl__10_gallerydl_install.py | 80 --- .../on_Crawl__20_gallerydl_install.py | 48 ++ ....bg.py => on_Snapshot__03_gallerydl.bg.py} | 54 +- .../plugins/gallerydl/templates/icon.html | 2 +- archivebox/plugins/git/binaries.jsonl | 1 - .../plugins/git/on_Crawl__05_git_install.py | 48 ++ .../plugins/git/on_Crawl__09_git_install.py | 80 --- ...t__62_git.py => on_Snapshot__05_git.bg.py} | 2 +- archivebox/plugins/git/templates/icon.html | 2 +- .../plugins/headers/templates/icon.html | 2 +- .../htmltotext/on_Snapshot__58_htmltotext.py | 24 +- .../plugins/htmltotext/templates/icon.html | 2 +- .../plugins/infiniscroll/templates/icon.html | 1 + ...l_istilldontcareaboutcookies_extension.js} | 2 +- archivebox/plugins/mercury/binaries.jsonl | 1 - .../mercury/on_Crawl__12_mercury_install.py | 85 --- .../mercury/on_Crawl__40_mercury_install.py | 53 ++ .../plugins/mercury/templates/icon.html | 2 +- .../plugins/merkletree/templates/icon.html | 1 + .../plugins/merkletree/tests/__init__.py | 1 - .../on_Snapshot__15_modalcloser.bg.js | 2 +- .../plugins/modalcloser/templates/icon.html | 1 + .../plugins/npm/on_Binary__10_npm_install.py | 38 +- .../plugins/npm/on_Crawl__00_npm_install.py | 51 ++ archivebox/plugins/npm/tests/__init__.py | 1 - .../plugins/npm/tests/test_npm_provider.py | 4 +- archivebox/plugins/papersdl/binaries.jsonl | 1 - .../papersdl/on_Crawl__14_papersdl_install.py | 80 --- .../papersdl/on_Crawl__30_papersdl_install.py | 48 ++ .../papersdl/on_Snapshot__66_papersdl.bg.py | 37 +- .../plugins/papersdl/templates/icon.html | 2 +- .../on_Snapshot__75_parse_dom_outlinks.js | 5 + .../parse_dom_outlinks/templates/icon.html | 2 +- .../parse_dom_outlinks/tests/__init__.py | 1 - .../tests/test_parse_dom_outlinks.py | 3 +- .../on_Snapshot__70_parse_html_urls.py | 143 +++- .../parse_html_urls/templates/icon.html | 2 +- .../on_Snapshot__74_parse_jsonl_urls.py | 7 + .../parse_jsonl_urls/templates/icon.html | 2 +- .../on_Snapshot__73_parse_netscape_urls.py | 7 + .../parse_netscape_urls/templates/icon.html | 2 +- .../on_Snapshot__72_parse_rss_urls.py | 7 + .../parse_rss_urls/templates/icon.html | 2 +- .../on_Snapshot__71_parse_txt_urls.py | 7 + .../parse_txt_urls/templates/icon.html | 2 +- archivebox/plugins/pdf/on_Snapshot__52_pdf.js | 16 +- archivebox/plugins/pdf/templates/icon.html | 2 +- .../plugins/pip/on_Binary__11_pip_install.py | 43 +- archivebox/plugins/pip/tests/__init__.py | 1 - .../plugins/pip/tests/test_pip_provider.py | 30 +- archivebox/plugins/puppeteer/__init__.py | 1 + .../on_Binary__12_puppeteer_install.py | 170 +++++ .../on_Crawl__60_puppeteer_install.py | 31 + .../plugins/puppeteer/tests/test_puppeteer.py | 124 ++++ archivebox/plugins/readability/binaries.jsonl | 1 - .../on_Crawl__11_readability_install.py | 83 --- .../on_Crawl__35_readability_install.py | 53 ++ .../on_Snapshot__56_readability.py | 18 +- .../plugins/readability/templates/icon.html | 2 +- ....bg.js => on_Snapshot__25_redirects.bg.js} | 14 +- .../plugins/redirects/templates/icon.html | 1 + .../plugins/redirects/tests/__init__.py | 1 - .../plugins/redirects/tests/test_redirects.py | 38 +- .../responses/on_Snapshot__24_responses.bg.js | 60 +- .../plugins/responses/templates/icon.html | 1 + .../plugins/responses/tests/__init__.py | 1 - .../plugins/responses/tests/test_responses.py | 31 +- .../screenshot/on_Snapshot__51_screenshot.js | 16 +- .../plugins/screenshot/templates/icon.html | 2 +- .../screenshot/tests/test_screenshot.py | 2 +- .../search_backend_ripgrep/binaries.jsonl | 1 - .../on_Crawl__00_ripgrep_install.py | 92 --- .../on_Crawl__50_ripgrep_install.py | 32 + .../plugins/search_backend_ripgrep/search.py | 14 +- .../search_backend_ripgrep/tests/__init__.py | 0 .../tests/test_ripgrep_detection.py | 34 +- .../tests/test_ripgrep_search.py | 4 +- .../search_backend_sonic/templates/icon.html | 1 + .../plugins/search_backend_sqlite/search.py | 11 +- .../search_backend_sqlite/templates/icon.html | 1 + .../search_backend_sqlite/tests/__init__.py | 1 - archivebox/plugins/seo/templates/icon.html | 1 + archivebox/plugins/seo/tests/__init__.py | 1 - archivebox/plugins/seo/tests/test_seo.py | 3 +- archivebox/plugins/singlefile/binaries.jsonl | 1 - archivebox/plugins/singlefile/config.json | 2 +- .../on_Crawl__08_singlefile_install.py | 85 --- .../on_Crawl__45_singlefile_install.py | 54 ++ ....js => on_Crawl__82_singlefile_install.js} | 2 +- .../singlefile/on_Snapshot__50_singlefile.py | 83 ++- .../plugins/singlefile/templates/icon.html | 2 +- .../singlefile/tests/test_singlefile.py | 7 +- .../plugins/ssl/on_Snapshot__23_ssl.bg.js | 57 +- archivebox/plugins/ssl/templates/icon.html | 1 + archivebox/plugins/ssl/tests/__init__.py | 1 - archivebox/plugins/ssl/tests/test_ssl.py | 32 +- ...bg.js => on_Snapshot__26_staticfile.bg.js} | 14 +- .../plugins/staticfile/templates/icon.html | 2 +- .../plugins/staticfile/tests/__init__.py | 1 - .../staticfile/tests/test_staticfile.py | 25 +- archivebox/plugins/title/templates/icon.html | 2 +- ....js => on_Crawl__83_twocaptcha_install.js} | 4 +- ...g.js => on_Crawl__95_twocaptcha_config.js} | 4 +- .../twocaptcha/tests/test_twocaptcha.py | 4 +- ... on_Crawl__80_install_ublock_extension.js} | 2 +- archivebox/plugins/wget/binaries.jsonl | 1 - .../plugins/wget/on_Crawl__06_wget_install.py | 146 ----- .../plugins/wget/on_Crawl__10_wget_install.py | 95 +++ ..._61_wget.py => on_Snapshot__06_wget.bg.py} | 19 +- archivebox/plugins/wget/templates/icon.html | 2 +- archivebox/plugins/wget/tests/test_wget.py | 2 +- archivebox/plugins/ytdlp/binaries.jsonl | 3 - .../ytdlp/on_Crawl__07_ytdlp_install.py | 80 --- .../ytdlp/on_Crawl__15_ytdlp_install.py | 64 ++ ...tdlp.bg.py => on_Snapshot__02_ytdlp.bg.py} | 58 +- archivebox/plugins/ytdlp/templates/icon.html | 2 +- archivebox/templates/core/snapshot_live.html | 6 +- archivebox/templates/static/admin.css | 32 + archivebox/tests/conftest.py | 14 +- archivebox/tests/test_cli_add_interrupt.py | 133 ++++ archivebox/tests/test_hooks.py | 147 ++--- archivebox/tests/test_list.py | 12 +- archivebox/tests/test_real_world_add.py | 133 ++++ .../tests/test_settings_signal_webhooks.py | 8 + archivebox/tests/test_snapshot.py | 105 +-- archivebox/workers/orchestrator.py | 276 ++++++-- archivebox/workers/tests/test_orchestrator.py | 40 ++ archivebox/workers/worker.py | 154 +++-- 184 files changed, 3943 insertions(+), 2420 deletions(-) delete mode 100644 archivebox/plugins/accessibility/tests/__init__.py delete mode 100644 archivebox/plugins/apt/tests/__init__.py delete mode 100644 archivebox/plugins/chrome/binaries.jsonl delete mode 100755 archivebox/plugins/chrome/on_Crawl__01_chrome_install.py create mode 100755 archivebox/plugins/chrome/on_Crawl__70_chrome_install.py rename archivebox/plugins/chrome/{on_Crawl__20_chrome_launch.bg.js => on_Crawl__90_chrome_launch.bg.js} (98%) rename archivebox/plugins/chrome/{on_Snapshot__20_chrome_tab.bg.js => on_Snapshot__10_chrome_tab.bg.js} (86%) create mode 100644 archivebox/plugins/chrome/on_Snapshot__11_chrome_wait.js delete mode 100644 archivebox/plugins/chrome/tests/__init__.py delete mode 100644 archivebox/plugins/consolelog/tests/__init__.py delete mode 100644 archivebox/plugins/custom/tests/__init__.py delete mode 100644 archivebox/plugins/env/tests/__init__.py delete mode 100644 archivebox/plugins/forumdl/binaries.jsonl delete mode 100755 archivebox/plugins/forumdl/on_Crawl__13_forumdl_install.py create mode 100755 archivebox/plugins/forumdl/on_Crawl__25_forumdl_install.py rename archivebox/plugins/forumdl/{on_Snapshot__65_forumdl.bg.py => on_Snapshot__04_forumdl.bg.py} (87%) delete mode 100644 archivebox/plugins/gallerydl/binaries.jsonl delete mode 100755 archivebox/plugins/gallerydl/on_Crawl__10_gallerydl_install.py create mode 100755 archivebox/plugins/gallerydl/on_Crawl__20_gallerydl_install.py rename archivebox/plugins/gallerydl/{on_Snapshot__64_gallerydl.bg.py => on_Snapshot__03_gallerydl.bg.py} (81%) delete mode 100644 archivebox/plugins/git/binaries.jsonl create mode 100755 archivebox/plugins/git/on_Crawl__05_git_install.py delete mode 100755 archivebox/plugins/git/on_Crawl__09_git_install.py rename archivebox/plugins/git/{on_Snapshot__62_git.py => on_Snapshot__05_git.bg.py} (98%) create mode 100644 archivebox/plugins/infiniscroll/templates/icon.html rename archivebox/plugins/istilldontcareaboutcookies/{on_Crawl__02_istilldontcareaboutcookies_install.js => on_Crawl__81_install_istilldontcareaboutcookies_extension.js} (97%) delete mode 100644 archivebox/plugins/mercury/binaries.jsonl delete mode 100755 archivebox/plugins/mercury/on_Crawl__12_mercury_install.py create mode 100755 archivebox/plugins/mercury/on_Crawl__40_mercury_install.py delete mode 100644 archivebox/plugins/merkletree/tests/__init__.py create mode 100644 archivebox/plugins/modalcloser/templates/icon.html create mode 100644 archivebox/plugins/npm/on_Crawl__00_npm_install.py delete mode 100644 archivebox/plugins/npm/tests/__init__.py delete mode 100644 archivebox/plugins/papersdl/binaries.jsonl delete mode 100755 archivebox/plugins/papersdl/on_Crawl__14_papersdl_install.py create mode 100755 archivebox/plugins/papersdl/on_Crawl__30_papersdl_install.py delete mode 100644 archivebox/plugins/parse_dom_outlinks/tests/__init__.py delete mode 100644 archivebox/plugins/pip/tests/__init__.py create mode 100644 archivebox/plugins/puppeteer/__init__.py create mode 100644 archivebox/plugins/puppeteer/on_Binary__12_puppeteer_install.py create mode 100644 archivebox/plugins/puppeteer/on_Crawl__60_puppeteer_install.py create mode 100644 archivebox/plugins/puppeteer/tests/test_puppeteer.py delete mode 100644 archivebox/plugins/readability/binaries.jsonl delete mode 100755 archivebox/plugins/readability/on_Crawl__11_readability_install.py create mode 100755 archivebox/plugins/readability/on_Crawl__35_readability_install.py rename archivebox/plugins/redirects/{on_Snapshot__31_redirects.bg.js => on_Snapshot__25_redirects.bg.js} (93%) delete mode 100644 archivebox/plugins/redirects/tests/__init__.py delete mode 100644 archivebox/plugins/responses/tests/__init__.py delete mode 100644 archivebox/plugins/search_backend_ripgrep/binaries.jsonl delete mode 100755 archivebox/plugins/search_backend_ripgrep/on_Crawl__00_ripgrep_install.py create mode 100755 archivebox/plugins/search_backend_ripgrep/on_Crawl__50_ripgrep_install.py delete mode 100644 archivebox/plugins/search_backend_ripgrep/tests/__init__.py delete mode 100644 archivebox/plugins/search_backend_sqlite/tests/__init__.py delete mode 100644 archivebox/plugins/seo/tests/__init__.py delete mode 100644 archivebox/plugins/singlefile/binaries.jsonl delete mode 100755 archivebox/plugins/singlefile/on_Crawl__08_singlefile_install.py create mode 100755 archivebox/plugins/singlefile/on_Crawl__45_singlefile_install.py rename archivebox/plugins/singlefile/{on_Crawl__04_singlefile_install.js => on_Crawl__82_singlefile_install.js} (99%) delete mode 100644 archivebox/plugins/ssl/tests/__init__.py rename archivebox/plugins/staticfile/{on_Snapshot__32_staticfile.bg.js => on_Snapshot__26_staticfile.bg.js} (95%) delete mode 100644 archivebox/plugins/staticfile/tests/__init__.py rename archivebox/plugins/twocaptcha/{on_Crawl__05_twocaptcha_install.js => on_Crawl__83_twocaptcha_install.js} (93%) rename archivebox/plugins/twocaptcha/{on_Crawl__25_twocaptcha_config.js => on_Crawl__95_twocaptcha_config.js} (99%) rename archivebox/plugins/ublock/{on_Crawl__03_ublock_install.js => on_Crawl__80_install_ublock_extension.js} (95%) delete mode 100644 archivebox/plugins/wget/binaries.jsonl delete mode 100755 archivebox/plugins/wget/on_Crawl__06_wget_install.py create mode 100755 archivebox/plugins/wget/on_Crawl__10_wget_install.py rename archivebox/plugins/wget/{on_Snapshot__61_wget.py => on_Snapshot__06_wget.bg.py} (92%) delete mode 100644 archivebox/plugins/ytdlp/binaries.jsonl delete mode 100755 archivebox/plugins/ytdlp/on_Crawl__07_ytdlp_install.py create mode 100755 archivebox/plugins/ytdlp/on_Crawl__15_ytdlp_install.py rename archivebox/plugins/ytdlp/{on_Snapshot__63_ytdlp.bg.py => on_Snapshot__02_ytdlp.bg.py} (81%) create mode 100644 archivebox/tests/test_cli_add_interrupt.py create mode 100644 archivebox/tests/test_real_world_add.py create mode 100644 archivebox/tests/test_settings_signal_webhooks.py diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index 65a34c02..d21c11c6 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -4,6 +4,7 @@ __package__ = 'archivebox.cli' __command__ = 'archivebox add' import sys +from pathlib import Path from typing import TYPE_CHECKING @@ -14,7 +15,7 @@ from django.db.models import QuerySet from archivebox.misc.util import enforce_types, docstring from archivebox import CONSTANTS -from archivebox.config.common import ARCHIVING_CONFIG +from archivebox.config.common import ARCHIVING_CONFIG, SERVER_CONFIG from archivebox.config.permissions import USER, HOSTNAME @@ -57,8 +58,11 @@ def add(urls: str | list[str], from archivebox.crawls.models import Crawl from archivebox.base_models.models import get_or_create_system_user_pk from archivebox.workers.orchestrator import Orchestrator + from archivebox.misc.logging_util import printable_filesize + from archivebox.misc.system import get_dir_size created_by_id = created_by_id or get_or_create_system_user_pk() + started_at = timezone.now() # 1. Save the provided URLs to sources/2024-11-05__23-59-59__cli_add.txt sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__cli_add.txt' @@ -127,11 +131,56 @@ def add(urls: str | list[str], # Background mode: just queue work and return (orchestrator via server will pick it up) print('[yellow]\\[*] URLs queued. Orchestrator will process them (run `archivebox server` if not already running).[/yellow]') else: - # Foreground mode: run CrawlWorker inline until all work is done - print(f'[green]\\[*] Starting worker to process crawl...[/green]') - from archivebox.workers.worker import CrawlWorker - worker = CrawlWorker(crawl_id=str(crawl.id), worker_id=0) - worker.runloop() # Block until complete + # Foreground mode: run full orchestrator until all work is done + print(f'[green]\\[*] Starting orchestrator to process crawl...[/green]') + from archivebox.workers.orchestrator import Orchestrator + orchestrator = Orchestrator(exit_on_idle=True, crawl_id=str(crawl.id)) + orchestrator.runloop() # Block until complete + + # Print summary for foreground runs + try: + crawl.refresh_from_db() + snapshots_count = crawl.snapshot_set.count() + try: + total_bytes = sum(s.archive_size for s in crawl.snapshot_set.all()) + except Exception: + total_bytes, _, _ = get_dir_size(crawl.output_dir) + total_size = printable_filesize(total_bytes) + total_time = timezone.now() - started_at + total_seconds = int(total_time.total_seconds()) + mins, secs = divmod(total_seconds, 60) + hours, mins = divmod(mins, 60) + if hours: + duration_str = f"{hours}h {mins}m {secs}s" + elif mins: + duration_str = f"{mins}m {secs}s" + else: + duration_str = f"{secs}s" + + # Output dir relative to DATA_DIR + try: + rel_output = Path(crawl.output_dir).relative_to(CONSTANTS.DATA_DIR) + rel_output_str = f'./{rel_output}' + except Exception: + rel_output_str = str(crawl.output_dir) + + # Build admin URL from SERVER_CONFIG + bind_addr = SERVER_CONFIG.BIND_ADDR + if bind_addr.startswith('http://') or bind_addr.startswith('https://'): + base_url = bind_addr + else: + base_url = f'http://{bind_addr}' + admin_url = f'{base_url}/admin/crawls/crawl/{crawl.id}/change/' + + print('\n[bold]crawl output saved to:[/bold]') + print(f' {rel_output_str}') + print(f' {admin_url}') + print(f'\n[bold]total urls snapshotted:[/bold] {snapshots_count}') + print(f'[bold]total size:[/bold] {total_size}') + print(f'[bold]total time:[/bold] {duration_str}') + except Exception: + # Summary is best-effort; avoid failing the command if something goes wrong + pass # 6. Return the list of Snapshots in this crawl return crawl.snapshot_set.all() diff --git a/archivebox/cli/archivebox_pluginmap.py b/archivebox/cli/archivebox_pluginmap.py index 04a8cba6..fe280faa 100644 --- a/archivebox/cli/archivebox_pluginmap.py +++ b/archivebox/cli/archivebox_pluginmap.py @@ -205,7 +205,6 @@ def pluginmap( from archivebox.hooks import ( discover_hooks, - extract_step, is_background_hook, BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, @@ -277,16 +276,14 @@ def pluginmap( # Build hook info list hook_infos = [] for hook_path in hooks: - # Get plugin name from parent directory (e.g., 'wget' from 'plugins/wget/on_Snapshot__61_wget.py') + # Get plugin name from parent directory (e.g., 'wget' from 'plugins/wget/on_Snapshot__06_wget.bg.py') plugin_name = hook_path.parent.name - step = extract_step(hook_path.name) is_bg = is_background_hook(hook_path.name) hook_infos.append({ 'path': str(hook_path), 'name': hook_path.name, 'plugin': plugin_name, - 'step': step, 'is_background': is_bg, 'extension': hook_path.suffix, }) @@ -316,20 +313,18 @@ def pluginmap( show_header=True, header_style='bold magenta', ) - table.add_column('Step', justify='center', width=6) table.add_column('Plugin', style='cyan', width=20) table.add_column('Hook Name', style='green') table.add_column('BG', justify='center', width=4) table.add_column('Type', justify='center', width=5) - # Sort by step then by name - sorted_hooks = sorted(hook_infos, key=lambda h: (h['step'], h['name'])) + # Sort lexicographically by hook name + sorted_hooks = sorted(hook_infos, key=lambda h: h['name']) for hook in sorted_hooks: bg_marker = '[yellow]bg[/yellow]' if hook['is_background'] else '' ext = hook['extension'].lstrip('.') table.add_row( - str(hook['step']), hook['plugin'], hook['name'], bg_marker, @@ -347,7 +342,7 @@ def pluginmap( prnt(f'[bold]Total hooks discovered: {total_hooks}[/bold]') prnt() prnt('[dim]Hook naming convention: on_{Model}__{XX}_{description}[.bg].{ext}[/dim]') - prnt('[dim] - XX: Two-digit order (first digit = step 0-9)[/dim]') + prnt('[dim] - XX: Two-digit lexicographic order (00-99)[/dim]') prnt('[dim] - .bg: Background hook (non-blocking)[/dim]') prnt('[dim] - ext: py, sh, or js[/dim]') prnt() diff --git a/archivebox/config/configset.py b/archivebox/config/configset.py index d4a02141..19e2e2d2 100644 --- a/archivebox/config/configset.py +++ b/archivebox/config/configset.py @@ -258,11 +258,18 @@ def get_config( # Add CRAWL_OUTPUT_DIR for snapshot hooks to find shared Chrome session if crawl and hasattr(crawl, "output_dir"): config['CRAWL_OUTPUT_DIR'] = str(crawl.output_dir) + config['CRAWL_ID'] = str(getattr(crawl, "id", "")) if getattr(crawl, "id", None) else config.get('CRAWL_ID') # Apply snapshot config overrides (highest priority) if snapshot and hasattr(snapshot, "config") and snapshot.config: config.update(snapshot.config) + if snapshot: + config['SNAPSHOT_ID'] = str(getattr(snapshot, "id", "")) if getattr(snapshot, "id", None) else config.get('SNAPSHOT_ID') + config['SNAPSHOT_DEPTH'] = int(getattr(snapshot, "depth", 0) or 0) + if getattr(snapshot, "crawl_id", None): + config['CRAWL_ID'] = str(snapshot.crawl_id) + # Normalize all aliases to canonical names (after all sources merged) # This handles aliases that came from user/crawl/snapshot configs, not just env try: diff --git a/archivebox/core/models.py b/archivebox/core/models.py index f86ef048..bd943a29 100755 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -344,6 +344,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea @property def process_set(self): """Get all Process objects related to this snapshot's ArchiveResults.""" + import json + import json from archivebox.machine.models import Process return Process.objects.filter(archiveresult__snapshot_id=self.id) @@ -613,7 +615,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea ONLY used by: archivebox update (for orphan detection) """ - import json + from archivebox.machine.models import Process # Try index.jsonl first (new format), then index.json (legacy) jsonl_path = snapshot_dir / CONSTANTS.JSONL_INDEX_FILENAME @@ -622,15 +624,12 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea data = None if jsonl_path.exists(): try: - with open(jsonl_path) as f: - for line in f: - line = line.strip() - if line.startswith('{'): - record = json.loads(line) - if record.get('type') == 'Snapshot': - data = record - break - except (json.JSONDecodeError, OSError): + records = Process.parse_records_from_text(jsonl_path.read_text()) + for record in records: + if record.get('type') == 'Snapshot': + data = record + break + except OSError: pass elif json_path.exists(): try: @@ -689,7 +688,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea ONLY used by: archivebox update (for orphan import) """ - import json + from archivebox.machine.models import Process # Try index.jsonl first (new format), then index.json (legacy) jsonl_path = snapshot_dir / CONSTANTS.JSONL_INDEX_FILENAME @@ -698,15 +697,12 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea data = None if jsonl_path.exists(): try: - with open(jsonl_path) as f: - for line in f: - line = line.strip() - if line.startswith('{'): - record = json.loads(line) - if record.get('type') == 'Snapshot': - data = record - break - except (json.JSONDecodeError, OSError): + records = Process.parse_records_from_text(jsonl_path.read_text()) + for record in records: + if record.get('type') == 'Snapshot': + data = record + break + except OSError: pass elif json_path.exists(): try: @@ -1040,7 +1036,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea Returns dict with keys: 'snapshot', 'archive_results', 'binaries', 'processes' """ - import json + from archivebox.machine.models import Process from archivebox.misc.jsonl import ( TYPE_SNAPSHOT, TYPE_ARCHIVERESULT, TYPE_BINARY, TYPE_PROCESS, ) @@ -1056,24 +1052,17 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea if not index_path.exists(): return result - with open(index_path, 'r') as f: - for line in f: - line = line.strip() - if not line or not line.startswith('{'): - continue - try: - record = json.loads(line) - record_type = record.get('type') - if record_type == TYPE_SNAPSHOT: - result['snapshot'] = record - elif record_type == TYPE_ARCHIVERESULT: - result['archive_results'].append(record) - elif record_type == TYPE_BINARY: - result['binaries'].append(record) - elif record_type == TYPE_PROCESS: - result['processes'].append(record) - except json.JSONDecodeError: - continue + records = Process.parse_records_from_text(index_path.read_text()) + for record in records: + record_type = record.get('type') + if record_type == TYPE_SNAPSHOT: + result['snapshot'] = record + elif record_type == TYPE_ARCHIVERESULT: + result['archive_results'].append(record) + elif record_type == TYPE_BINARY: + result['binaries'].append(record) + elif record_type == TYPE_PROCESS: + result['processes'].append(record) return result @@ -1317,7 +1306,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea for plugin in all_plugins: result = archive_results.get(plugin) existing = result and result.status == 'succeeded' and (result.output_files or result.output_str) - icon = get_plugin_icon(plugin) + icon = mark_safe(get_plugin_icon(plugin)) # Skip plugins with empty icons that have no output # (e.g., staticfile only shows when there's actual output) @@ -1373,6 +1362,45 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea return str(current_path) + def ensure_crawl_symlink(self) -> None: + """Ensure snapshot is symlinked under its crawl output directory.""" + import os + from pathlib import Path + from django.utils import timezone + from archivebox import DATA_DIR + from archivebox.crawls.models import Crawl + + if not self.crawl_id: + return + crawl = Crawl.objects.filter(id=self.crawl_id).select_related('created_by').first() + if not crawl: + return + + date_base = crawl.created_at or self.created_at or timezone.now() + date_str = date_base.strftime('%Y%m%d') + domain = self.extract_domain_from_url(self.url) + username = crawl.created_by.username if crawl.created_by_id else 'system' + + crawl_dir = DATA_DIR / 'users' / username / 'crawls' / date_str / domain / str(crawl.id) + link_path = crawl_dir / 'snapshots' / domain / str(self.id) + link_parent = link_path.parent + link_parent.mkdir(parents=True, exist_ok=True) + + target = Path(self.output_dir) + if link_path.exists() or link_path.is_symlink(): + if link_path.is_symlink(): + if link_path.resolve() == target.resolve(): + return + link_path.unlink(missing_ok=True) + else: + return + + rel_target = os.path.relpath(target, link_parent) + try: + link_path.symlink_to(rel_target, target_is_directory=True) + except OSError: + return + @cached_property def archive_path(self): return f'{CONSTANTS.ARCHIVE_DIR_NAME}/{self.timestamp}' @@ -1636,6 +1664,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea if update_fields: snapshot.save(update_fields=update_fields + ['modified_at']) + snapshot.ensure_crawl_symlink() + return snapshot def create_pending_archiveresults(self) -> list['ArchiveResult']: @@ -1689,7 +1719,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea """ # Check if any ARs are still pending/started pending = self.archiveresult_set.exclude( - status__in=ArchiveResult.FINAL_OR_ACTIVE_STATES + status__in=ArchiveResult.FINAL_STATES ).exists() return not pending @@ -1754,7 +1784,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea - Plugins run in order (numeric prefix) - Each plugin checks its dependencies at runtime - Dependency handling (e.g., chrome_session → screenshot): + Dependency handling (e.g., chrome → screenshot): - Plugins check if required outputs exist before running - If dependency output missing → plugin returns 'skipped' - On retry, if dependency now succeeds → dependent can run @@ -2117,6 +2147,18 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea TITLE_LOADING_MSG = 'Not yet archived...' canonical = self.canonical_outputs() + preview_priority = [ + 'singlefile_path', + 'screenshot_path', + 'wget_path', + 'dom_path', + 'pdf_path', + 'readability_path', + ] + best_preview_path = next( + (canonical.get(key) for key in preview_priority if canonical.get(key)), + canonical.get('index_path', 'index.html'), + ) context = { **self.to_dict(extended=True), **{f'{k}_path': v for k, v in canonical.items()}, @@ -2132,6 +2174,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea 'oldest_archive_date': ts_to_date_str(self.oldest_archive_date), 'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG, 'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS, + 'best_preview_path': best_preview_path, } rendered_html = render_to_string('snapshot.html', context) atomic_write(str(Path(out_dir) / CONSTANTS.HTML_INDEX_FILENAME), rendered_html) @@ -2669,12 +2712,12 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi - end_ts, retry_at, cmd, cmd_version, binary FK - Processes side-effect records (Snapshot, Tag, etc.) via process_hook_records() """ - import json import mimetypes from collections import defaultdict from pathlib import Path from django.utils import timezone - from archivebox.hooks import process_hook_records + from archivebox.hooks import process_hook_records, extract_records_from_process + from archivebox.machine.models import Process plugin_dir = Path(self.pwd) if self.pwd else None if not plugin_dir or not plugin_dir.exists(): @@ -2687,15 +2730,13 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi # Read and parse JSONL output from stdout.log stdout_file = plugin_dir / 'stdout.log' - stdout = stdout_file.read_text() if stdout_file.exists() else '' - records = [] - for line in stdout.splitlines(): - if line.strip() and line.strip().startswith('{'): - try: - records.append(json.loads(line)) - except json.JSONDecodeError: - continue + if self.process_id and self.process: + records = extract_records_from_process(self.process) + + if not records: + stdout = stdout_file.read_text() if stdout_file.exists() else '' + records = Process.parse_records_from_text(stdout) # Find ArchiveResult record and update status/output from it ar_records = [r for r in records if r.get('type') == 'ArchiveResult'] @@ -2722,9 +2763,20 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi self._set_binary_from_cmd(hook_data['cmd']) # Note: cmd_version is derived from binary.version, not stored on Process else: - # No ArchiveResult record = failed - self.status = self.StatusChoices.FAILED - self.output_str = 'Hook did not output ArchiveResult record' + # No ArchiveResult record: treat background hooks or clean exits as skipped + is_background = False + try: + from archivebox.hooks import is_background_hook + is_background = bool(self.hook_name and is_background_hook(self.hook_name)) + except Exception: + pass + + if is_background or (self.process_id and self.process and self.process.exit_code == 0): + self.status = self.StatusChoices.SKIPPED + self.output_str = 'Hook did not output ArchiveResult record' + else: + self.status = self.StatusChoices.FAILED + self.output_str = 'Hook did not output ArchiveResult record' # Walk filesystem and populate output_files, output_size, output_mimetypes exclude_names = {'stdout.log', 'stderr.log', 'hook.pid', 'listener.pid'} @@ -2793,14 +2845,9 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi } process_hook_records(filtered_records, overrides=overrides) - # Cleanup PID files and empty logs + # Cleanup PID files (keep logs even if empty so they can be tailed) pid_file = plugin_dir / 'hook.pid' pid_file.unlink(missing_ok=True) - stderr_file = plugin_dir / 'stderr.log' - if stdout_file.exists() and stdout_file.stat().st_size == 0: - stdout_file.unlink() - if stderr_file.exists() and stderr_file.stat().st_size == 0: - stderr_file.unlink() def _set_binary_from_cmd(self, cmd: list) -> None: """ @@ -3186,4 +3233,4 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True): # Manually register state machines with python-statemachine registry # (normally auto-discovered from statemachines.py, but we define them here for clarity) registry.register(SnapshotMachine) -registry.register(ArchiveResultMachine) \ No newline at end of file +registry.register(ArchiveResultMachine) diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index aee8d19d..16b6df0c 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -436,6 +436,10 @@ SIGNAL_WEBHOOKS = { }, } +# Avoid background threads touching sqlite connections (especially during tests/migrations). +if DATABASES["default"]["ENGINE"].endswith("sqlite3"): + SIGNAL_WEBHOOKS["TASK_HANDLER"] = "signal_webhooks.handlers.sync_task_handler" + ################################################################################ ### Admin Data View Settings ################################################################################ diff --git a/archivebox/core/views.py b/archivebox/core/views.py index f0410846..eec08661 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -120,7 +120,15 @@ class SnapshotView(View): # Get available extractor plugins from hooks (sorted by numeric prefix for ordering) # Convert to base names for display ordering all_plugins = [get_plugin_name(e) for e in get_enabled_plugins()] - preferred_types = tuple(all_plugins) + preview_priority = [ + 'singlefile', + 'screenshot', + 'wget', + 'dom', + 'pdf', + 'readability', + ] + preferred_types = tuple(preview_priority + [p for p in all_plugins if p not in preview_priority]) all_types = preferred_types + tuple(result_type for result_type in archiveresults.keys() if result_type not in preferred_types) best_result = {'path': 'None', 'result': None} diff --git a/archivebox/crawls/models.py b/archivebox/crawls/models.py index 9083d9f5..969287cc 100755 --- a/archivebox/crawls/models.py +++ b/archivebox/crawls/models.py @@ -313,6 +313,12 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith if tags: snapshot.save_tags(tags.split(',')) + # Ensure crawl -> snapshot symlink exists for both new and existing snapshots + try: + snapshot.ensure_crawl_symlink() + except Exception: + pass + return created_snapshots def run(self) -> 'Snapshot | None': @@ -325,7 +331,6 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith The root Snapshot for this crawl, or None for system crawls that don't create snapshots """ import time - import json from pathlib import Path from archivebox.hooks import run_hook, discover_hooks, process_hook_records from archivebox.config.configset import get_config @@ -339,35 +344,6 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith # Get merged config with crawl context config = get_config(crawl=self) - # Load all binaries.jsonl files from plugins - # This replaces individual on_Crawl install hooks with declarative configuration - from archivebox.hooks import BUILTIN_PLUGINS_DIR - from archivebox.machine.models import Machine - - machine_id = str(Machine.current().id) - binaries_records = [] - - for binaries_file in BUILTIN_PLUGINS_DIR.glob('*/binaries.jsonl'): - try: - with open(binaries_file, 'r') as f: - for line in f: - line = line.strip() - if line and not line.startswith('#'): - try: - record = json.loads(line) - if record.get('type') == 'Binary': - record['machine_id'] = machine_id - binaries_records.append(record) - except json.JSONDecodeError: - pass - except Exception: - pass - - # Process binary declarations before running hooks - if binaries_records: - overrides = {'crawl': self} - process_hook_records(binaries_records, overrides=overrides) - # Discover and run on_Crawl hooks with open(debug_log, 'a') as f: f.write(f'Discovering Crawl hooks...\n') @@ -418,6 +394,34 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith if stats: print(f'[green]✓ Created: {stats}[/green]') + # Ensure any newly declared binaries are installed before creating snapshots + from archivebox.machine.models import Binary, Machine + from django.utils import timezone + + machine = Machine.current() + while True: + pending_binaries = Binary.objects.filter( + machine=machine, + status=Binary.StatusChoices.QUEUED, + retry_at__lte=timezone.now(), + ).order_by('retry_at') + if not pending_binaries.exists(): + break + + for binary in pending_binaries: + try: + binary.sm.tick() + except Exception: + continue + + # Exit if nothing else is immediately retryable + if not Binary.objects.filter( + machine=machine, + status=Binary.StatusChoices.QUEUED, + retry_at__lte=timezone.now(), + ).exists(): + break + # Create snapshots from all URLs in self.urls with open(debug_log, 'a') as f: f.write(f'Creating snapshots from URLs...\n') diff --git a/archivebox/hooks.py b/archivebox/hooks.py index 04bfa0ef..e5483e59 100644 --- a/archivebox/hooks.py +++ b/archivebox/hooks.py @@ -15,29 +15,29 @@ Hook contract: Exit: 0 = success, non-zero = failure Execution order: - - Hooks are numbered 00-99 with first digit determining step (0-9) - - All hooks in a step can run in parallel - - Steps execute sequentially (step 0 → step 1 → ... → step 9) - - Background hooks (.bg suffix) don't block step advancement + - Hooks are named with two-digit prefixes (00-99) and sorted lexicographically by filename + - Foreground hooks run sequentially in that order + - Background hooks (.bg suffix) run concurrently and do not block foreground progress + - After all foreground hooks complete, background hooks receive SIGTERM and must finalize - Failed extractors don't block subsequent extractors Hook Naming Convention: on_{ModelName}__{run_order}_{description}[.bg].{ext} Examples: - on_Snapshot__00_setup.py # Step 0, runs first - on_Snapshot__20_chrome_tab.bg.js # Step 2, background (doesn't block) - on_Snapshot__50_screenshot.js # Step 5, foreground (blocks step) - on_Snapshot__63_media.bg.py # Step 6, background (long-running) + on_Snapshot__00_setup.py # runs first + on_Snapshot__10_chrome_tab.bg.js # background (doesn't block) + on_Snapshot__50_screenshot.js # foreground (blocks) + on_Snapshot__63_media.bg.py # background (long-running) Dependency handling: Extractor plugins that depend on other plugins' output should check at runtime: ```python # Example: screenshot plugin depends on chrome plugin - chrome_session_dir = Path(os.environ.get('SNAPSHOT_DIR', '.')) / 'chrome_session' - if not (chrome_session_dir / 'session.json').exists(): - print('{"status": "skipped", "output": "chrome_session not available"}') + chrome_dir = Path(os.environ.get('SNAPSHOT_DIR', '.')) / 'chrome' + if not (chrome_dir / 'cdp_url.txt').exists(): + print('{"status": "skipped", "output": "chrome session not available"}') sys.exit(1) # Exit non-zero so it gets retried later ``` @@ -50,7 +50,7 @@ API (all hook logic lives here): discover_hooks(event) -> List[Path] Find hook scripts run_hook(script, ...) -> HookResult Execute a hook script run_hooks(event, ...) -> List[HookResult] Run all hooks for an event - extract_step(hook_name) -> int Get step number (0-9) from hook name + extract_step(hook_name) -> int Deprecated: get two-digit order prefix if present is_background_hook(name) -> bool Check if hook is background (.bg suffix) """ @@ -67,6 +67,7 @@ from typing import List, Dict, Any, Optional, TypedDict from django.conf import settings from django.utils import timezone +from django.utils.safestring import mark_safe # Plugin directories @@ -80,51 +81,33 @@ USER_PLUGINS_DIR = Path(getattr(settings, 'DATA_DIR', Path.cwd())) / 'plugins' def extract_step(hook_name: str) -> int: """ - Extract step number (0-9) from hook name. + Deprecated: return the two-digit order prefix as an integer (00-99) if present. - Hooks are numbered 00-99 with the first digit determining the step. - Pattern: on_{Model}__{XX}_{description}[.bg].{ext} - - Args: - hook_name: Hook filename (e.g., 'on_Snapshot__50_wget.py') - - Returns: - Step number 0-9, or 9 (default) for unnumbered hooks. - - Examples: - extract_step('on_Snapshot__05_chrome.py') -> 0 - extract_step('on_Snapshot__50_wget.py') -> 5 - extract_step('on_Snapshot__63_media.bg.py') -> 6 - extract_step('on_Snapshot__99_cleanup.sh') -> 9 - extract_step('on_Snapshot__unnumbered.py') -> 9 (default) + Hook execution is based on lexicographic ordering of filenames; callers should + not rely on parsed numeric steps for ordering decisions. """ - # Pattern matches __XX_ where XX is two digits match = re.search(r'__(\d{2})_', hook_name) if match: - two_digit = int(match.group(1)) - step = two_digit // 10 # First digit is the step (0-9) - return step - - # Log warning for unnumbered hooks and default to step 9 + return int(match.group(1)) import sys - print(f"Warning: Hook '{hook_name}' has no step number (expected __XX_), defaulting to step 9", file=sys.stderr) - return 9 + print(f"Warning: Hook '{hook_name}' has no order prefix (expected __XX_), defaulting to 99", file=sys.stderr) + return 99 def is_background_hook(hook_name: str) -> bool: """ - Check if a hook is a background hook (doesn't block step advancement). + Check if a hook is a background hook (doesn't block foreground progression). Background hooks have '.bg.' in their filename before the extension. Args: - hook_name: Hook filename (e.g., 'on_Snapshot__20_chrome_tab.bg.js') + hook_name: Hook filename (e.g., 'on_Snapshot__10_chrome_tab.bg.js') Returns: True if background hook, False if foreground. Examples: - is_background_hook('on_Snapshot__20_chrome_tab.bg.js') -> True + is_background_hook('on_Snapshot__10_chrome_tab.bg.js') -> True is_background_hook('on_Snapshot__50_wget.py') -> False is_background_hook('on_Snapshot__63_media.bg.py') -> True """ @@ -273,6 +256,7 @@ def run_hook( """ from archivebox.machine.models import Process, Machine import time + import sys start_time = time.time() # Auto-detect timeout from plugin config if not explicitly provided @@ -313,7 +297,7 @@ def run_hook( if ext == '.sh': cmd = ['bash', str(script)] elif ext == '.py': - cmd = ['python3', str(script)] + cmd = [sys.executable, str(script)] elif ext == '.js': cmd = ['node', str(script)] else: @@ -393,10 +377,10 @@ def run_hook( # Priority: config dict > Machine.config > derive from LIB_DIR node_path = config.get('NODE_PATH') if not node_path and lib_dir: - # Derive from LIB_DIR/npm/node_modules + # Derive from LIB_DIR/npm/node_modules (create if needed) node_modules_dir = Path(lib_dir) / 'npm' / 'node_modules' - if node_modules_dir.exists(): - node_path = str(node_modules_dir) + node_modules_dir.mkdir(parents=True, exist_ok=True) + node_path = str(node_modules_dir) if not node_path: try: # Fallback to Machine.config @@ -462,7 +446,7 @@ def run_hook( cmd=cmd, timeout=timeout, status=Process.StatusChoices.EXITED, - exit_code=-1, + exit_code=1, stderr=f'Failed to run hook: {type(e).__name__}: {e}', ) return process @@ -472,7 +456,6 @@ def extract_records_from_process(process: 'Process') -> List[Dict[str, Any]]: """ Extract JSONL records from a Process's stdout. - Uses the same parse_line() logic from misc/jsonl.py. Adds plugin metadata to each record. Args: @@ -481,32 +464,20 @@ def extract_records_from_process(process: 'Process') -> List[Dict[str, Any]]: Returns: List of parsed JSONL records with plugin metadata """ - from archivebox.misc.jsonl import parse_line - - records = [] - - # Read stdout from process - stdout = process.stdout - if not stdout and process.stdout_file and process.stdout_file.exists(): - stdout = process.stdout_file.read_text() - - if not stdout: - return records + records = process.get_records() + if not records: + return [] # Extract plugin metadata from process.pwd and process.cmd plugin_name = Path(process.pwd).name if process.pwd else 'unknown' hook_name = Path(process.cmd[1]).name if len(process.cmd) > 1 else 'unknown' plugin_hook = process.cmd[1] if len(process.cmd) > 1 else '' - # Parse each line as JSONL - for line in stdout.splitlines(): - record = parse_line(line) - if record and 'type' in record: - # Add plugin metadata to record - record.setdefault('plugin', plugin_name) - record.setdefault('hook_name', hook_name) - record.setdefault('plugin_hook', plugin_hook) - records.append(record) + for record in records: + # Add plugin metadata to record + record.setdefault('plugin', plugin_name) + record.setdefault('hook_name', hook_name) + record.setdefault('plugin_hook', plugin_hook) return records @@ -538,18 +509,13 @@ def collect_urls_from_plugins(snapshot_dir: Path) -> List[Dict[str, Any]]: continue try: - with open(urls_file, 'r') as f: - for line in f: - line = line.strip() - if line: - try: - entry = json.loads(line) - if entry.get('url'): - # Track which parser plugin found this URL - entry['plugin'] = subdir.name - urls.append(entry) - except json.JSONDecodeError: - continue + from archivebox.machine.models import Process + text = urls_file.read_text() + for entry in Process.parse_records_from_text(text): + if entry.get('url'): + # Track which parser plugin found this URL + entry['plugin'] = subdir.name + urls.append(entry) except Exception: pass @@ -610,8 +576,8 @@ def get_plugins() -> List[str]: The plugin name is the plugin directory name, not the hook script name. Example: - archivebox/plugins/chrome_session/on_Snapshot__20_chrome_tab.bg.js - -> plugin = 'chrome_session' + archivebox/plugins/chrome/on_Snapshot__10_chrome_tab.bg.js + -> plugin = 'chrome' Sorted alphabetically (plugins control their hook order via numeric prefixes in hook names). """ @@ -817,7 +783,7 @@ def discover_plugin_configs() -> Dict[str, Dict[str, Any]]: Returns: Dict mapping plugin names to their parsed JSONSchema configs. - e.g., {'wget': {...schema...}, 'chrome_session': {...schema...}} + e.g., {'wget': {...schema...}, 'chrome': {...schema...}} Example config.json: { @@ -928,14 +894,10 @@ def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[ if plugins_whitelist: # PLUGINS whitelist is specified - only enable plugins in the list plugin_names = [p.strip().lower() for p in plugins_whitelist.split(',') if p.strip()] - import sys - print(f"DEBUG: PLUGINS whitelist='{plugins_whitelist}', checking plugin '{plugin_name}', plugin_names={plugin_names}", file=sys.stderr) if plugin_name.lower() not in plugin_names: # Plugin not in whitelist - explicitly disabled - print(f"DEBUG: Plugin '{plugin_name}' NOT in whitelist, disabling", file=sys.stderr) enabled = False else: - print(f"DEBUG: Plugin '{plugin_name}' IS in whitelist, enabling", file=sys.stderr) # Plugin is in whitelist - check if explicitly disabled by PLUGINNAME_ENABLED enabled_key = f'{plugin_upper}_ENABLED' enabled = config.get(enabled_key) @@ -945,10 +907,8 @@ def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[ enabled = enabled.lower() not in ('false', '0', 'no', '') else: # No PLUGINS whitelist - use PLUGINNAME_ENABLED (default True) - import sys enabled_key = f'{plugin_upper}_ENABLED' enabled = config.get(enabled_key) - print(f"DEBUG: NO PLUGINS whitelist in config, checking {enabled_key}={enabled}", file=sys.stderr) if enabled is None: enabled = True elif isinstance(enabled, str): @@ -1064,10 +1024,10 @@ def get_plugin_icon(plugin: str) -> str: # Try plugin-provided icon template icon_template = get_plugin_template(plugin, 'icon', fallback=False) if icon_template: - return icon_template.strip() + return mark_safe(icon_template.strip()) # Fall back to generic folder icon - return '📁' + return mark_safe('📁') def get_all_plugin_icons() -> Dict[str, str]: @@ -1204,18 +1164,14 @@ def create_model_record(record: Dict[str, Any]) -> Any: return obj elif record_type == 'Machine': - # Machine config update (special _method handling) - method = record.pop('_method', None) - if method == 'update': - key = record.get('key') - value = record.get('value') - if key and value: - machine = Machine.current() - if not machine.config: - machine.config = {} - machine.config[key] = value - machine.save(update_fields=['config']) - return machine + config_patch = record.get('config') + if isinstance(config_patch, dict) and config_patch: + machine = Machine.current() + if not machine.config: + machine.config = {} + machine.config.update(config_patch) + machine.save(update_fields=['config']) + return machine return None # Add more types as needed (Dependency, Snapshot, etc.) diff --git a/archivebox/machine/detect.py b/archivebox/machine/detect.py index 84595d77..9d44df0d 100644 --- a/archivebox/machine/detect.py +++ b/archivebox/machine/detect.py @@ -227,33 +227,45 @@ def get_os_info() -> Dict[str, Any]: } def get_host_stats() -> Dict[str, Any]: - with tempfile.TemporaryDirectory() as tmp_dir: - tmp_usage = psutil.disk_usage(str(tmp_dir)) - app_usage = psutil.disk_usage(str(PACKAGE_DIR)) - data_usage = psutil.disk_usage(str(DATA_DIR)) - mem_usage = psutil.virtual_memory() - swap_usage = psutil.swap_memory() - return { - "cpu_boot_time": datetime.fromtimestamp(psutil.boot_time()).isoformat(), - "cpu_count": psutil.cpu_count(logical=False), - "cpu_load": psutil.getloadavg(), - # "cpu_pct": psutil.cpu_percent(interval=1), - "mem_virt_used_pct": mem_usage.percent, - "mem_virt_used_gb": round(mem_usage.used / 1024 / 1024 / 1024, 3), - "mem_virt_free_gb": round(mem_usage.free / 1024 / 1024 / 1024, 3), - "mem_swap_used_pct": swap_usage.percent, - "mem_swap_used_gb": round(swap_usage.used / 1024 / 1024 / 1024, 3), - "mem_swap_free_gb": round(swap_usage.free / 1024 / 1024 / 1024, 3), - "disk_tmp_used_pct": tmp_usage.percent, - "disk_tmp_used_gb": round(tmp_usage.used / 1024 / 1024 / 1024, 3), - "disk_tmp_free_gb": round(tmp_usage.free / 1024 / 1024 / 1024, 3), # in GB - "disk_app_used_pct": app_usage.percent, - "disk_app_used_gb": round(app_usage.used / 1024 / 1024 / 1024, 3), - "disk_app_free_gb": round(app_usage.free / 1024 / 1024 / 1024, 3), - "disk_data_used_pct": data_usage.percent, - "disk_data_used_gb": round(data_usage.used / 1024 / 1024 / 1024, 3), - "disk_data_free_gb": round(data_usage.free / 1024 / 1024 / 1024, 3), - } + try: + with tempfile.TemporaryDirectory() as tmp_dir: + tmp_usage = psutil.disk_usage(str(tmp_dir)) + app_usage = psutil.disk_usage(str(PACKAGE_DIR)) + data_usage = psutil.disk_usage(str(DATA_DIR)) + mem_usage = psutil.virtual_memory() + try: + swap_usage = psutil.swap_memory() + swap_used_pct = swap_usage.percent + swap_used_gb = round(swap_usage.used / 1024 / 1024 / 1024, 3) + swap_free_gb = round(swap_usage.free / 1024 / 1024 / 1024, 3) + except OSError: + # Some sandboxed environments deny access to swap stats + swap_used_pct = 0.0 + swap_used_gb = 0.0 + swap_free_gb = 0.0 + return { + "cpu_boot_time": datetime.fromtimestamp(psutil.boot_time()).isoformat(), + "cpu_count": psutil.cpu_count(logical=False), + "cpu_load": psutil.getloadavg(), + # "cpu_pct": psutil.cpu_percent(interval=1), + "mem_virt_used_pct": mem_usage.percent, + "mem_virt_used_gb": round(mem_usage.used / 1024 / 1024 / 1024, 3), + "mem_virt_free_gb": round(mem_usage.free / 1024 / 1024 / 1024, 3), + "mem_swap_used_pct": swap_used_pct, + "mem_swap_used_gb": swap_used_gb, + "mem_swap_free_gb": swap_free_gb, + "disk_tmp_used_pct": tmp_usage.percent, + "disk_tmp_used_gb": round(tmp_usage.used / 1024 / 1024 / 1024, 3), + "disk_tmp_free_gb": round(tmp_usage.free / 1024 / 1024 / 1024, 3), # in GB + "disk_app_used_pct": app_usage.percent, + "disk_app_used_gb": round(app_usage.used / 1024 / 1024 / 1024, 3), + "disk_app_free_gb": round(app_usage.free / 1024 / 1024 / 1024, 3), + "disk_data_used_pct": data_usage.percent, + "disk_data_used_gb": round(data_usage.used / 1024 / 1024 / 1024, 3), + "disk_data_free_gb": round(data_usage.free / 1024 / 1024 / 1024, 3), + } + except Exception: + return {} def get_host_immutable_info(host_info: Dict[str, Any]) -> Dict[str, Any]: return { diff --git a/archivebox/machine/models.py b/archivebox/machine/models.py index 07da29ec..210452f9 100755 --- a/archivebox/machine/models.py +++ b/archivebox/machine/models.py @@ -113,23 +113,20 @@ class Machine(ModelWithHealthStats): Update Machine config from JSON dict. Args: - record: JSON dict with '_method': 'update', 'key': '...', 'value': '...' + record: JSON dict with 'config': {key: value} patch overrides: Not used Returns: Machine instance or None """ - method = record.get('_method') - if method == 'update': - key = record.get('key') - value = record.get('value') - if key and value: - machine = Machine.current() - if not machine.config: - machine.config = {} - machine.config[key] = value - machine.save(update_fields=['config']) - return machine + config_patch = record.get('config') + if isinstance(config_patch, dict) and config_patch: + machine = Machine.current() + if not machine.config: + machine.config = {} + machine.config.update(config_patch) + machine.save(update_fields=['config']) + return machine return None @@ -458,31 +455,31 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine): continue # Parse JSONL output to check for successful installation - stdout_file = plugin_output_dir / 'stdout.log' - if stdout_file.exists(): - stdout = stdout_file.read_text() - for line in stdout.splitlines(): - if line.strip() and line.strip().startswith('{'): - try: - record = json.loads(line) - if record.get('type') == 'Binary' and record.get('abspath'): - # Update self from successful installation - self.abspath = record['abspath'] - self.version = record.get('version', '') - self.sha256 = record.get('sha256', '') - self.binprovider = record.get('binprovider', 'env') - self.status = self.StatusChoices.INSTALLED - self.save() + from archivebox.hooks import extract_records_from_process, process_hook_records + records = extract_records_from_process(process) + if records: + process_hook_records(records, overrides={}) + binary_records = [ + record for record in records + if record.get('type') == 'Binary' and record.get('abspath') + ] + if binary_records: + record = binary_records[0] + # Update self from successful installation + self.abspath = record['abspath'] + self.version = record.get('version', '') + self.sha256 = record.get('sha256', '') + self.binprovider = record.get('binprovider', 'env') + self.status = self.StatusChoices.INSTALLED + self.save() - # Symlink binary into LIB_BIN_DIR if configured - from django.conf import settings - lib_bin_dir = getattr(settings, 'LIB_BIN_DIR', None) - if lib_bin_dir: - self.symlink_to_lib_bin(lib_bin_dir) + # Symlink binary into LIB_BIN_DIR if configured + from django.conf import settings + lib_bin_dir = getattr(settings, 'LIB_BIN_DIR', None) + if lib_bin_dir: + self.symlink_to_lib_bin(lib_bin_dir) - return - except json.JSONDecodeError: - continue + return # No hook succeeded - leave status as QUEUED (will retry later) # Don't set to FAILED since we don't have that status anymore @@ -861,6 +858,27 @@ class Process(models.Model): record['timeout'] = self.timeout return record + @classmethod + def parse_records_from_text(cls, text: str) -> list[dict]: + """Parse JSONL records from raw text using the shared JSONL parser.""" + from archivebox.misc.jsonl import parse_line + + records: list[dict] = [] + if not text: + return records + for line in text.splitlines(): + record = parse_line(line) + if record and record.get('type'): + records.append(record) + return records + + def get_records(self) -> list[dict]: + """Parse JSONL records from this process's stdout.""" + stdout = self.stdout + if not stdout and self.stdout_file and self.stdout_file.exists(): + stdout = self.stdout_file.read_text() + return self.parse_records_from_text(stdout or '') + @staticmethod def from_json(record: dict, overrides: dict = None): """ @@ -919,6 +937,7 @@ class Process(models.Model): if (_CURRENT_PROCESS.pid == current_pid and _CURRENT_PROCESS.machine_id == machine.id and timezone.now() < _CURRENT_PROCESS.modified_at + timedelta(seconds=PROCESS_RECHECK_INTERVAL)): + _CURRENT_PROCESS.ensure_log_files() return _CURRENT_PROCESS _CURRENT_PROCESS = None @@ -945,6 +964,7 @@ class Process(models.Model): db_start_time = existing.started_at.timestamp() if abs(db_start_time - os_start_time) < START_TIME_TOLERANCE: _CURRENT_PROCESS = existing + _CURRENT_PROCESS.ensure_log_files() return existing # No valid existing record - create new one @@ -977,6 +997,7 @@ class Process(models.Model): started_at=started_at, status=cls.StatusChoices.RUNNING, ) + _CURRENT_PROCESS.ensure_log_files() return _CURRENT_PROCESS @classmethod @@ -1089,7 +1110,7 @@ class Process(models.Model): if is_stale: proc.status = cls.StatusChoices.EXITED proc.ended_at = proc.ended_at or timezone.now() - proc.exit_code = proc.exit_code if proc.exit_code is not None else -1 + proc.exit_code = proc.exit_code if proc.exit_code is not None else 0 proc.save(update_fields=['status', 'ended_at', 'exit_code']) cleaned += 1 @@ -1209,7 +1230,15 @@ class Process(models.Model): the actual OS process exists and matches our record. """ proc = self.proc - return proc is not None and proc.is_running() + if proc is None: + return False + try: + # Treat zombies as not running (they should be reaped) + if proc.status() == psutil.STATUS_ZOMBIE: + return False + except Exception: + pass + return proc.is_running() def is_alive(self) -> bool: """ @@ -1421,6 +1450,22 @@ class Process(models.Model): except OSError: pass + def ensure_log_files(self) -> None: + """Ensure stdout/stderr log files exist for this process.""" + if not self.pwd: + return + try: + Path(self.pwd).mkdir(parents=True, exist_ok=True) + except OSError: + return + try: + if self.stdout_file: + self.stdout_file.touch(exist_ok=True) + if self.stderr_file: + self.stderr_file.touch(exist_ok=True) + except OSError: + return + def _build_env(self) -> dict: """Build environment dict for subprocess, merging stored env with system.""" import json @@ -1507,9 +1552,11 @@ class Process(models.Model): proc.wait(timeout=self.timeout) self.exit_code = proc.returncode except subprocess.TimeoutExpired: + import signal + proc.kill() proc.wait() - self.exit_code = -1 + self.exit_code = 128 + signal.SIGKILL self.ended_at = timezone.now() if stdout_path.exists(): @@ -1579,9 +1626,19 @@ class Process(models.Model): exit_code if exited, None if still running """ if self.status == self.StatusChoices.EXITED: + if self.exit_code == -1: + self.exit_code = 137 + self.save(update_fields=['exit_code']) return self.exit_code if not self.is_running: + # Reap child process if it's a zombie (best-effort) + proc = self.proc + if proc is not None: + try: + proc.wait(timeout=0) + except Exception: + pass # Process exited - read output and copy to DB if self.stdout_file and self.stdout_file.exists(): self.stdout = self.stdout_file.read_text() @@ -1603,7 +1660,9 @@ class Process(models.Model): # cmd_file.unlink(missing_ok=True) # Try to get exit code from proc or default to unknown - self.exit_code = self.exit_code if self.exit_code is not None else -1 + self.exit_code = self.exit_code if self.exit_code is not None else 0 + if self.exit_code == -1: + self.exit_code = 137 self.ended_at = timezone.now() self.status = self.StatusChoices.EXITED self.save() @@ -1723,6 +1782,7 @@ class Process(models.Model): import os killed_count = 0 + used_sigkill = False proc = self.proc if proc is None: # Already dead @@ -1772,11 +1832,15 @@ class Process(models.Model): try: os.kill(pid, signal.SIGKILL) killed_count += 1 + used_sigkill = True except (OSError, ProcessLookupError): pass # Update self status - self.exit_code = 128 + signal.SIGTERM if killed_count > 0 else 0 + if used_sigkill: + self.exit_code = 128 + signal.SIGKILL + else: + self.exit_code = 128 + signal.SIGTERM if killed_count > 0 else 0 self.status = self.StatusChoices.EXITED self.ended_at = timezone.now() self.save() @@ -1925,6 +1989,50 @@ class Process(models.Model): return 0 + @classmethod + def cleanup_orphaned_workers(cls) -> int: + """ + Kill orphaned worker/hook processes whose root process is no longer running. + + Orphaned if: + - Root (orchestrator/cli) is not running, or + - No orchestrator/cli ancestor exists. + + Standalone worker runs (archivebox run --snapshot-id) are allowed. + """ + killed = 0 + + running_children = cls.objects.filter( + process_type__in=[cls.TypeChoices.WORKER, cls.TypeChoices.HOOK], + status=cls.StatusChoices.RUNNING, + ) + + for proc in running_children: + if not proc.is_running: + continue + + root = proc.root + # Standalone worker/hook process (run directly) + if root.id == proc.id and root.process_type in (cls.TypeChoices.WORKER, cls.TypeChoices.HOOK): + continue + + # If root is an active orchestrator/cli, keep it + if root.process_type in (cls.TypeChoices.ORCHESTRATOR, cls.TypeChoices.CLI) and root.is_running: + continue + + try: + if proc.process_type == cls.TypeChoices.HOOK: + proc.kill_tree(graceful_timeout=1.0) + else: + proc.terminate(graceful_timeout=1.0) + killed += 1 + except Exception: + continue + + if killed: + print(f'[yellow]🧹 Cleaned up {killed} orphaned worker/hook process(es)[/yellow]') + return killed + # ============================================================================= # Binary State Machine @@ -2126,5 +2234,3 @@ class ProcessMachine(BaseStateMachine, strict_states=True): # Manually register state machines with python-statemachine registry registry.register(BinaryMachine) registry.register(ProcessMachine) - - diff --git a/archivebox/machine/tests/test_machine_models.py b/archivebox/machine/tests/test_machine_models.py index 83875057..b36fd7a2 100644 --- a/archivebox/machine/tests/test_machine_models.py +++ b/archivebox/machine/tests/test_machine_models.py @@ -79,9 +79,9 @@ class TestMachineModel(TestCase): """Machine.from_json() should update machine config.""" Machine.current() # Ensure machine exists record = { - '_method': 'update', - 'key': 'WGET_BINARY', - 'value': '/usr/bin/wget', + 'config': { + 'WGET_BINARY': '/usr/bin/wget', + }, } result = Machine.from_json(record) @@ -190,12 +190,12 @@ class TestBinaryModel(TestCase): old_modified = binary.modified_at binary.update_and_requeue( - status=Binary.StatusChoices.STARTED, + status=Binary.StatusChoices.QUEUED, retry_at=timezone.now() + timedelta(seconds=60), ) binary.refresh_from_db() - self.assertEqual(binary.status, Binary.StatusChoices.STARTED) + self.assertEqual(binary.status, Binary.StatusChoices.QUEUED) self.assertGreater(binary.modified_at, old_modified) @@ -221,12 +221,12 @@ class TestBinaryStateMachine(TestCase): def test_binary_state_machine_can_start(self): """BinaryMachine.can_start() should check name and binproviders.""" sm = BinaryMachine(self.binary) - self.assertTrue(sm.can_start()) + self.assertTrue(sm.can_install()) self.binary.binproviders = '' self.binary.save() sm = BinaryMachine(self.binary) - self.assertFalse(sm.can_start()) + self.assertFalse(sm.can_install()) class TestProcessModel(TestCase): @@ -415,11 +415,15 @@ class TestProcessLifecycle(TestCase): def test_process_is_running_current_pid(self): """is_running should be True for current PID.""" + import psutil + from datetime import datetime + + proc_start = datetime.fromtimestamp(psutil.Process(os.getpid()).create_time(), tz=timezone.get_current_timezone()) proc = Process.objects.create( machine=self.machine, status=Process.StatusChoices.RUNNING, pid=os.getpid(), - started_at=timezone.now(), + started_at=proc_start, ) self.assertTrue(proc.is_running) @@ -450,6 +454,22 @@ class TestProcessLifecycle(TestCase): proc.refresh_from_db() self.assertEqual(proc.status, Process.StatusChoices.EXITED) + def test_process_poll_normalizes_negative_exit_code(self): + """poll() should normalize -1 exit codes to 137.""" + proc = Process.objects.create( + machine=self.machine, + status=Process.StatusChoices.EXITED, + pid=999999, + exit_code=-1, + started_at=timezone.now(), + ) + + exit_code = proc.poll() + + self.assertEqual(exit_code, 137) + proc.refresh_from_db() + self.assertEqual(proc.exit_code, 137) + def test_process_terminate_dead_process(self): """terminate() should handle already-dead process.""" proc = Process.objects.create( diff --git a/archivebox/misc/checks.py b/archivebox/misc/checks.py index c5795d8a..09929d36 100644 --- a/archivebox/misc/checks.py +++ b/archivebox/misc/checks.py @@ -180,9 +180,11 @@ def check_tmp_dir(tmp_dir=None, throw=False, quiet=False, must_exist=True): return len(f'file://{socket_file}') <= 96 tmp_is_valid = False + allow_no_unix_sockets = os.environ.get('ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS', '').lower() in ('1', 'true', 'yes') try: tmp_is_valid = dir_is_writable(tmp_dir) - tmp_is_valid = tmp_is_valid and assert_dir_can_contain_unix_sockets(tmp_dir) + if not allow_no_unix_sockets: + tmp_is_valid = tmp_is_valid and assert_dir_can_contain_unix_sockets(tmp_dir) assert tmp_is_valid, f'ArchiveBox user PUID={ARCHIVEBOX_USER} PGID={ARCHIVEBOX_GROUP} is unable to write to TMP_DIR={tmp_dir}' assert len(f'file://{socket_file}') <= 96, f'ArchiveBox TMP_DIR={tmp_dir} is too long, dir containing unix socket files must be <90 chars.' return True diff --git a/archivebox/misc/progress_layout.py b/archivebox/misc/progress_layout.py index fc4001d8..2db2e1c1 100644 --- a/archivebox/misc/progress_layout.py +++ b/archivebox/misc/progress_layout.py @@ -3,30 +3,29 @@ Rich Layout-based live progress display for ArchiveBox orchestrator. Shows a comprehensive dashboard with: - Top: Crawl queue status (full width) -- Middle: 4-column grid of SnapshotWorker progress panels +- Middle: Running process logs (dynamic panels) - Bottom: Orchestrator/Daphne logs """ __package__ = 'archivebox.misc' from datetime import datetime, timezone -from typing import Dict, List, Optional, Any +from typing import List, Optional, Any from collections import deque +from pathlib import Path from rich import box from rich.align import Align -from rich.console import Console, Group, RenderableType +from rich.console import Group from rich.layout import Layout +from rich.columns import Columns from rich.panel import Panel -from rich.progress import Progress, BarColumn, TextColumn, TaskProgressColumn, SpinnerColumn -from rich.table import Table from rich.text import Text +from rich.table import Table +from rich.tree import Tree from archivebox.config import VERSION -# Maximum number of SnapshotWorker columns to display -MAX_WORKER_COLUMNS = 4 - class CrawlQueuePanel: """Display crawl queue status across full width.""" @@ -35,6 +34,8 @@ class CrawlQueuePanel: self.orchestrator_status = "Idle" self.crawl_queue_count = 0 self.crawl_workers_count = 0 + self.binary_queue_count = 0 + self.binary_workers_count = 0 self.max_crawl_workers = 8 self.crawl_id: Optional[str] = None @@ -51,19 +52,27 @@ class CrawlQueuePanel: left_text.append(f"v{VERSION}", style="bold yellow") left_text.append(f" • {datetime.now(timezone.utc).strftime('%H:%M:%S')}", style="grey53") - # Center-left: Crawl queue status + # Center-left: Crawl + Binary queue status queue_style = "yellow" if self.crawl_queue_count > 0 else "grey53" center_left_text = Text() center_left_text.append("Crawls: ", style="white") center_left_text.append(str(self.crawl_queue_count), style=f"bold {queue_style}") center_left_text.append(" queued", style="grey53") + center_left_text.append(" • Binaries: ", style="white") + binary_queue_style = "yellow" if self.binary_queue_count > 0 else "grey53" + center_left_text.append(str(self.binary_queue_count), style=f"bold {binary_queue_style}") + center_left_text.append(" queued", style="grey53") - # Center-right: CrawlWorker status + # Center-right: Worker status worker_style = "green" if self.crawl_workers_count > 0 else "grey53" center_right_text = Text() center_right_text.append("Workers: ", style="white") center_right_text.append(f"{self.crawl_workers_count}/{self.max_crawl_workers}", style=f"bold {worker_style}") - center_right_text.append(" active", style="grey53") + center_right_text.append(" crawl", style="grey53") + binary_worker_style = "green" if self.binary_workers_count > 0 else "grey53" + center_right_text.append(" • ", style="grey53") + center_right_text.append(str(self.binary_workers_count), style=f"bold {binary_worker_style}") + center_right_text.append(" binary", style="grey53") # Right: Orchestrator status status_color = "green" if self.crawl_workers_count > 0 else "grey53" @@ -74,151 +83,302 @@ class CrawlQueuePanel: right_text.append(f" [{self.crawl_id[:8]}]", style="grey53") grid.add_row(left_text, center_left_text, center_right_text, right_text) - return Panel(grid, style="white on blue", box=box.ROUNDED) + return Panel(grid, style="white on blue", box=box.HORIZONTALS) -class SnapshotWorkerPanel: - """Display progress for a single SnapshotWorker.""" +class ProcessLogPanel: + """Display logs for a running Process.""" - def __init__(self, worker_num: int): - self.worker_num = worker_num - self.snapshot_id: Optional[str] = None - self.snapshot_url: Optional[str] = None - self.total_hooks: int = 0 - self.completed_hooks: int = 0 - self.current_plugin: Optional[str] = None - self.status: str = "idle" # idle, working, completed - self.recent_logs: deque = deque(maxlen=5) + def __init__(self, process: Any, max_lines: int = 8, compact: bool | None = None): + self.process = process + self.max_lines = max_lines + self.compact = compact def __rich__(self) -> Panel: - if self.status == "idle": - content = Align.center( - Text("Idle", style="grey53"), - vertical="middle", - ) - border_style = "grey53" - title_style = "grey53" - else: - # Build progress display - lines = [] + is_pending = self._is_pending() + output_line = '' if is_pending else self._output_line() + stdout_lines = [] + stderr_lines = [] + try: + stdout_lines = list(self.process.tail_stdout(lines=self.max_lines, follow=False)) + stderr_lines = list(self.process.tail_stderr(lines=self.max_lines, follow=False)) + except Exception: + stdout_lines = [] + stderr_lines = [] - # URL (truncated) - if self.snapshot_url: - url_display = self.snapshot_url[:35] + "..." if len(self.snapshot_url) > 35 else self.snapshot_url - lines.append(Text(url_display, style="cyan")) - lines.append(Text()) # Spacing + header_lines = [] + chrome_launch_line = self._chrome_launch_line(stderr_lines, stdout_lines) + if chrome_launch_line: + header_lines.append(Text(chrome_launch_line, style="grey53")) + if output_line: + header_lines.append(Text(output_line, style="grey53")) + log_lines = [] + for line in stdout_lines: + if line: + log_lines.append(Text(line, style="white")) + for line in stderr_lines: + if line: + log_lines.append(Text(line, style="cyan")) - # Progress bar - if self.total_hooks > 0: - pct = (self.completed_hooks / self.total_hooks) * 100 - bar_width = 30 - filled = int((pct / 100) * bar_width) - bar = "█" * filled + "░" * (bar_width - filled) + compact = self.compact if self.compact is not None else self._is_background_hook() + max_body = max(1, self.max_lines - len(header_lines)) + if not log_lines: + log_lines = [] - # Color based on progress - if pct < 30: - bar_style = "yellow" - elif pct < 100: - bar_style = "green" - else: - bar_style = "blue" + lines = header_lines + log_lines[-max_body:] - progress_text = Text() - progress_text.append(bar, style=bar_style) - progress_text.append(f" {pct:.0f}%", style="white") - lines.append(progress_text) - lines.append(Text()) # Spacing - - # Stats - stats = Table.grid(padding=(0, 1)) - stats.add_column(style="grey53", no_wrap=True) - stats.add_column(style="white") - stats.add_row("Hooks:", f"{self.completed_hooks}/{self.total_hooks}") - if self.current_plugin: - stats.add_row("Current:", Text(self.current_plugin, style="yellow")) - lines.append(stats) - lines.append(Text()) # Spacing - - # Recent logs - if self.recent_logs: - lines.append(Text("Recent:", style="grey53")) - for log_msg, log_style in self.recent_logs: - log_text = Text(f"• {log_msg[:30]}", style=log_style) - lines.append(log_text) - - content = Group(*lines) - border_style = "green" if self.status == "working" else "blue" - title_style = "green" if self.status == "working" else "blue" + content = Group(*lines) if lines else Text("") + title = self._title() + border_style = "grey53" if is_pending else "cyan" + height = 2 if is_pending else None return Panel( content, - title=f"[{title_style}]Worker {self.worker_num}", + title=title, border_style=border_style, - box=box.ROUNDED, - height=20, + box=box.HORIZONTALS, + padding=(0, 1), + height=height, ) - def add_log(self, message: str, style: str = "white"): - """Add a log message to this worker's recent logs.""" - self.recent_logs.append((message, style)) + def _title(self) -> str: + process_type = getattr(self.process, 'process_type', 'process') + worker_type = getattr(self.process, 'worker_type', '') + pid = getattr(self.process, 'pid', None) + label = process_type + if process_type == 'worker' and worker_type: + label, worker_suffix = self._worker_label(worker_type) + elif process_type == 'hook': + try: + cmd = getattr(self.process, 'cmd', []) + hook_path = Path(cmd[1]) if len(cmd) > 1 else None + hook_name = hook_path.name if hook_path else 'hook' + plugin_name = hook_path.parent.name if hook_path and hook_path.parent.name else 'hook' + except Exception: + hook_name = 'hook' + plugin_name = 'hook' + label = f"{plugin_name}/{hook_name}" + worker_suffix = '' + else: + worker_suffix = '' + + url = self._extract_url() + url_suffix = f" url={self._abbrev_url(url)}" if url else "" + time_suffix = self._elapsed_suffix() + title_style = "grey53" if self._is_pending() else "bold white" + if pid: + return f"[{title_style}]{label}[/{title_style}] [grey53]pid={pid}{worker_suffix}{url_suffix}{time_suffix}[/grey53]" + return f"[{title_style}]{label}[/{title_style}]{f' [grey53]{worker_suffix.strip()} {url_suffix.strip()}{time_suffix}[/grey53]' if (worker_suffix or url_suffix or time_suffix) else ''}".rstrip() + + def _is_background_hook(self) -> bool: + if getattr(self.process, 'process_type', '') != 'hook': + return False + try: + cmd = getattr(self.process, 'cmd', []) + hook_path = Path(cmd[1]) if len(cmd) > 1 else None + hook_name = hook_path.name if hook_path else '' + return '.bg.' in hook_name + except Exception: + return False + + def _is_pending(self) -> bool: + status = getattr(self.process, 'status', '') + if status in ('queued', 'pending', 'backoff'): + return True + if getattr(self.process, 'process_type', '') == 'hook' and not getattr(self.process, 'pid', None): + return True + return False + + def _worker_label(self, worker_type: str) -> tuple[str, str]: + cmd = getattr(self.process, 'cmd', []) or [] + if worker_type == 'crawl': + crawl_id = self._extract_arg(cmd, '--crawl-id') + suffix = '' + if crawl_id: + suffix = f" id={str(crawl_id)[-8:]}" + try: + from archivebox.crawls.models import Crawl + crawl = Crawl.objects.filter(id=crawl_id).first() + if crawl: + urls = crawl.get_urls_list() + if urls: + url_list = self._abbrev_urls(urls) + suffix += f" urls={url_list}" + except Exception: + pass + return 'crawl', suffix + if worker_type == 'snapshot': + snapshot_id = self._extract_arg(cmd, '--snapshot-id') + suffix = '' + if snapshot_id: + suffix = f" id={str(snapshot_id)[-8:]}" + try: + from archivebox.core.models import Snapshot + snap = Snapshot.objects.filter(id=snapshot_id).first() + if snap and snap.url: + suffix += f" url={self._abbrev_url(snap.url, max_len=48)}" + except Exception: + pass + return 'snapshot', suffix + return f"worker:{worker_type}", '' + + @staticmethod + def _extract_arg(cmd: list[str], key: str) -> str | None: + for i, part in enumerate(cmd): + if part.startswith(f'{key}='): + return part.split('=', 1)[1] + if part == key and i + 1 < len(cmd): + return cmd[i + 1] + return None + + def _abbrev_urls(self, urls: list[str], max_len: int = 48) -> str: + if not urls: + return '' + if len(urls) == 1: + return self._abbrev_url(urls[0], max_len=max_len) + first = self._abbrev_url(urls[0], max_len=max_len) + return f"{first},+{len(urls) - 1}" + + def _extract_url(self) -> str: + url = getattr(self.process, 'url', None) + if url: + return str(url) + cmd = getattr(self.process, 'cmd', []) or [] + for i, part in enumerate(cmd): + if part.startswith('--url='): + return part.split('=', 1)[1].strip() + if part == '--url' and i + 1 < len(cmd): + return str(cmd[i + 1]).strip() + return '' + + def _abbrev_url(self, url: str, max_len: int = 48) -> str: + if not url: + return '' + if len(url) <= max_len: + return url + return f"{url[:max_len - 3]}..." + + def _chrome_launch_line(self, stderr_lines: list[str], stdout_lines: list[str]) -> str: + try: + cmd = getattr(self.process, 'cmd', []) + hook_path = Path(cmd[1]) if len(cmd) > 1 else None + hook_name = hook_path.name if hook_path else '' + if 'chrome_launch' not in hook_name: + return '' + + pid = '' + ws = '' + for line in stderr_lines + stdout_lines: + if not ws and 'CDP URL:' in line: + ws = line.split('CDP URL:', 1)[1].strip() + if not pid and 'PID:' in line: + pid = line.split('PID:', 1)[1].strip() + + if pid and ws: + return f"Chrome pid={pid} {ws}" + if ws: + return f"Chrome {ws}" + if pid: + return f"Chrome pid={pid}" + try: + from archivebox import DATA_DIR + base = Path(DATA_DIR) + pwd = getattr(self.process, 'pwd', None) + if pwd: + chrome_dir = Path(pwd) + if not chrome_dir.is_absolute(): + chrome_dir = (base / chrome_dir).resolve() + cdp_file = chrome_dir / 'cdp_url.txt' + pid_file = chrome_dir / 'chrome.pid' + if cdp_file.exists(): + ws = cdp_file.read_text().strip() + if pid_file.exists(): + pid = pid_file.read_text().strip() + if pid and ws: + return f"Chrome pid={pid} {ws}" + if ws: + return f"Chrome {ws}" + if pid: + return f"Chrome pid={pid}" + except Exception: + pass + except Exception: + return '' + return '' + + def _elapsed_suffix(self) -> str: + started_at = getattr(self.process, 'started_at', None) + timeout = getattr(self.process, 'timeout', None) + if not started_at or not timeout: + return '' + try: + now = datetime.now(timezone.utc) if started_at.tzinfo else datetime.now() + elapsed = int((now - started_at).total_seconds()) + elapsed = max(elapsed, 0) + return f" [{elapsed}/{int(timeout)}s]" + except Exception: + return '' + + def _output_line(self) -> str: + pwd = getattr(self.process, 'pwd', None) + if not pwd: + return '' + try: + from archivebox import DATA_DIR + rel = Path(pwd) + base = Path(DATA_DIR) + if rel.is_absolute(): + try: + rel = rel.relative_to(base) + except Exception: + pass + rel_str = f"./{rel}" if not str(rel).startswith("./") else str(rel) + return f"{rel_str}" + except Exception: + return f"{pwd}" -class CrawlWorkerLogPanel: - """Display CrawlWorker logs by tailing stdout/stderr from Process.""" +class WorkerLogPanel: + """Display worker logs by tailing stdout/stderr from Process.""" - def __init__(self, max_lines: int = 8): + def __init__(self, title: str, empty_message: str, running_message: str, max_lines: int = 8): + self.title = title + self.empty_message = empty_message + self.running_message = running_message self.log_lines: deque = deque(maxlen=max_lines * 2) # Allow more buffer self.max_lines = max_lines self.last_stdout_pos = 0 # Track file position for efficient tailing self.last_stderr_pos = 0 + self.last_process_running = False def update_from_process(self, process: Any): """Update logs by tailing the Process stdout/stderr files.""" - from pathlib import Path - if not process: + self.last_process_running = False return - # Read new stdout lines since last read + # Use Process tail helpers for consistency try: - stdout_path = Path(process.stdout) - if stdout_path.exists(): - with open(stdout_path, 'r') as f: - # Seek to last read position - f.seek(self.last_stdout_pos) - new_lines = f.readlines() - - # Update position - self.last_stdout_pos = f.tell() - - # Add new lines (up to max_lines to avoid overflow) - for line in new_lines[-self.max_lines:]: - line = line.rstrip('\n') - if line and not line.startswith('['): # Skip Rich markup lines - self.log_lines.append(('stdout', line)) + self.last_process_running = bool(getattr(process, 'is_running', False)) + stdout_lines = list(process.tail_stdout(lines=self.max_lines, follow=False)) + stderr_lines = list(process.tail_stderr(lines=self.max_lines, follow=False)) except Exception: - pass + return - # Read new stderr lines since last read - try: - stderr_path = Path(process.stderr) - if stderr_path.exists(): - with open(stderr_path, 'r') as f: - f.seek(self.last_stderr_pos) - new_lines = f.readlines() + self.log_lines.clear() - self.last_stderr_pos = f.tell() - - for line in new_lines[-self.max_lines:]: - line = line.rstrip('\n') - if line and not line.startswith('['): # Skip Rich markup lines - self.log_lines.append(('stderr', line)) - except Exception: - pass + # Preserve ordering by showing stdout then stderr + for line in stdout_lines: + if line: + self.log_lines.append(('stdout', line)) + for line in stderr_lines: + if line: + self.log_lines.append(('stderr', line)) def __rich__(self) -> Panel: if not self.log_lines: - content = Text("No CrawlWorker logs yet", style="grey53", justify="center") + message = self.running_message if self.last_process_running else self.empty_message + content = Text(message, style="grey53", justify="center") else: # Get the last max_lines for display display_lines = list(self.log_lines)[-self.max_lines:] @@ -236,9 +396,9 @@ class CrawlWorkerLogPanel: return Panel( content, - title="[bold cyan]CrawlWorker Logs (stdout/stderr)", + title=f"[bold cyan]{self.title}", border_style="cyan", - box=box.ROUNDED, + box=box.HORIZONTALS, ) @@ -270,10 +430,71 @@ class OrchestratorLogPanel: content, title="[bold white]Orchestrator / Daphne Logs", border_style="white", - box=box.ROUNDED, + box=box.HORIZONTALS, ) +class CrawlQueueTreePanel: + """Display crawl queue with snapshots + hook summary in a tree view.""" + + def __init__(self, max_crawls: int = 8, max_snapshots: int = 16): + self.crawls: list[dict[str, Any]] = [] + self.max_crawls = max_crawls + self.max_snapshots = max_snapshots + + def update_crawls(self, crawls: list[dict[str, Any]]) -> None: + """Update crawl tree data.""" + self.crawls = crawls[:self.max_crawls] + + def __rich__(self) -> Panel: + if not self.crawls: + content = Text("No active crawls", style="grey53", justify="center") + else: + trees = [] + for crawl in self.crawls: + crawl_status = crawl.get('status', '') + crawl_label = crawl.get('label', '') + crawl_id = crawl.get('id', '')[:8] + crawl_text = Text(f"{self._status_icon(crawl_status)} {crawl_id} {crawl_label}", style="white") + crawl_tree = Tree(crawl_text, guide_style="grey53") + + snapshots = crawl.get('snapshots', [])[:self.max_snapshots] + for snap in snapshots: + snap_status = snap.get('status', '') + snap_label = snap.get('label', '') + snap_text = Text(f"{self._status_icon(snap_status)} {snap_label}", style="white") + snap_node = crawl_tree.add(snap_text) + + hooks = snap.get('hooks', {}) + if hooks: + completed = hooks.get('completed', 0) + running = hooks.get('running', 0) + pending = hooks.get('pending', 0) + summary = f"✅ {completed} | ▶️ {running} | ⌛️ {pending}" + snap_node.add(Text(summary, style="grey53")) + trees.append(crawl_tree) + content = Group(*trees) + + return Panel( + content, + title="[bold white]Crawl Queue", + border_style="white", + box=box.HORIZONTALS, + ) + + @staticmethod + def _status_icon(status: str) -> str: + if status in ('queued', 'pending'): + return '⏳' + if status in ('started', 'running'): + return '▶' + if status in ('sealed', 'done', 'completed'): + return '✅' + if status in ('failed', 'error'): + return '✖' + return '•' + + class ArchiveBoxProgressLayout: """ Main layout manager for ArchiveBox orchestrator progress display. @@ -281,15 +502,8 @@ class ArchiveBoxProgressLayout: Layout structure: ┌─────────────────────────────────────────────────────────────┐ │ Crawl Queue (full width) │ - ├───────────────┬───────────────┬───────────────┬─────────────┤ - │ Snapshot │ Snapshot │ Snapshot │ Snapshot │ - │ Worker 1 │ Worker 2 │ Worker 3 │ Worker 4 │ - │ │ │ │ │ - │ Progress + │ Progress + │ Progress + │ Progress + │ - │ Stats + │ Stats + │ Stats + │ Stats + │ - │ Logs │ Logs │ Logs │ Logs │ - ├───────────────┴───────────────┴───────────────┴─────────────┤ - │ CrawlWorker Logs (stdout/stderr) │ + ├─────────────────────────────────────────────────────────────┤ + │ Running Process Logs (dynamic panels) │ ├─────────────────────────────────────────────────────────────┤ │ Orchestrator / Daphne Logs │ └─────────────────────────────────────────────────────────────┘ @@ -303,51 +517,33 @@ class ArchiveBoxProgressLayout: self.crawl_queue = CrawlQueuePanel() self.crawl_queue.crawl_id = crawl_id - # Create 4 worker panels - self.worker_panels = [SnapshotWorkerPanel(i + 1) for i in range(MAX_WORKER_COLUMNS)] - - self.crawl_worker_log = CrawlWorkerLogPanel(max_lines=8) + self.process_panels: List[ProcessLogPanel] = [] self.orchestrator_log = OrchestratorLogPanel(max_events=8) + self.crawl_queue_tree = CrawlQueueTreePanel(max_crawls=8, max_snapshots=16) # Create layout self.layout = self._make_layout() - # Track snapshot ID to worker panel mapping - self.snapshot_to_worker: Dict[str, int] = {} # snapshot_id -> worker_panel_index - def _make_layout(self) -> Layout: """Define the layout structure.""" layout = Layout(name="root") - # Top-level split: crawl_queue, workers, logs + # Top-level split: crawl_queue, workers, bottom layout.split( Layout(name="crawl_queue", size=3), - Layout(name="workers", ratio=1), - Layout(name="logs", size=20), - ) - - # Split workers into 4 columns - layout["workers"].split_row( - Layout(name="worker1"), - Layout(name="worker2"), - Layout(name="worker3"), - Layout(name="worker4"), - ) - - # Split logs into crawl_worker_logs and orchestrator_logs - layout["logs"].split( - Layout(name="crawl_worker_logs", size=10), - Layout(name="orchestrator_logs", size=10), + Layout(name="processes", ratio=1), + Layout(name="bottom", size=12), ) # Assign components to layout sections layout["crawl_queue"].update(self.crawl_queue) - layout["worker1"].update(self.worker_panels[0]) - layout["worker2"].update(self.worker_panels[1]) - layout["worker3"].update(self.worker_panels[2]) - layout["worker4"].update(self.worker_panels[3]) - layout["crawl_worker_logs"].update(self.crawl_worker_log) + layout["processes"].update(Columns([])) + layout["bottom"].split_row( + Layout(name="orchestrator_logs", ratio=2), + Layout(name="crawl_tree", ratio=1), + ) layout["orchestrator_logs"].update(self.orchestrator_log) + layout["crawl_tree"].update(self.crawl_queue_tree) return layout @@ -356,82 +552,53 @@ class ArchiveBoxProgressLayout: status: str, crawl_queue_count: int = 0, crawl_workers_count: int = 0, + binary_queue_count: int = 0, + binary_workers_count: int = 0, max_crawl_workers: int = 8, ): """Update orchestrator status in the crawl queue panel.""" self.crawl_queue.orchestrator_status = status self.crawl_queue.crawl_queue_count = crawl_queue_count self.crawl_queue.crawl_workers_count = crawl_workers_count + self.crawl_queue.binary_queue_count = binary_queue_count + self.crawl_queue.binary_workers_count = binary_workers_count self.crawl_queue.max_crawl_workers = max_crawl_workers - def update_snapshot_worker( - self, - snapshot_id: str, - url: str, - total: int, - completed: int, - current_plugin: str = "", - ): - """Update or assign a snapshot to a worker panel.""" - # Find or assign worker panel for this snapshot - if snapshot_id not in self.snapshot_to_worker: - # Find first idle worker panel - worker_idx = None - for idx, panel in enumerate(self.worker_panels): - if panel.status == "idle": - worker_idx = idx - break + def update_process_panels(self, processes: List[Any], pending: Optional[List[Any]] = None) -> None: + """Update process panels to show all running processes.""" + panels = [] + all_processes = list(processes) + list(pending or []) + for process in all_processes: + is_hook = getattr(process, 'process_type', '') == 'hook' + is_bg = False + if is_hook: + try: + cmd = getattr(process, 'cmd', []) + hook_path = Path(cmd[1]) if len(cmd) > 1 else None + hook_name = hook_path.name if hook_path else '' + is_bg = '.bg.' in hook_name + except Exception: + is_bg = False + is_pending = getattr(process, 'status', '') in ('queued', 'pending', 'backoff') or (is_hook and not getattr(process, 'pid', None)) + max_lines = 2 if is_pending else (4 if is_bg else 7) + panels.append(ProcessLogPanel(process, max_lines=max_lines, compact=is_bg)) + if not panels: + self.layout["processes"].size = 0 + self.layout["processes"].update(Text("")) + return - # If no idle worker, use round-robin (shouldn't happen often) - if worker_idx is None: - worker_idx = len(self.snapshot_to_worker) % MAX_WORKER_COLUMNS + self.layout["processes"].size = None + self.layout["processes"].ratio = 1 + self.layout["processes"].update(Columns(panels, equal=True, expand=True)) - self.snapshot_to_worker[snapshot_id] = worker_idx + def update_crawl_tree(self, crawls: list[dict[str, Any]]) -> None: + """Update the crawl queue tree panel.""" + self.crawl_queue_tree.update_crawls(crawls) - # Get assigned worker panel - worker_idx = self.snapshot_to_worker[snapshot_id] - panel = self.worker_panels[worker_idx] - - # Update panel - panel.snapshot_id = snapshot_id - panel.snapshot_url = url - panel.total_hooks = total - panel.completed_hooks = completed - panel.current_plugin = current_plugin - panel.status = "working" if completed < total else "completed" - - def remove_snapshot_worker(self, snapshot_id: str): - """Mark a snapshot worker as idle after completion.""" - if snapshot_id in self.snapshot_to_worker: - worker_idx = self.snapshot_to_worker[snapshot_id] - panel = self.worker_panels[worker_idx] - - # Mark as idle - panel.status = "idle" - panel.snapshot_id = None - panel.snapshot_url = None - panel.total_hooks = 0 - panel.completed_hooks = 0 - panel.current_plugin = None - panel.recent_logs.clear() - - # Remove mapping - del self.snapshot_to_worker[snapshot_id] - - def log_to_worker(self, snapshot_id: str, message: str, style: str = "white"): - """Add a log message to a specific worker's panel.""" - if snapshot_id in self.snapshot_to_worker: - worker_idx = self.snapshot_to_worker[snapshot_id] - self.worker_panels[worker_idx].add_log(message, style) - - def log_event(self, message: str, style: str = "white"): + def log_event(self, message: str, style: str = "white") -> None: """Add an event to the orchestrator log.""" self.orchestrator_log.add_event(message, style) - def update_crawl_worker_logs(self, process: Any): - """Update CrawlWorker logs by tailing the Process stdout/stderr files.""" - self.crawl_worker_log.update_from_process(process) - def get_layout(self) -> Layout: """Get the Rich Layout object for rendering.""" return self.layout diff --git a/archivebox/plugins/accessibility/templates/icon.html b/archivebox/plugins/accessibility/templates/icon.html index e69de29b..e1c30fa0 100644 --- a/archivebox/plugins/accessibility/templates/icon.html +++ b/archivebox/plugins/accessibility/templates/icon.html @@ -0,0 +1 @@ + diff --git a/archivebox/plugins/accessibility/tests/__init__.py b/archivebox/plugins/accessibility/tests/__init__.py deleted file mode 100644 index fffe074b..00000000 --- a/archivebox/plugins/accessibility/tests/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Tests for the accessibility plugin.""" diff --git a/archivebox/plugins/apt/on_Binary__13_apt_install.py b/archivebox/plugins/apt/on_Binary__13_apt_install.py index af8506df..82e343ff 100644 --- a/archivebox/plugins/apt/on_Binary__13_apt_install.py +++ b/archivebox/plugins/apt/on_Binary__13_apt_install.py @@ -10,7 +10,7 @@ import json import sys import rich_click as click -from abx_pkg import Binary, AptProvider +from abx_pkg import Binary, AptProvider, BinProviderOverrides # Fix pydantic forward reference issue AptProvider.model_rebuild() diff --git a/archivebox/plugins/apt/tests/__init__.py b/archivebox/plugins/apt/tests/__init__.py deleted file mode 100644 index fdde694e..00000000 --- a/archivebox/plugins/apt/tests/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Tests for the apt binary provider plugin.""" diff --git a/archivebox/plugins/apt/tests/test_apt_provider.py b/archivebox/plugins/apt/tests/test_apt_provider.py index be55e901..430fde24 100644 --- a/archivebox/plugins/apt/tests/test_apt_provider.py +++ b/archivebox/plugins/apt/tests/test_apt_provider.py @@ -21,7 +21,7 @@ from django.test import TestCase # Get the path to the apt provider hook PLUGIN_DIR = Path(__file__).parent.parent -INSTALL_HOOK = PLUGIN_DIR / 'on_Binary__install_using_apt_provider.py' +INSTALL_HOOK = next(PLUGIN_DIR.glob('on_Binary__*_apt_install.py'), None) def apt_available() -> bool: @@ -48,7 +48,7 @@ class TestAptProviderHook(TestCase): def test_hook_script_exists(self): """Hook script should exist.""" - self.assertTrue(INSTALL_HOOK.exists(), f"Hook not found: {INSTALL_HOOK}") + self.assertTrue(INSTALL_HOOK and INSTALL_HOOK.exists(), f"Hook not found: {INSTALL_HOOK}") def test_hook_skips_when_apt_not_allowed(self): """Hook should skip when apt not in allowed binproviders.""" diff --git a/archivebox/plugins/archivedotorg/on_Snapshot__13_archivedotorg.py b/archivebox/plugins/archivedotorg/on_Snapshot__13_archivedotorg.py index 5490008d..36522417 100644 --- a/archivebox/plugins/archivedotorg/on_Snapshot__13_archivedotorg.py +++ b/archivebox/plugins/archivedotorg/on_Snapshot__13_archivedotorg.py @@ -47,6 +47,9 @@ def submit_to_archivedotorg(url: str) -> tuple[bool, str | None, str]: Returns: (success, output_path, error_message) """ + def log(message: str) -> None: + print(f'[archivedotorg] {message}', file=sys.stderr) + try: import requests except ImportError: @@ -56,6 +59,8 @@ def submit_to_archivedotorg(url: str) -> tuple[bool, str | None, str]: user_agent = get_env('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)') submit_url = f'https://web.archive.org/save/{url}' + log(f'Submitting to Wayback Machine (timeout={timeout}s)') + log(f'GET {submit_url}') try: response = requests.get( @@ -64,31 +69,40 @@ def submit_to_archivedotorg(url: str) -> tuple[bool, str | None, str]: headers={'User-Agent': user_agent}, allow_redirects=True, ) + log(f'HTTP {response.status_code} final_url={response.url}') # Check for successful archive content_location = response.headers.get('Content-Location', '') x_archive_orig_url = response.headers.get('X-Archive-Orig-Url', '') + if content_location: + log(f'Content-Location: {content_location}') + if x_archive_orig_url: + log(f'X-Archive-Orig-Url: {x_archive_orig_url}') # Build archive URL if content_location: archive_url = f'https://web.archive.org{content_location}' Path(OUTPUT_FILE).write_text(archive_url, encoding='utf-8') + log(f'Saved archive URL -> {archive_url}') return True, OUTPUT_FILE, '' elif 'web.archive.org' in response.url: # We were redirected to an archive page Path(OUTPUT_FILE).write_text(response.url, encoding='utf-8') + log(f'Redirected to archive page -> {response.url}') return True, OUTPUT_FILE, '' else: # Check for errors in response if 'RobotAccessControlException' in response.text: # Blocked by robots.txt - save submit URL for manual retry Path(OUTPUT_FILE).write_text(submit_url, encoding='utf-8') + log('Blocked by robots.txt, saved submit URL for manual retry') return True, OUTPUT_FILE, '' # Consider this a soft success elif response.status_code >= 400: return False, None, f'HTTP {response.status_code}' else: # Save submit URL anyway Path(OUTPUT_FILE).write_text(submit_url, encoding='utf-8') + log('No archive URL returned, saved submit URL for manual retry') return True, OUTPUT_FILE, '' except requests.Timeout: diff --git a/archivebox/plugins/archivedotorg/templates/icon.html b/archivebox/plugins/archivedotorg/templates/icon.html index 09f24b76..e3f48634 100644 --- a/archivebox/plugins/archivedotorg/templates/icon.html +++ b/archivebox/plugins/archivedotorg/templates/icon.html @@ -1 +1 @@ -🏛️ \ No newline at end of file + diff --git a/archivebox/plugins/chrome/binaries.jsonl b/archivebox/plugins/chrome/binaries.jsonl deleted file mode 100644 index 55ccbad0..00000000 --- a/archivebox/plugins/chrome/binaries.jsonl +++ /dev/null @@ -1 +0,0 @@ -{"type": "Binary", "name": "chrome", "binproviders": "npm,env,brew,apt", "overrides": {"npm": {"packages": ["@puppeteer/browsers"]}}} diff --git a/archivebox/plugins/chrome/chrome_utils.js b/archivebox/plugins/chrome/chrome_utils.js index f61cfcdd..6369f1e7 100755 --- a/archivebox/plugins/chrome/chrome_utils.js +++ b/archivebox/plugins/chrome/chrome_utils.js @@ -1253,7 +1253,7 @@ function getExtensionTargets(browser) { } /** - * Find Chromium/Chrome binary path. + * Find Chromium binary path. * Checks CHROME_BINARY env var first, then falls back to system locations. * * @returns {string|null} - Absolute path to browser binary or null if not found @@ -1276,7 +1276,9 @@ function findChromium() { const chromeBinary = getEnv('CHROME_BINARY'); if (chromeBinary) { const absPath = path.resolve(chromeBinary); - if (validateBinary(absPath)) { + if (absPath.includes('Google Chrome') || absPath.includes('google-chrome')) { + console.error('[!] Warning: CHROME_BINARY points to Chrome. Chromium is required for extension support.'); + } else if (validateBinary(absPath)) { return absPath; } console.error(`[!] Warning: CHROME_BINARY="${chromeBinary}" is not valid`); @@ -1309,7 +1311,7 @@ function findChromium() { return null; }; - // 3. Search fallback locations (Chromium first, then Chrome) + // 3. Search fallback locations (Chromium only) const fallbackLocations = [ // System Chromium '/Applications/Chromium.app/Contents/MacOS/Chromium', @@ -1318,10 +1320,6 @@ function findChromium() { // Puppeteer cache path.join(process.env.HOME || '', '.cache/puppeteer/chromium'), path.join(process.env.HOME || '', '.cache/puppeteer'), - // Chrome (fallback - extensions may not work in 137+) - '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', - '/usr/bin/google-chrome', - '/usr/bin/google-chrome-stable', ]; for (const loc of fallbackLocations) { @@ -1332,9 +1330,6 @@ function findChromium() { return binary; } } else if (validateBinary(loc)) { - if (loc.includes('Google Chrome') || loc.includes('google-chrome')) { - console.error('[!] Warning: Using Chrome instead of Chromium. Extension loading may not work in Chrome 137+'); - } return loc; } } @@ -1699,10 +1694,10 @@ module.exports = { // Chrome launching launchChromium, killChrome, - // Chrome/Chromium install + // Chromium install installChromium, installPuppeteerCore, - // Chrome/Chromium binary finding + // Chromium binary finding findChromium, // Extension utilities getExtensionId, @@ -1744,7 +1739,7 @@ if (require.main === module) { console.log('Usage: chrome_utils.js [args...]'); console.log(''); console.log('Commands:'); - console.log(' findChromium Find Chrome/Chromium binary'); + console.log(' findChromium Find Chromium binary'); console.log(' installChromium Install Chromium via @puppeteer/browsers'); console.log(' installPuppeteerCore Install puppeteer-core npm package'); console.log(' launchChromium Launch Chrome with CDP debugging'); diff --git a/archivebox/plugins/chrome/config.json b/archivebox/plugins/chrome/config.json index 79d1946d..f4d6a4d8 100644 --- a/archivebox/plugins/chrome/config.json +++ b/archivebox/plugins/chrome/config.json @@ -7,13 +7,13 @@ "type": "boolean", "default": true, "x-aliases": ["USE_CHROME"], - "description": "Enable Chrome/Chromium browser integration for archiving" + "description": "Enable Chromium browser integration for archiving" }, "CHROME_BINARY": { "type": "string", "default": "chromium", "x-aliases": ["CHROMIUM_BINARY", "GOOGLE_CHROME_BINARY"], - "description": "Path to Chrome/Chromium binary" + "description": "Path to Chromium binary" }, "CHROME_NODE_BINARY": { "type": "string", diff --git a/archivebox/plugins/chrome/on_Crawl__01_chrome_install.py b/archivebox/plugins/chrome/on_Crawl__01_chrome_install.py deleted file mode 100755 index 6730333f..00000000 --- a/archivebox/plugins/chrome/on_Crawl__01_chrome_install.py +++ /dev/null @@ -1,265 +0,0 @@ -#!/usr/bin/env python3 -""" -Install hook for Chrome/Chromium and puppeteer-core. - -Runs at crawl start to install/find Chromium and puppeteer-core. -Also validates config and computes derived values. - -Outputs: - - JSONL for Binary and Machine config updates - - COMPUTED:KEY=VALUE lines that hooks.py parses and adds to env - -Respects CHROME_BINARY env var for custom binary paths. -Uses `npx @puppeteer/browsers install chromium@latest` and parses output. - -NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for ---load-extension and --disable-extensions-except flags, which are needed for -loading unpacked extensions in headless mode. -""" - -import os -import sys -import json -import subprocess -from pathlib import Path - - -def get_env(name: str, default: str = '') -> str: - return os.environ.get(name, default).strip() - - -def get_env_bool(name: str, default: bool = False) -> bool: - val = get_env(name, '').lower() - if val in ('true', '1', 'yes', 'on'): - return True - if val in ('false', '0', 'no', 'off'): - return False - return default - - -def detect_docker() -> bool: - """Detect if running inside Docker container.""" - return ( - os.path.exists('/.dockerenv') or - os.environ.get('IN_DOCKER', '').lower() in ('true', '1', 'yes') or - os.path.exists('/run/.containerenv') - ) - - -def get_chrome_version(binary_path: str) -> str | None: - """Get Chrome/Chromium version string.""" - try: - result = subprocess.run( - [binary_path, '--version'], - capture_output=True, - text=True, - timeout=5 - ) - if result.returncode == 0: - return result.stdout.strip() - except Exception: - pass - return None - - -def install_puppeteer_core() -> bool: - """Install puppeteer-core to NODE_MODULES_DIR if not present.""" - node_modules_dir = os.environ.get('NODE_MODULES_DIR', '').strip() - if not node_modules_dir: - # No isolated node_modules, skip (will use global) - return True - - node_modules_path = Path(node_modules_dir) - if (node_modules_path / 'puppeteer-core').exists(): - return True - - # Get npm prefix from NODE_MODULES_DIR (parent of node_modules) - npm_prefix = node_modules_path.parent - - try: - print(f"[*] Installing puppeteer-core to {npm_prefix}...", file=sys.stderr) - result = subprocess.run( - ['npm', 'install', '--prefix', str(npm_prefix), 'puppeteer-core', '@puppeteer/browsers'], - capture_output=True, - text=True, - timeout=60 - ) - if result.returncode == 0: - print(f"[+] puppeteer-core installed", file=sys.stderr) - return True - else: - print(f"[!] Failed to install puppeteer-core: {result.stderr}", file=sys.stderr) - return False - except Exception as e: - print(f"[!] Failed to install puppeteer-core: {e}", file=sys.stderr) - return False - - -def install_chromium() -> dict | None: - """Install Chromium using @puppeteer/browsers and parse output for binary path. - - Output format: "chromium@ " - e.g.: "chromium@1563294 /Users/x/.cache/puppeteer/chromium/.../Chromium" - - Note: npx is fast when chromium is already cached - it returns the path without re-downloading. - """ - try: - print("[*] Installing Chromium via @puppeteer/browsers...", file=sys.stderr) - - # Use --path to install to puppeteer's standard cache location - cache_path = os.path.expanduser('~/.cache/puppeteer') - - result = subprocess.run( - ['npx', '@puppeteer/browsers', 'install', 'chromium@1563297', f'--path={cache_path}'], - capture_output=True, - text=True, - stdin=subprocess.DEVNULL, - timeout=300 - ) - - if result.returncode != 0: - print(f"[!] Failed to install Chromium: {result.stderr}", file=sys.stderr) - return None - - # Parse output: "chromium@1563294 /path/to/Chromium" - output = result.stdout.strip() - parts = output.split(' ', 1) - if len(parts) != 2: - print(f"[!] Failed to parse install output: {output}", file=sys.stderr) - return None - - version_str = parts[0] # "chromium@1563294" - binary_path = parts[1].strip() - - if not binary_path or not os.path.exists(binary_path): - print(f"[!] Binary not found at: {binary_path}", file=sys.stderr) - return None - - # Extract version number - version = version_str.split('@')[1] if '@' in version_str else None - - print(f"[+] Chromium installed: {binary_path}", file=sys.stderr) - - return { - 'name': 'chromium', - 'abspath': binary_path, - 'version': version, - 'binprovider': 'puppeteer', - } - - except subprocess.TimeoutExpired: - print("[!] Chromium install timed out", file=sys.stderr) - except FileNotFoundError: - print("[!] npx not found - is Node.js installed?", file=sys.stderr) - except Exception as e: - print(f"[!] Failed to install Chromium: {e}", file=sys.stderr) - - return None - - -def main(): - warnings = [] - errors = [] - computed = {} - - # Install puppeteer-core if NODE_MODULES_DIR is set - install_puppeteer_core() - - # Check if Chrome is enabled - chrome_enabled = get_env_bool('CHROME_ENABLED', True) - - # Detect Docker and adjust sandbox - in_docker = detect_docker() - computed['IN_DOCKER'] = str(in_docker).lower() - - chrome_sandbox = get_env_bool('CHROME_SANDBOX', True) - if in_docker and chrome_sandbox: - warnings.append( - "Running in Docker with CHROME_SANDBOX=true. " - "Chrome may fail to start. Consider setting CHROME_SANDBOX=false." - ) - # Auto-disable sandbox in Docker unless explicitly set - if not get_env('CHROME_SANDBOX'): - computed['CHROME_SANDBOX'] = 'false' - - # Check Node.js availability - node_binary = get_env('NODE_BINARY', 'node') - computed['NODE_BINARY'] = node_binary - - # Check if CHROME_BINARY is already set and valid - configured_binary = get_env('CHROME_BINARY', '') - if configured_binary and os.path.isfile(configured_binary) and os.access(configured_binary, os.X_OK): - version = get_chrome_version(configured_binary) - computed['CHROME_BINARY'] = configured_binary - computed['CHROME_VERSION'] = version or 'unknown' - - print(json.dumps({ - 'type': 'Binary', - 'name': 'chromium', - 'abspath': configured_binary, - 'version': version, - 'binprovider': 'env', - })) - - # Output computed values - for key, value in computed.items(): - print(f"COMPUTED:{key}={value}") - for warning in warnings: - print(f"WARNING:{warning}", file=sys.stderr) - - sys.exit(0) - - # Install/find Chromium via puppeteer - result = install_chromium() - - if result and result.get('abspath'): - computed['CHROME_BINARY'] = result['abspath'] - computed['CHROME_VERSION'] = result['version'] or 'unknown' - - print(json.dumps({ - 'type': 'Binary', - 'name': result['name'], - 'abspath': result['abspath'], - 'version': result['version'], - 'binprovider': result['binprovider'], - })) - - print(json.dumps({ - 'type': 'Machine', - '_method': 'update', - 'key': 'config/CHROME_BINARY', - 'value': result['abspath'], - })) - - if result['version']: - print(json.dumps({ - 'type': 'Machine', - '_method': 'update', - 'key': 'config/CHROMIUM_VERSION', - 'value': result['version'], - })) - - # Output computed values - for key, value in computed.items(): - print(f"COMPUTED:{key}={value}") - for warning in warnings: - print(f"WARNING:{warning}", file=sys.stderr) - - sys.exit(0) - else: - errors.append("Chromium binary not found") - computed['CHROME_BINARY'] = '' - - # Output computed values and errors - for key, value in computed.items(): - print(f"COMPUTED:{key}={value}") - for warning in warnings: - print(f"WARNING:{warning}", file=sys.stderr) - for error in errors: - print(f"ERROR:{error}", file=sys.stderr) - - sys.exit(1) - - -if __name__ == '__main__': - main() diff --git a/archivebox/plugins/chrome/on_Crawl__70_chrome_install.py b/archivebox/plugins/chrome/on_Crawl__70_chrome_install.py new file mode 100755 index 00000000..af0b8ec7 --- /dev/null +++ b/archivebox/plugins/chrome/on_Crawl__70_chrome_install.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python3 +""" +Emit Chromium Binary dependency for the crawl. + +NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for +--load-extension and --disable-extensions-except flags, which are needed for +loading unpacked extensions in headless mode. +""" + +import json +import os +import sys + + +def main(): + # Check if Chrome is enabled + chrome_enabled = os.environ.get('CHROME_ENABLED', 'true').lower() not in ('false', '0', 'no', 'off') + if not chrome_enabled: + sys.exit(0) + + record = { + 'type': 'Binary', + 'name': 'chromium', + 'binproviders': 'puppeteer,env', + 'overrides': { + 'puppeteer': ['chromium@latest', '--install-deps'], + }, + } + print(json.dumps(record)) + sys.exit(0) + + +if __name__ == '__main__': + main() diff --git a/archivebox/plugins/chrome/on_Crawl__20_chrome_launch.bg.js b/archivebox/plugins/chrome/on_Crawl__90_chrome_launch.bg.js similarity index 98% rename from archivebox/plugins/chrome/on_Crawl__20_chrome_launch.bg.js rename to archivebox/plugins/chrome/on_Crawl__90_chrome_launch.bg.js index f4d659e1..c50eb847 100644 --- a/archivebox/plugins/chrome/on_Crawl__20_chrome_launch.bg.js +++ b/archivebox/plugins/chrome/on_Crawl__90_chrome_launch.bg.js @@ -3,12 +3,12 @@ * Launch a shared Chromium browser session for the entire crawl. * * This runs once per crawl and keeps Chromium alive for all snapshots to share. - * Each snapshot creates its own tab via on_Snapshot__20_chrome_tab.bg.js. + * Each snapshot creates its own tab via on_Snapshot__10_chrome_tab.bg.js. * * NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for * --load-extension and --disable-extensions-except flags. * - * Usage: on_Crawl__20_chrome_launch.bg.js --crawl-id= --source-url= + * Usage: on_Crawl__90_chrome_launch.bg.js --crawl-id= --source-url= * Output: Writes to current directory (executor creates chrome/ dir): * - cdp_url.txt: WebSocket URL for CDP connection * - chrome.pid: Chromium process ID (for cleanup) @@ -31,7 +31,7 @@ if (process.env.NODE_MODULES_DIR) { const fs = require('fs'); const path = require('path'); -const puppeteer = require('puppeteer-core'); +const puppeteer = require('puppeteer'); const { findChromium, launchChromium, diff --git a/archivebox/plugins/chrome/on_Snapshot__20_chrome_tab.bg.js b/archivebox/plugins/chrome/on_Snapshot__10_chrome_tab.bg.js similarity index 86% rename from archivebox/plugins/chrome/on_Snapshot__20_chrome_tab.bg.js rename to archivebox/plugins/chrome/on_Snapshot__10_chrome_tab.bg.js index db953ef0..fca4acdc 100755 --- a/archivebox/plugins/chrome/on_Snapshot__20_chrome_tab.bg.js +++ b/archivebox/plugins/chrome/on_Snapshot__10_chrome_tab.bg.js @@ -2,11 +2,11 @@ /** * Create a Chrome tab for this snapshot in the shared crawl Chrome session. * - * If a crawl-level Chrome session exists (from on_Crawl__20_chrome_launch.bg.js), + * If a crawl-level Chrome session exists (from on_Crawl__90_chrome_launch.bg.js), * this connects to it and creates a new tab. Otherwise, falls back to launching * its own Chrome instance. * - * Usage: on_Snapshot__20_chrome_tab.bg.js --url= --snapshot-id= --crawl-id= + * Usage: on_Snapshot__10_chrome_tab.bg.js --url= --snapshot-id= --crawl-id= * Output: Creates chrome/ directory under snapshot output dir with: * - cdp_url.txt: WebSocket URL for CDP connection * - chrome.pid: Chrome process ID (from crawl) @@ -15,11 +15,14 @@ * * Environment variables: * CRAWL_OUTPUT_DIR: Crawl output directory (to find crawl's Chrome session) - * CHROME_BINARY: Path to Chrome/Chromium binary (for fallback) + * CHROME_BINARY: Path to Chromium binary (for fallback) * CHROME_RESOLUTION: Page resolution (default: 1440,2000) * CHROME_USER_AGENT: User agent string (optional) * CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true) * CHROME_HEADLESS: Run in headless mode (default: true) + * + * This is a background hook that stays alive until SIGTERM so the tab + * can be closed cleanly at the end of the snapshot run. */ const fs = require('fs'); @@ -28,7 +31,7 @@ const { spawn } = require('child_process'); // Add NODE_MODULES_DIR to module resolution paths if set if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); -const puppeteer = require('puppeteer-core'); +const puppeteer = require('puppeteer'); const { findChromium, getEnv, @@ -43,6 +46,11 @@ const PLUGIN_NAME = 'chrome_tab'; const OUTPUT_DIR = '.'; // Hook already runs in chrome/ output directory const CHROME_SESSION_DIR = '.'; +let finalStatus = 'failed'; +let finalOutput = ''; +let finalError = ''; +let cmdVersion = ''; +let finalized = false; // Parse command line arguments function parseArgs() { @@ -56,8 +64,31 @@ function parseArgs() { return args; } +function emitResult(statusOverride) { + if (finalized) return; + finalized = true; + + const status = statusOverride || finalStatus; + const outputStr = status === 'succeeded' + ? finalOutput + : (finalError || finalOutput || ''); + + const result = { + type: 'ArchiveResult', + status, + output_str: outputStr, + }; + if (cmdVersion) { + result.cmd_version = cmdVersion; + } + console.log(JSON.stringify(result)); +} + // Cleanup handler for SIGTERM - close this snapshot's tab -async function cleanup() { +async function cleanup(signal) { + if (signal) { + console.error(`\nReceived ${signal}, closing chrome tab...`); + } try { const cdpFile = path.join(OUTPUT_DIR, 'cdp_url.txt'); const targetIdFile = path.join(OUTPUT_DIR, 'target_id.txt'); @@ -78,12 +109,13 @@ async function cleanup() { } catch (e) { // Best effort } - process.exit(0); + emitResult(); + process.exit(finalStatus === 'succeeded' ? 0 : 1); } // Register signal handlers -process.on('SIGTERM', cleanup); -process.on('SIGINT', cleanup); +process.on('SIGTERM', () => cleanup('SIGTERM')); +process.on('SIGINT', () => cleanup('SIGINT')); // Try to find the crawl's Chrome session function findCrawlChromeSession(crawlId) { @@ -272,23 +304,22 @@ async function main() { const crawlId = args.crawl_id; if (!url || !snapshotId) { - console.error('Usage: on_Snapshot__20_chrome_tab.bg.js --url= --snapshot-id= [--crawl-id=]'); + console.error('Usage: on_Snapshot__10_chrome_tab.bg.js --url= --snapshot-id= [--crawl-id=]'); process.exit(1); } - const startTs = new Date(); let status = 'failed'; - let output = null; + let output = ''; let error = ''; let version = ''; try { const binary = findChromium(); if (!binary) { - console.error('ERROR: Chrome/Chromium binary not found'); - console.error('DEPENDENCY_NEEDED=chrome'); + console.error('ERROR: Chromium binary not found'); + console.error('DEPENDENCY_NEEDED=chromium'); console.error('BIN_PROVIDERS=puppeteer,env,playwright,apt,brew'); - console.error('INSTALL_HINT=npx @puppeteer/browsers install chrome@stable'); + console.error('INSTALL_HINT=npx @puppeteer/browsers install chromium@latest'); process.exit(1); } @@ -327,24 +358,22 @@ async function main() { status = 'failed'; } - const endTs = new Date(); - if (error) { console.error(`ERROR: ${error}`); } - // Output clean JSONL (no RESULT_JSON= prefix) - const result = { - type: 'ArchiveResult', - status, - output_str: output || error || '', - }; - if (version) { - result.cmd_version = version; - } - console.log(JSON.stringify(result)); + finalStatus = status; + finalOutput = output || ''; + finalError = error || ''; + cmdVersion = version || ''; - process.exit(status === 'succeeded' ? 0 : 1); + if (status !== 'succeeded') { + emitResult(status); + process.exit(1); + } + + console.log('[*] Chrome tab created, waiting for cleanup signal...'); + await new Promise(() => {}); // Keep alive until SIGTERM } main().catch(e => { diff --git a/archivebox/plugins/chrome/on_Snapshot__11_chrome_wait.js b/archivebox/plugins/chrome/on_Snapshot__11_chrome_wait.js new file mode 100644 index 00000000..219b58b9 --- /dev/null +++ b/archivebox/plugins/chrome/on_Snapshot__11_chrome_wait.js @@ -0,0 +1,76 @@ +#!/usr/bin/env node +/** + * Wait for Chrome session files to exist (cdp_url.txt + target_id.txt). + * + * This is a foreground hook that blocks until the Chrome tab is ready, + * so downstream hooks can safely connect to CDP. + * + * Usage: on_Snapshot__11_chrome_wait.js --url= --snapshot-id= + */ + +const fs = require('fs'); +const path = require('path'); +// Add NODE_MODULES_DIR to module resolution paths if set +if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); + +const { + getEnvInt, + waitForChromeSession, + readCdpUrl, + readTargetId, +} = require('./chrome_utils.js'); + +const CHROME_SESSION_DIR = '.'; + +function parseArgs() { + const args = {}; + process.argv.slice(2).forEach(arg => { + if (arg.startsWith('--')) { + const [key, ...valueParts] = arg.slice(2).split('='); + args[key.replace(/-/g, '_')] = valueParts.join('=') || true; + } + }); + return args; +} + +async function main() { + const args = parseArgs(); + const url = args.url; + const snapshotId = args.snapshot_id; + + if (!url || !snapshotId) { + console.error('Usage: on_Snapshot__11_chrome_wait.js --url= --snapshot-id='); + process.exit(1); + } + + const timeoutSeconds = getEnvInt('CHROME_TAB_TIMEOUT', getEnvInt('CHROME_TIMEOUT', getEnvInt('TIMEOUT', 60))); + const timeoutMs = timeoutSeconds * 1000; + + console.error(`[chrome_wait] Waiting for Chrome session (timeout=${timeoutSeconds}s)...`); + + const ready = await waitForChromeSession(CHROME_SESSION_DIR, timeoutMs); + if (!ready) { + const error = `Chrome session not ready after ${timeoutSeconds}s (cdp_url.txt/target_id.txt missing)`; + console.error(`[chrome_wait] ERROR: ${error}`); + console.log(JSON.stringify({ type: 'ArchiveResult', status: 'failed', output_str: error })); + process.exit(1); + } + + const cdpUrl = readCdpUrl(CHROME_SESSION_DIR); + const targetId = readTargetId(CHROME_SESSION_DIR); + if (!cdpUrl || !targetId) { + const error = 'Chrome session files incomplete (cdp_url.txt/target_id.txt missing)'; + console.error(`[chrome_wait] ERROR: ${error}`); + console.log(JSON.stringify({ type: 'ArchiveResult', status: 'failed', output_str: error })); + process.exit(1); + } + + console.error(`[chrome_wait] Chrome session ready (cdp_url=${cdpUrl.slice(0, 32)}..., target_id=${targetId}).`); + console.log(JSON.stringify({ type: 'ArchiveResult', status: 'succeeded', output_str: 'chrome session ready' })); + process.exit(0); +} + +main().catch(e => { + console.error(`Fatal error: ${e.message}`); + process.exit(1); +}); diff --git a/archivebox/plugins/chrome/on_Snapshot__30_chrome_navigate.js b/archivebox/plugins/chrome/on_Snapshot__30_chrome_navigate.js index 5e2c95d6..242c9853 100644 --- a/archivebox/plugins/chrome/on_Snapshot__30_chrome_navigate.js +++ b/archivebox/plugins/chrome/on_Snapshot__30_chrome_navigate.js @@ -19,7 +19,7 @@ const fs = require('fs'); const path = require('path'); // Add NODE_MODULES_DIR to module resolution paths if set if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); -const puppeteer = require('puppeteer-core'); +const puppeteer = require('puppeteer'); const PLUGIN_NAME = 'chrome_navigate'; const CHROME_SESSION_DIR = '.'; diff --git a/archivebox/plugins/chrome/templates/icon.html b/archivebox/plugins/chrome/templates/icon.html index e69de29b..18555344 100644 --- a/archivebox/plugins/chrome/templates/icon.html +++ b/archivebox/plugins/chrome/templates/icon.html @@ -0,0 +1 @@ + diff --git a/archivebox/plugins/chrome/tests/__init__.py b/archivebox/plugins/chrome/tests/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/archivebox/plugins/chrome/tests/chrome_test_helpers.py b/archivebox/plugins/chrome/tests/chrome_test_helpers.py index 3c2424ca..8be2bb3c 100644 --- a/archivebox/plugins/chrome/tests/chrome_test_helpers.py +++ b/archivebox/plugins/chrome/tests/chrome_test_helpers.py @@ -60,6 +60,7 @@ import os import platform import signal import subprocess +import sys import time from datetime import datetime from pathlib import Path @@ -72,11 +73,14 @@ CHROME_PLUGIN_DIR = Path(__file__).parent.parent PLUGINS_ROOT = CHROME_PLUGIN_DIR.parent # Hook script locations -CHROME_INSTALL_HOOK = CHROME_PLUGIN_DIR / 'on_Crawl__01_chrome_install.py' -CHROME_LAUNCH_HOOK = CHROME_PLUGIN_DIR / 'on_Crawl__20_chrome_launch.bg.js' -CHROME_TAB_HOOK = CHROME_PLUGIN_DIR / 'on_Snapshot__20_chrome_tab.bg.js' +CHROME_INSTALL_HOOK = CHROME_PLUGIN_DIR / 'on_Crawl__70_chrome_install.py' +CHROME_LAUNCH_HOOK = CHROME_PLUGIN_DIR / 'on_Crawl__90_chrome_launch.bg.js' +CHROME_TAB_HOOK = CHROME_PLUGIN_DIR / 'on_Snapshot__10_chrome_tab.bg.js' CHROME_NAVIGATE_HOOK = next(CHROME_PLUGIN_DIR.glob('on_Snapshot__*_chrome_navigate.*'), None) CHROME_UTILS = CHROME_PLUGIN_DIR / 'chrome_utils.js' +PUPPETEER_BINARY_HOOK = PLUGINS_ROOT / 'puppeteer' / 'on_Binary__12_puppeteer_install.py' +PUPPETEER_CRAWL_HOOK = PLUGINS_ROOT / 'puppeteer' / 'on_Crawl__60_puppeteer_install.py' +NPM_BINARY_HOOK = PLUGINS_ROOT / 'npm' / 'on_Binary__10_npm_install.py' # ============================================================================= @@ -402,7 +406,7 @@ def run_hook( # Determine interpreter based on file extension if hook_script.suffix == '.py': - cmd = ['python', str(hook_script)] + cmd = [sys.executable, str(hook_script)] elif hook_script.suffix == '.js': cmd = ['node', str(hook_script)] else: @@ -451,6 +455,128 @@ def parse_jsonl_output(stdout: str, record_type: str = 'ArchiveResult') -> Optio return None +def parse_jsonl_records(stdout: str) -> List[Dict[str, Any]]: + """Parse all JSONL records from stdout.""" + records: List[Dict[str, Any]] = [] + for line in stdout.strip().split('\n'): + line = line.strip() + if not line.startswith('{'): + continue + try: + records.append(json.loads(line)) + except json.JSONDecodeError: + continue + return records + + +def apply_machine_updates(records: List[Dict[str, Any]], env: dict) -> None: + """Apply Machine update records to env dict in-place.""" + for record in records: + if record.get('type') != 'Machine': + continue + config = record.get('config') + if not isinstance(config, dict): + continue + env.update(config) + + +def install_chromium_with_hooks(env: dict, timeout: int = 300) -> str: + """Install Chromium via chrome crawl hook + puppeteer/npm hooks. + + Returns absolute path to Chromium binary. + """ + puppeteer_result = subprocess.run( + [sys.executable, str(PUPPETEER_CRAWL_HOOK)], + capture_output=True, + text=True, + timeout=timeout, + env=env, + ) + if puppeteer_result.returncode != 0: + raise RuntimeError(f"Puppeteer crawl hook failed: {puppeteer_result.stderr}") + + puppeteer_record = parse_jsonl_output(puppeteer_result.stdout, record_type='Binary') or {} + if not puppeteer_record or puppeteer_record.get('name') != 'puppeteer': + raise RuntimeError("Puppeteer Binary record not emitted by crawl hook") + + npm_cmd = [ + sys.executable, + str(NPM_BINARY_HOOK), + '--machine-id=test-machine', + '--binary-id=test-puppeteer', + '--name=puppeteer', + f"--binproviders={puppeteer_record.get('binproviders', '*')}", + ] + puppeteer_overrides = puppeteer_record.get('overrides') + if puppeteer_overrides: + npm_cmd.append(f'--overrides={json.dumps(puppeteer_overrides)}') + + npm_result = subprocess.run( + npm_cmd, + capture_output=True, + text=True, + timeout=timeout, + env=env, + ) + if npm_result.returncode != 0: + raise RuntimeError(f"Npm install failed: {npm_result.stderr}") + + apply_machine_updates(parse_jsonl_records(npm_result.stdout), env) + + chrome_result = subprocess.run( + [sys.executable, str(CHROME_INSTALL_HOOK)], + capture_output=True, + text=True, + timeout=timeout, + env=env, + ) + if chrome_result.returncode != 0: + raise RuntimeError(f"Chrome install hook failed: {chrome_result.stderr}") + + chrome_record = parse_jsonl_output(chrome_result.stdout, record_type='Binary') or {} + if not chrome_record or chrome_record.get('name') not in ('chromium', 'chrome'): + raise RuntimeError("Chrome Binary record not emitted by crawl hook") + + chromium_cmd = [ + sys.executable, + str(PUPPETEER_BINARY_HOOK), + '--machine-id=test-machine', + '--binary-id=test-chromium', + f"--name={chrome_record.get('name', 'chromium')}", + f"--binproviders={chrome_record.get('binproviders', '*')}", + ] + chrome_overrides = chrome_record.get('overrides') + if chrome_overrides: + chromium_cmd.append(f'--overrides={json.dumps(chrome_overrides)}') + + result = subprocess.run( + chromium_cmd, + capture_output=True, + text=True, + timeout=timeout, + env=env, + ) + if result.returncode != 0: + raise RuntimeError(f"Puppeteer chromium install failed: {result.stderr}") + + records = parse_jsonl_records(result.stdout) + chromium_record = None + for record in records: + if record.get('type') == 'Binary' and record.get('name') in ('chromium', 'chrome'): + chromium_record = record + break + if not chromium_record: + chromium_record = parse_jsonl_output(result.stdout, record_type='Binary') + + chromium_path = chromium_record.get('abspath') + if not chromium_path or not Path(chromium_path).exists(): + raise RuntimeError(f"Chromium binary not found after install: {chromium_path}") + + env['CHROME_BINARY'] = chromium_path + apply_machine_updates(records, env) + return chromium_path + + def run_hook_and_parse( hook_script: Path, url: str, @@ -499,7 +625,7 @@ def setup_test_env(tmpdir: Path) -> dict: crawls/ snapshots/ - Calls chrome install hook which handles puppeteer-core and chromium installation. + Calls chrome install hook + puppeteer/npm hooks for Chromium installation. Returns env dict with DATA_DIR, LIB_DIR, NPM_BIN_DIR, NODE_MODULES_DIR, CHROME_BINARY, etc. Args: @@ -559,31 +685,10 @@ def setup_test_env(tmpdir: Path) -> dict: if 'CHROME_HEADLESS' not in os.environ: env['CHROME_HEADLESS'] = 'true' - # Call chrome install hook (installs puppeteer-core and chromium, outputs JSONL) - result = subprocess.run( - ['python', str(CHROME_INSTALL_HOOK)], - capture_output=True, text=True, timeout=120, env=env - ) - if result.returncode != 0: - pytest.skip(f"Chrome install hook failed: {result.stderr}") - - # Parse JSONL output to get CHROME_BINARY - chrome_binary = None - for line in result.stdout.strip().split('\n'): - if not line.strip(): - continue - try: - data = json.loads(line) - if data.get('type') == 'Binary' and data.get('abspath'): - chrome_binary = data['abspath'] - break - except json.JSONDecodeError: - continue - - if not chrome_binary or not Path(chrome_binary).exists(): - pytest.skip(f"Chromium binary not found: {chrome_binary}") - - env['CHROME_BINARY'] = chrome_binary + try: + install_chromium_with_hooks(env) + except RuntimeError as e: + pytest.skip(str(e)) return env @@ -790,17 +895,8 @@ def chrome_session( 'CHROME_HEADLESS': 'true', }) - # CRITICAL: Run chrome install hook first (installs puppeteer-core and chromium) - # chrome_launch assumes chrome_install has already run - install_result = subprocess.run( - ['python', str(CHROME_INSTALL_HOOK)], - capture_output=True, - text=True, - timeout=120, - env=env - ) - if install_result.returncode != 0: - raise RuntimeError(f"Chrome install failed: {install_result.stderr}") + # Install Chromium via npm + puppeteer hooks using normal Binary flow + install_chromium_with_hooks(env) # Launch Chrome at crawl level chrome_launch_process = subprocess.Popen( diff --git a/archivebox/plugins/chrome/tests/test_chrome.py b/archivebox/plugins/chrome/tests/test_chrome.py index 82672566..c23a48d9 100644 --- a/archivebox/plugins/chrome/tests/test_chrome.py +++ b/archivebox/plugins/chrome/tests/test_chrome.py @@ -30,9 +30,8 @@ import platform from archivebox.plugins.chrome.tests.chrome_test_helpers import ( get_test_env, - get_lib_dir, - get_node_modules_dir, find_chromium_binary, + install_chromium_with_hooks, CHROME_PLUGIN_DIR as PLUGIN_DIR, CHROME_LAUNCH_HOOK, CHROME_TAB_HOOK, @@ -41,58 +40,24 @@ from archivebox.plugins.chrome.tests.chrome_test_helpers import ( @pytest.fixture(scope="session", autouse=True) def ensure_chromium_and_puppeteer_installed(tmp_path_factory): - """Ensure Chromium and puppeteer are installed before running tests. - - Puppeteer handles Chromium installation automatically in its own cache. - We only need to install puppeteer itself to LIB_DIR/npm. - """ - from abx_pkg import Binary, NpmProvider, BinProviderOverrides - - # Set DATA_DIR if not already set (required by abx_pkg) + """Ensure Chromium and puppeteer are installed before running tests.""" if not os.environ.get('DATA_DIR'): - # Use isolated temp dir for direct pytest runs test_data_dir = tmp_path_factory.mktemp('chrome_test_data') os.environ['DATA_DIR'] = str(test_data_dir) + env = get_test_env() - # Compute paths AFTER setting DATA_DIR - lib_dir = get_lib_dir() - node_modules_dir = get_node_modules_dir() - npm_prefix = lib_dir / 'npm' + try: + chromium_binary = install_chromium_with_hooks(env) + except RuntimeError as e: + pytest.skip(str(e)) - # Rebuild pydantic models - NpmProvider.model_rebuild() - - # Install puppeteer if not available (it will handle Chromium in its own cache) - puppeteer_core_path = node_modules_dir / 'puppeteer-core' - if not puppeteer_core_path.exists(): - print(f"\n[*] Installing puppeteer to {npm_prefix}...") - npm_prefix.mkdir(parents=True, exist_ok=True) - - provider = NpmProvider(npm_prefix=npm_prefix) - try: - binary = Binary( - name='puppeteer', - binproviders=[provider], - overrides={'npm': {'packages': ['puppeteer@^23.5.0']}} - ) - binary.install() - print(f"[*] Puppeteer installed successfully to {npm_prefix}") - except Exception as e: - pytest.skip(f"Failed to install puppeteer: {e}") - - # Find Chromium binary (puppeteer installs it automatically in its cache) - chromium_binary = find_chromium_binary() if not chromium_binary: - pytest.skip("Chromium not found - puppeteer should install it automatically") + pytest.skip("Chromium not found after install") - # Set CHROME_BINARY env var for tests os.environ['CHROME_BINARY'] = chromium_binary - - -# Get paths from helpers (will use DATA_DIR if set, or compute based on __file__) -LIB_DIR = get_lib_dir() -NODE_MODULES_DIR = get_node_modules_dir() -NPM_PREFIX = LIB_DIR / 'npm' + for key in ('NODE_MODULES_DIR', 'NODE_PATH', 'PATH'): + if env.get(key): + os.environ[key] = env[key] def test_hook_scripts_exist(): diff --git a/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.bg.js b/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.bg.js index c312f0c5..92351c05 100755 --- a/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.bg.js +++ b/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.bg.js @@ -32,6 +32,13 @@ const OUTPUT_DIR = '.'; const OUTPUT_FILE = 'console.jsonl'; const CHROME_SESSION_DIR = '../chrome'; +let browser = null; +let page = null; +let logCount = 0; +let errorCount = 0; +let requestFailCount = 0; +let shuttingDown = false; + async function serializeArgs(args) { const serialized = []; for (const arg of args) { @@ -73,6 +80,7 @@ async function setupListeners() { location: msg.location(), }; fs.appendFileSync(outputPath, JSON.stringify(logEntry) + '\n'); + logCount += 1; } catch (e) { // Ignore errors } @@ -87,6 +95,7 @@ async function setupListeners() { stack: error.stack || '', }; fs.appendFileSync(outputPath, JSON.stringify(logEntry) + '\n'); + errorCount += 1; } catch (e) { // Ignore } @@ -103,6 +112,7 @@ async function setupListeners() { url: request.url(), }; fs.appendFileSync(outputPath, JSON.stringify(logEntry) + '\n'); + requestFailCount += 1; } catch (e) { // Ignore } @@ -111,6 +121,29 @@ async function setupListeners() { return { browser, page }; } +function emitResult(status = 'succeeded') { + if (shuttingDown) return; + shuttingDown = true; + + const counts = `${logCount} console, ${errorCount} errors, ${requestFailCount} failed requests`; + console.log(JSON.stringify({ + type: 'ArchiveResult', + status, + output_str: `${OUTPUT_FILE} (${counts})`, + })); +} + +async function handleShutdown(signal) { + console.error(`\nReceived ${signal}, emitting final results...`); + emitResult('succeeded'); + if (browser) { + try { + browser.disconnect(); + } catch (e) {} + } + process.exit(0); +} + async function main() { const args = parseArgs(); const url = args.url; @@ -127,23 +160,27 @@ async function main() { process.exit(0); } - const timeout = getEnvInt('CONSOLELOG_TIMEOUT', 30) * 1000; - try { // Set up listeners BEFORE navigation - await setupListeners(); + const connection = await setupListeners(); + browser = connection.browser; + page = connection.page; - // Wait for chrome_navigate to complete (BLOCKING) - await waitForPageLoaded(CHROME_SESSION_DIR, timeout * 4, 500); + // Register signal handlers for graceful shutdown + process.on('SIGTERM', () => handleShutdown('SIGTERM')); + process.on('SIGINT', () => handleShutdown('SIGINT')); - // Output clean JSONL - console.log(JSON.stringify({ - type: 'ArchiveResult', - status: 'succeeded', - output_str: OUTPUT_FILE, - })); + // Wait for chrome_navigate to complete (non-fatal) + try { + const timeout = getEnvInt('CONSOLELOG_TIMEOUT', 30) * 1000; + await waitForPageLoaded(CHROME_SESSION_DIR, timeout * 4, 500); + } catch (e) { + console.error(`WARN: ${e.message}`); + } - process.exit(0); + // console.error('Consolelog active, waiting for cleanup signal...'); + await new Promise(() => {}); // Keep alive until SIGTERM + return; } catch (e) { const error = `${e.name}: ${e.message}`; diff --git a/archivebox/plugins/consolelog/templates/icon.html b/archivebox/plugins/consolelog/templates/icon.html index e69de29b..c68b8db5 100644 --- a/archivebox/plugins/consolelog/templates/icon.html +++ b/archivebox/plugins/consolelog/templates/icon.html @@ -0,0 +1 @@ + diff --git a/archivebox/plugins/consolelog/tests/__init__.py b/archivebox/plugins/consolelog/tests/__init__.py deleted file mode 100644 index 456c345d..00000000 --- a/archivebox/plugins/consolelog/tests/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Tests for the consolelog plugin.""" diff --git a/archivebox/plugins/consolelog/tests/test_consolelog.py b/archivebox/plugins/consolelog/tests/test_consolelog.py index 2f9189ff..7d590aaa 100644 --- a/archivebox/plugins/consolelog/tests/test_consolelog.py +++ b/archivebox/plugins/consolelog/tests/test_consolelog.py @@ -10,6 +10,7 @@ import shutil import subprocess import sys import tempfile +import time from pathlib import Path import pytest @@ -76,26 +77,33 @@ class TestConsolelogWithChrome(TestCase): # Use the environment from chrome_session (already has CHROME_HEADLESS=true) - # Run consolelog hook with the active Chrome session - result = subprocess.run( + # Run consolelog hook with the active Chrome session (background hook) + result = subprocess.Popen( ['node', str(CONSOLELOG_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], cwd=str(snapshot_chrome_dir), - capture_output=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, text=True, - timeout=120, # Longer timeout as it waits for navigation env=env ) # Check for output file console_output = snapshot_chrome_dir / 'console.jsonl' - # Verify hook ran (may succeed or timeout waiting for navigation) - # The hook is designed to wait for page_loaded.txt from chrome_navigate - # In test mode, that file may not exist, so hook may timeout - # But it should still create the console.jsonl file + # Allow it to run briefly, then terminate (background hook) + time.sleep(3) + if result.poll() is None: + result.terminate() + try: + stdout, stderr = result.communicate(timeout=5) + except subprocess.TimeoutExpired: + result.kill() + stdout, stderr = result.communicate() + else: + stdout, stderr = result.communicate() # At minimum, verify no crash - self.assertNotIn('Traceback', result.stderr) + self.assertNotIn('Traceback', stderr) # If output file exists, verify it's valid JSONL if console_output.exists(): diff --git a/archivebox/plugins/custom/on_Binary__14_custom_install.py b/archivebox/plugins/custom/on_Binary__14_custom_install.py index b0ed6c15..7e523d54 100644 --- a/archivebox/plugins/custom/on_Binary__14_custom_install.py +++ b/archivebox/plugins/custom/on_Binary__14_custom_install.py @@ -59,9 +59,16 @@ def main(binary_id: str, machine_id: str, name: str, binproviders: str, custom_c provider = EnvProvider() try: binary = Binary(name=name, binproviders=[provider]).load() - except Exception as e: - click.echo(f"{name} not found after custom install: {e}", err=True) - sys.exit(1) + except Exception: + try: + binary = Binary( + name=name, + binproviders=[provider], + overrides={'env': {'version': '0.0.1'}}, + ).load() + except Exception as e: + click.echo(f"{name} not found after custom install: {e}", err=True) + sys.exit(1) if not binary.abspath: click.echo(f"{name} not found after custom install", err=True) diff --git a/archivebox/plugins/custom/tests/__init__.py b/archivebox/plugins/custom/tests/__init__.py deleted file mode 100644 index 63791d76..00000000 --- a/archivebox/plugins/custom/tests/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Tests for the custom binary provider plugin.""" diff --git a/archivebox/plugins/custom/tests/test_custom_provider.py b/archivebox/plugins/custom/tests/test_custom_provider.py index 301f8657..22a2cb1d 100644 --- a/archivebox/plugins/custom/tests/test_custom_provider.py +++ b/archivebox/plugins/custom/tests/test_custom_provider.py @@ -17,7 +17,7 @@ from django.test import TestCase # Get the path to the custom provider hook PLUGIN_DIR = Path(__file__).parent.parent -INSTALL_HOOK = PLUGIN_DIR / 'on_Binary__install_using_custom_bash.py' +INSTALL_HOOK = next(PLUGIN_DIR.glob('on_Binary__*_custom_install.py'), None) class TestCustomProviderHook(TestCase): @@ -34,7 +34,7 @@ class TestCustomProviderHook(TestCase): def test_hook_script_exists(self): """Hook script should exist.""" - self.assertTrue(INSTALL_HOOK.exists(), f"Hook not found: {INSTALL_HOOK}") + self.assertTrue(INSTALL_HOOK and INSTALL_HOOK.exists(), f"Hook not found: {INSTALL_HOOK}") def test_hook_skips_when_custom_not_allowed(self): """Hook should skip when custom not in allowed binproviders.""" diff --git a/archivebox/plugins/dns/on_Snapshot__22_dns.bg.js b/archivebox/plugins/dns/on_Snapshot__22_dns.bg.js index 721674f1..105f13d8 100755 --- a/archivebox/plugins/dns/on_Snapshot__22_dns.bg.js +++ b/archivebox/plugins/dns/on_Snapshot__22_dns.bg.js @@ -32,6 +32,11 @@ const OUTPUT_DIR = '.'; const OUTPUT_FILE = 'dns.jsonl'; const CHROME_SESSION_DIR = '../chrome'; +let browser = null; +let page = null; +let recordCount = 0; +let shuttingDown = false; + function extractHostname(url) { try { const urlObj = new URL(url); @@ -121,6 +126,7 @@ async function setupListener(targetUrl) { // Append to output file fs.appendFileSync(outputPath, JSON.stringify(dnsRecord) + '\n'); + recordCount += 1; } catch (e) { // Ignore errors @@ -170,6 +176,7 @@ async function setupListener(targetUrl) { }; fs.appendFileSync(outputPath, JSON.stringify(dnsRecord) + '\n'); + recordCount += 1; } } catch (e) { // Ignore errors @@ -179,6 +186,28 @@ async function setupListener(targetUrl) { return { browser, page, client }; } +function emitResult(status = 'succeeded') { + if (shuttingDown) return; + shuttingDown = true; + + console.log(JSON.stringify({ + type: 'ArchiveResult', + status, + output_str: `${OUTPUT_FILE} (${recordCount} DNS records)`, + })); +} + +async function handleShutdown(signal) { + console.error(`\nReceived ${signal}, emitting final results...`); + emitResult('succeeded'); + if (browser) { + try { + browser.disconnect(); + } catch (e) {} + } + process.exit(0); +} + async function main() { const args = parseArgs(); const url = args.url; @@ -195,31 +224,27 @@ async function main() { process.exit(0); } - const timeout = getEnvInt('DNS_TIMEOUT', 30) * 1000; - try { // Set up listener BEFORE navigation - await setupListener(url); + const connection = await setupListener(url); + browser = connection.browser; + page = connection.page; - // Wait for chrome_navigate to complete (BLOCKING) - await waitForPageLoaded(CHROME_SESSION_DIR, timeout * 4, 500); + // Register signal handlers for graceful shutdown + process.on('SIGTERM', () => handleShutdown('SIGTERM')); + process.on('SIGINT', () => handleShutdown('SIGINT')); - // Count DNS records - const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE); - let recordCount = 0; - if (fs.existsSync(outputPath)) { - const content = fs.readFileSync(outputPath, 'utf8'); - recordCount = content.split('\n').filter(line => line.trim()).length; + // Wait for chrome_navigate to complete (non-fatal) + try { + const timeout = getEnvInt('DNS_TIMEOUT', 30) * 1000; + await waitForPageLoaded(CHROME_SESSION_DIR, timeout * 4, 500); + } catch (e) { + console.error(`WARN: ${e.message}`); } - // Output clean JSONL - console.log(JSON.stringify({ - type: 'ArchiveResult', - status: 'succeeded', - output_str: `${OUTPUT_FILE} (${recordCount} DNS records)`, - })); - - process.exit(0); + // console.error('DNS listener active, waiting for cleanup signal...'); + await new Promise(() => {}); // Keep alive until SIGTERM + return; } catch (e) { const error = `${e.name}: ${e.message}`; diff --git a/archivebox/plugins/dns/templates/icon.html b/archivebox/plugins/dns/templates/icon.html index e69de29b..1a558d40 100644 --- a/archivebox/plugins/dns/templates/icon.html +++ b/archivebox/plugins/dns/templates/icon.html @@ -0,0 +1 @@ + diff --git a/archivebox/plugins/dom/on_Snapshot__53_dom.js b/archivebox/plugins/dom/on_Snapshot__53_dom.js index cc35645e..f62662f8 100644 --- a/archivebox/plugins/dom/on_Snapshot__53_dom.js +++ b/archivebox/plugins/dom/on_Snapshot__53_dom.js @@ -52,7 +52,21 @@ const CHROME_SESSION_DIR = '../chrome'; // Check if staticfile extractor already downloaded this URL const STATICFILE_DIR = '../staticfile'; function hasStaticFileOutput() { - return fs.existsSync(STATICFILE_DIR) && fs.readdirSync(STATICFILE_DIR).length > 0; + if (!fs.existsSync(STATICFILE_DIR)) return false; + const stdoutPath = path.join(STATICFILE_DIR, 'stdout.log'); + if (!fs.existsSync(stdoutPath)) return false; + const stdout = fs.readFileSync(stdoutPath, 'utf8'); + for (const line of stdout.split('\n')) { + const trimmed = line.trim(); + if (!trimmed.startsWith('{')) continue; + try { + const record = JSON.parse(trimmed); + if (record.type === 'ArchiveResult' && record.status === 'succeeded') { + return true; + } + } catch (e) {} + } + return false; } // Wait for chrome tab to be fully loaded diff --git a/archivebox/plugins/dom/templates/icon.html b/archivebox/plugins/dom/templates/icon.html index f8995a81..56efac8d 100644 --- a/archivebox/plugins/dom/templates/icon.html +++ b/archivebox/plugins/dom/templates/icon.html @@ -1 +1 @@ -🌐 \ No newline at end of file + diff --git a/archivebox/plugins/dom/tests/test_dom.py b/archivebox/plugins/dom/tests/test_dom.py index fea41b8d..7312a72f 100644 --- a/archivebox/plugins/dom/tests/test_dom.py +++ b/archivebox/plugins/dom/tests/test_dom.py @@ -142,7 +142,7 @@ def test_staticfile_present_skips(): # dom/ <- dom extractor runs here, looks for ../staticfile staticfile_dir = tmpdir / 'staticfile' staticfile_dir.mkdir() - (staticfile_dir / 'index.html').write_text('test') + (staticfile_dir / 'stdout.log').write_text('{"type":"ArchiveResult","status":"succeeded","output_str":"index.html"}\n') dom_dir = tmpdir / 'dom' dom_dir.mkdir() diff --git a/archivebox/plugins/env/on_Binary__15_env_install.py b/archivebox/plugins/env/on_Binary__15_env_install.py index 0e867063..35b3a9ca 100644 --- a/archivebox/plugins/env/on_Binary__15_env_install.py +++ b/archivebox/plugins/env/on_Binary__15_env_install.py @@ -25,7 +25,8 @@ from abx_pkg import Binary, EnvProvider @click.option('--binary-id', required=True, help="Dependency UUID") @click.option('--name', required=True, help="Binary name to find") @click.option('--binproviders', default='*', help="Allowed providers (comma-separated)") -def main(binary_id: str, machine_id: str, name: str, binproviders: str): +@click.option('--overrides', default=None, help="JSON-encoded overrides dict (unused)") +def main(binary_id: str, machine_id: str, name: str, binproviders: str, overrides: str | None): """Check if binary is available in PATH and record it.""" # Check if env provider is allowed diff --git a/archivebox/plugins/env/tests/__init__.py b/archivebox/plugins/env/tests/__init__.py deleted file mode 100644 index 4fe95e6e..00000000 --- a/archivebox/plugins/env/tests/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Tests for the env binary provider plugin.""" diff --git a/archivebox/plugins/env/tests/test_env_provider.py b/archivebox/plugins/env/tests/test_env_provider.py index bf3cc590..2bffcfca 100644 --- a/archivebox/plugins/env/tests/test_env_provider.py +++ b/archivebox/plugins/env/tests/test_env_provider.py @@ -17,7 +17,7 @@ from django.test import TestCase # Get the path to the env provider hook PLUGIN_DIR = Path(__file__).parent.parent -INSTALL_HOOK = PLUGIN_DIR / 'on_Binary__install_using_env_provider.py' +INSTALL_HOOK = next(PLUGIN_DIR.glob('on_Binary__*_env_install.py'), None) class TestEnvProviderHook(TestCase): @@ -34,7 +34,7 @@ class TestEnvProviderHook(TestCase): def test_hook_script_exists(self): """Hook script should exist.""" - self.assertTrue(INSTALL_HOOK.exists(), f"Hook not found: {INSTALL_HOOK}") + self.assertTrue(INSTALL_HOOK and INSTALL_HOOK.exists(), f"Hook not found: {INSTALL_HOOK}") def test_hook_finds_python(self): """Hook should find python3 binary in PATH.""" diff --git a/archivebox/plugins/favicon/on_Snapshot__11_favicon.py b/archivebox/plugins/favicon/on_Snapshot__11_favicon.py index ea5e9200..4b40d726 100644 --- a/archivebox/plugins/favicon/on_Snapshot__11_favicon.py +++ b/archivebox/plugins/favicon/on_Snapshot__11_favicon.py @@ -126,7 +126,12 @@ def main(url: str, snapshot_id: str): try: # Run extraction success, output, error = get_favicon(url) - status = 'succeeded' if success else 'failed' + if success: + status = 'succeeded' + elif error == 'No favicon found': + status = 'skipped' + else: + status = 'failed' except Exception as e: error = f'{type(e).__name__}: {e}' @@ -143,7 +148,7 @@ def main(url: str, snapshot_id: str): } print(json.dumps(result)) - sys.exit(0 if status == 'succeeded' else 1) + sys.exit(0 if status in ('succeeded', 'skipped') else 1) if __name__ == '__main__': diff --git a/archivebox/plugins/favicon/templates/icon.html b/archivebox/plugins/favicon/templates/icon.html index ec6acc11..7ba648b3 100644 --- a/archivebox/plugins/favicon/templates/icon.html +++ b/archivebox/plugins/favicon/templates/icon.html @@ -1 +1 @@ -⭐ \ No newline at end of file + diff --git a/archivebox/plugins/forumdl/binaries.jsonl b/archivebox/plugins/forumdl/binaries.jsonl deleted file mode 100644 index 2d085bdd..00000000 --- a/archivebox/plugins/forumdl/binaries.jsonl +++ /dev/null @@ -1 +0,0 @@ -{"type": "Binary", "name": "forum-dl", "binproviders": "pip,env"} diff --git a/archivebox/plugins/forumdl/on_Crawl__13_forumdl_install.py b/archivebox/plugins/forumdl/on_Crawl__13_forumdl_install.py deleted file mode 100755 index f52a72f2..00000000 --- a/archivebox/plugins/forumdl/on_Crawl__13_forumdl_install.py +++ /dev/null @@ -1,80 +0,0 @@ -#!/usr/bin/env python3 -""" -Detect forum-dl binary and emit Binary JSONL record. - -Output: Binary JSONL record to stdout if forum-dl is found -""" - -import json -import os -import sys - -from abx_pkg import Binary, EnvProvider - - -def get_env(name: str, default: str = '') -> str: - return os.environ.get(name, default).strip() - -def get_env_bool(name: str, default: bool = False) -> bool: - val = get_env(name, '').lower() - if val in ('true', '1', 'yes', 'on'): - return True - if val in ('false', '0', 'no', 'off'): - return False - return default - - -def output_binary_found(binary: Binary, name: str): - """Output Binary JSONL record for an installed binary.""" - machine_id = os.environ.get('MACHINE_ID', '') - - record = { - 'type': 'Binary', - 'name': name, - 'abspath': str(binary.abspath), - 'version': str(binary.version) if binary.version else '', - 'sha256': binary.sha256 or '', - 'binprovider': 'env', # Already installed - 'machine_id': machine_id, - } - print(json.dumps(record)) - - -def output_binary_missing(name: str, binproviders: str): - """Output Binary JSONL record for a missing binary that needs installation.""" - machine_id = os.environ.get('MACHINE_ID', '') - - record = { - 'type': 'Binary', - 'name': name, - 'binproviders': binproviders, # Providers that can install it - 'machine_id': machine_id, - } - print(json.dumps(record)) - - -def main(): - forumdl_enabled = get_env_bool('FORUMDL_ENABLED', True) - forumdl_binary = get_env('FORUMDL_BINARY', 'forum-dl') - - if not forumdl_enabled: - sys.exit(0) - - provider = EnvProvider() - try: - binary = Binary(name=forumdl_binary, binproviders=[provider]).load() - if binary.abspath: - # Binary found - output_binary_found(binary, name='forum-dl') - else: - # Binary not found - output_binary_missing(name='forum-dl', binproviders='pip') - except Exception: - # Binary not found - output_binary_missing(name='forum-dl', binproviders='pip') - - sys.exit(0) - - -if __name__ == '__main__': - main() diff --git a/archivebox/plugins/forumdl/on_Crawl__25_forumdl_install.py b/archivebox/plugins/forumdl/on_Crawl__25_forumdl_install.py new file mode 100755 index 00000000..73a72a24 --- /dev/null +++ b/archivebox/plugins/forumdl/on_Crawl__25_forumdl_install.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python3 +""" +Emit forum-dl Binary dependency for the crawl. +""" + +import json +import os +import sys + + +def get_env(name: str, default: str = '') -> str: + return os.environ.get(name, default).strip() + +def get_env_bool(name: str, default: bool = False) -> bool: + val = get_env(name, '').lower() + if val in ('true', '1', 'yes', 'on'): + return True + if val in ('false', '0', 'no', 'off'): + return False + return default + + +def output_binary(name: str, binproviders: str, overrides: dict | None = None): + """Output Binary JSONL record for a dependency.""" + machine_id = os.environ.get('MACHINE_ID', '') + + record = { + 'type': 'Binary', + 'name': name, + 'binproviders': binproviders, + 'machine_id': machine_id, + } + if overrides: + record['overrides'] = overrides + print(json.dumps(record)) + + +def main(): + forumdl_enabled = get_env_bool('FORUMDL_ENABLED', True) + + if not forumdl_enabled: + sys.exit(0) + + output_binary( + name='forum-dl', + binproviders='pip,env', + overrides={ + 'pip': { + 'packages': [ + '--no-deps', + 'forum-dl', + 'pydantic', + 'pydantic-core', + 'typing-extensions', + 'annotated-types', + 'typing-inspection', + 'beautifulsoup4', + 'soupsieve', + 'lxml', + 'requests', + 'urllib3', + 'certifi', + 'idna', + 'charset-normalizer', + 'tenacity', + 'python-dateutil', + 'six', + 'html2text', + 'warcio', + ] + } + }, + ) + + sys.exit(0) + + +if __name__ == '__main__': + main() diff --git a/archivebox/plugins/forumdl/on_Snapshot__65_forumdl.bg.py b/archivebox/plugins/forumdl/on_Snapshot__04_forumdl.bg.py similarity index 87% rename from archivebox/plugins/forumdl/on_Snapshot__65_forumdl.bg.py rename to archivebox/plugins/forumdl/on_Snapshot__04_forumdl.bg.py index 8cb97d54..9d2c2461 100755 --- a/archivebox/plugins/forumdl/on_Snapshot__65_forumdl.bg.py +++ b/archivebox/plugins/forumdl/on_Snapshot__04_forumdl.bg.py @@ -2,7 +2,7 @@ """ Download forum content from a URL using forum-dl. -Usage: on_Snapshot__forumdl.py --url= --snapshot-id= +Usage: on_Snapshot__04_forumdl.bg.py --url= --snapshot-id= Output: Downloads forum content to $PWD/ Environment variables: @@ -19,6 +19,7 @@ import json import os import subprocess import sys +import threading from pathlib import Path import rich_click as click @@ -131,13 +132,41 @@ def save_forum(url: str, binary: str) -> tuple[bool, str | None, str]: cmd.append(url) try: - result = subprocess.run(cmd, capture_output=True, timeout=timeout, text=True) + print(f'[forumdl] Starting download (timeout={timeout}s)', file=sys.stderr) + output_lines: list[str] = [] + process = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1, + ) + + def _read_output() -> None: + if not process.stdout: + return + for line in process.stdout: + output_lines.append(line) + sys.stderr.write(line) + + reader = threading.Thread(target=_read_output, daemon=True) + reader.start() + + try: + process.wait(timeout=timeout) + except subprocess.TimeoutExpired: + process.kill() + reader.join(timeout=1) + return False, None, f'Timed out after {timeout} seconds' + + reader.join(timeout=1) + combined_output = ''.join(output_lines) # Check if output file was created if output_file.exists() and output_file.stat().st_size > 0: return True, str(output_file), '' else: - stderr = result.stderr + stderr = combined_output # These are NOT errors - page simply has no downloadable forum content stderr_lower = stderr.lower() @@ -147,7 +176,7 @@ def save_forum(url: str, binary: str) -> tuple[bool, str | None, str]: return True, None, '' # No forum found - success, no output if 'extractornotfounderror' in stderr_lower: return True, None, '' # No forum extractor for this URL - success, no output - if result.returncode == 0: + if process.returncode == 0: return True, None, '' # forum-dl exited cleanly, just no forum - success # These ARE errors - something went wrong diff --git a/archivebox/plugins/forumdl/templates/icon.html b/archivebox/plugins/forumdl/templates/icon.html index 4c000f72..01cace0d 100644 --- a/archivebox/plugins/forumdl/templates/icon.html +++ b/archivebox/plugins/forumdl/templates/icon.html @@ -1 +1 @@ -💬 \ No newline at end of file + diff --git a/archivebox/plugins/gallerydl/binaries.jsonl b/archivebox/plugins/gallerydl/binaries.jsonl deleted file mode 100644 index 1fb165f1..00000000 --- a/archivebox/plugins/gallerydl/binaries.jsonl +++ /dev/null @@ -1 +0,0 @@ -{"type": "Binary", "name": "gallery-dl", "binproviders": "pip,brew,apt,env"} diff --git a/archivebox/plugins/gallerydl/on_Crawl__10_gallerydl_install.py b/archivebox/plugins/gallerydl/on_Crawl__10_gallerydl_install.py deleted file mode 100755 index df627ab4..00000000 --- a/archivebox/plugins/gallerydl/on_Crawl__10_gallerydl_install.py +++ /dev/null @@ -1,80 +0,0 @@ -#!/usr/bin/env python3 -""" -Detect gallery-dl binary and emit Binary JSONL record. - -Output: Binary JSONL record to stdout if gallery-dl is found -""" - -import json -import os -import sys - -from abx_pkg import Binary, EnvProvider - - -def get_env(name: str, default: str = '') -> str: - return os.environ.get(name, default).strip() - -def get_env_bool(name: str, default: bool = False) -> bool: - val = get_env(name, '').lower() - if val in ('true', '1', 'yes', 'on'): - return True - if val in ('false', '0', 'no', 'off'): - return False - return default - - -def output_binary_found(binary: Binary, name: str): - """Output Binary JSONL record for an installed binary.""" - machine_id = os.environ.get('MACHINE_ID', '') - - record = { - 'type': 'Binary', - 'name': name, - 'abspath': str(binary.abspath), - 'version': str(binary.version) if binary.version else '', - 'sha256': binary.sha256 or '', - 'binprovider': 'env', # Already installed - 'machine_id': machine_id, - } - print(json.dumps(record)) - - -def output_binary_missing(name: str, binproviders: str): - """Output Binary JSONL record for a missing binary that needs installation.""" - machine_id = os.environ.get('MACHINE_ID', '') - - record = { - 'type': 'Binary', - 'name': name, - 'binproviders': binproviders, # Providers that can install it - 'machine_id': machine_id, - } - print(json.dumps(record)) - - -def main(): - gallerydl_enabled = get_env_bool('GALLERYDL_ENABLED', True) - gallerydl_binary = get_env('GALLERYDL_BINARY', 'gallery-dl') - - if not gallerydl_enabled: - sys.exit(0) - - provider = EnvProvider() - try: - binary = Binary(name=gallerydl_binary, binproviders=[provider]).load() - if binary.abspath: - # Binary found - output_binary_found(binary, name='gallery-dl') - else: - # Binary not found - output_binary_missing(name='gallery-dl', binproviders='pip') - except Exception: - # Binary not found - output_binary_missing(name='gallery-dl', binproviders='pip') - - sys.exit(0) - - -if __name__ == '__main__': - main() diff --git a/archivebox/plugins/gallerydl/on_Crawl__20_gallerydl_install.py b/archivebox/plugins/gallerydl/on_Crawl__20_gallerydl_install.py new file mode 100755 index 00000000..06d95f4d --- /dev/null +++ b/archivebox/plugins/gallerydl/on_Crawl__20_gallerydl_install.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python3 +""" +Emit gallery-dl Binary dependency for the crawl. +""" + +import json +import os +import sys + + +def get_env(name: str, default: str = '') -> str: + return os.environ.get(name, default).strip() + +def get_env_bool(name: str, default: bool = False) -> bool: + val = get_env(name, '').lower() + if val in ('true', '1', 'yes', 'on'): + return True + if val in ('false', '0', 'no', 'off'): + return False + return default + + +def output_binary(name: str, binproviders: str): + """Output Binary JSONL record for a dependency.""" + machine_id = os.environ.get('MACHINE_ID', '') + + record = { + 'type': 'Binary', + 'name': name, + 'binproviders': binproviders, + 'machine_id': machine_id, + } + print(json.dumps(record)) + + +def main(): + gallerydl_enabled = get_env_bool('GALLERYDL_ENABLED', True) + + if not gallerydl_enabled: + sys.exit(0) + + output_binary(name='gallery-dl', binproviders='pip,brew,apt,env') + + sys.exit(0) + + +if __name__ == '__main__': + main() diff --git a/archivebox/plugins/gallerydl/on_Snapshot__64_gallerydl.bg.py b/archivebox/plugins/gallerydl/on_Snapshot__03_gallerydl.bg.py similarity index 81% rename from archivebox/plugins/gallerydl/on_Snapshot__64_gallerydl.bg.py rename to archivebox/plugins/gallerydl/on_Snapshot__03_gallerydl.bg.py index 78c1128a..d4c2a08d 100755 --- a/archivebox/plugins/gallerydl/on_Snapshot__64_gallerydl.bg.py +++ b/archivebox/plugins/gallerydl/on_Snapshot__03_gallerydl.bg.py @@ -2,7 +2,7 @@ """ Download image galleries from a URL using gallery-dl. -Usage: on_Snapshot__gallerydl.py --url= --snapshot-id= +Usage: on_Snapshot__03_gallerydl.bg.py --url= --snapshot-id= Output: Downloads gallery images to $PWD/gallerydl/ Environment variables: @@ -19,6 +19,7 @@ import json import os import subprocess import sys +import threading from pathlib import Path import rich_click as click @@ -70,7 +71,22 @@ STATICFILE_DIR = '../staticfile' def has_staticfile_output() -> bool: """Check if staticfile extractor already downloaded this URL.""" staticfile_dir = Path(STATICFILE_DIR) - return staticfile_dir.exists() and any(staticfile_dir.iterdir()) + if not staticfile_dir.exists(): + return False + stdout_log = staticfile_dir / 'stdout.log' + if not stdout_log.exists(): + return False + for line in stdout_log.read_text(errors='ignore').splitlines(): + line = line.strip() + if not line.startswith('{'): + continue + try: + record = json.loads(line) + except json.JSONDecodeError: + continue + if record.get('type') == 'ArchiveResult' and record.get('status') == 'succeeded': + return True + return False def save_gallery(url: str, binary: str) -> tuple[bool, str | None, str]: @@ -109,7 +125,35 @@ def save_gallery(url: str, binary: str) -> tuple[bool, str | None, str]: cmd.append(url) try: - result = subprocess.run(cmd, capture_output=True, timeout=timeout, text=True) + print(f'[gallerydl] Starting download (timeout={timeout}s)', file=sys.stderr) + output_lines: list[str] = [] + process = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1, + ) + + def _read_output() -> None: + if not process.stdout: + return + for line in process.stdout: + output_lines.append(line) + sys.stderr.write(line) + + reader = threading.Thread(target=_read_output, daemon=True) + reader.start() + + try: + process.wait(timeout=timeout) + except subprocess.TimeoutExpired: + process.kill() + reader.join(timeout=1) + return False, None, f'Timed out after {timeout} seconds' + + reader.join(timeout=1) + combined_output = ''.join(output_lines) # Check if any gallery files were downloaded (search recursively) gallery_extensions = ( @@ -132,7 +176,7 @@ def save_gallery(url: str, binary: str) -> tuple[bool, str | None, str]: output = str(image_files[0]) if image_files else str(downloaded_files[0]) return True, output, '' else: - stderr = result.stderr + stderr = combined_output # These are NOT errors - page simply has no downloadable gallery # Return success with no output (legitimate "nothing to download") @@ -141,7 +185,7 @@ def save_gallery(url: str, binary: str) -> tuple[bool, str | None, str]: return True, None, '' # Not a gallery site - success, no output if 'no results' in stderr_lower: return True, None, '' # No gallery found - success, no output - if result.returncode == 0: + if process.returncode == 0: return True, None, '' # gallery-dl exited cleanly, just no gallery - success # These ARE errors - something went wrong diff --git a/archivebox/plugins/gallerydl/templates/icon.html b/archivebox/plugins/gallerydl/templates/icon.html index b6bb6d16..a8ef89e7 100644 --- a/archivebox/plugins/gallerydl/templates/icon.html +++ b/archivebox/plugins/gallerydl/templates/icon.html @@ -1 +1 @@ -🖼️ \ No newline at end of file + diff --git a/archivebox/plugins/git/binaries.jsonl b/archivebox/plugins/git/binaries.jsonl deleted file mode 100644 index b459ab22..00000000 --- a/archivebox/plugins/git/binaries.jsonl +++ /dev/null @@ -1 +0,0 @@ -{"type": "Binary", "name": "git", "binproviders": "apt,brew,env"} diff --git a/archivebox/plugins/git/on_Crawl__05_git_install.py b/archivebox/plugins/git/on_Crawl__05_git_install.py new file mode 100755 index 00000000..e090d546 --- /dev/null +++ b/archivebox/plugins/git/on_Crawl__05_git_install.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python3 +""" +Emit git Binary dependency for the crawl. +""" + +import json +import os +import sys + + +def get_env(name: str, default: str = '') -> str: + return os.environ.get(name, default).strip() + +def get_env_bool(name: str, default: bool = False) -> bool: + val = get_env(name, '').lower() + if val in ('true', '1', 'yes', 'on'): + return True + if val in ('false', '0', 'no', 'off'): + return False + return default + + +def output_binary(name: str, binproviders: str): + """Output Binary JSONL record for a dependency.""" + machine_id = os.environ.get('MACHINE_ID', '') + + record = { + 'type': 'Binary', + 'name': name, + 'binproviders': binproviders, + 'machine_id': machine_id, + } + print(json.dumps(record)) + + +def main(): + git_enabled = get_env_bool('GIT_ENABLED', True) + + if not git_enabled: + sys.exit(0) + + output_binary(name='git', binproviders='apt,brew,env') + + sys.exit(0) + + +if __name__ == '__main__': + main() diff --git a/archivebox/plugins/git/on_Crawl__09_git_install.py b/archivebox/plugins/git/on_Crawl__09_git_install.py deleted file mode 100755 index 4179ed81..00000000 --- a/archivebox/plugins/git/on_Crawl__09_git_install.py +++ /dev/null @@ -1,80 +0,0 @@ -#!/usr/bin/env python3 -""" -Detect git binary and emit Binary JSONL record. - -Output: Binary JSONL record to stdout if git is found -""" - -import json -import os -import sys - -from abx_pkg import Binary, EnvProvider - - -def get_env(name: str, default: str = '') -> str: - return os.environ.get(name, default).strip() - -def get_env_bool(name: str, default: bool = False) -> bool: - val = get_env(name, '').lower() - if val in ('true', '1', 'yes', 'on'): - return True - if val in ('false', '0', 'no', 'off'): - return False - return default - - -def output_binary_found(binary: Binary, name: str): - """Output Binary JSONL record for an installed binary.""" - machine_id = os.environ.get('MACHINE_ID', '') - - record = { - 'type': 'Binary', - 'name': name, - 'abspath': str(binary.abspath), - 'version': str(binary.version) if binary.version else '', - 'sha256': binary.sha256 or '', - 'binprovider': 'env', # Already installed - 'machine_id': machine_id, - } - print(json.dumps(record)) - - -def output_binary_missing(name: str, binproviders: str): - """Output Binary JSONL record for a missing binary that needs installation.""" - machine_id = os.environ.get('MACHINE_ID', '') - - record = { - 'type': 'Binary', - 'name': name, - 'binproviders': binproviders, # Providers that can install it - 'machine_id': machine_id, - } - print(json.dumps(record)) - - -def main(): - git_enabled = get_env_bool('GIT_ENABLED', True) - git_binary = get_env('GIT_BINARY', 'git') - - if not git_enabled: - sys.exit(0) - - provider = EnvProvider() - try: - binary = Binary(name=git_binary, binproviders=[provider]).load() - if binary.abspath: - # Binary found - output_binary_found(binary, name='git') - else: - # Binary not found - output_binary_missing(name='git', binproviders='apt,brew') - except Exception: - # Binary not found - output_binary_missing(name='git', binproviders='apt,brew') - - sys.exit(0) - - -if __name__ == '__main__': - main() diff --git a/archivebox/plugins/git/on_Snapshot__62_git.py b/archivebox/plugins/git/on_Snapshot__05_git.bg.py similarity index 98% rename from archivebox/plugins/git/on_Snapshot__62_git.py rename to archivebox/plugins/git/on_Snapshot__05_git.bg.py index 04dbbd70..14ad7894 100644 --- a/archivebox/plugins/git/on_Snapshot__62_git.py +++ b/archivebox/plugins/git/on_Snapshot__05_git.bg.py @@ -2,7 +2,7 @@ """ Clone a git repository from a URL. -Usage: on_Snapshot__git.py --url= --snapshot-id= +Usage: on_Snapshot__05_git.bg.py --url= --snapshot-id= Output: Clones repository to $PWD/repo Environment variables: diff --git a/archivebox/plugins/git/templates/icon.html b/archivebox/plugins/git/templates/icon.html index de2a340a..e16f0231 100644 --- a/archivebox/plugins/git/templates/icon.html +++ b/archivebox/plugins/git/templates/icon.html @@ -1 +1 @@ -📂 \ No newline at end of file + diff --git a/archivebox/plugins/headers/templates/icon.html b/archivebox/plugins/headers/templates/icon.html index e74c28f8..f693e709 100644 --- a/archivebox/plugins/headers/templates/icon.html +++ b/archivebox/plugins/headers/templates/icon.html @@ -1 +1 @@ -📋 \ No newline at end of file + diff --git a/archivebox/plugins/htmltotext/on_Snapshot__58_htmltotext.py b/archivebox/plugins/htmltotext/on_Snapshot__58_htmltotext.py index c719c027..30134446 100644 --- a/archivebox/plugins/htmltotext/on_Snapshot__58_htmltotext.py +++ b/archivebox/plugins/htmltotext/on_Snapshot__58_htmltotext.py @@ -76,22 +76,28 @@ def find_html_source() -> str | None: # Hooks run in snapshot_dir, sibling extractor outputs are in subdirectories search_patterns = [ 'singlefile/singlefile.html', + '*_singlefile/singlefile.html', 'singlefile/*.html', + '*_singlefile/*.html', 'dom/output.html', + '*_dom/output.html', 'dom/*.html', + '*_dom/*.html', 'wget/**/*.html', + '*_wget/**/*.html', 'wget/**/*.htm', + '*_wget/**/*.htm', ] - cwd = Path.cwd() - for pattern in search_patterns: - matches = list(cwd.glob(pattern)) - for match in matches: - if match.is_file() and match.stat().st_size > 0: - try: - return match.read_text(errors='ignore') - except Exception: - continue + for base in (Path.cwd(), Path.cwd().parent): + for pattern in search_patterns: + matches = list(base.glob(pattern)) + for match in matches: + if match.is_file() and match.stat().st_size > 0: + try: + return match.read_text(errors='ignore') + except Exception: + continue return None diff --git a/archivebox/plugins/htmltotext/templates/icon.html b/archivebox/plugins/htmltotext/templates/icon.html index 070c6ec4..d1c8c78d 100644 --- a/archivebox/plugins/htmltotext/templates/icon.html +++ b/archivebox/plugins/htmltotext/templates/icon.html @@ -1 +1 @@ -📃 \ No newline at end of file + diff --git a/archivebox/plugins/infiniscroll/templates/icon.html b/archivebox/plugins/infiniscroll/templates/icon.html new file mode 100644 index 00000000..7de95bf4 --- /dev/null +++ b/archivebox/plugins/infiniscroll/templates/icon.html @@ -0,0 +1 @@ + diff --git a/archivebox/plugins/istilldontcareaboutcookies/on_Crawl__02_istilldontcareaboutcookies_install.js b/archivebox/plugins/istilldontcareaboutcookies/on_Crawl__81_install_istilldontcareaboutcookies_extension.js similarity index 97% rename from archivebox/plugins/istilldontcareaboutcookies/on_Crawl__02_istilldontcareaboutcookies_install.js rename to archivebox/plugins/istilldontcareaboutcookies/on_Crawl__81_install_istilldontcareaboutcookies_extension.js index f2df6629..ab29cdac 100755 --- a/archivebox/plugins/istilldontcareaboutcookies/on_Crawl__02_istilldontcareaboutcookies_install.js +++ b/archivebox/plugins/istilldontcareaboutcookies/on_Crawl__81_install_istilldontcareaboutcookies_extension.js @@ -7,7 +7,7 @@ * * Extension: https://chromewebstore.google.com/detail/edibdbjcniadpccecjdfdjjppcpchdlm * - * Priority: 02 (early) - Must install before Chrome session starts at Crawl level + * Priority: 81 - Must install before Chrome session starts at Crawl level * Hook: on_Crawl (runs once per crawl, not per snapshot) * * This extension automatically: diff --git a/archivebox/plugins/mercury/binaries.jsonl b/archivebox/plugins/mercury/binaries.jsonl deleted file mode 100644 index 9b9be5cf..00000000 --- a/archivebox/plugins/mercury/binaries.jsonl +++ /dev/null @@ -1 +0,0 @@ -{"type": "Binary", "name": "postlight-parser", "binproviders": "npm,env", "overrides": {"npm": {"packages": ["@postlight/parser"]}}} diff --git a/archivebox/plugins/mercury/on_Crawl__12_mercury_install.py b/archivebox/plugins/mercury/on_Crawl__12_mercury_install.py deleted file mode 100755 index 25d1c9c1..00000000 --- a/archivebox/plugins/mercury/on_Crawl__12_mercury_install.py +++ /dev/null @@ -1,85 +0,0 @@ -#!/usr/bin/env python3 -""" -Detect postlight-parser binary and emit Binary JSONL record. - -Output: Binary JSONL record to stdout if postlight-parser is found -""" - -import json -import os -import sys - -from abx_pkg import Binary, EnvProvider - - -def get_env(name: str, default: str = '') -> str: - return os.environ.get(name, default).strip() - -def get_env_bool(name: str, default: bool = False) -> bool: - val = get_env(name, '').lower() - if val in ('true', '1', 'yes', 'on'): - return True - if val in ('false', '0', 'no', 'off'): - return False - return default - - -def output_binary_found(binary: Binary, name: str): - """Output Binary JSONL record for an installed binary.""" - machine_id = os.environ.get('MACHINE_ID', '') - - record = { - 'type': 'Binary', - 'name': name, - 'abspath': str(binary.abspath), - 'version': str(binary.version) if binary.version else '', - 'sha256': binary.sha256 or '', - 'binprovider': 'env', # Already installed - 'machine_id': machine_id, - } - print(json.dumps(record)) - - -def output_binary_missing(name: str, binproviders: str): - """Output Binary JSONL record for a missing binary that needs installation.""" - machine_id = os.environ.get('MACHINE_ID', '') - - record = { - 'type': 'Binary', - 'name': name, - 'binproviders': binproviders, # Providers that can install it - 'overrides': { - 'npm': { - 'packages': ['@postlight/parser'], - } - }, - 'machine_id': machine_id, - } - print(json.dumps(record)) - - -def main(): - mercury_enabled = get_env_bool('MERCURY_ENABLED', True) - mercury_binary = get_env('MERCURY_BINARY', 'postlight-parser') - - if not mercury_enabled: - sys.exit(0) - - provider = EnvProvider() - try: - binary = Binary(name=mercury_binary, binproviders=[provider]).load() - if binary.abspath: - # Binary found - output_binary_found(binary, name='postlight-parser') - else: - # Binary not found - output_binary_missing(name='postlight-parser', binproviders='npm') - except Exception: - # Binary not found - output_binary_missing(name='postlight-parser', binproviders='npm') - - sys.exit(0) - - -if __name__ == '__main__': - main() diff --git a/archivebox/plugins/mercury/on_Crawl__40_mercury_install.py b/archivebox/plugins/mercury/on_Crawl__40_mercury_install.py new file mode 100755 index 00000000..7ec64d8b --- /dev/null +++ b/archivebox/plugins/mercury/on_Crawl__40_mercury_install.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python3 +""" +Emit postlight-parser Binary dependency for the crawl. +""" + +import json +import os +import sys + + +def get_env(name: str, default: str = '') -> str: + return os.environ.get(name, default).strip() + +def get_env_bool(name: str, default: bool = False) -> bool: + val = get_env(name, '').lower() + if val in ('true', '1', 'yes', 'on'): + return True + if val in ('false', '0', 'no', 'off'): + return False + return default + + +def output_binary(name: str, binproviders: str): + """Output Binary JSONL record for a dependency.""" + machine_id = os.environ.get('MACHINE_ID', '') + + record = { + 'type': 'Binary', + 'name': name, + 'binproviders': binproviders, + 'overrides': { + 'npm': { + 'packages': ['@postlight/parser'], + } + }, + 'machine_id': machine_id, + } + print(json.dumps(record)) + + +def main(): + mercury_enabled = get_env_bool('MERCURY_ENABLED', True) + + if not mercury_enabled: + sys.exit(0) + + output_binary(name='postlight-parser', binproviders='npm,env') + + sys.exit(0) + + +if __name__ == '__main__': + main() diff --git a/archivebox/plugins/mercury/templates/icon.html b/archivebox/plugins/mercury/templates/icon.html index 776ed9b1..bd17e0cf 100644 --- a/archivebox/plugins/mercury/templates/icon.html +++ b/archivebox/plugins/mercury/templates/icon.html @@ -1 +1 @@ -☿️ \ No newline at end of file + diff --git a/archivebox/plugins/merkletree/templates/icon.html b/archivebox/plugins/merkletree/templates/icon.html index e69de29b..b8d3579c 100644 --- a/archivebox/plugins/merkletree/templates/icon.html +++ b/archivebox/plugins/merkletree/templates/icon.html @@ -0,0 +1 @@ + diff --git a/archivebox/plugins/merkletree/tests/__init__.py b/archivebox/plugins/merkletree/tests/__init__.py deleted file mode 100644 index 1eb43866..00000000 --- a/archivebox/plugins/merkletree/tests/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Tests for the merkletree plugin.""" diff --git a/archivebox/plugins/modalcloser/on_Snapshot__15_modalcloser.bg.js b/archivebox/plugins/modalcloser/on_Snapshot__15_modalcloser.bg.js index 3469026d..38b2a604 100644 --- a/archivebox/plugins/modalcloser/on_Snapshot__15_modalcloser.bg.js +++ b/archivebox/plugins/modalcloser/on_Snapshot__15_modalcloser.bg.js @@ -287,7 +287,7 @@ async function main() { page = pages[pages.length - 1]; } - console.error(`Modalcloser listening on ${url}`); + // console.error(`Modalcloser listening on ${url}`); // Set up dialog handler (for JS alert/confirm/prompt/beforeunload) page.on('dialog', async (dialog) => { diff --git a/archivebox/plugins/modalcloser/templates/icon.html b/archivebox/plugins/modalcloser/templates/icon.html new file mode 100644 index 00000000..e58b588b --- /dev/null +++ b/archivebox/plugins/modalcloser/templates/icon.html @@ -0,0 +1 @@ + diff --git a/archivebox/plugins/npm/on_Binary__10_npm_install.py b/archivebox/plugins/npm/on_Binary__10_npm_install.py index 4bf1a05c..f0b43893 100644 --- a/archivebox/plugins/npm/on_Binary__10_npm_install.py +++ b/archivebox/plugins/npm/on_Binary__10_npm_install.py @@ -90,30 +90,34 @@ def main(binary_id: str, machine_id: str, name: str, binproviders: str, custom_c } print(json.dumps(record)) - # Emit PATH update if npm bin dir not already in PATH - npm_bin_dir = str(npm_prefix / 'bin') + # Emit PATH update for npm bin dirs (node_modules/.bin preferred) + npm_bin_dirs = [ + str(npm_prefix / 'node_modules' / '.bin'), + str(npm_prefix / 'bin'), + ] current_path = os.environ.get('PATH', '') + path_dirs = current_path.split(':') if current_path else [] + new_path = current_path - # Check if npm_bin_dir is already in PATH - path_dirs = current_path.split(':') - if npm_bin_dir not in path_dirs: - # Prepend npm_bin_dir to PATH - new_path = f"{npm_bin_dir}:{current_path}" if current_path else npm_bin_dir - print(json.dumps({ - 'type': 'Machine', - '_method': 'update', - 'key': 'config/PATH', - 'value': new_path, - })) - click.echo(f" Added {npm_bin_dir} to PATH", err=True) + for npm_bin_dir in npm_bin_dirs: + if npm_bin_dir and npm_bin_dir not in path_dirs: + new_path = f"{npm_bin_dir}:{new_path}" if new_path else npm_bin_dir + path_dirs.insert(0, npm_bin_dir) + + print(json.dumps({ + 'type': 'Machine', + 'config': { + 'PATH': new_path, + }, + })) # Also emit NODE_MODULES_DIR for JS module resolution node_modules_dir = str(npm_prefix / 'node_modules') print(json.dumps({ 'type': 'Machine', - '_method': 'update', - 'key': 'config/NODE_MODULES_DIR', - 'value': node_modules_dir, + 'config': { + 'NODE_MODULES_DIR': node_modules_dir, + }, })) # Log human-readable info to stderr diff --git a/archivebox/plugins/npm/on_Crawl__00_npm_install.py b/archivebox/plugins/npm/on_Crawl__00_npm_install.py new file mode 100644 index 00000000..5660dd01 --- /dev/null +++ b/archivebox/plugins/npm/on_Crawl__00_npm_install.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 +""" +Emit node/npm Binary dependencies for the crawl. + +This hook runs early in the Crawl lifecycle so node/npm are installed +before any npm-based extractors (e.g., puppeteer) run. +""" + +import json +import os +import sys + + +def get_env(name: str, default: str = '') -> str: + return os.environ.get(name, default).strip() + + +def output_binary(name: str, binproviders: str, overrides: dict | None = None) -> None: + machine_id = os.environ.get('MACHINE_ID', '') + record = { + 'type': 'Binary', + 'name': name, + 'binproviders': binproviders, + 'machine_id': machine_id, + } + if overrides: + record['overrides'] = overrides + print(json.dumps(record)) + + +def main() -> None: + output_binary( + name='node', + binproviders='apt,brew,env', + overrides={'apt': {'packages': ['nodejs']}}, + ) + + output_binary( + name='npm', + binproviders='apt,brew,env', + overrides={ + 'apt': {'packages': ['nodejs', 'npm']}, + 'brew': {'packages': ['node']}, + }, + ) + + sys.exit(0) + + +if __name__ == '__main__': + main() diff --git a/archivebox/plugins/npm/tests/__init__.py b/archivebox/plugins/npm/tests/__init__.py deleted file mode 100644 index 08ccd028..00000000 --- a/archivebox/plugins/npm/tests/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Tests for the npm binary provider plugin.""" diff --git a/archivebox/plugins/npm/tests/test_npm_provider.py b/archivebox/plugins/npm/tests/test_npm_provider.py index c5099475..5492738a 100644 --- a/archivebox/plugins/npm/tests/test_npm_provider.py +++ b/archivebox/plugins/npm/tests/test_npm_provider.py @@ -22,7 +22,7 @@ from django.test import TestCase # Get the path to the npm provider hook PLUGIN_DIR = Path(__file__).parent.parent -INSTALL_HOOK = PLUGIN_DIR / 'on_Binary__install_using_npm_provider.py' +INSTALL_HOOK = next(PLUGIN_DIR.glob('on_Binary__*_npm_install.py'), None) def npm_available() -> bool: @@ -45,7 +45,7 @@ class TestNpmProviderHook(TestCase): def test_hook_script_exists(self): """Hook script should exist.""" - self.assertTrue(INSTALL_HOOK.exists(), f"Hook not found: {INSTALL_HOOK}") + self.assertTrue(INSTALL_HOOK and INSTALL_HOOK.exists(), f"Hook not found: {INSTALL_HOOK}") def test_hook_requires_lib_dir(self): """Hook should fail when LIB_DIR is not set.""" diff --git a/archivebox/plugins/papersdl/binaries.jsonl b/archivebox/plugins/papersdl/binaries.jsonl deleted file mode 100644 index 538af943..00000000 --- a/archivebox/plugins/papersdl/binaries.jsonl +++ /dev/null @@ -1 +0,0 @@ -{"type": "Binary", "name": "papers-dl", "binproviders": "pip,env"} diff --git a/archivebox/plugins/papersdl/on_Crawl__14_papersdl_install.py b/archivebox/plugins/papersdl/on_Crawl__14_papersdl_install.py deleted file mode 100755 index 8c548c7c..00000000 --- a/archivebox/plugins/papersdl/on_Crawl__14_papersdl_install.py +++ /dev/null @@ -1,80 +0,0 @@ -#!/usr/bin/env python3 -""" -Detect papers-dl binary and emit Binary JSONL record. - -Output: Binary JSONL record to stdout if papers-dl is found -""" - -import json -import os -import sys - -from abx_pkg import Binary, EnvProvider - - -def get_env(name: str, default: str = '') -> str: - return os.environ.get(name, default).strip() - -def get_env_bool(name: str, default: bool = False) -> bool: - val = get_env(name, '').lower() - if val in ('true', '1', 'yes', 'on'): - return True - if val in ('false', '0', 'no', 'off'): - return False - return default - - -def output_binary_found(binary: Binary, name: str): - """Output Binary JSONL record for an installed binary.""" - machine_id = os.environ.get('MACHINE_ID', '') - - record = { - 'type': 'Binary', - 'name': name, - 'abspath': str(binary.abspath), - 'version': str(binary.version) if binary.version else '', - 'sha256': binary.sha256 or '', - 'binprovider': 'env', # Already installed - 'machine_id': machine_id, - } - print(json.dumps(record)) - - -def output_binary_missing(name: str, binproviders: str): - """Output Binary JSONL record for a missing binary that needs installation.""" - machine_id = os.environ.get('MACHINE_ID', '') - - record = { - 'type': 'Binary', - 'name': name, - 'binproviders': binproviders, # Providers that can install it - 'machine_id': machine_id, - } - print(json.dumps(record)) - - -def main(): - papersdl_enabled = get_env_bool('PAPERSDL_ENABLED', True) - papersdl_binary = get_env('PAPERSDL_BINARY', 'papers-dl') - - if not papersdl_enabled: - sys.exit(0) - - provider = EnvProvider() - try: - binary = Binary(name=papersdl_binary, binproviders=[provider]).load() - if binary.abspath: - # Binary found - output_binary_found(binary, name='papers-dl') - else: - # Binary not found - output_binary_missing(name='papers-dl', binproviders='pip') - except Exception: - # Binary not found - output_binary_missing(name='papers-dl', binproviders='pip') - - sys.exit(0) - - -if __name__ == '__main__': - main() diff --git a/archivebox/plugins/papersdl/on_Crawl__30_papersdl_install.py b/archivebox/plugins/papersdl/on_Crawl__30_papersdl_install.py new file mode 100755 index 00000000..050aa23b --- /dev/null +++ b/archivebox/plugins/papersdl/on_Crawl__30_papersdl_install.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python3 +""" +Emit papers-dl Binary dependency for the crawl. +""" + +import json +import os +import sys + + +def get_env(name: str, default: str = '') -> str: + return os.environ.get(name, default).strip() + +def get_env_bool(name: str, default: bool = False) -> bool: + val = get_env(name, '').lower() + if val in ('true', '1', 'yes', 'on'): + return True + if val in ('false', '0', 'no', 'off'): + return False + return default + + +def output_binary(name: str, binproviders: str): + """Output Binary JSONL record for a dependency.""" + machine_id = os.environ.get('MACHINE_ID', '') + + record = { + 'type': 'Binary', + 'name': name, + 'binproviders': binproviders, + 'machine_id': machine_id, + } + print(json.dumps(record)) + + +def main(): + papersdl_enabled = get_env_bool('PAPERSDL_ENABLED', True) + + if not papersdl_enabled: + sys.exit(0) + + output_binary(name='papers-dl', binproviders='pip,env') + + sys.exit(0) + + +if __name__ == '__main__': + main() diff --git a/archivebox/plugins/papersdl/on_Snapshot__66_papersdl.bg.py b/archivebox/plugins/papersdl/on_Snapshot__66_papersdl.bg.py index 859d911e..60015050 100755 --- a/archivebox/plugins/papersdl/on_Snapshot__66_papersdl.bg.py +++ b/archivebox/plugins/papersdl/on_Snapshot__66_papersdl.bg.py @@ -23,6 +23,7 @@ import os import re import subprocess import sys +import threading from pathlib import Path import rich_click as click @@ -108,7 +109,35 @@ def save_paper(url: str, binary: str) -> tuple[bool, str | None, str]: cmd.extend(papersdl_args_extra) try: - result = subprocess.run(cmd, capture_output=True, timeout=timeout, text=True) + print(f'[papersdl] Starting download (timeout={timeout}s)', file=sys.stderr) + output_lines: list[str] = [] + process = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1, + ) + + def _read_output() -> None: + if not process.stdout: + return + for line in process.stdout: + output_lines.append(line) + sys.stderr.write(line) + + reader = threading.Thread(target=_read_output, daemon=True) + reader.start() + + try: + process.wait(timeout=timeout) + except subprocess.TimeoutExpired: + process.kill() + reader.join(timeout=1) + return False, None, f'Timed out after {timeout} seconds' + + reader.join(timeout=1) + combined_output = ''.join(output_lines) # Check if any PDF files were downloaded pdf_files = list(output_dir.glob('*.pdf')) @@ -117,8 +146,8 @@ def save_paper(url: str, binary: str) -> tuple[bool, str | None, str]: # Return first PDF file return True, str(pdf_files[0]), '' else: - stderr = result.stderr - stdout = result.stdout + stderr = combined_output + stdout = combined_output # These are NOT errors - page simply has no downloadable paper stderr_lower = stderr.lower() @@ -127,7 +156,7 @@ def save_paper(url: str, binary: str) -> tuple[bool, str | None, str]: return True, None, '' # Paper not available - success, no output if 'no results' in stderr_lower or 'no results' in stdout_lower: return True, None, '' # No paper found - success, no output - if result.returncode == 0: + if process.returncode == 0: return True, None, '' # papers-dl exited cleanly, just no paper - success # These ARE errors - something went wrong diff --git a/archivebox/plugins/papersdl/templates/icon.html b/archivebox/plugins/papersdl/templates/icon.html index 063530f3..94afb781 100644 --- a/archivebox/plugins/papersdl/templates/icon.html +++ b/archivebox/plugins/papersdl/templates/icon.html @@ -1 +1 @@ -📄 \ No newline at end of file + diff --git a/archivebox/plugins/parse_dom_outlinks/on_Snapshot__75_parse_dom_outlinks.js b/archivebox/plugins/parse_dom_outlinks/on_Snapshot__75_parse_dom_outlinks.js index d3eafb0b..e900d9b5 100755 --- a/archivebox/plugins/parse_dom_outlinks/on_Snapshot__75_parse_dom_outlinks.js +++ b/archivebox/plugins/parse_dom_outlinks/on_Snapshot__75_parse_dom_outlinks.js @@ -193,6 +193,9 @@ async function extractOutlinks(url) { type: 'Snapshot', url: href, plugin: PLUGIN_NAME, + depth: depth + 1, + parent_snapshot_id: snapshotId || undefined, + crawl_id: crawlId || undefined, })).join('\n'); if (urlsJsonl) { @@ -214,6 +217,8 @@ async function main() { const args = parseArgs(); const url = args.url; const snapshotId = args.snapshot_id; + const crawlId = args.crawl_id || process.env.CRAWL_ID; + const depth = parseInt(args.depth || process.env.SNAPSHOT_DEPTH || '0', 10) || 0; if (!url || !snapshotId) { console.error('Usage: on_Snapshot__75_parse_dom_outlinks.js --url= --snapshot-id='); diff --git a/archivebox/plugins/parse_dom_outlinks/templates/icon.html b/archivebox/plugins/parse_dom_outlinks/templates/icon.html index f77458fd..b333082c 100644 --- a/archivebox/plugins/parse_dom_outlinks/templates/icon.html +++ b/archivebox/plugins/parse_dom_outlinks/templates/icon.html @@ -1 +1 @@ -🔗 + diff --git a/archivebox/plugins/parse_dom_outlinks/tests/__init__.py b/archivebox/plugins/parse_dom_outlinks/tests/__init__.py deleted file mode 100644 index 47e46db9..00000000 --- a/archivebox/plugins/parse_dom_outlinks/tests/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Tests for the parse_dom_outlinks plugin.""" diff --git a/archivebox/plugins/parse_dom_outlinks/tests/test_parse_dom_outlinks.py b/archivebox/plugins/parse_dom_outlinks/tests/test_parse_dom_outlinks.py index 33045184..cf6df8ed 100644 --- a/archivebox/plugins/parse_dom_outlinks/tests/test_parse_dom_outlinks.py +++ b/archivebox/plugins/parse_dom_outlinks/tests/test_parse_dom_outlinks.py @@ -79,8 +79,7 @@ class TestParseDomOutlinksWithChrome(TestCase): # Run outlinks hook with the active Chrome session result = subprocess.run( ['node', str(OUTLINKS_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], - cwd=str(snapshot_chrome_dir, - env=get_test_env()), + cwd=str(snapshot_chrome_dir), capture_output=True, text=True, timeout=60, diff --git a/archivebox/plugins/parse_html_urls/on_Snapshot__70_parse_html_urls.py b/archivebox/plugins/parse_html_urls/on_Snapshot__70_parse_html_urls.py index 14fe3a6b..1fc36552 100755 --- a/archivebox/plugins/parse_html_urls/on_Snapshot__70_parse_html_urls.py +++ b/archivebox/plugins/parse_html_urls/on_Snapshot__70_parse_html_urls.py @@ -24,14 +24,15 @@ from datetime import datetime, timezone from html import unescape from html.parser import HTMLParser from pathlib import Path -from urllib.parse import urljoin, urlparse +from urllib.parse import urljoin, urlparse, urlunparse import rich_click as click PLUGIN_NAME = 'parse_html_urls' -# Check if parse_dom_outlinks extractor already ran -DOM_OUTLINKS_URLS_FILE = Path('parse_dom_outlinks/urls.jsonl') +# Check if parse_dom_outlinks extractor already ran (sibling plugin output dir) +DOM_OUTLINKS_URLS_FILE = Path('..') / 'parse_dom_outlinks' / 'urls.jsonl' +URLS_FILE = Path('urls.jsonl') # URL regex from archivebox/misc/util.py @@ -95,8 +96,9 @@ def fix_urljoin_bug(url: str, nesting_limit=5) -> str: def normalize_url(url: str, root_url: str = None) -> str: """Normalize a URL, resolving relative paths if root_url provided.""" + url = clean_url_candidate(url) if not root_url: - return url + return _normalize_trailing_slash(url) url_is_absolute = url.lower().startswith('http://') or url.lower().startswith('https://') @@ -110,7 +112,40 @@ def normalize_url(url: str, root_url: str = None) -> str: if did_urljoin_misbehave(root_url, url, resolved): resolved = fix_urljoin_bug(resolved) - return resolved + return _normalize_trailing_slash(resolved) + + +def _normalize_trailing_slash(url: str) -> str: + """Drop trailing slash for non-root paths when no query/fragment.""" + try: + parsed = urlparse(url) + path = parsed.path or '' + if path != '/' and path.endswith('/') and not parsed.query and not parsed.fragment: + path = path.rstrip('/') + return urlunparse((parsed.scheme, parsed.netloc, path, parsed.params, parsed.query, parsed.fragment)) + except Exception: + pass + return url + + +def clean_url_candidate(url: str) -> str: + """Strip obvious surrounding/trailing punctuation from extracted URLs.""" + cleaned = (url or '').strip() + if not cleaned: + return cleaned + + # Strip common wrappers + cleaned = cleaned.strip(' \t\r\n') + cleaned = cleaned.strip('"\''"'"'<>[]()') + + # Strip trailing punctuation and escape artifacts + cleaned = cleaned.rstrip('.,;:!?)\\\'"') + cleaned = cleaned.rstrip('"') + + # Strip leading punctuation artifacts + cleaned = cleaned.lstrip('("'\''<') + + return cleaned def fetch_content(url: str) -> str: @@ -131,6 +166,43 @@ def fetch_content(url: str) -> str: return response.read().decode('utf-8', errors='replace') +def find_html_sources() -> list[str]: + """Find HTML content from other extractors in the snapshot directory.""" + search_patterns = [ + 'readability/content.html', + '*_readability/content.html', + 'mercury/content.html', + '*_mercury/content.html', + 'singlefile/singlefile.html', + '*_singlefile/singlefile.html', + 'singlefile/*.html', + '*_singlefile/*.html', + 'dom/output.html', + '*_dom/output.html', + 'dom/*.html', + '*_dom/*.html', + 'wget/**/*.html', + '*_wget/**/*.html', + 'wget/**/*.htm', + '*_wget/**/*.htm', + 'wget/**/*.htm*', + '*_wget/**/*.htm*', + ] + + sources: list[str] = [] + for base in (Path.cwd(), Path.cwd().parent): + for pattern in search_patterns: + for match in base.glob(pattern): + if not match.is_file() or match.stat().st_size == 0: + continue + try: + sources.append(match.read_text(errors='ignore')) + except Exception: + continue + + return sources + + @click.command() @click.option('--url', required=True, help='HTML URL to parse') @click.option('--snapshot-id', required=False, help='Parent Snapshot UUID') @@ -138,6 +210,13 @@ def fetch_content(url: str) -> str: @click.option('--depth', type=int, default=0, help='Current depth level') def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0): """Parse HTML and extract href URLs.""" + env_depth = os.environ.get('SNAPSHOT_DEPTH') + if env_depth is not None: + try: + depth = int(env_depth) + except Exception: + pass + crawl_id = crawl_id or os.environ.get('CRAWL_ID') # Skip only if parse_dom_outlinks already ran AND found URLs (it uses Chrome for better coverage) # If parse_dom_outlinks ran but found nothing, we still try static HTML parsing as fallback @@ -145,32 +224,38 @@ def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0 click.echo(f'Skipping parse_html_urls - parse_dom_outlinks already extracted URLs') sys.exit(0) - try: - content = fetch_content(url) - except Exception as e: - click.echo(f'Failed to fetch {url}: {e}', err=True) - sys.exit(1) - - # Parse HTML for hrefs - parser = HrefParser() - try: - parser.feed(content) - except Exception as e: - click.echo(f'Failed to parse HTML: {e}', err=True) - sys.exit(1) + contents = find_html_sources() + if not contents: + try: + contents = [fetch_content(url)] + except Exception as e: + click.echo(f'Failed to fetch {url}: {e}', err=True) + sys.exit(1) urls_found = set() - for href in parser.urls: - # Normalize URL - normalized = normalize_url(href, root_url=url) + for content in contents: + # Parse HTML for hrefs + parser = HrefParser() + try: + parser.feed(content) + except Exception: + pass - # Only include http/https URLs - if normalized.lower().startswith('http://') or normalized.lower().startswith('https://'): - # Skip the source URL itself - if normalized != url: - urls_found.add(unescape(normalized)) + for href in parser.urls: + normalized = normalize_url(href, root_url=url) + if normalized.lower().startswith('http://') or normalized.lower().startswith('https://'): + if normalized != url: + urls_found.add(unescape(normalized)) - # Emit Snapshot records to stdout (JSONL) + # Also capture explicit URLs in the HTML text + for match in URL_REGEX.findall(content): + normalized = normalize_url(match, root_url=url) + if normalized.lower().startswith('http://') or normalized.lower().startswith('https://'): + if normalized != url: + urls_found.add(unescape(normalized)) + + # Emit Snapshot records to stdout (JSONL) and urls.jsonl for crawl system + records = [] for found_url in sorted(urls_found): record = { 'type': 'Snapshot', @@ -183,8 +268,12 @@ def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0 if crawl_id: record['crawl_id'] = crawl_id + records.append(record) print(json.dumps(record)) + if records: + URLS_FILE.write_text('\n'.join(json.dumps(r) for r in records) + '\n') + # Emit ArchiveResult record to mark completion status = 'succeeded' if urls_found else 'skipped' output_str = f'Found {len(urls_found)} URLs' if urls_found else 'No URLs found' diff --git a/archivebox/plugins/parse_html_urls/templates/icon.html b/archivebox/plugins/parse_html_urls/templates/icon.html index f77458fd..ee9d8294 100644 --- a/archivebox/plugins/parse_html_urls/templates/icon.html +++ b/archivebox/plugins/parse_html_urls/templates/icon.html @@ -1 +1 @@ -🔗 + diff --git a/archivebox/plugins/parse_jsonl_urls/on_Snapshot__74_parse_jsonl_urls.py b/archivebox/plugins/parse_jsonl_urls/on_Snapshot__74_parse_jsonl_urls.py index 6b846f5d..086c7f10 100755 --- a/archivebox/plugins/parse_jsonl_urls/on_Snapshot__74_parse_jsonl_urls.py +++ b/archivebox/plugins/parse_jsonl_urls/on_Snapshot__74_parse_jsonl_urls.py @@ -132,6 +132,13 @@ def fetch_content(url: str) -> str: @click.option('--depth', type=int, default=0, help='Current depth level') def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0): """Parse JSONL bookmark file and extract URLs.""" + env_depth = os.environ.get('SNAPSHOT_DEPTH') + if env_depth is not None: + try: + depth = int(env_depth) + except Exception: + pass + crawl_id = crawl_id or os.environ.get('CRAWL_ID') try: content = fetch_content(url) diff --git a/archivebox/plugins/parse_jsonl_urls/templates/icon.html b/archivebox/plugins/parse_jsonl_urls/templates/icon.html index 98c76c15..124a8cb4 100644 --- a/archivebox/plugins/parse_jsonl_urls/templates/icon.html +++ b/archivebox/plugins/parse_jsonl_urls/templates/icon.html @@ -1 +1 @@ -📋 + diff --git a/archivebox/plugins/parse_netscape_urls/on_Snapshot__73_parse_netscape_urls.py b/archivebox/plugins/parse_netscape_urls/on_Snapshot__73_parse_netscape_urls.py index caccdac5..99e3c8c1 100755 --- a/archivebox/plugins/parse_netscape_urls/on_Snapshot__73_parse_netscape_urls.py +++ b/archivebox/plugins/parse_netscape_urls/on_Snapshot__73_parse_netscape_urls.py @@ -168,6 +168,13 @@ def fetch_content(url: str) -> str: @click.option('--depth', type=int, default=0, help='Current depth level') def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0): """Parse Netscape bookmark HTML and extract URLs.""" + env_depth = os.environ.get('SNAPSHOT_DEPTH') + if env_depth is not None: + try: + depth = int(env_depth) + except Exception: + pass + crawl_id = crawl_id or os.environ.get('CRAWL_ID') try: content = fetch_content(url) diff --git a/archivebox/plugins/parse_netscape_urls/templates/icon.html b/archivebox/plugins/parse_netscape_urls/templates/icon.html index 0cc8da81..4c60899c 100644 --- a/archivebox/plugins/parse_netscape_urls/templates/icon.html +++ b/archivebox/plugins/parse_netscape_urls/templates/icon.html @@ -1 +1 @@ -🔖 + diff --git a/archivebox/plugins/parse_rss_urls/on_Snapshot__72_parse_rss_urls.py b/archivebox/plugins/parse_rss_urls/on_Snapshot__72_parse_rss_urls.py index 5b153123..bdc50afa 100755 --- a/archivebox/plugins/parse_rss_urls/on_Snapshot__72_parse_rss_urls.py +++ b/archivebox/plugins/parse_rss_urls/on_Snapshot__72_parse_rss_urls.py @@ -56,6 +56,13 @@ def fetch_content(url: str) -> str: @click.option('--depth', type=int, default=0, help='Current depth level') def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0): """Parse RSS/Atom feed and extract article URLs.""" + env_depth = os.environ.get('SNAPSHOT_DEPTH') + if env_depth is not None: + try: + depth = int(env_depth) + except Exception: + pass + crawl_id = crawl_id or os.environ.get('CRAWL_ID') if feedparser is None: click.echo('feedparser library not installed', err=True) diff --git a/archivebox/plugins/parse_rss_urls/templates/icon.html b/archivebox/plugins/parse_rss_urls/templates/icon.html index 81de8a1a..09b3b8e7 100644 --- a/archivebox/plugins/parse_rss_urls/templates/icon.html +++ b/archivebox/plugins/parse_rss_urls/templates/icon.html @@ -1 +1 @@ -📡 + diff --git a/archivebox/plugins/parse_txt_urls/on_Snapshot__71_parse_txt_urls.py b/archivebox/plugins/parse_txt_urls/on_Snapshot__71_parse_txt_urls.py index d899c742..29265700 100755 --- a/archivebox/plugins/parse_txt_urls/on_Snapshot__71_parse_txt_urls.py +++ b/archivebox/plugins/parse_txt_urls/on_Snapshot__71_parse_txt_urls.py @@ -105,6 +105,13 @@ def fetch_content(url: str) -> str: @click.option('--depth', type=int, default=0, help='Current depth level') def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0): """Parse plain text and extract URLs.""" + env_depth = os.environ.get('SNAPSHOT_DEPTH') + if env_depth is not None: + try: + depth = int(env_depth) + except Exception: + pass + crawl_id = crawl_id or os.environ.get('CRAWL_ID') try: content = fetch_content(url) diff --git a/archivebox/plugins/parse_txt_urls/templates/icon.html b/archivebox/plugins/parse_txt_urls/templates/icon.html index 0351b8bf..af23375c 100644 --- a/archivebox/plugins/parse_txt_urls/templates/icon.html +++ b/archivebox/plugins/parse_txt_urls/templates/icon.html @@ -1 +1 @@ -📃 + diff --git a/archivebox/plugins/pdf/on_Snapshot__52_pdf.js b/archivebox/plugins/pdf/on_Snapshot__52_pdf.js index b53a9aea..05648a81 100644 --- a/archivebox/plugins/pdf/on_Snapshot__52_pdf.js +++ b/archivebox/plugins/pdf/on_Snapshot__52_pdf.js @@ -52,7 +52,21 @@ const CHROME_SESSION_DIR = '../chrome'; // Check if staticfile extractor already downloaded this URL const STATICFILE_DIR = '../staticfile'; function hasStaticFileOutput() { - return fs.existsSync(STATICFILE_DIR) && fs.readdirSync(STATICFILE_DIR).length > 0; + if (!fs.existsSync(STATICFILE_DIR)) return false; + const stdoutPath = path.join(STATICFILE_DIR, 'stdout.log'); + if (!fs.existsSync(stdoutPath)) return false; + const stdout = fs.readFileSync(stdoutPath, 'utf8'); + for (const line of stdout.split('\n')) { + const trimmed = line.trim(); + if (!trimmed.startsWith('{')) continue; + try { + const record = JSON.parse(trimmed); + if (record.type === 'ArchiveResult' && record.status === 'succeeded') { + return true; + } + } catch (e) {} + } + return false; } // Wait for chrome tab to be fully loaded diff --git a/archivebox/plugins/pdf/templates/icon.html b/archivebox/plugins/pdf/templates/icon.html index 063530f3..35a0ed89 100644 --- a/archivebox/plugins/pdf/templates/icon.html +++ b/archivebox/plugins/pdf/templates/icon.html @@ -1 +1 @@ -📄 \ No newline at end of file + diff --git a/archivebox/plugins/pip/on_Binary__11_pip_install.py b/archivebox/plugins/pip/on_Binary__11_pip_install.py index edbeef4b..8737a042 100644 --- a/archivebox/plugins/pip/on_Binary__11_pip_install.py +++ b/archivebox/plugins/pip/on_Binary__11_pip_install.py @@ -11,6 +11,8 @@ Environment variables: import json import os +import shutil +import subprocess import sys from pathlib import Path @@ -46,6 +48,26 @@ def main(binary_id: str, machine_id: str, name: str, binproviders: str, override # Structure: lib/arm64-darwin/pip/venv (PipProvider will create venv automatically) pip_venv_path = Path(lib_dir) / 'pip' / 'venv' pip_venv_path.parent.mkdir(parents=True, exist_ok=True) + venv_python = pip_venv_path / 'bin' / 'python' + + # Prefer a stable system python for venv creation if provided/available + preferred_python = os.environ.get('PIP_VENV_PYTHON', '').strip() + if not preferred_python: + for candidate in ('python3.12', 'python3.11', 'python3.10'): + if shutil.which(candidate): + preferred_python = candidate + break + if preferred_python and not venv_python.exists(): + try: + subprocess.run( + [preferred_python, '-m', 'venv', str(pip_venv_path), '--upgrade-deps'], + check=True, + capture_output=True, + text=True, + ) + except Exception: + # Fall back to PipProvider-managed venv creation + pass # Use abx-pkg PipProvider to install binary with custom venv provider = PipProvider(pip_venv=pip_venv_path) @@ -87,22 +109,21 @@ def main(binary_id: str, machine_id: str, name: str, binproviders: str, override } print(json.dumps(record)) - # Emit PATH update if pip bin dir not already in PATH + # Emit PATH update for pip bin dir pip_bin_dir = str(pip_venv_path / 'bin') current_path = os.environ.get('PATH', '') # Check if pip_bin_dir is already in PATH path_dirs = current_path.split(':') - if pip_bin_dir not in path_dirs: - # Prepend pip_bin_dir to PATH - new_path = f"{pip_bin_dir}:{current_path}" if current_path else pip_bin_dir - print(json.dumps({ - 'type': 'Machine', - '_method': 'update', - 'key': 'config/PATH', - 'value': new_path, - })) - click.echo(f" Added {pip_bin_dir} to PATH", err=True) + new_path = f"{pip_bin_dir}:{current_path}" if current_path else pip_bin_dir + if pip_bin_dir in path_dirs: + new_path = current_path + print(json.dumps({ + 'type': 'Machine', + 'config': { + 'PATH': new_path, + }, + })) # Log human-readable info to stderr click.echo(f"Installed {name} at {binary.abspath}", err=True) diff --git a/archivebox/plugins/pip/tests/__init__.py b/archivebox/plugins/pip/tests/__init__.py deleted file mode 100644 index 28ac0d82..00000000 --- a/archivebox/plugins/pip/tests/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Tests for the pip binary provider plugin.""" diff --git a/archivebox/plugins/pip/tests/test_pip_provider.py b/archivebox/plugins/pip/tests/test_pip_provider.py index a22ef183..4a4fe610 100644 --- a/archivebox/plugins/pip/tests/test_pip_provider.py +++ b/archivebox/plugins/pip/tests/test_pip_provider.py @@ -22,7 +22,7 @@ from django.test import TestCase # Get the path to the pip provider hook PLUGIN_DIR = Path(__file__).parent.parent -INSTALL_HOOK = PLUGIN_DIR / 'on_Binary__install_using_pip_provider.py' +INSTALL_HOOK = next(PLUGIN_DIR.glob('on_Binary__*_pip_install.py'), None) class TestPipProviderHook(TestCase): @@ -33,6 +33,10 @@ class TestPipProviderHook(TestCase): self.temp_dir = tempfile.mkdtemp() self.output_dir = Path(self.temp_dir) / 'output' self.output_dir.mkdir() + self.lib_dir = Path(self.temp_dir) / 'lib' / 'x86_64-linux' + self.lib_dir.mkdir(parents=True, exist_ok=True) + self.lib_dir = Path(self.temp_dir) / 'lib' / 'x86_64-linux' + self.lib_dir.mkdir(parents=True, exist_ok=True) def tearDown(self): """Clean up.""" @@ -41,7 +45,7 @@ class TestPipProviderHook(TestCase): def test_hook_script_exists(self): """Hook script should exist.""" - self.assertTrue(INSTALL_HOOK.exists(), f"Hook not found: {INSTALL_HOOK}") + self.assertTrue(INSTALL_HOOK and INSTALL_HOOK.exists(), f"Hook not found: {INSTALL_HOOK}") def test_hook_help(self): """Hook should accept --help without error.""" @@ -55,16 +59,19 @@ class TestPipProviderHook(TestCase): # At minimum should not crash with Python error self.assertNotIn('Traceback', result.stderr) - def test_hook_finds_python(self): - """Hook should find Python binary.""" + def test_hook_finds_pip(self): + """Hook should find pip binary.""" env = os.environ.copy() env['DATA_DIR'] = self.temp_dir + env['LIB_DIR'] = str(self.lib_dir) result = subprocess.run( [ sys.executable, str(INSTALL_HOOK), - '--name=python3', - '--binproviders=pip,env', + '--name=pip', + '--binproviders=pip', + '--binary-id=test-uuid', + '--machine-id=test-machine', ], capture_output=True, text=True, @@ -80,7 +87,7 @@ class TestPipProviderHook(TestCase): if line.startswith('{'): try: record = json.loads(line) - if record.get('type') == 'Binary' and record.get('name') == 'python3': + if record.get('type') == 'Binary' and record.get('name') == 'pip': jsonl_found = True # Verify structure self.assertIn('abspath', record) @@ -92,19 +99,22 @@ class TestPipProviderHook(TestCase): # Should not crash self.assertNotIn('Traceback', result.stderr) - # Should find python3 via pip or env provider - self.assertTrue(jsonl_found, "Expected to find python3 binary in JSONL output") + # Should find pip via pip provider + self.assertTrue(jsonl_found, "Expected to find pip binary in JSONL output") def test_hook_unknown_package(self): """Hook should handle unknown packages gracefully.""" env = os.environ.copy() env['DATA_DIR'] = self.temp_dir + env['LIB_DIR'] = str(self.lib_dir) result = subprocess.run( [ sys.executable, str(INSTALL_HOOK), '--name=nonexistent_package_xyz123', '--binproviders=pip', + '--binary-id=test-uuid', + '--machine-id=test-machine', ], capture_output=True, text=True, @@ -148,6 +158,8 @@ class TestPipProviderIntegration(TestCase): sys.executable, str(INSTALL_HOOK), '--name=pip', '--binproviders=pip,env', + '--binary-id=test-uuid', + '--machine-id=test-machine', ], capture_output=True, text=True, diff --git a/archivebox/plugins/puppeteer/__init__.py b/archivebox/plugins/puppeteer/__init__.py new file mode 100644 index 00000000..e32e0f82 --- /dev/null +++ b/archivebox/plugins/puppeteer/__init__.py @@ -0,0 +1 @@ +# Plugin namespace for puppeteer utilities. diff --git a/archivebox/plugins/puppeteer/on_Binary__12_puppeteer_install.py b/archivebox/plugins/puppeteer/on_Binary__12_puppeteer_install.py new file mode 100644 index 00000000..a30e9cc0 --- /dev/null +++ b/archivebox/plugins/puppeteer/on_Binary__12_puppeteer_install.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python3 +""" +Install Chromium via the Puppeteer CLI. + +Usage: on_Binary__12_puppeteer_install.py --binary-id= --machine-id= --name= +Output: Binary JSONL record to stdout after installation +""" + +import json +import os +import re +import sys +from pathlib import Path + +import rich_click as click +from abx_pkg import Binary, EnvProvider, NpmProvider, BinProviderOverrides + +# Fix pydantic forward reference issue +NpmProvider.model_rebuild() + + +@click.command() +@click.option('--machine-id', required=True, help='Machine UUID') +@click.option('--binary-id', required=True, help='Binary UUID') +@click.option('--name', required=True, help='Binary name to install') +@click.option('--binproviders', default='*', help='Allowed providers (comma-separated)') +@click.option('--overrides', default=None, help='JSON-encoded overrides dict') +def main(machine_id: str, binary_id: str, name: str, binproviders: str, overrides: str | None) -> None: + if binproviders != '*' and 'puppeteer' not in binproviders.split(','): + sys.exit(0) + + if name not in ('chromium', 'chrome'): + sys.exit(0) + + lib_dir = os.environ.get('LIB_DIR', '').strip() + if not lib_dir: + click.echo('ERROR: LIB_DIR environment variable not set', err=True) + sys.exit(1) + + npm_prefix = Path(lib_dir) / 'npm' + npm_prefix.mkdir(parents=True, exist_ok=True) + npm_provider = NpmProvider(npm_prefix=npm_prefix) + cache_dir = Path(lib_dir) / 'puppeteer' + cache_dir.mkdir(parents=True, exist_ok=True) + os.environ.setdefault('PUPPETEER_CACHE_DIR', str(cache_dir)) + + puppeteer_binary = Binary( + name='puppeteer', + binproviders=[npm_provider, EnvProvider()], + overrides={'npm': {'packages': ['puppeteer']}}, + ).load() + + if not puppeteer_binary.abspath: + click.echo('ERROR: puppeteer binary not found (install puppeteer first)', err=True) + sys.exit(1) + + install_args = _parse_override_packages(overrides, default=['chromium@latest', '--install-deps']) + cmd = ['browsers', 'install', *install_args] + proc = puppeteer_binary.exec(cmd=cmd, timeout=300) + if proc.returncode != 0: + click.echo(proc.stdout.strip(), err=True) + click.echo(proc.stderr.strip(), err=True) + click.echo(f'ERROR: puppeteer install failed ({proc.returncode})', err=True) + sys.exit(1) + + chromium_binary = _load_chromium_binary(proc.stdout + '\n' + proc.stderr) + if not chromium_binary or not chromium_binary.abspath: + click.echo('ERROR: failed to locate Chromium after install', err=True) + sys.exit(1) + + _emit_chromium_binary_record( + binary=chromium_binary, + machine_id=machine_id, + binary_id=binary_id, + ) + + config_patch = { + 'CHROME_BINARY': str(chromium_binary.abspath), + 'CHROMIUM_VERSION': str(chromium_binary.version) if chromium_binary.version else '', + } + + print(json.dumps({ + 'type': 'Machine', + 'config': config_patch, + })) + + sys.exit(0) + + +def _parse_override_packages(overrides: str | None, default: list[str]) -> list[str]: + if not overrides: + return default + try: + overrides_dict = json.loads(overrides) + except json.JSONDecodeError: + return default + + if isinstance(overrides_dict, dict): + provider_overrides = overrides_dict.get('puppeteer') + if isinstance(provider_overrides, dict): + packages = provider_overrides.get('packages') + if isinstance(packages, list) and packages: + return [str(arg) for arg in packages] + if isinstance(provider_overrides, list) and provider_overrides: + return [str(arg) for arg in provider_overrides] + if isinstance(overrides_dict, list) and overrides_dict: + return [str(arg) for arg in overrides_dict] + + return default + + +def _emit_chromium_binary_record(binary: Binary, machine_id: str, binary_id: str) -> None: + record = { + 'type': 'Binary', + 'name': 'chromium', + 'abspath': str(binary.abspath), + 'version': str(binary.version) if binary.version else '', + 'sha256': binary.sha256 or '', + 'binprovider': 'puppeteer', + 'machine_id': machine_id, + 'binary_id': binary_id, + } + print(json.dumps(record)) + + +def _load_chromium_binary(output: str) -> Binary | None: + candidates: list[Path] = [] + match = re.search(r'(?:chromium|chrome)@[^\s]+\s+(\S+)', output) + if match: + candidates.append(Path(match.group(1))) + + cache_dirs: list[Path] = [] + cache_env = os.environ.get('PUPPETEER_CACHE_DIR') + if cache_env: + cache_dirs.append(Path(cache_env)) + + home = Path.home() + cache_dirs.extend([ + home / '.cache' / 'puppeteer', + home / 'Library' / 'Caches' / 'puppeteer', + ]) + + for base in cache_dirs: + for root in (base, base / 'chromium', base / 'chrome'): + try: + candidates.extend(root.rglob('Chromium.app/Contents/MacOS/Chromium')) + except Exception: + pass + try: + candidates.extend(root.rglob('chrome')) + except Exception: + pass + + for candidate in candidates: + try: + binary = Binary( + name='chromium', + binproviders=[EnvProvider()], + overrides={'env': {'abspath': str(candidate)}}, + ).load() + except Exception: + continue + if binary.abspath: + return binary + + return None + + +if __name__ == '__main__': + main() diff --git a/archivebox/plugins/puppeteer/on_Crawl__60_puppeteer_install.py b/archivebox/plugins/puppeteer/on_Crawl__60_puppeteer_install.py new file mode 100644 index 00000000..9125dc2f --- /dev/null +++ b/archivebox/plugins/puppeteer/on_Crawl__60_puppeteer_install.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python3 +""" +Emit Puppeteer Binary dependency for the crawl. +""" + +import json +import os +import sys + + +def main() -> None: + enabled = os.environ.get('PUPPETEER_ENABLED', 'true').lower() not in ('false', '0', 'no', 'off') + if not enabled: + sys.exit(0) + + record = { + 'type': 'Binary', + 'name': 'puppeteer', + 'binproviders': 'npm,env', + 'overrides': { + 'npm': { + 'packages': ['puppeteer'], + } + }, + } + print(json.dumps(record)) + sys.exit(0) + + +if __name__ == '__main__': + main() diff --git a/archivebox/plugins/puppeteer/tests/test_puppeteer.py b/archivebox/plugins/puppeteer/tests/test_puppeteer.py new file mode 100644 index 00000000..5d230a7d --- /dev/null +++ b/archivebox/plugins/puppeteer/tests/test_puppeteer.py @@ -0,0 +1,124 @@ +"""Integration tests for puppeteer plugin.""" + +import json +import os +import shutil +import subprocess +import sys +import tempfile +from pathlib import Path + +import pytest + +from archivebox.plugins.chrome.tests.chrome_test_helpers import ( + get_plugin_dir, + get_hook_script, +) + + +PLUGIN_DIR = get_plugin_dir(__file__) +CRAWL_HOOK = get_hook_script(PLUGIN_DIR, 'on_Crawl__*_puppeteer_install.py') +BINARY_HOOK = get_hook_script(PLUGIN_DIR, 'on_Binary__*_puppeteer_install.py') +NPM_BINARY_HOOK = PLUGIN_DIR.parent / 'npm' / 'on_Binary__10_npm_install.py' + + +def test_hook_scripts_exist(): + assert CRAWL_HOOK and CRAWL_HOOK.exists(), f"Hook not found: {CRAWL_HOOK}" + assert BINARY_HOOK and BINARY_HOOK.exists(), f"Hook not found: {BINARY_HOOK}" + + +def test_crawl_hook_emits_puppeteer_binary(): + with tempfile.TemporaryDirectory() as tmpdir: + env = os.environ.copy() + result = subprocess.run( + [sys.executable, str(CRAWL_HOOK)], + cwd=tmpdir, + capture_output=True, + text=True, + env=env, + timeout=30, + ) + + assert result.returncode == 0, f"crawl hook failed: {result.stderr}" + records = [json.loads(line) for line in result.stdout.splitlines() if line.strip().startswith('{')] + binaries = [r for r in records if r.get('type') == 'Binary' and r.get('name') == 'puppeteer'] + assert binaries, f"Expected Binary record for puppeteer, got: {records}" + assert 'npm' in binaries[0].get('binproviders', ''), "puppeteer should be installable via npm provider" + + +@pytest.mark.skipif(shutil.which('npm') is None, reason='npm is required for puppeteer installation') +def test_puppeteer_installs_chromium(): + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir = Path(tmpdir) + lib_dir = tmpdir / 'lib' / 'arm64-darwin' + lib_dir.mkdir(parents=True, exist_ok=True) + + env = os.environ.copy() + env['LIB_DIR'] = str(lib_dir) + + crawl_result = subprocess.run( + [sys.executable, str(CRAWL_HOOK)], + cwd=tmpdir, + capture_output=True, + text=True, + env=env, + timeout=30, + ) + assert crawl_result.returncode == 0, f"crawl hook failed: {crawl_result.stderr}" + crawl_records = [json.loads(line) for line in crawl_result.stdout.splitlines() if line.strip().startswith('{')] + puppeteer_record = next( + (r for r in crawl_records if r.get('type') == 'Binary' and r.get('name') == 'puppeteer'), + None, + ) + assert puppeteer_record, f"Expected puppeteer Binary record, got: {crawl_records}" + + npm_result = subprocess.run( + [ + sys.executable, + str(NPM_BINARY_HOOK), + '--machine-id=test-machine', + '--binary-id=test-puppeteer', + '--name=puppeteer', + f"--binproviders={puppeteer_record.get('binproviders', '*')}", + '--overrides=' + json.dumps(puppeteer_record.get('overrides') or {}), + ], + cwd=tmpdir, + capture_output=True, + text=True, + env=env, + timeout=120, + ) + assert npm_result.returncode == 0, ( + "puppeteer npm install failed\n" + f"stdout:\n{npm_result.stdout}\n" + f"stderr:\n{npm_result.stderr}" + ) + + result = subprocess.run( + [ + sys.executable, + str(BINARY_HOOK), + '--machine-id=test-machine', + '--binary-id=test-binary', + '--name=chromium', + '--binproviders=puppeteer', + '--overrides=' + json.dumps({'puppeteer': ['chromium@latest', '--install-deps']}), + ], + cwd=tmpdir, + capture_output=True, + text=True, + env=env, + timeout=120, + ) + + assert result.returncode == 0, ( + "puppeteer binary hook failed\n" + f"stdout:\n{result.stdout}\n" + f"stderr:\n{result.stderr}" + ) + + records = [json.loads(line) for line in result.stdout.splitlines() if line.strip().startswith('{')] + binaries = [r for r in records if r.get('type') == 'Binary' and r.get('name') == 'chromium'] + assert binaries, f"Expected Binary record for chromium, got: {records}" + abspath = binaries[0].get('abspath') + assert abspath and Path(abspath).exists(), f"Chromium binary path invalid: {abspath}" diff --git a/archivebox/plugins/readability/binaries.jsonl b/archivebox/plugins/readability/binaries.jsonl deleted file mode 100644 index e8a1974a..00000000 --- a/archivebox/plugins/readability/binaries.jsonl +++ /dev/null @@ -1 +0,0 @@ -{"type": "Binary", "name": "readability-extractor", "binproviders": "npm,env", "overrides": {"npm": {"packages": ["https://github.com/ArchiveBox/readability-extractor"]}}} diff --git a/archivebox/plugins/readability/on_Crawl__11_readability_install.py b/archivebox/plugins/readability/on_Crawl__11_readability_install.py deleted file mode 100755 index ea0791ef..00000000 --- a/archivebox/plugins/readability/on_Crawl__11_readability_install.py +++ /dev/null @@ -1,83 +0,0 @@ -#!/usr/bin/env python3 -""" -Detect readability-extractor binary and emit Binary JSONL record. - -Output: Binary JSONL record to stdout if readability is found -""" - -import json -import os -import sys - -from abx_pkg import Binary, EnvProvider - - -def get_env(name: str, default: str = '') -> str: - return os.environ.get(name, default).strip() - -def get_env_bool(name: str, default: bool = False) -> bool: - val = get_env(name, '').lower() - if val in ('true', '1', 'yes', 'on'): - return True - if val in ('false', '0', 'no', 'off'): - return False - return default - - -def output_binary_found(binary: Binary, name: str): - """Output Binary JSONL record for an installed binary.""" - machine_id = os.environ.get('MACHINE_ID', '') - - record = { - 'type': 'Binary', - 'name': name, - 'abspath': str(binary.abspath), - 'version': str(binary.version) if binary.version else '', - 'sha256': binary.sha256 or '', - 'binprovider': 'env', # Already installed - 'machine_id': machine_id, - } - print(json.dumps(record)) - - -def output_binary_missing(name: str, binproviders: str): - """Output Binary JSONL record for a missing binary that needs installation.""" - machine_id = os.environ.get('MACHINE_ID', '') - - record = { - 'type': 'Binary', - 'name': name, - 'binproviders': binproviders, # Providers that can install it - 'overrides': { - 'packages': ['git+https://github.com/ArchiveBox/readability-extractor.git'], - }, - 'machine_id': machine_id, - } - print(json.dumps(record)) - - -def main(): - readability_enabled = get_env_bool('READABILITY_ENABLED', True) - readability_binary = get_env('READABILITY_BINARY', 'readability-extractor') - - if not readability_enabled: - sys.exit(0) - - provider = EnvProvider() - try: - binary = Binary(name=readability_binary, binproviders=[provider]).load() - if binary.abspath: - # Binary found - output_binary_found(binary, name='readability-extractor') - else: - # Binary not found - output_binary_missing(name='readability-extractor', binproviders='npm') - except Exception: - # Binary not found - output_binary_missing(name='readability-extractor', binproviders='npm') - - sys.exit(0) - - -if __name__ == '__main__': - main() diff --git a/archivebox/plugins/readability/on_Crawl__35_readability_install.py b/archivebox/plugins/readability/on_Crawl__35_readability_install.py new file mode 100755 index 00000000..6705c6bb --- /dev/null +++ b/archivebox/plugins/readability/on_Crawl__35_readability_install.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python3 +""" +Emit readability-extractor Binary dependency for the crawl. +""" + +import json +import os +import sys + + +def get_env(name: str, default: str = '') -> str: + return os.environ.get(name, default).strip() + +def get_env_bool(name: str, default: bool = False) -> bool: + val = get_env(name, '').lower() + if val in ('true', '1', 'yes', 'on'): + return True + if val in ('false', '0', 'no', 'off'): + return False + return default + + +def output_binary(name: str, binproviders: str): + """Output Binary JSONL record for a dependency.""" + machine_id = os.environ.get('MACHINE_ID', '') + + record = { + 'type': 'Binary', + 'name': name, + 'binproviders': binproviders, + 'overrides': { + 'npm': { + 'packages': ['https://github.com/ArchiveBox/readability-extractor'], + }, + }, + 'machine_id': machine_id, + } + print(json.dumps(record)) + + +def main(): + readability_enabled = get_env_bool('READABILITY_ENABLED', True) + + if not readability_enabled: + sys.exit(0) + + output_binary(name='readability-extractor', binproviders='npm,env') + + sys.exit(0) + + +if __name__ == '__main__': + main() diff --git a/archivebox/plugins/readability/on_Snapshot__56_readability.py b/archivebox/plugins/readability/on_Snapshot__56_readability.py index 2777479a..4c23fa28 100644 --- a/archivebox/plugins/readability/on_Snapshot__56_readability.py +++ b/archivebox/plugins/readability/on_Snapshot__56_readability.py @@ -63,19 +63,25 @@ def find_html_source() -> str | None: # Hooks run in snapshot_dir, sibling extractor outputs are in subdirectories search_patterns = [ 'singlefile/singlefile.html', + '*_singlefile/singlefile.html', 'singlefile/*.html', + '*_singlefile/*.html', 'dom/output.html', + '*_dom/output.html', 'dom/*.html', + '*_dom/*.html', 'wget/**/*.html', + '*_wget/**/*.html', 'wget/**/*.htm', + '*_wget/**/*.htm', ] - cwd = Path.cwd() - for pattern in search_patterns: - matches = list(cwd.glob(pattern)) - for match in matches: - if match.is_file() and match.stat().st_size > 0: - return str(match) + for base in (Path.cwd(), Path.cwd().parent): + for pattern in search_patterns: + matches = list(base.glob(pattern)) + for match in matches: + if match.is_file() and match.stat().st_size > 0: + return str(match) return None diff --git a/archivebox/plugins/readability/templates/icon.html b/archivebox/plugins/readability/templates/icon.html index 66336e65..ae67c26f 100644 --- a/archivebox/plugins/readability/templates/icon.html +++ b/archivebox/plugins/readability/templates/icon.html @@ -1 +1 @@ -📖 \ No newline at end of file + diff --git a/archivebox/plugins/redirects/on_Snapshot__31_redirects.bg.js b/archivebox/plugins/redirects/on_Snapshot__25_redirects.bg.js similarity index 93% rename from archivebox/plugins/redirects/on_Snapshot__31_redirects.bg.js rename to archivebox/plugins/redirects/on_Snapshot__25_redirects.bg.js index e051cd50..66aac407 100755 --- a/archivebox/plugins/redirects/on_Snapshot__31_redirects.bg.js +++ b/archivebox/plugins/redirects/on_Snapshot__25_redirects.bg.js @@ -6,7 +6,7 @@ * redirect chain from the initial request. It stays alive through navigation * and emits JSONL on SIGTERM. * - * Usage: on_Snapshot__31_redirects.bg.js --url= --snapshot-id= + * Usage: on_Snapshot__25_redirects.bg.js --url= --snapshot-id= * Output: Writes redirects.jsonl */ @@ -169,7 +169,7 @@ async function main() { const snapshotId = args.snapshot_id; if (!url || !snapshotId) { - console.error('Usage: on_Snapshot__31_redirects.bg.js --url= --snapshot-id='); + console.error('Usage: on_Snapshot__25_redirects.bg.js --url= --snapshot-id='); process.exit(1); } @@ -191,11 +191,15 @@ async function main() { // Set up redirect listener BEFORE navigation await setupRedirectListener(); - // Wait for chrome_navigate to complete (BLOCKING) - await waitForPageLoaded(CHROME_SESSION_DIR, timeout * 4, 1000); + // Wait for chrome_navigate to complete (non-fatal) + try { + await waitForPageLoaded(CHROME_SESSION_DIR, timeout * 4, 1000); + } catch (e) { + console.error(`WARN: ${e.message}`); + } // Keep process alive until killed by cleanup - console.error('Redirect tracking complete, waiting for cleanup signal...'); + // console.error('Redirect tracking complete, waiting for cleanup signal...'); // Keep the process alive indefinitely await new Promise(() => {}); // Never resolves diff --git a/archivebox/plugins/redirects/templates/icon.html b/archivebox/plugins/redirects/templates/icon.html index e69de29b..8f32e981 100644 --- a/archivebox/plugins/redirects/templates/icon.html +++ b/archivebox/plugins/redirects/templates/icon.html @@ -0,0 +1 @@ + diff --git a/archivebox/plugins/redirects/tests/__init__.py b/archivebox/plugins/redirects/tests/__init__.py deleted file mode 100644 index 6bc72141..00000000 --- a/archivebox/plugins/redirects/tests/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Tests for the redirects plugin.""" diff --git a/archivebox/plugins/redirects/tests/test_redirects.py b/archivebox/plugins/redirects/tests/test_redirects.py index 0164d461..452c5dd6 100644 --- a/archivebox/plugins/redirects/tests/test_redirects.py +++ b/archivebox/plugins/redirects/tests/test_redirects.py @@ -10,6 +10,7 @@ import shutil import subprocess import sys import tempfile +import time from pathlib import Path import pytest @@ -77,14 +78,13 @@ class TestRedirectsWithChrome(TestCase): # Use the environment from chrome_session (already has CHROME_HEADLESS=true) - # Run redirects hook with the active Chrome session - result = subprocess.run( + # Run redirects hook with the active Chrome session (background hook) + result = subprocess.Popen( ['node', str(REDIRECTS_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], - cwd=str(snapshot_chrome_dir, - env=get_test_env()), - capture_output=True, + cwd=str(snapshot_chrome_dir), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, text=True, - timeout=60, env=env ) @@ -93,6 +93,12 @@ class TestRedirectsWithChrome(TestCase): redirects_data = None + # Wait briefly for background hook to write output + for _ in range(10): + if redirects_output.exists() and redirects_output.stat().st_size > 0: + break + time.sleep(1) + # Try parsing from file first if redirects_output.exists(): with open(redirects_output) as f: @@ -107,7 +113,11 @@ class TestRedirectsWithChrome(TestCase): # Try parsing from stdout if not in file if not redirects_data: - for line in result.stdout.split('\n'): + try: + stdout, stderr = result.communicate(timeout=5) + except subprocess.TimeoutExpired: + stdout, stderr = "", "" + for line in stdout.split('\n'): line = line.strip() if line.startswith('{'): try: @@ -120,9 +130,17 @@ class TestRedirectsWithChrome(TestCase): # Verify hook ran successfully # example.com typically doesn't redirect, so we just verify no errors - self.assertEqual(result.returncode, 0, f"Hook failed: {result.stderr}") - self.assertNotIn('Traceback', result.stderr) - self.assertNotIn('Error:', result.stderr) + if result.poll() is None: + result.terminate() + try: + stdout, stderr = result.communicate(timeout=5) + except subprocess.TimeoutExpired: + result.kill() + stdout, stderr = result.communicate() + else: + stdout, stderr = result.communicate() + self.assertNotIn('Traceback', stderr) + self.assertNotIn('Error:', stderr) except RuntimeError as e: if 'Chrome' in str(e) or 'CDP' in str(e): diff --git a/archivebox/plugins/responses/on_Snapshot__24_responses.bg.js b/archivebox/plugins/responses/on_Snapshot__24_responses.bg.js index 9d8f16ed..c7dd6491 100755 --- a/archivebox/plugins/responses/on_Snapshot__24_responses.bg.js +++ b/archivebox/plugins/responses/on_Snapshot__24_responses.bg.js @@ -33,6 +33,11 @@ const PLUGIN_NAME = 'responses'; const OUTPUT_DIR = '.'; const CHROME_SESSION_DIR = '../chrome'; +let browser = null; +let page = null; +let responseCount = 0; +let shuttingDown = false; + // Resource types to capture (by default, capture everything) const DEFAULT_TYPES = ['script', 'stylesheet', 'font', 'image', 'media', 'xhr', 'websocket']; @@ -199,6 +204,7 @@ async function setupListener() { }; fs.appendFileSync(indexPath, JSON.stringify(indexEntry) + '\n'); + responseCount += 1; } catch (e) { // Ignore errors @@ -208,6 +214,31 @@ async function setupListener() { return { browser, page }; } +function emitResult(status = 'succeeded') { + if (shuttingDown) return; + shuttingDown = true; + + const outputStr = responseCount > 0 + ? `responses/ (${responseCount} responses)` + : 'responses/'; + console.log(JSON.stringify({ + type: 'ArchiveResult', + status, + output_str: outputStr, + })); +} + +async function handleShutdown(signal) { + console.error(`\nReceived ${signal}, emitting final results...`); + emitResult('succeeded'); + if (browser) { + try { + browser.disconnect(); + } catch (e) {} + } + process.exit(0); +} + async function main() { const args = parseArgs(); const url = args.url; @@ -224,24 +255,27 @@ async function main() { process.exit(0); } - const timeout = getEnvInt('RESPONSES_TIMEOUT', 30) * 1000; - try { // Set up listener BEFORE navigation - await setupListener(); + const connection = await setupListener(); + browser = connection.browser; + page = connection.page; - // Wait for chrome_navigate to complete (BLOCKING) - // Extra 1s delay for late responses - await waitForPageLoaded(CHROME_SESSION_DIR, timeout * 4, 1000); + // Register signal handlers for graceful shutdown + process.on('SIGTERM', () => handleShutdown('SIGTERM')); + process.on('SIGINT', () => handleShutdown('SIGINT')); - // Output clean JSONL - console.log(JSON.stringify({ - type: 'ArchiveResult', - status: 'succeeded', - output_str: 'responses/', - })); + // Wait for chrome_navigate to complete (non-fatal) + try { + const timeout = getEnvInt('RESPONSES_TIMEOUT', 30) * 1000; + await waitForPageLoaded(CHROME_SESSION_DIR, timeout * 4, 1000); + } catch (e) { + console.error(`WARN: ${e.message}`); + } - process.exit(0); + // console.error('Responses listener active, waiting for cleanup signal...'); + await new Promise(() => {}); // Keep alive until SIGTERM + return; } catch (e) { const error = `${e.name}: ${e.message}`; diff --git a/archivebox/plugins/responses/templates/icon.html b/archivebox/plugins/responses/templates/icon.html index e69de29b..51210acb 100644 --- a/archivebox/plugins/responses/templates/icon.html +++ b/archivebox/plugins/responses/templates/icon.html @@ -0,0 +1 @@ + diff --git a/archivebox/plugins/responses/tests/__init__.py b/archivebox/plugins/responses/tests/__init__.py deleted file mode 100644 index d31fa890..00000000 --- a/archivebox/plugins/responses/tests/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Tests for the responses plugin.""" diff --git a/archivebox/plugins/responses/tests/test_responses.py b/archivebox/plugins/responses/tests/test_responses.py index c66f7652..82a5fa77 100644 --- a/archivebox/plugins/responses/tests/test_responses.py +++ b/archivebox/plugins/responses/tests/test_responses.py @@ -10,6 +10,7 @@ import shutil import subprocess import sys import tempfile +import time from pathlib import Path import pytest @@ -76,22 +77,36 @@ class TestResponsesWithChrome(TestCase): # Use the environment from chrome_session (already has CHROME_HEADLESS=true) - # Run responses hook with the active Chrome session - result = subprocess.run( + # Run responses hook with the active Chrome session (background hook) + result = subprocess.Popen( ['node', str(RESPONSES_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], - cwd=str(snapshot_chrome_dir, - env=get_test_env()), - capture_output=True, + cwd=str(snapshot_chrome_dir), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, text=True, - timeout=120, # Longer timeout as it waits for navigation env=env ) # Check for output directory and index file index_output = snapshot_chrome_dir / 'index.jsonl' - # Verify hook ran (may timeout waiting for page_loaded.txt in test mode) - self.assertNotIn('Traceback', result.stderr) + # Wait briefly for background hook to write output + for _ in range(10): + if index_output.exists() and index_output.stat().st_size > 0: + break + time.sleep(1) + + # Verify hook ran (may keep running waiting for cleanup signal) + if result.poll() is None: + result.terminate() + try: + stdout, stderr = result.communicate(timeout=5) + except subprocess.TimeoutExpired: + result.kill() + stdout, stderr = result.communicate() + else: + stdout, stderr = result.communicate() + self.assertNotIn('Traceback', stderr) # If index file exists, verify it's valid JSONL if index_output.exists(): diff --git a/archivebox/plugins/screenshot/on_Snapshot__51_screenshot.js b/archivebox/plugins/screenshot/on_Snapshot__51_screenshot.js index fae0bf93..76390846 100644 --- a/archivebox/plugins/screenshot/on_Snapshot__51_screenshot.js +++ b/archivebox/plugins/screenshot/on_Snapshot__51_screenshot.js @@ -58,7 +58,21 @@ const CHROME_SESSION_DIR = '../chrome'; // Check if staticfile extractor already downloaded this URL const STATICFILE_DIR = '../staticfile'; function hasStaticFileOutput() { - return fs.existsSync(STATICFILE_DIR) && fs.readdirSync(STATICFILE_DIR).length > 0; + if (!fs.existsSync(STATICFILE_DIR)) return false; + const stdoutPath = path.join(STATICFILE_DIR, 'stdout.log'); + if (!fs.existsSync(stdoutPath)) return false; + const stdout = fs.readFileSync(stdoutPath, 'utf8'); + for (const line of stdout.split('\n')) { + const trimmed = line.trim(); + if (!trimmed.startsWith('{')) continue; + try { + const record = JSON.parse(trimmed); + if (record.type === 'ArchiveResult' && record.status === 'succeeded') { + return true; + } + } catch (e) {} + } + return false; } // Wait for chrome tab to be fully loaded diff --git a/archivebox/plugins/screenshot/templates/icon.html b/archivebox/plugins/screenshot/templates/icon.html index e76b5f98..4236aee3 100644 --- a/archivebox/plugins/screenshot/templates/icon.html +++ b/archivebox/plugins/screenshot/templates/icon.html @@ -1 +1 @@ -📷 \ No newline at end of file + diff --git a/archivebox/plugins/screenshot/tests/test_screenshot.py b/archivebox/plugins/screenshot/tests/test_screenshot.py index 2d804757..9627ec02 100644 --- a/archivebox/plugins/screenshot/tests/test_screenshot.py +++ b/archivebox/plugins/screenshot/tests/test_screenshot.py @@ -166,7 +166,7 @@ def test_skips_when_staticfile_exists(): # Create staticfile output to simulate staticfile extractor already ran staticfile_dir = snapshot_dir / 'staticfile' staticfile_dir.mkdir() - (staticfile_dir / 'index.html').write_text('') + (staticfile_dir / 'stdout.log').write_text('{"type":"ArchiveResult","status":"succeeded","output_str":"index.html"}\n') env = get_test_env() result = subprocess.run( diff --git a/archivebox/plugins/search_backend_ripgrep/binaries.jsonl b/archivebox/plugins/search_backend_ripgrep/binaries.jsonl deleted file mode 100644 index f66337f7..00000000 --- a/archivebox/plugins/search_backend_ripgrep/binaries.jsonl +++ /dev/null @@ -1 +0,0 @@ -{"type": "Binary", "name": "rg", "binproviders": "apt,brew,env", "overrides": {"apt": {"packages": ["ripgrep"]}}} diff --git a/archivebox/plugins/search_backend_ripgrep/on_Crawl__00_ripgrep_install.py b/archivebox/plugins/search_backend_ripgrep/on_Crawl__00_ripgrep_install.py deleted file mode 100755 index 57502514..00000000 --- a/archivebox/plugins/search_backend_ripgrep/on_Crawl__00_ripgrep_install.py +++ /dev/null @@ -1,92 +0,0 @@ -#!/usr/bin/env python3 -""" -Install hook for ripgrep binary. - -Runs at crawl start to verify ripgrep is available when SEARCH_BACKEND_ENGINE='ripgrep'. -Outputs JSONL for Binary and Machine config updates. -Uses abx-pkg to handle installation via apt/brew providers. -""" - -import os -import sys -import json - - -def find_ripgrep() -> dict | None: - """Find ripgrep binary using abx-pkg, respecting RIPGREP_BINARY env var.""" - # Quick check: if RIPGREP_BINARY is set and exists, skip expensive lookup - configured_binary = os.environ.get('RIPGREP_BINARY', '').strip() - if configured_binary and os.path.isfile(configured_binary) and os.access(configured_binary, os.X_OK): - # Binary is already configured and valid - exit immediately - sys.exit(0) - - try: - from abx_pkg import Binary, EnvProvider, AptProvider, BrewProvider, BinProviderOverrides - - # Try to find ripgrep using abx-pkg (EnvProvider checks PATH, apt/brew handle installation) - binary = Binary( - name='rg', - binproviders=[EnvProvider(), AptProvider(), BrewProvider()], - overrides={ - 'apt': {'packages': ['ripgrep']}, - 'brew': {'packages': ['ripgrep']}, - } - ) - - loaded = binary.load() - if loaded and loaded.abspath: - return { - 'name': 'rg', - 'abspath': str(loaded.abspath), - 'version': str(loaded.version) if loaded.version else None, - 'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None, - 'binprovider': loaded.binprovider.name if loaded.binprovider else 'env', - } - except Exception as e: - print(f"Error loading ripgrep: {e}", file=sys.stderr) - pass - - return None - - -def main(): - # Only proceed if ripgrep backend is enabled - search_backend_engine = os.environ.get('SEARCH_BACKEND_ENGINE', 'ripgrep').strip() - if search_backend_engine != 'ripgrep': - # Not using ripgrep, exit successfully without output - sys.exit(0) - - result = find_ripgrep() - - if result and result.get('abspath'): - print(json.dumps({ - 'type': 'Binary', - 'name': result['name'], - 'abspath': result['abspath'], - 'version': result['version'], - 'binprovider': result['binprovider'], - })) - - print(json.dumps({ - 'type': 'Machine', - '_method': 'update', - 'key': 'config/RIPGREP_BINARY', - 'value': result['abspath'], - })) - - if result['version']: - print(json.dumps({ - 'type': 'Machine', - '_method': 'update', - 'key': 'config/RIPGREP_VERSION', - 'value': result['version'], - })) - - sys.exit(0) - else: - print(f"Ripgrep binary not found (install with: apt install ripgrep or brew install ripgrep)", file=sys.stderr) - sys.exit(1) - - -if __name__ == '__main__': - main() diff --git a/archivebox/plugins/search_backend_ripgrep/on_Crawl__50_ripgrep_install.py b/archivebox/plugins/search_backend_ripgrep/on_Crawl__50_ripgrep_install.py new file mode 100755 index 00000000..071dbb5b --- /dev/null +++ b/archivebox/plugins/search_backend_ripgrep/on_Crawl__50_ripgrep_install.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python3 +""" +Emit ripgrep Binary dependency for the crawl. +""" + +import os +import sys +import json + + +def main(): + # Only proceed if ripgrep backend is enabled + search_backend_engine = os.environ.get('SEARCH_BACKEND_ENGINE', 'ripgrep').strip() + if search_backend_engine != 'ripgrep': + # Not using ripgrep, exit successfully without output + sys.exit(0) + + machine_id = os.environ.get('MACHINE_ID', '') + print(json.dumps({ + 'type': 'Binary', + 'name': 'rg', + 'binproviders': 'apt,brew,env', + 'overrides': { + 'apt': {'packages': ['ripgrep']}, + }, + 'machine_id': machine_id, + })) + sys.exit(0) + + +if __name__ == '__main__': + main() diff --git a/archivebox/plugins/search_backend_ripgrep/search.py b/archivebox/plugins/search_backend_ripgrep/search.py index 140a32d1..171b60bb 100644 --- a/archivebox/plugins/search_backend_ripgrep/search.py +++ b/archivebox/plugins/search_backend_ripgrep/search.py @@ -18,8 +18,6 @@ import shutil from pathlib import Path from typing import List, Iterable -from django.conf import settings - def get_env(name: str, default: str = '') -> str: return os.environ.get(name, default).strip() @@ -46,6 +44,16 @@ def get_env_array(name: str, default: list[str] | None = None) -> list[str]: return default if default is not None else [] +def _get_archive_dir() -> Path: + archive_dir = os.environ.get('ARCHIVE_DIR', '').strip() + if archive_dir: + return Path(archive_dir) + data_dir = os.environ.get('DATA_DIR', '').strip() + if data_dir: + return Path(data_dir) / 'archive' + return Path.cwd() / 'archive' + + def search(query: str) -> List[str]: """Search for snapshots using ripgrep.""" rg_binary = get_env('RIPGREP_BINARY', 'rg') @@ -57,7 +65,7 @@ def search(query: str) -> List[str]: ripgrep_args = get_env_array('RIPGREP_ARGS', []) ripgrep_args_extra = get_env_array('RIPGREP_ARGS_EXTRA', []) - archive_dir = Path(settings.ARCHIVE_DIR) + archive_dir = _get_archive_dir() if not archive_dir.exists(): return [] diff --git a/archivebox/plugins/search_backend_ripgrep/tests/__init__.py b/archivebox/plugins/search_backend_ripgrep/tests/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/archivebox/plugins/search_backend_ripgrep/tests/test_ripgrep_detection.py b/archivebox/plugins/search_backend_ripgrep/tests/test_ripgrep_detection.py index 8057783a..60eb6e3a 100644 --- a/archivebox/plugins/search_backend_ripgrep/tests/test_ripgrep_detection.py +++ b/archivebox/plugins/search_backend_ripgrep/tests/test_ripgrep_detection.py @@ -23,7 +23,7 @@ import pytest def test_ripgrep_hook_detects_binary_from_path(): """Test that ripgrep hook finds binary using abx-pkg when env var is just a name.""" - hook_path = Path(__file__).parent.parent / 'on_Crawl__00_install_ripgrep.py' + hook_path = Path(__file__).parent.parent / 'on_Crawl__50_ripgrep_install.py' # Skip if rg is not installed if not shutil.which('rg'): @@ -44,26 +44,19 @@ def test_ripgrep_hook_detects_binary_from_path(): assert result.returncode == 0, f"Hook failed: {result.stderr}" - # Parse JSONL output (filter out COMPUTED: lines) + # Parse JSONL output (filter out non-JSON lines) lines = [line for line in result.stdout.strip().split('\n') if line.strip() and line.strip().startswith('{')] - assert len(lines) >= 2, "Expected at least 2 JSONL lines (Binary + Machine config)" + assert len(lines) >= 1, "Expected at least 1 JSONL line (Binary)" binary = json.loads(lines[0]) assert binary['type'] == 'Binary' assert binary['name'] == 'rg' - assert '/' in binary['abspath'], "Expected full path, not just binary name" - assert Path(binary['abspath']).is_file(), "Binary path should exist" - assert binary['version'], "Version should be detected" - - machine_config = json.loads(lines[1]) - assert machine_config['type'] == 'Machine' - assert machine_config['key'] == 'config/RIPGREP_BINARY' - assert '/' in machine_config['value'], "Machine config should store full path" + assert 'binproviders' in binary, "Expected binproviders declaration" def test_ripgrep_hook_skips_when_backend_not_ripgrep(): """Test that ripgrep hook exits silently when search backend is not ripgrep.""" - hook_path = Path(__file__).parent.parent / 'on_Crawl__00_install_ripgrep.py' + hook_path = Path(__file__).parent.parent / 'on_Crawl__50_ripgrep_install.py' env = os.environ.copy() env['SEARCH_BACKEND_ENGINE'] = 'sqlite' # Different backend @@ -82,7 +75,7 @@ def test_ripgrep_hook_skips_when_backend_not_ripgrep(): def test_ripgrep_hook_handles_absolute_path(): """Test that ripgrep hook exits successfully when RIPGREP_BINARY is a valid absolute path.""" - hook_path = Path(__file__).parent.parent / 'on_Crawl__00_install_ripgrep.py' + hook_path = Path(__file__).parent.parent / 'on_Crawl__50_ripgrep_install.py' rg_path = shutil.which('rg') if not rg_path: @@ -100,9 +93,9 @@ def test_ripgrep_hook_handles_absolute_path(): timeout=10, ) - # When binary is already configured with valid absolute path, hook exits early without output assert result.returncode == 0, f"Hook should exit successfully when binary already configured: {result.stderr}" - # No output is expected/needed when binary is already valid + lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')] + assert lines, "Expected Binary JSONL output when backend is ripgrep" @pytest.mark.django_db @@ -115,6 +108,8 @@ def test_machine_config_overrides_base_config(): """ from archivebox.machine.models import Machine, Binary + import archivebox.machine.models as models + models._CURRENT_MACHINE = None machine = Machine.current() # Simulate a hook detecting chrome and storing it with a different path than base config @@ -177,7 +172,9 @@ def test_install_creates_binary_records(): This verifies the Binary model works correctly with the database. """ from archivebox.machine.models import Machine, Binary + import archivebox.machine.models as models + models._CURRENT_MACHINE = None machine = Machine.current() initial_binary_count = Binary.objects.filter(machine=machine).count() @@ -188,7 +185,7 @@ def test_install_creates_binary_records(): abspath='/usr/bin/test-binary', version='1.0.0', binprovider='env', - status='succeeded' + status=Binary.StatusChoices.INSTALLED ) # Verify Binary record was created @@ -220,7 +217,7 @@ def test_ripgrep_only_detected_when_backend_enabled(): if not shutil.which('rg'): pytest.skip("ripgrep not installed") - hook_path = Path(__file__).parent.parent / 'on_Crawl__00_install_ripgrep.py' + hook_path = Path(__file__).parent.parent / 'on_Crawl__50_ripgrep_install.py' # Test 1: With ripgrep backend - should output Binary record env1 = os.environ.copy() @@ -237,8 +234,7 @@ def test_ripgrep_only_detected_when_backend_enabled(): assert result1.returncode == 0, f"Hook should succeed with ripgrep backend: {result1.stderr}" # Should output Binary JSONL when backend is ripgrep - assert 'Binary' in result1.stdout or 'COMPUTED:' in result1.stdout, \ - "Should output Binary or COMPUTED when backend=ripgrep" + assert 'Binary' in result1.stdout, "Should output Binary when backend=ripgrep" # Test 2: With different backend - should output nothing env2 = os.environ.copy() diff --git a/archivebox/plugins/search_backend_ripgrep/tests/test_ripgrep_search.py b/archivebox/plugins/search_backend_ripgrep/tests/test_ripgrep_search.py index 75513d34..1f0ce7fa 100644 --- a/archivebox/plugins/search_backend_ripgrep/tests/test_ripgrep_search.py +++ b/archivebox/plugins/search_backend_ripgrep/tests/test_ripgrep_search.py @@ -117,8 +117,8 @@ class TestRipgrepSearch(TestCase): 'title/title.txt': 'JavaScript Basics', }) self._create_snapshot('snap-003', { - 'wget/index.html': 'Web archiving best practices', - 'title/title.txt': 'Web Archiving Guide', + 'wget/index.html': 'Web archiving guide and best practices', + 'title/title.txt': 'Web Archiving guide', }) # Patch settings diff --git a/archivebox/plugins/search_backend_sonic/templates/icon.html b/archivebox/plugins/search_backend_sonic/templates/icon.html index e69de29b..bf81a372 100644 --- a/archivebox/plugins/search_backend_sonic/templates/icon.html +++ b/archivebox/plugins/search_backend_sonic/templates/icon.html @@ -0,0 +1 @@ + diff --git a/archivebox/plugins/search_backend_sqlite/search.py b/archivebox/plugins/search_backend_sqlite/search.py index afd52f15..0d3f5539 100644 --- a/archivebox/plugins/search_backend_sqlite/search.py +++ b/archivebox/plugins/search_backend_sqlite/search.py @@ -14,8 +14,6 @@ import sqlite3 from pathlib import Path from typing import List, Iterable -from django.conf import settings - # Config with old var names for backwards compatibility SQLITEFTS_DB = os.environ.get('SQLITEFTS_DB', 'search.sqlite3').strip() @@ -23,9 +21,16 @@ FTS_SEPARATE_DATABASE = os.environ.get('FTS_SEPARATE_DATABASE', 'true').lower() FTS_TOKENIZERS = os.environ.get('FTS_TOKENIZERS', 'porter unicode61 remove_diacritics 2').strip() +def _get_data_dir() -> Path: + data_dir = os.environ.get('DATA_DIR', '').strip() + if data_dir: + return Path(data_dir) + return Path.cwd() / 'data' + + def get_db_path() -> Path: """Get path to the search index database.""" - return Path(settings.DATA_DIR) / SQLITEFTS_DB + return _get_data_dir() / SQLITEFTS_DB def search(query: str) -> List[str]: diff --git a/archivebox/plugins/search_backend_sqlite/templates/icon.html b/archivebox/plugins/search_backend_sqlite/templates/icon.html index e69de29b..3c9f8646 100644 --- a/archivebox/plugins/search_backend_sqlite/templates/icon.html +++ b/archivebox/plugins/search_backend_sqlite/templates/icon.html @@ -0,0 +1 @@ + diff --git a/archivebox/plugins/search_backend_sqlite/tests/__init__.py b/archivebox/plugins/search_backend_sqlite/tests/__init__.py deleted file mode 100644 index 6bef82e4..00000000 --- a/archivebox/plugins/search_backend_sqlite/tests/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Tests for the SQLite FTS5 search backend.""" diff --git a/archivebox/plugins/seo/templates/icon.html b/archivebox/plugins/seo/templates/icon.html index e69de29b..1306d22d 100644 --- a/archivebox/plugins/seo/templates/icon.html +++ b/archivebox/plugins/seo/templates/icon.html @@ -0,0 +1 @@ + diff --git a/archivebox/plugins/seo/tests/__init__.py b/archivebox/plugins/seo/tests/__init__.py deleted file mode 100644 index f2b12854..00000000 --- a/archivebox/plugins/seo/tests/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Tests for the SEO plugin.""" diff --git a/archivebox/plugins/seo/tests/test_seo.py b/archivebox/plugins/seo/tests/test_seo.py index e365e4b0..63233b16 100644 --- a/archivebox/plugins/seo/tests/test_seo.py +++ b/archivebox/plugins/seo/tests/test_seo.py @@ -79,8 +79,7 @@ class TestSEOWithChrome(TestCase): # Run SEO hook with the active Chrome session result = subprocess.run( ['node', str(SEO_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], - cwd=str(snapshot_chrome_dir, - env=get_test_env()), + cwd=str(snapshot_chrome_dir), capture_output=True, text=True, timeout=60, diff --git a/archivebox/plugins/singlefile/binaries.jsonl b/archivebox/plugins/singlefile/binaries.jsonl deleted file mode 100644 index e1241163..00000000 --- a/archivebox/plugins/singlefile/binaries.jsonl +++ /dev/null @@ -1 +0,0 @@ -{"type": "Binary", "name": "single-file", "binproviders": "npm,env", "overrides": {"npm": {"packages": ["single-file-cli"]}}} diff --git a/archivebox/plugins/singlefile/config.json b/archivebox/plugins/singlefile/config.json index fe4962a0..c522efba 100644 --- a/archivebox/plugins/singlefile/config.json +++ b/archivebox/plugins/singlefile/config.json @@ -25,7 +25,7 @@ "type": "string", "default": "", "x-fallback": "CHROME_BINARY", - "description": "Path to Chrome/Chromium binary" + "description": "Path to Chromium binary" }, "SINGLEFILE_TIMEOUT": { "type": "integer", diff --git a/archivebox/plugins/singlefile/on_Crawl__08_singlefile_install.py b/archivebox/plugins/singlefile/on_Crawl__08_singlefile_install.py deleted file mode 100755 index b1bb2a68..00000000 --- a/archivebox/plugins/singlefile/on_Crawl__08_singlefile_install.py +++ /dev/null @@ -1,85 +0,0 @@ -#!/usr/bin/env python3 -""" -Detect single-file binary and emit Binary JSONL record. - -Output: Binary JSONL record to stdout if single-file is found -""" - -import json -import os -import sys - -from abx_pkg import Binary, EnvProvider - - -def get_env(name: str, default: str = '') -> str: - return os.environ.get(name, default).strip() - -def get_env_bool(name: str, default: bool = False) -> bool: - val = get_env(name, '').lower() - if val in ('true', '1', 'yes', 'on'): - return True - if val in ('false', '0', 'no', 'off'): - return False - return default - - -def output_binary_found(binary: Binary, name: str): - """Output Binary JSONL record for an installed binary.""" - machine_id = os.environ.get('MACHINE_ID', '') - - record = { - 'type': 'Binary', - 'name': name, - 'abspath': str(binary.abspath), - 'version': str(binary.version) if binary.version else '', - 'sha256': binary.sha256 or '', - 'binprovider': 'env', # Already installed - 'machine_id': machine_id, - } - print(json.dumps(record)) - - -def output_binary_missing(name: str, binproviders: str): - """Output Binary JSONL record for a missing binary that needs installation.""" - machine_id = os.environ.get('MACHINE_ID', '') - - record = { - 'type': 'Binary', - 'name': name, - 'binproviders': binproviders, # Providers that can install it - 'machine_id': machine_id, - } - print(json.dumps(record)) - - -def main(): - singlefile_enabled = get_env_bool('SINGLEFILE_ENABLED', True) - - if not singlefile_enabled: - sys.exit(0) - - provider = EnvProvider() - found = False - - # Try single-file-cli first, then single-file - for binary_name in ['single-file-cli', 'single-file']: - try: - binary = Binary(name=binary_name, binproviders=[provider]).load() - if binary.abspath: - # Binary found - output_binary_found(binary, name='single-file') - found = True - break - except Exception: - continue - - if not found: - # Binary not found - output_binary_missing(name='single-file', binproviders='npm') - - sys.exit(0) - - -if __name__ == '__main__': - main() diff --git a/archivebox/plugins/singlefile/on_Crawl__45_singlefile_install.py b/archivebox/plugins/singlefile/on_Crawl__45_singlefile_install.py new file mode 100755 index 00000000..f2d22b3e --- /dev/null +++ b/archivebox/plugins/singlefile/on_Crawl__45_singlefile_install.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 +""" +Emit single-file Binary dependency for the crawl. +""" + +import json +import os +import sys + + +def get_env(name: str, default: str = '') -> str: + return os.environ.get(name, default).strip() + +def get_env_bool(name: str, default: bool = False) -> bool: + val = get_env(name, '').lower() + if val in ('true', '1', 'yes', 'on'): + return True + if val in ('false', '0', 'no', 'off'): + return False + return default + + +def output_binary(name: str, binproviders: str, overrides: dict | None = None): + """Output Binary JSONL record for a dependency.""" + machine_id = os.environ.get('MACHINE_ID', '') + + record = { + 'type': 'Binary', + 'name': name, + 'binproviders': binproviders, + 'machine_id': machine_id, + } + if overrides: + record['overrides'] = overrides + print(json.dumps(record)) + + +def main(): + singlefile_enabled = get_env_bool('SINGLEFILE_ENABLED', True) + + if not singlefile_enabled: + sys.exit(0) + + output_binary( + name='single-file', + binproviders='npm,env', + overrides={'npm': {'packages': ['single-file-cli']}}, + ) + + sys.exit(0) + + +if __name__ == '__main__': + main() diff --git a/archivebox/plugins/singlefile/on_Crawl__04_singlefile_install.js b/archivebox/plugins/singlefile/on_Crawl__82_singlefile_install.js similarity index 99% rename from archivebox/plugins/singlefile/on_Crawl__04_singlefile_install.js rename to archivebox/plugins/singlefile/on_Crawl__82_singlefile_install.js index 7637bf98..c0a0b4da 100755 --- a/archivebox/plugins/singlefile/on_Crawl__04_singlefile_install.js +++ b/archivebox/plugins/singlefile/on_Crawl__82_singlefile_install.js @@ -7,7 +7,7 @@ * * Extension: https://chromewebstore.google.com/detail/mpiodijhokgodhhofbcjdecpffjipkle * - * Priority: 04 (early) - Must install before Chrome session starts at Crawl level + * Priority: 82 - Must install before Chrome session starts at Crawl level * Hook: on_Crawl (runs once per crawl, not per snapshot) * * This extension automatically: diff --git a/archivebox/plugins/singlefile/on_Snapshot__50_singlefile.py b/archivebox/plugins/singlefile/on_Snapshot__50_singlefile.py index ec5188d8..aa73d69e 100644 --- a/archivebox/plugins/singlefile/on_Snapshot__50_singlefile.py +++ b/archivebox/plugins/singlefile/on_Snapshot__50_singlefile.py @@ -23,6 +23,8 @@ import json import os import subprocess import sys +import time +from urllib.request import urlopen from pathlib import Path import rich_click as click @@ -75,7 +77,22 @@ STATICFILE_DIR = '../staticfile' def has_staticfile_output() -> bool: """Check if staticfile extractor already downloaded this URL.""" staticfile_dir = Path(STATICFILE_DIR) - return staticfile_dir.exists() and any(staticfile_dir.iterdir()) + if not staticfile_dir.exists(): + return False + stdout_log = staticfile_dir / 'stdout.log' + if not stdout_log.exists(): + return False + for line in stdout_log.read_text(errors='ignore').splitlines(): + line = line.strip() + if not line.startswith('{'): + continue + try: + record = json.loads(line) + except json.JSONDecodeError: + continue + if record.get('type') == 'ArchiveResult' and record.get('status') == 'succeeded': + return True + return False # Chrome session directory (relative to extractor output dir) @@ -84,12 +101,17 @@ def has_staticfile_output() -> bool: CHROME_SESSION_DIR = '../chrome' -def get_cdp_url() -> str | None: +def get_cdp_url(wait_seconds: float = 0.0) -> str | None: """Get CDP URL from chrome plugin if available.""" cdp_file = Path(CHROME_SESSION_DIR) / 'cdp_url.txt' - if cdp_file.exists(): - return cdp_file.read_text().strip() - return None + deadline = time.time() + max(wait_seconds, 0.0) + while True: + if cdp_file.exists(): + cdp_url = cdp_file.read_text().strip() + return cdp_url or None + if time.time() >= deadline: + return None + time.sleep(0.2) def get_port_from_cdp_url(cdp_url: str) -> str | None: @@ -101,6 +123,14 @@ def get_port_from_cdp_url(cdp_url: str) -> str | None: return None +def is_cdp_server_available(cdp_remote_url: str) -> bool: + try: + with urlopen(f'{cdp_remote_url}/json/version', timeout=1) as resp: + return resp.status == 200 + except Exception: + return False + + def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]: """ Archive URL using SingleFile. @@ -122,19 +152,30 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]: cmd = [binary, *singlefile_args] - # Try to use existing Chrome session via CDP - cdp_url = get_cdp_url() + # Try to use existing Chrome session via CDP (prefer HTTP base URL) + cdp_wait = min(10, max(1, timeout // 10)) + cdp_url = get_cdp_url(wait_seconds=cdp_wait) + cdp_remote_url = None if cdp_url: - # SingleFile can connect to existing browser via WebSocket - # Extract port from CDP URL (ws://127.0.0.1:PORT/...) - port = get_port_from_cdp_url(cdp_url) - if port: - cmd.extend(['--browser-server', f'http://127.0.0.1:{port}']) + if cdp_url.startswith(('http://', 'https://')): + cdp_remote_url = cdp_url + else: + port = get_port_from_cdp_url(cdp_url) + if port: + cdp_remote_url = f'http://127.0.0.1:{port}' + else: + cdp_remote_url = cdp_url + + if cdp_remote_url and not is_cdp_server_available(cdp_remote_url): + cdp_remote_url = None + + if cdp_remote_url: + cmd.extend(['--browser-server', cdp_remote_url]) elif chrome: cmd.extend(['--browser-executable-path', chrome]) - # Pass Chrome arguments (includes user-data-dir and other launch options) - if chrome_args: + # Pass Chrome arguments (only when launching a new browser) + if chrome_args and not cdp_remote_url: # SingleFile expects --browser-args as a JSON array string cmd.extend(['--browser-args', json.dumps(chrome_args)]) @@ -143,7 +184,7 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]: cmd.append('--browser-ignore-insecure-certs') if user_agent: - cmd.extend(['--browser-user-agent', user_agent]) + cmd.extend(['--user-agent', user_agent]) if cookies_file and Path(cookies_file).is_file(): cmd.extend(['--browser-cookies-file', cookies_file]) @@ -165,11 +206,21 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]: return True, str(output_path), '' else: stderr = result.stderr.decode('utf-8', errors='replace') + stdout = result.stdout.decode('utf-8', errors='replace') if 'ERR_NAME_NOT_RESOLVED' in stderr: return False, None, 'DNS resolution failed' if 'ERR_CONNECTION_REFUSED' in stderr: return False, None, 'Connection refused' - return False, None, f'SingleFile failed: {stderr[:200]}' + detail = (stderr or stdout).strip() + if len(detail) > 2000: + detail = detail[:2000] + cmd_preview = list(cmd) + if '--browser-args' in cmd_preview: + idx = cmd_preview.index('--browser-args') + if idx + 1 < len(cmd_preview): + cmd_preview[idx + 1] = '' + cmd_str = ' '.join(cmd_preview) + return False, None, f'SingleFile failed (cmd={cmd_str}): {detail}' except subprocess.TimeoutExpired: return False, None, f'Timed out after {timeout} seconds' diff --git a/archivebox/plugins/singlefile/templates/icon.html b/archivebox/plugins/singlefile/templates/icon.html index 31f4673e..cd055f8b 100644 --- a/archivebox/plugins/singlefile/templates/icon.html +++ b/archivebox/plugins/singlefile/templates/icon.html @@ -1 +1 @@ -📦 \ No newline at end of file + diff --git a/archivebox/plugins/singlefile/tests/test_singlefile.py b/archivebox/plugins/singlefile/tests/test_singlefile.py index a473f152..c5e8d3e7 100644 --- a/archivebox/plugins/singlefile/tests/test_singlefile.py +++ b/archivebox/plugins/singlefile/tests/test_singlefile.py @@ -13,6 +13,7 @@ Tests verify: import json import os import subprocess +import sys import tempfile from pathlib import Path @@ -66,7 +67,7 @@ def test_singlefile_cli_archives_example_com(): # Run singlefile snapshot hook result = subprocess.run( - ['python', str(SNAPSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'], + [sys.executable, str(SNAPSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'], cwd=tmpdir, capture_output=True, text=True, @@ -120,7 +121,7 @@ def test_singlefile_with_chrome_session(): # Run singlefile - it should find and use the existing Chrome session result = subprocess.run( - ['python', str(SNAPSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=singlefile-test-snap'], + [sys.executable, str(SNAPSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=singlefile-test-snap'], cwd=str(singlefile_output_dir), capture_output=True, text=True, @@ -150,7 +151,7 @@ def test_singlefile_disabled_skips(): env['SINGLEFILE_ENABLED'] = 'False' result = subprocess.run( - ['python', str(SNAPSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=test-disabled'], + [sys.executable, str(SNAPSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=test-disabled'], cwd=tmpdir, capture_output=True, text=True, diff --git a/archivebox/plugins/ssl/on_Snapshot__23_ssl.bg.js b/archivebox/plugins/ssl/on_Snapshot__23_ssl.bg.js index 5b98801b..59740e5c 100755 --- a/archivebox/plugins/ssl/on_Snapshot__23_ssl.bg.js +++ b/archivebox/plugins/ssl/on_Snapshot__23_ssl.bg.js @@ -32,6 +32,11 @@ const OUTPUT_DIR = '.'; const OUTPUT_FILE = 'ssl.jsonl'; const CHROME_SESSION_DIR = '../chrome'; +let browser = null; +let page = null; +let sslCaptured = false; +let shuttingDown = false; + async function setupListener(url) { const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE); const timeout = getEnvInt('SSL_TIMEOUT', 30) * 1000; @@ -94,6 +99,7 @@ async function setupListener(url) { // Write output directly to file fs.writeFileSync(outputPath, JSON.stringify(sslInfo, null, 2)); + sslCaptured = true; } catch (e) { // Ignore errors @@ -103,6 +109,29 @@ async function setupListener(url) { return { browser, page }; } +function emitResult(status = 'succeeded') { + if (shuttingDown) return; + shuttingDown = true; + + const outputStr = sslCaptured ? OUTPUT_FILE : OUTPUT_FILE; + console.log(JSON.stringify({ + type: 'ArchiveResult', + status, + output_str: outputStr, + })); +} + +async function handleShutdown(signal) { + console.error(`\nReceived ${signal}, emitting final results...`); + emitResult('succeeded'); + if (browser) { + try { + browser.disconnect(); + } catch (e) {} + } + process.exit(0); +} + async function main() { const args = parseArgs(); const url = args.url; @@ -119,23 +148,27 @@ async function main() { process.exit(0); } - const timeout = getEnvInt('SSL_TIMEOUT', 30) * 1000; - try { // Set up listener BEFORE navigation - await setupListener(url); + const connection = await setupListener(url); + browser = connection.browser; + page = connection.page; - // Wait for chrome_navigate to complete (BLOCKING) - await waitForPageLoaded(CHROME_SESSION_DIR, timeout * 4); + // Register signal handlers for graceful shutdown + process.on('SIGTERM', () => handleShutdown('SIGTERM')); + process.on('SIGINT', () => handleShutdown('SIGINT')); - // Output clean JSONL - console.log(JSON.stringify({ - type: 'ArchiveResult', - status: 'succeeded', - output_str: OUTPUT_FILE, - })); + // Wait for chrome_navigate to complete (non-fatal) + try { + const timeout = getEnvInt('SSL_TIMEOUT', 30) * 1000; + await waitForPageLoaded(CHROME_SESSION_DIR, timeout * 4); + } catch (e) { + console.error(`WARN: ${e.message}`); + } - process.exit(0); + // console.error('SSL listener active, waiting for cleanup signal...'); + await new Promise(() => {}); // Keep alive until SIGTERM + return; } catch (e) { const error = `${e.name}: ${e.message}`; diff --git a/archivebox/plugins/ssl/templates/icon.html b/archivebox/plugins/ssl/templates/icon.html index e69de29b..1707e8b9 100644 --- a/archivebox/plugins/ssl/templates/icon.html +++ b/archivebox/plugins/ssl/templates/icon.html @@ -0,0 +1 @@ + diff --git a/archivebox/plugins/ssl/tests/__init__.py b/archivebox/plugins/ssl/tests/__init__.py deleted file mode 100644 index 48a022d5..00000000 --- a/archivebox/plugins/ssl/tests/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Tests for the SSL plugin.""" diff --git a/archivebox/plugins/ssl/tests/test_ssl.py b/archivebox/plugins/ssl/tests/test_ssl.py index 6261c26b..5dfa17df 100644 --- a/archivebox/plugins/ssl/tests/test_ssl.py +++ b/archivebox/plugins/ssl/tests/test_ssl.py @@ -10,6 +10,7 @@ import shutil import subprocess import sys import tempfile +import time from pathlib import Path import pytest @@ -19,7 +20,6 @@ from django.test import TestCase sys.path.insert(0, str(Path(__file__).parent.parent.parent / 'chrome' / 'tests')) from chrome_test_helpers import ( chrome_session, - get_test_env, get_plugin_dir, get_hook_script, ) @@ -76,17 +76,28 @@ class TestSSLWithChrome(TestCase): # Use the environment from chrome_session (already has CHROME_HEADLESS=true) - # Run SSL hook with the active Chrome session - result = subprocess.run( + # Run SSL hook with the active Chrome session (background hook) + result = subprocess.Popen( ['node', str(SSL_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], - cwd=str(snapshot_chrome_dir, - env=get_test_env()), - capture_output=True, + cwd=str(snapshot_chrome_dir), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, text=True, - timeout=60, env=env ) + # Allow it to run briefly, then terminate (background hook) + time.sleep(3) + if result.poll() is None: + result.terminate() + try: + stdout, stderr = result.communicate(timeout=5) + except subprocess.TimeoutExpired: + result.kill() + stdout, stderr = result.communicate() + else: + stdout, stderr = result.communicate() + # Check for output file ssl_output = snapshot_chrome_dir / 'ssl.jsonl' @@ -106,7 +117,7 @@ class TestSSLWithChrome(TestCase): # Try parsing from stdout if not in file if not ssl_data: - for line in result.stdout.split('\n'): + for line in stdout.split('\n'): line = line.strip() if line.startswith('{'): try: @@ -118,9 +129,8 @@ class TestSSLWithChrome(TestCase): continue # Verify hook ran successfully - self.assertEqual(result.returncode, 0, f"Hook failed: {result.stderr}") - self.assertNotIn('Traceback', result.stderr) - self.assertNotIn('Error:', result.stderr) + self.assertNotIn('Traceback', stderr) + self.assertNotIn('Error:', stderr) # example.com uses HTTPS, so we MUST get SSL certificate data self.assertIsNotNone(ssl_data, "No SSL data extracted from HTTPS URL") diff --git a/archivebox/plugins/staticfile/on_Snapshot__32_staticfile.bg.js b/archivebox/plugins/staticfile/on_Snapshot__26_staticfile.bg.js similarity index 95% rename from archivebox/plugins/staticfile/on_Snapshot__32_staticfile.bg.js rename to archivebox/plugins/staticfile/on_Snapshot__26_staticfile.bg.js index 1ae44977..33531d93 100644 --- a/archivebox/plugins/staticfile/on_Snapshot__32_staticfile.bg.js +++ b/archivebox/plugins/staticfile/on_Snapshot__26_staticfile.bg.js @@ -6,7 +6,7 @@ * Content-Type from the initial response. If it's a static file (PDF, image, etc.), * it downloads the content directly using CDP. * - * Usage: on_Snapshot__32_staticfile.bg.js --url= --snapshot-id= + * Usage: on_Snapshot__26_staticfile.bg.js --url= --snapshot-id= * Output: Downloads static file */ @@ -288,7 +288,7 @@ async function main() { const snapshotId = args.snapshot_id; if (!url || !snapshotId) { - console.error('Usage: on_Snapshot__32_staticfile.bg.js --url= --snapshot-id='); + console.error('Usage: on_Snapshot__26_staticfile.bg.js --url= --snapshot-id='); process.exit(1); } @@ -310,11 +310,15 @@ async function main() { // Set up static file listener BEFORE navigation await setupStaticFileListener(); - // Wait for chrome_navigate to complete (BLOCKING) - await waitForPageLoaded(CHROME_SESSION_DIR, timeout * 4, 500); + // Wait for chrome_navigate to complete (non-fatal) + try { + await waitForPageLoaded(CHROME_SESSION_DIR, timeout * 4, 500); + } catch (e) { + console.error(`WARN: ${e.message}`); + } // Keep process alive until killed by cleanup - console.error('Static file detection complete, waiting for cleanup signal...'); + // console.error('Static file detection complete, waiting for cleanup signal...'); // Keep the process alive indefinitely await new Promise(() => {}); // Never resolves diff --git a/archivebox/plugins/staticfile/templates/icon.html b/archivebox/plugins/staticfile/templates/icon.html index 1c681685..bc71e426 100644 --- a/archivebox/plugins/staticfile/templates/icon.html +++ b/archivebox/plugins/staticfile/templates/icon.html @@ -1 +1 @@ -📎 + diff --git a/archivebox/plugins/staticfile/tests/__init__.py b/archivebox/plugins/staticfile/tests/__init__.py deleted file mode 100644 index d60e588b..00000000 --- a/archivebox/plugins/staticfile/tests/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Tests for the staticfile plugin.""" diff --git a/archivebox/plugins/staticfile/tests/test_staticfile.py b/archivebox/plugins/staticfile/tests/test_staticfile.py index 3f4412ae..b99be87c 100644 --- a/archivebox/plugins/staticfile/tests/test_staticfile.py +++ b/archivebox/plugins/staticfile/tests/test_staticfile.py @@ -10,6 +10,7 @@ import shutil import subprocess import sys import tempfile +import time from pathlib import Path import pytest @@ -76,21 +77,33 @@ class TestStaticfileWithChrome(TestCase): # Use the environment from chrome_session (already has CHROME_HEADLESS=true) - # Run staticfile hook with the active Chrome session - result = subprocess.run( + # Run staticfile hook with the active Chrome session (background hook) + result = subprocess.Popen( ['node', str(STATICFILE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], cwd=str(snapshot_chrome_dir), - capture_output=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, text=True, - timeout=120, # Longer timeout as it waits for navigation env=env ) + # Allow it to run briefly, then terminate (background hook) + time.sleep(3) + if result.poll() is None: + result.terminate() + try: + stdout, stderr = result.communicate(timeout=5) + except subprocess.TimeoutExpired: + result.kill() + stdout, stderr = result.communicate() + else: + stdout, stderr = result.communicate() + # Verify hook ran without crash - self.assertNotIn('Traceback', result.stderr) + self.assertNotIn('Traceback', stderr) # Parse JSONL output to verify it recognized HTML as non-static - for line in result.stdout.split('\n'): + for line in stdout.split('\n'): line = line.strip() if line.startswith('{'): try: diff --git a/archivebox/plugins/title/templates/icon.html b/archivebox/plugins/title/templates/icon.html index 5a051312..0cc05a17 100644 --- a/archivebox/plugins/title/templates/icon.html +++ b/archivebox/plugins/title/templates/icon.html @@ -1 +1 @@ -📝 \ No newline at end of file + diff --git a/archivebox/plugins/twocaptcha/on_Crawl__05_twocaptcha_install.js b/archivebox/plugins/twocaptcha/on_Crawl__83_twocaptcha_install.js similarity index 93% rename from archivebox/plugins/twocaptcha/on_Crawl__05_twocaptcha_install.js rename to archivebox/plugins/twocaptcha/on_Crawl__83_twocaptcha_install.js index 5b2cb4e5..23a1b3f2 100755 --- a/archivebox/plugins/twocaptcha/on_Crawl__05_twocaptcha_install.js +++ b/archivebox/plugins/twocaptcha/on_Crawl__83_twocaptcha_install.js @@ -8,7 +8,7 @@ * Extension: https://chromewebstore.google.com/detail/ifibfemgeogfhoebkmokieepdoobkbpo * Documentation: https://2captcha.com/blog/how-to-use-2captcha-solver-extension-in-puppeteer * - * Priority: 01 (early) - Must install before Chrome session starts at Crawl level + * Priority: 83 - Must install before Chrome session starts at Crawl level * Hook: on_Crawl (runs once per crawl, not per snapshot) * * Requirements: @@ -28,7 +28,7 @@ const EXTENSION = { /** * Main entry point - install extension before archiving * - * Note: 2captcha configuration is handled by on_Crawl__25_twocaptcha_config.js + * Note: 2captcha configuration is handled by on_Crawl__95_twocaptcha_config.js * during first-time browser setup to avoid repeated configuration on every snapshot. * The API key is injected via chrome.storage API once per browser session. */ diff --git a/archivebox/plugins/twocaptcha/on_Crawl__25_twocaptcha_config.js b/archivebox/plugins/twocaptcha/on_Crawl__95_twocaptcha_config.js similarity index 99% rename from archivebox/plugins/twocaptcha/on_Crawl__25_twocaptcha_config.js rename to archivebox/plugins/twocaptcha/on_Crawl__95_twocaptcha_config.js index 282b0404..3fe8a10a 100755 --- a/archivebox/plugins/twocaptcha/on_Crawl__25_twocaptcha_config.js +++ b/archivebox/plugins/twocaptcha/on_Crawl__95_twocaptcha_config.js @@ -5,7 +5,7 @@ * Configures the 2captcha extension with API key and settings after Crawl-level Chrome session starts. * Runs once per crawl to inject configuration into extension storage. * - * Priority: 25 (after chrome_launch at 20, before snapshots start) + * Priority: 95 (after chrome_launch at 90, before snapshots start) * Hook: on_Crawl (runs once per crawl, not per snapshot) * * Config Options (from config.json / environment): @@ -346,7 +346,7 @@ async function main() { const snapshotId = args.snapshot_id; if (!url || !snapshotId) { - console.error('Usage: on_Crawl__25_twocaptcha_config.js --url= --snapshot-id='); + console.error('Usage: on_Crawl__95_twocaptcha_config.js --url= --snapshot-id='); process.exit(1); } diff --git a/archivebox/plugins/twocaptcha/tests/test_twocaptcha.py b/archivebox/plugins/twocaptcha/tests/test_twocaptcha.py index d8e65a37..5738cc05 100644 --- a/archivebox/plugins/twocaptcha/tests/test_twocaptcha.py +++ b/archivebox/plugins/twocaptcha/tests/test_twocaptcha.py @@ -26,8 +26,8 @@ from archivebox.plugins.chrome.tests.chrome_test_helpers import ( PLUGIN_DIR = Path(__file__).parent.parent -INSTALL_SCRIPT = PLUGIN_DIR / 'on_Crawl__05_twocaptcha_install.js' -CONFIG_SCRIPT = PLUGIN_DIR / 'on_Crawl__25_twocaptcha_config.js' +INSTALL_SCRIPT = PLUGIN_DIR / 'on_Crawl__83_twocaptcha_install.js' +CONFIG_SCRIPT = PLUGIN_DIR / 'on_Crawl__95_twocaptcha_config.js' TEST_URL = 'https://2captcha.com/demo/cloudflare-turnstile' diff --git a/archivebox/plugins/ublock/on_Crawl__03_ublock_install.js b/archivebox/plugins/ublock/on_Crawl__80_install_ublock_extension.js similarity index 95% rename from archivebox/plugins/ublock/on_Crawl__03_ublock_install.js rename to archivebox/plugins/ublock/on_Crawl__80_install_ublock_extension.js index deb1ada7..ea5fd474 100755 --- a/archivebox/plugins/ublock/on_Crawl__03_ublock_install.js +++ b/archivebox/plugins/ublock/on_Crawl__80_install_ublock_extension.js @@ -7,7 +7,7 @@ * * Extension: https://chromewebstore.google.com/detail/cjpalhdlnbpafiamejdnhcphjbkeiagm * - * Priority: 03 (early) - Must install before Chrome session starts at Crawl level + * Priority: 80 - Must install before Chrome session starts at Crawl level * Hook: on_Crawl (runs once per crawl, not per snapshot) * * This extension automatically: diff --git a/archivebox/plugins/wget/binaries.jsonl b/archivebox/plugins/wget/binaries.jsonl deleted file mode 100644 index 96965691..00000000 --- a/archivebox/plugins/wget/binaries.jsonl +++ /dev/null @@ -1 +0,0 @@ -{"type": "Binary", "name": "wget", "binproviders": "apt,brew,pip,env"} diff --git a/archivebox/plugins/wget/on_Crawl__06_wget_install.py b/archivebox/plugins/wget/on_Crawl__06_wget_install.py deleted file mode 100755 index 3e21596f..00000000 --- a/archivebox/plugins/wget/on_Crawl__06_wget_install.py +++ /dev/null @@ -1,146 +0,0 @@ -#!/usr/bin/env python3 -""" -Validate and compute derived wget config values. - -This hook runs early in the Crawl lifecycle to: -1. Validate config values with warnings (not hard errors) -2. Compute derived values (USE_WGET from WGET_ENABLED) -3. Check binary availability and version - -Output: - - COMPUTED:KEY=VALUE lines that hooks.py parses and adds to env - - Binary JSONL records to stdout when binaries are found -""" - -import json -import os -import shutil -import subprocess -import sys - -from abx_pkg import Binary, EnvProvider - - -# Read config from environment (already validated by JSONSchema) -def get_env(name: str, default: str = '') -> str: - return os.environ.get(name, default).strip() - -def get_env_bool(name: str, default: bool = False) -> bool: - val = get_env(name, '').lower() - if val in ('true', '1', 'yes', 'on'): - return True - if val in ('false', '0', 'no', 'off'): - return False - return default - -def get_env_int(name: str, default: int = 0) -> int: - try: - return int(get_env(name, str(default))) - except ValueError: - return default - - -def output_binary_found(binary: Binary, name: str): - """Output Binary JSONL record for an installed binary.""" - machine_id = os.environ.get('MACHINE_ID', '') - - record = { - 'type': 'Binary', - 'name': name, - 'abspath': str(binary.abspath), - 'version': str(binary.version) if binary.version else '', - 'sha256': binary.sha256 or '', - 'binprovider': 'env', # Already installed - 'machine_id': machine_id, - } - print(json.dumps(record)) - - -def output_binary_missing(name: str, binproviders: str): - """Output Binary JSONL record for a missing binary that needs installation.""" - machine_id = os.environ.get('MACHINE_ID', '') - - record = { - 'type': 'Binary', - 'name': name, - 'binproviders': binproviders, # Providers that can install it - 'machine_id': machine_id, - } - print(json.dumps(record)) - - -def main(): - warnings = [] - errors = [] - computed = {} - - # Get config values - wget_enabled = get_env_bool('WGET_ENABLED', True) - wget_save_warc = get_env_bool('WGET_SAVE_WARC', True) - wget_timeout = get_env_int('WGET_TIMEOUT') or get_env_int('TIMEOUT', 60) - wget_binary = get_env('WGET_BINARY', 'wget') - - # Compute derived values (USE_WGET for backward compatibility) - use_wget = wget_enabled - computed['USE_WGET'] = str(use_wget).lower() - - # Validate timeout with warning (not error) - if use_wget and wget_timeout < 20: - warnings.append( - f"WGET_TIMEOUT={wget_timeout} is very low. " - "wget may fail to archive sites if set to less than ~20 seconds. " - "Consider setting WGET_TIMEOUT=60 or higher." - ) - - # Check binary availability using abx-pkg - provider = EnvProvider() - try: - binary = Binary(name=wget_binary, binproviders=[provider]).load() - binary_path = str(binary.abspath) if binary.abspath else '' - except Exception: - binary = None - binary_path = '' - - if not binary_path: - # Binary not found - computed['WGET_BINARY'] = '' - if use_wget: - # Emit Binary record for installation - output_binary_missing(name='wget', binproviders='apt,brew') - else: - # Binary found - computed['WGET_BINARY'] = binary_path - wget_version = str(binary.version) if binary.version else 'unknown' - computed['WGET_VERSION'] = wget_version - - # Output Binary JSONL record for installed binary - output_binary_found(binary, name='wget') - - # Check for compression support - if computed.get('WGET_BINARY'): - try: - result = subprocess.run( - [computed['WGET_BINARY'], '--compression=auto', '--help'], - capture_output=True, timeout=5 - ) - computed['WGET_AUTO_COMPRESSION'] = 'true' if result.returncode == 0 else 'false' - except Exception: - computed['WGET_AUTO_COMPRESSION'] = 'false' - - # Output results - # Format: KEY=VALUE lines that hooks.py will parse and add to env - for key, value in computed.items(): - print(f"COMPUTED:{key}={value}") - - for warning in warnings: - print(f"WARNING:{warning}", file=sys.stderr) - - for error in errors: - print(f"ERROR:{error}", file=sys.stderr) - - # Exit with error if any hard errors - sys.exit(1 if errors else 0) - - -if __name__ == '__main__': - main() diff --git a/archivebox/plugins/wget/on_Crawl__10_wget_install.py b/archivebox/plugins/wget/on_Crawl__10_wget_install.py new file mode 100755 index 00000000..16d95332 --- /dev/null +++ b/archivebox/plugins/wget/on_Crawl__10_wget_install.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python3 +""" +Emit wget Binary dependency for the crawl. +""" + +import json +import os +import sys + + +# Read config from environment (already validated by JSONSchema) +def get_env(name: str, default: str = '') -> str: + return os.environ.get(name, default).strip() + +def get_env_bool(name: str, default: bool = False) -> bool: + val = get_env(name, '').lower() + if val in ('true', '1', 'yes', 'on'): + return True + if val in ('false', '0', 'no', 'off'): + return False + return default + +def get_env_int(name: str, default: int = 0) -> int: + try: + return int(get_env(name, str(default))) + except ValueError: + return default + + +def output_binary(name: str, binproviders: str): + """Output Binary JSONL record for a dependency.""" + machine_id = os.environ.get('MACHINE_ID', '') + + record = { + 'type': 'Binary', + 'name': name, + 'binproviders': binproviders, + 'machine_id': machine_id, + } + print(json.dumps(record)) + + +def output_machine_config(config: dict): + """Output Machine config JSONL patch.""" + if not config: + return + record = { + 'type': 'Machine', + 'config': config, + } + print(json.dumps(record)) + + +def main(): + warnings = [] + errors = [] + + # Get config values + wget_enabled = get_env_bool('WGET_ENABLED', True) + wget_save_warc = get_env_bool('WGET_SAVE_WARC', True) + wget_timeout = get_env_int('WGET_TIMEOUT') or get_env_int('TIMEOUT', 60) + wget_binary = get_env('WGET_BINARY', 'wget') + + # Compute derived values (USE_WGET for backward compatibility) + use_wget = wget_enabled + + # Validate timeout with warning (not error) + if use_wget and wget_timeout < 20: + warnings.append( + f"WGET_TIMEOUT={wget_timeout} is very low. " + "wget may fail to archive sites if set to less than ~20 seconds. " + "Consider setting WGET_TIMEOUT=60 or higher." + ) + + if use_wget: + output_binary(name='wget', binproviders='apt,brew,pip,env') + + # Output computed config patch as JSONL + output_machine_config({ + 'USE_WGET': use_wget, + 'WGET_BINARY': wget_binary, + }) + + for warning in warnings: + print(f"WARNING:{warning}", file=sys.stderr) + + for error in errors: + print(f"ERROR:{error}", file=sys.stderr) + + # Exit with error if any hard errors + sys.exit(1 if errors else 0) + + +if __name__ == '__main__': + main() diff --git a/archivebox/plugins/wget/on_Snapshot__61_wget.py b/archivebox/plugins/wget/on_Snapshot__06_wget.bg.py similarity index 92% rename from archivebox/plugins/wget/on_Snapshot__61_wget.py rename to archivebox/plugins/wget/on_Snapshot__06_wget.bg.py index 8d4372d5..bf60ea58 100644 --- a/archivebox/plugins/wget/on_Snapshot__61_wget.py +++ b/archivebox/plugins/wget/on_Snapshot__06_wget.bg.py @@ -2,7 +2,7 @@ """ Archive a URL using wget. -Usage: on_Snapshot__wget.py --url= --snapshot-id= +Usage: on_Snapshot__06_wget.bg.py --url= --snapshot-id= Output: Downloads files to $PWD Environment variables: @@ -74,7 +74,22 @@ STATICFILE_DIR = '../staticfile' def has_staticfile_output() -> bool: """Check if staticfile extractor already downloaded this URL.""" staticfile_dir = Path(STATICFILE_DIR) - return staticfile_dir.exists() and any(staticfile_dir.iterdir()) + if not staticfile_dir.exists(): + return False + stdout_log = staticfile_dir / 'stdout.log' + if not stdout_log.exists(): + return False + for line in stdout_log.read_text(errors='ignore').splitlines(): + line = line.strip() + if not line.startswith('{'): + continue + try: + record = json.loads(line) + except json.JSONDecodeError: + continue + if record.get('type') == 'ArchiveResult' and record.get('status') == 'succeeded': + return True + return False diff --git a/archivebox/plugins/wget/templates/icon.html b/archivebox/plugins/wget/templates/icon.html index fdf8df21..430432cf 100644 --- a/archivebox/plugins/wget/templates/icon.html +++ b/archivebox/plugins/wget/templates/icon.html @@ -1 +1 @@ -📥 \ No newline at end of file + diff --git a/archivebox/plugins/wget/tests/test_wget.py b/archivebox/plugins/wget/tests/test_wget.py index 4d891904..52c1fc55 100644 --- a/archivebox/plugins/wget/tests/test_wget.py +++ b/archivebox/plugins/wget/tests/test_wget.py @@ -300,7 +300,7 @@ def test_staticfile_present_skips(): # wget/ <- wget extractor runs here, looks for ../staticfile staticfile_dir = tmpdir / 'staticfile' staticfile_dir.mkdir() - (staticfile_dir / 'index.html').write_text('test') + (staticfile_dir / 'stdout.log').write_text('{"type":"ArchiveResult","status":"succeeded","output_str":"index.html"}\n') wget_dir = tmpdir / 'wget' wget_dir.mkdir() diff --git a/archivebox/plugins/ytdlp/binaries.jsonl b/archivebox/plugins/ytdlp/binaries.jsonl deleted file mode 100644 index 05240fd2..00000000 --- a/archivebox/plugins/ytdlp/binaries.jsonl +++ /dev/null @@ -1,3 +0,0 @@ -{"type": "Binary", "name": "yt-dlp", "binproviders": "pip,brew,apt,env", "overrides": {"pip": {"packages": "yt-dlp[default]"}}} -{"type": "Binary", "name": "node", "binproviders": "apt,brew,env", "overrides": {"apt": {"packages": ["nodejs"]}}} -{"type": "Binary", "name": "ffmpeg", "binproviders": "apt,brew,env"} diff --git a/archivebox/plugins/ytdlp/on_Crawl__07_ytdlp_install.py b/archivebox/plugins/ytdlp/on_Crawl__07_ytdlp_install.py deleted file mode 100755 index 212d21bb..00000000 --- a/archivebox/plugins/ytdlp/on_Crawl__07_ytdlp_install.py +++ /dev/null @@ -1,80 +0,0 @@ -#!/usr/bin/env python3 -""" -Detect yt-dlp binary and emit Binary JSONL record. - -Output: Binary JSONL record to stdout if yt-dlp is found -""" - -import json -import os -import sys - -from abx_pkg import Binary, EnvProvider - - -def get_env(name: str, default: str = '') -> str: - return os.environ.get(name, default).strip() - -def get_env_bool(name: str, default: bool = False) -> bool: - val = get_env(name, '').lower() - if val in ('true', '1', 'yes', 'on'): - return True - if val in ('false', '0', 'no', 'off'): - return False - return default - - -def output_binary_found(binary: Binary, name: str): - """Output Binary JSONL record for an installed binary.""" - machine_id = os.environ.get('MACHINE_ID', '') - - record = { - 'type': 'Binary', - 'name': name, - 'abspath': str(binary.abspath), - 'version': str(binary.version) if binary.version else '', - 'sha256': binary.sha256 or '', - 'binprovider': 'env', # Already installed - 'machine_id': machine_id, - } - print(json.dumps(record)) - - -def output_binary_missing(name: str, binproviders: str): - """Output Binary JSONL record for a missing binary that needs installation.""" - machine_id = os.environ.get('MACHINE_ID', '') - - record = { - 'type': 'Binary', - 'name': name, - 'binproviders': binproviders, # Providers that can install it - 'machine_id': machine_id, - } - print(json.dumps(record)) - - -def main(): - ytdlp_enabled = get_env_bool('YTDLP_ENABLED', True) - ytdlp_binary = get_env('YTDLP_BINARY', 'yt-dlp') - - if not ytdlp_enabled: - sys.exit(0) - - provider = EnvProvider() - try: - binary = Binary(name=ytdlp_binary, binproviders=[provider]).load() - if binary.abspath: - # Binary found - output_binary_found(binary, name='yt-dlp') - else: - # Binary not found - output_binary_missing(name='yt-dlp', binproviders='pip,brew,apt') - except Exception: - # Binary not found - output_binary_missing(name='yt-dlp', binproviders='pip,brew,apt') - - sys.exit(0) - - -if __name__ == '__main__': - main() diff --git a/archivebox/plugins/ytdlp/on_Crawl__15_ytdlp_install.py b/archivebox/plugins/ytdlp/on_Crawl__15_ytdlp_install.py new file mode 100755 index 00000000..7b81b5d9 --- /dev/null +++ b/archivebox/plugins/ytdlp/on_Crawl__15_ytdlp_install.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +""" +Emit yt-dlp (and related) Binary dependencies for the crawl. +""" + +import json +import os +import sys + + +def get_env(name: str, default: str = '') -> str: + return os.environ.get(name, default).strip() + +def get_env_bool(name: str, default: bool = False) -> bool: + val = get_env(name, '').lower() + if val in ('true', '1', 'yes', 'on'): + return True + if val in ('false', '0', 'no', 'off'): + return False + return default + + +def output_binary(name: str, binproviders: str, overrides: dict | None = None): + """Output Binary JSONL record for a dependency.""" + machine_id = os.environ.get('MACHINE_ID', '') + + record = { + 'type': 'Binary', + 'name': name, + 'binproviders': binproviders, + 'machine_id': machine_id, + } + if overrides: + record['overrides'] = overrides + print(json.dumps(record)) + + +def main(): + ytdlp_enabled = get_env_bool('YTDLP_ENABLED', True) + + if not ytdlp_enabled: + sys.exit(0) + + output_binary( + name='yt-dlp', + binproviders='pip,brew,apt,env', + overrides={'pip': {'packages': ['yt-dlp[default]']}}, + ) + + # Node.js (required by several JS-based extractors, declared here per legacy binaries.jsonl) + output_binary( + name='node', + binproviders='apt,brew,env', + overrides={'apt': {'packages': ['nodejs']}}, + ) + + # ffmpeg (used by media extraction) + output_binary(name='ffmpeg', binproviders='apt,brew,env') + + sys.exit(0) + + +if __name__ == '__main__': + main() diff --git a/archivebox/plugins/ytdlp/on_Snapshot__63_ytdlp.bg.py b/archivebox/plugins/ytdlp/on_Snapshot__02_ytdlp.bg.py similarity index 81% rename from archivebox/plugins/ytdlp/on_Snapshot__63_ytdlp.bg.py rename to archivebox/plugins/ytdlp/on_Snapshot__02_ytdlp.bg.py index d8faae21..633765ef 100644 --- a/archivebox/plugins/ytdlp/on_Snapshot__63_ytdlp.bg.py +++ b/archivebox/plugins/ytdlp/on_Snapshot__02_ytdlp.bg.py @@ -2,7 +2,7 @@ """ Download video/audio from a URL using yt-dlp. -Usage: on_Snapshot__ytdlp.py --url= --snapshot-id= +Usage: on_Snapshot__02_ytdlp.bg.py --url= --snapshot-id= Output: Downloads video/audio files to $PWD Environment variables: @@ -21,6 +21,7 @@ import json import os import subprocess import sys +import threading from pathlib import Path import rich_click as click @@ -67,7 +68,22 @@ STATICFILE_DIR = '../staticfile' def has_staticfile_output() -> bool: """Check if staticfile extractor already downloaded this URL.""" staticfile_dir = Path(STATICFILE_DIR) - return staticfile_dir.exists() and any(staticfile_dir.iterdir()) + if not staticfile_dir.exists(): + return False + stdout_log = staticfile_dir / 'stdout.log' + if not stdout_log.exists(): + return False + for line in stdout_log.read_text(errors='ignore').splitlines(): + line = line.strip() + if not line.startswith('{'): + continue + try: + record = json.loads(line) + except json.JSONDecodeError: + continue + if record.get('type') == 'ArchiveResult' and record.get('status') == 'succeeded': + return True + return False def save_ytdlp(url: str, binary: str) -> tuple[bool, str | None, str]: @@ -106,10 +122,42 @@ def save_ytdlp(url: str, binary: str) -> tuple[bool, str | None, str]: if ytdlp_args_extra: cmd.extend(ytdlp_args_extra) + if '--newline' not in cmd: + cmd.append('--newline') + cmd.append(url) try: - result = subprocess.run(cmd, capture_output=True, timeout=timeout, text=True) + print(f'[ytdlp] Starting download (timeout={timeout}s)', file=sys.stderr) + + output_lines: list[str] = [] + process = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1, + ) + + def _read_output() -> None: + if not process.stdout: + return + for line in process.stdout: + output_lines.append(line) + sys.stderr.write(line) + + reader = threading.Thread(target=_read_output, daemon=True) + reader.start() + + try: + process.wait(timeout=timeout) + except subprocess.TimeoutExpired: + process.kill() + reader.join(timeout=1) + return False, None, f'Timed out after {timeout} seconds' + + reader.join(timeout=1) + combined_output = ''.join(output_lines) # Check if any media files were downloaded media_extensions = ( @@ -134,7 +182,7 @@ def save_ytdlp(url: str, binary: str) -> tuple[bool, str | None, str]: output = str(video_audio[0]) if video_audio else str(downloaded_files[0]) return True, output, '' else: - stderr = result.stderr + stderr = combined_output # These are NOT errors - page simply has no downloadable media # Return success with no output (legitimate "nothing to download") @@ -142,7 +190,7 @@ def save_ytdlp(url: str, binary: str) -> tuple[bool, str | None, str]: return True, None, '' # Not a media site - success, no output if 'URL could be a direct video link' in stderr: return True, None, '' # Not a supported media URL - success, no output - if result.returncode == 0: + if process.returncode == 0: return True, None, '' # yt-dlp exited cleanly, just no media - success # These ARE errors - something went wrong diff --git a/archivebox/plugins/ytdlp/templates/icon.html b/archivebox/plugins/ytdlp/templates/icon.html index b17d15b8..bf0e4ee4 100644 --- a/archivebox/plugins/ytdlp/templates/icon.html +++ b/archivebox/plugins/ytdlp/templates/icon.html @@ -1 +1 @@ -🎬 \ No newline at end of file + diff --git a/archivebox/templates/core/snapshot_live.html b/archivebox/templates/core/snapshot_live.html index ccecaef4..1de0c2db 100644 --- a/archivebox/templates/core/snapshot_live.html +++ b/archivebox/templates/core/snapshot_live.html @@ -439,13 +439,13 @@
{% if result.result %} {# Use plugin-specific thumbnail template when ArchiveResult is available #}
- {% extractor_thumbnail result.result %} + {% plugin_thumbnail result.result %}
{% else %} {# Fall back to generic iframe for filesystem-discovered files #} @@ -476,7 +476,7 @@ {% if best_result.result %} {# Use plugin-specific fullscreen template when ArchiveResult is available #}
- {% extractor_fullscreen best_result.result %} + {% plugin_fullscreen best_result.result %}
{% else %} {# Fall back to generic iframe #} diff --git a/archivebox/templates/static/admin.css b/archivebox/templates/static/admin.css index 63bf87b2..0afdfe72 100755 --- a/archivebox/templates/static/admin.css +++ b/archivebox/templates/static/admin.css @@ -403,6 +403,38 @@ body.model-snapshot.change-list #content .object-tools { margin-top: 1px; } +.files-icons { + display: inline-flex; + flex-wrap: wrap; + gap: 4px; + vertical-align: middle; +} + +.files-icons a { + display: inline-flex; + align-items: center; + justify-content: center; + text-decoration: none; +} + +.files-icons .abx-output-icon { + width: 18px; + height: 18px; + display: inline-flex; + align-items: center; + justify-content: center; + border-radius: 4px; + color: #1f2937; + background: rgba(15, 23, 42, 0.08); + box-shadow: inset 0 0 0 1px rgba(15, 23, 42, 0.08); +} + +.files-icons .abx-output-icon svg { + width: 14px; + height: 14px; + display: block; +} + .exists-False { opacity: 0.1; filter: grayscale(100%); diff --git a/archivebox/tests/conftest.py b/archivebox/tests/conftest.py index ff6f1875..ed2e5316 100644 --- a/archivebox/tests/conftest.py +++ b/archivebox/tests/conftest.py @@ -2,7 +2,6 @@ import os import sys -import json import subprocess from pathlib import Path from typing import List, Dict, Any, Optional, Tuple @@ -110,16 +109,9 @@ def initialized_archive(isolated_data_dir): # ============================================================================= def parse_jsonl_output(stdout: str) -> List[Dict[str, Any]]: - """Parse JSONL output into list of dicts.""" - records = [] - for line in stdout.strip().split('\n'): - line = line.strip() - if line and line.startswith('{'): - try: - records.append(json.loads(line)) - except json.JSONDecodeError: - pass - return records + """Parse JSONL output into list of dicts via Process parser.""" + from archivebox.machine.models import Process + return Process.parse_records_from_text(stdout or '') def assert_jsonl_contains_type(stdout: str, record_type: str, min_count: int = 1): diff --git a/archivebox/tests/test_cli_add_interrupt.py b/archivebox/tests/test_cli_add_interrupt.py new file mode 100644 index 00000000..a9343391 --- /dev/null +++ b/archivebox/tests/test_cli_add_interrupt.py @@ -0,0 +1,133 @@ +import os +import signal +import sqlite3 +import subprocess +import sys +import time +from pathlib import Path + + +def _run(cmd, data_dir: Path, env: dict, timeout: int = 120): + return subprocess.run( + cmd, + cwd=data_dir, + env=env, + capture_output=True, + text=True, + timeout=timeout, + ) + + +def _make_env(data_dir: Path) -> dict: + env = os.environ.copy() + env["DATA_DIR"] = str(data_dir) + env["USE_COLOR"] = "False" + env["SHOW_PROGRESS"] = "False" + env["ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS"] = "true" + env["PLUGINS"] = "title,favicon" + # Keep it fast but still real hooks + env["SAVE_TITLE"] = "True" + env["SAVE_FAVICON"] = "True" + env["SAVE_WGET"] = "False" + env["SAVE_WARC"] = "False" + env["SAVE_PDF"] = "False" + env["SAVE_SCREENSHOT"] = "False" + env["SAVE_DOM"] = "False" + env["SAVE_SINGLEFILE"] = "False" + env["SAVE_READABILITY"] = "False" + env["SAVE_MERCURY"] = "False" + env["SAVE_GIT"] = "False" + env["SAVE_YTDLP"] = "False" + env["SAVE_HEADERS"] = "False" + env["SAVE_HTMLTOTEXT"] = "False" + return env + + +def _count_running_processes(db_path: Path, where: str) -> int: + for _ in range(50): + try: + conn = sqlite3.connect(db_path, timeout=1) + cur = conn.cursor() + count = cur.execute( + f"SELECT COUNT(*) FROM machine_process WHERE status = 'running' AND {where}" + ).fetchone()[0] + conn.close() + return count + except sqlite3.OperationalError: + time.sleep(0.1) + return 0 + + +def _wait_for_count(db_path: Path, where: str, target: int, timeout: int = 20) -> bool: + start = time.time() + while time.time() - start < timeout: + if _count_running_processes(db_path, where) >= target: + return True + time.sleep(0.1) + return False + + +def test_add_parents_workers_to_orchestrator(tmp_path): + data_dir = tmp_path / "data" + data_dir.mkdir() + env = _make_env(data_dir) + + init = _run([sys.executable, "-m", "archivebox", "init", "--quick"], data_dir, env) + assert init.returncode == 0, init.stderr + + add = _run([sys.executable, "-m", "archivebox", "add", "https://example.com"], data_dir, env, timeout=120) + assert add.returncode == 0, add.stderr + + conn = sqlite3.connect(data_dir / "index.sqlite3") + cur = conn.cursor() + orchestrator = cur.execute( + "SELECT id FROM machine_process WHERE process_type = 'orchestrator' ORDER BY created_at DESC LIMIT 1" + ).fetchone() + assert orchestrator is not None + orchestrator_id = orchestrator[0] + + worker_count = cur.execute( + "SELECT COUNT(*) FROM machine_process WHERE process_type = 'worker' AND worker_type = 'crawl' " + "AND parent_id = ?", + (orchestrator_id,), + ).fetchone()[0] + conn.close() + + assert worker_count >= 1, "Expected crawl worker to be parented to orchestrator" + + +def test_add_interrupt_cleans_orphaned_processes(tmp_path): + data_dir = tmp_path / "data" + data_dir.mkdir() + env = _make_env(data_dir) + + init = _run([sys.executable, "-m", "archivebox", "init", "--quick"], data_dir, env) + assert init.returncode == 0, init.stderr + + proc = subprocess.Popen( + [sys.executable, "-m", "archivebox", "add", "https://example.com"], + cwd=data_dir, + env=env, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + + db_path = data_dir / "index.sqlite3" + saw_worker = _wait_for_count(db_path, "process_type = 'worker'", 1, timeout=20) + assert saw_worker, "Expected at least one worker to start before interrupt" + + proc.send_signal(signal.SIGINT) + proc.wait(timeout=30) + + # Wait for workers/hooks to be cleaned up + start = time.time() + while time.time() - start < 30: + running = _count_running_processes(db_path, "process_type IN ('worker','hook')") + if running == 0: + break + time.sleep(0.2) + + assert _count_running_processes(db_path, "process_type IN ('worker','hook')") == 0, ( + "Expected no running worker/hook processes after interrupt" + ) diff --git a/archivebox/tests/test_hooks.py b/archivebox/tests/test_hooks.py index 54ac210a..308633ba 100755 --- a/archivebox/tests/test_hooks.py +++ b/archivebox/tests/test_hooks.py @@ -68,17 +68,8 @@ class TestJSONLParsing(unittest.TestCase): def test_parse_clean_jsonl(self): """Clean JSONL format should be parsed correctly.""" stdout = '{"type": "ArchiveResult", "status": "succeeded", "output_str": "Done"}' - records = [] - for line in stdout.splitlines(): - line = line.strip() - if not line or not line.startswith('{'): - continue - try: - data = json.loads(line) - if 'type' in data: - records.append(data) - except json.JSONDecodeError: - pass + from archivebox.machine.models import Process + records = Process.parse_records_from_text(stdout) self.assertEqual(len(records), 1) self.assertEqual(records[0]['type'], 'ArchiveResult') @@ -89,17 +80,8 @@ class TestJSONLParsing(unittest.TestCase): """Multiple JSONL records should all be parsed.""" stdout = '''{"type": "ArchiveResult", "status": "succeeded", "output_str": "Done"} {"type": "Binary", "name": "wget", "abspath": "/usr/bin/wget"}''' - records = [] - for line in stdout.splitlines(): - line = line.strip() - if not line or not line.startswith('{'): - continue - try: - data = json.loads(line) - if 'type' in data: - records.append(data) - except json.JSONDecodeError: - pass + from archivebox.machine.models import Process + records = Process.parse_records_from_text(stdout) self.assertEqual(len(records), 2) self.assertEqual(records[0]['type'], 'ArchiveResult') @@ -111,59 +93,20 @@ class TestJSONLParsing(unittest.TestCase): Processing URL: https://example.com {"type": "ArchiveResult", "status": "succeeded", "output_str": "Downloaded"} Hook completed successfully''' - records = [] - for line in stdout.splitlines(): - line = line.strip() - if not line or not line.startswith('{'): - continue - try: - data = json.loads(line) - if 'type' in data: - records.append(data) - except json.JSONDecodeError: - pass + from archivebox.machine.models import Process + records = Process.parse_records_from_text(stdout) self.assertEqual(len(records), 1) self.assertEqual(records[0]['status'], 'succeeded') - def test_parse_legacy_result_json_format(self): - """Legacy RESULT_JSON= format should be parsed for backwards compat.""" - stdout = 'RESULT_JSON={"status": "succeeded", "output": "Done"}' - output_json = None - records = [] - for line in stdout.splitlines(): - line = line.strip() - if line.startswith('RESULT_JSON='): - try: - data = json.loads(line[len('RESULT_JSON='):]) - if output_json is None: - output_json = data - data['type'] = 'ArchiveResult' - records.append(data) - except json.JSONDecodeError: - pass - - self.assertEqual(len(records), 1) - self.assertEqual(records[0]['type'], 'ArchiveResult') - self.assertEqual(records[0]['status'], 'succeeded') - def test_ignore_invalid_json(self): """Invalid JSON should be silently ignored.""" stdout = '''{"type": "ArchiveResult", "status": "succeeded"} {invalid json here} not json at all {"type": "Binary", "name": "wget"}''' - records = [] - for line in stdout.splitlines(): - line = line.strip() - if not line or not line.startswith('{'): - continue - try: - data = json.loads(line) - if 'type' in data: - records.append(data) - except json.JSONDecodeError: - pass + from archivebox.machine.models import Process + records = Process.parse_records_from_text(stdout) self.assertEqual(len(records), 2) @@ -171,17 +114,8 @@ not json at all """JSON objects without 'type' field should be ignored.""" stdout = '''{"status": "succeeded", "output_str": "Done"} {"type": "ArchiveResult", "status": "succeeded"}''' - records = [] - for line in stdout.splitlines(): - line = line.strip() - if not line or not line.startswith('{'): - continue - try: - data = json.loads(line) - if 'type' in data: - records.append(data) - except json.JSONDecodeError: - pass + from archivebox.machine.models import Process + records = Process.parse_records_from_text(stdout) self.assertEqual(len(records), 1) self.assertEqual(records[0]['type'], 'ArchiveResult') @@ -250,9 +184,9 @@ class TestHookDiscovery(unittest.TestCase): (wget_dir / 'on_Snapshot__50_wget.py').write_text('# test hook') (wget_dir / 'on_Crawl__00_install_wget.py').write_text('# install hook') - chrome_dir = self.plugins_dir / 'chrome_session' + chrome_dir = self.plugins_dir / 'chrome' chrome_dir.mkdir() - (chrome_dir / 'on_Snapshot__20_chrome_session.bg.js').write_text('// background hook') + (chrome_dir / 'on_Snapshot__20_chrome_tab.bg.js').write_text('// background hook') consolelog_dir = self.plugins_dir / 'consolelog' consolelog_dir.mkdir() @@ -274,7 +208,7 @@ class TestHookDiscovery(unittest.TestCase): self.assertEqual(len(hooks), 3) hook_names = [h.name for h in hooks] - self.assertIn('on_Snapshot__20_chrome_session.bg.js', hook_names) + self.assertIn('on_Snapshot__20_chrome_tab.bg.js', hook_names) self.assertIn('on_Snapshot__21_consolelog.bg.js', hook_names) self.assertIn('on_Snapshot__50_wget.py', hook_names) @@ -288,7 +222,7 @@ class TestHookDiscovery(unittest.TestCase): hooks = sorted(set(hooks), key=lambda p: p.name) # Check numeric ordering - self.assertEqual(hooks[0].name, 'on_Snapshot__20_chrome_session.js') + self.assertEqual(hooks[0].name, 'on_Snapshot__20_chrome_tab.bg.js') self.assertEqual(hooks[1].name, 'on_Snapshot__21_consolelog.bg.js') self.assertEqual(hooks[2].name, 'on_Snapshot__50_wget.py') @@ -348,9 +282,11 @@ print(json.dumps({"type": "ArchiveResult", "status": "succeeded", "output_str": ) self.assertEqual(result.returncode, 0) - output = json.loads(result.stdout.strip()) - self.assertEqual(output['type'], 'ArchiveResult') - self.assertEqual(output['status'], 'succeeded') + from archivebox.machine.models import Process + records = Process.parse_records_from_text(result.stdout) + self.assertTrue(records) + self.assertEqual(records[0]['type'], 'ArchiveResult') + self.assertEqual(records[0]['status'], 'succeeded') def test_js_hook_execution(self): """JavaScript hook should execute and output JSONL.""" @@ -371,9 +307,11 @@ console.log(JSON.stringify({type: 'ArchiveResult', status: 'succeeded', output_s ) self.assertEqual(result.returncode, 0) - output = json.loads(result.stdout.strip()) - self.assertEqual(output['type'], 'ArchiveResult') - self.assertEqual(output['status'], 'succeeded') + from archivebox.machine.models import Process + records = Process.parse_records_from_text(result.stdout) + self.assertTrue(records) + self.assertEqual(records[0]['type'], 'ArchiveResult') + self.assertEqual(records[0]['status'], 'succeeded') def test_hook_receives_cli_args(self): """Hook should receive CLI arguments.""" @@ -398,8 +336,10 @@ print(json.dumps({"type": "ArchiveResult", "status": "succeeded", "url": args.ge ) self.assertEqual(result.returncode, 0) - output = json.loads(result.stdout.strip()) - self.assertEqual(output['url'], 'https://example.com') + from archivebox.machine.models import Process + records = Process.parse_records_from_text(result.stdout) + self.assertTrue(records) + self.assertEqual(records[0]['url'], 'https://example.com') class TestInstallHookOutput(unittest.TestCase): @@ -424,7 +364,8 @@ class TestInstallHookOutput(unittest.TestCase): 'binprovider': 'apt', }) - data = json.loads(hook_output) + from archivebox.machine.models import Process + data = Process.parse_records_from_text(hook_output)[0] self.assertEqual(data['type'], 'Binary') self.assertEqual(data['name'], 'wget') self.assertTrue(data['abspath'].startswith('/')) @@ -433,15 +374,16 @@ class TestInstallHookOutput(unittest.TestCase): """Install hook should output Machine config update JSONL.""" hook_output = json.dumps({ 'type': 'Machine', - '_method': 'update', - 'key': 'config/WGET_BINARY', - 'value': '/usr/bin/wget', + 'config': { + 'WGET_BINARY': '/usr/bin/wget', + }, }) - data = json.loads(hook_output) + from archivebox.machine.models import Process + data = Process.parse_records_from_text(hook_output)[0] self.assertEqual(data['type'], 'Machine') - self.assertEqual(data['_method'], 'update') - self.assertEqual(data['key'], 'config/WGET_BINARY') + self.assertIn('config', data) + self.assertEqual(data['config']['WGET_BINARY'], '/usr/bin/wget') class TestSnapshotHookOutput(unittest.TestCase): @@ -455,7 +397,8 @@ class TestSnapshotHookOutput(unittest.TestCase): 'output_str': 'Downloaded 5 files', }) - data = json.loads(hook_output) + from archivebox.machine.models import Process + data = Process.parse_records_from_text(hook_output)[0] self.assertEqual(data['type'], 'ArchiveResult') self.assertEqual(data['status'], 'succeeded') self.assertIn('output_str', data) @@ -469,7 +412,8 @@ class TestSnapshotHookOutput(unittest.TestCase): 'cmd': ['/usr/bin/wget', '-p', '-k', 'https://example.com'], }) - data = json.loads(hook_output) + from archivebox.machine.models import Process + data = Process.parse_records_from_text(hook_output)[0] self.assertEqual(data['type'], 'ArchiveResult') self.assertIsInstance(data['cmd'], list) self.assertEqual(data['cmd'][0], '/usr/bin/wget') @@ -487,7 +431,8 @@ class TestSnapshotHookOutput(unittest.TestCase): }, }) - data = json.loads(hook_output) + from archivebox.machine.models import Process + data = Process.parse_records_from_text(hook_output)[0] self.assertEqual(data['type'], 'ArchiveResult') self.assertIsInstance(data['output_json'], dict) self.assertEqual(data['output_json']['status-code'], 200) @@ -500,7 +445,8 @@ class TestSnapshotHookOutput(unittest.TestCase): 'output_str': 'SAVE_WGET=False', }) - data = json.loads(hook_output) + from archivebox.machine.models import Process + data = Process.parse_records_from_text(hook_output)[0] self.assertEqual(data['status'], 'skipped') def test_snapshot_hook_failed_status(self): @@ -511,7 +457,8 @@ class TestSnapshotHookOutput(unittest.TestCase): 'output_str': '404 Not Found', }) - data = json.loads(hook_output) + from archivebox.machine.models import Process + data = Process.parse_records_from_text(hook_output)[0] self.assertEqual(data['status'], 'failed') diff --git a/archivebox/tests/test_list.py b/archivebox/tests/test_list.py index b46596fa..d527fa5d 100644 --- a/archivebox/tests/test_list.py +++ b/archivebox/tests/test_list.py @@ -18,11 +18,10 @@ def test_search_json(process, disable_extractors_dict): clean_str = re.sub(r'\x1b\[[0-9;]*m', '', output_str) clean_str = re.sub(r'[\x00-\x1f\x7f]', lambda m: ' ' if m.group(0) in '\t\n\r' else '', clean_str) output_json = json.loads(clean_str) - # With --index-only, only source file snapshots are created (file:// URLs) # Verify we get at least one snapshot back assert len(output_json) >= 1 - # The snapshot should be a file:// URL pointing to sources - assert any("sources" in entry.get("url", "") for entry in output_json) + # Should include the requested URL + assert any("example.com" in entry.get("url", "") for entry in output_json) def test_search_json_headers(process, disable_extractors_dict): @@ -65,16 +64,17 @@ def test_search_csv(process, disable_extractors_dict): capture_output=True, env=disable_extractors_dict) search_process = subprocess.run(["archivebox", "search", "--csv", "url"], capture_output=True) output_csv = search_process.stdout.decode("utf-8") - # Should contain the source file URL - assert "file://" in output_csv or "sources" in output_csv + # Should contain the requested URL + assert "example.com" in output_csv def test_search_csv_headers(process, disable_extractors_dict): subprocess.run(["archivebox", "add", "--index-only", "https://example.com", "--depth=0"], capture_output=True, env=disable_extractors_dict) search_process = subprocess.run(["archivebox", "search", "--csv", "url", "--with-headers"], capture_output=True) output_csv = search_process.stdout.decode("utf-8") - # Should have url header and source file content + # Should have url header and requested URL assert "url" in output_csv + assert "example.com" in output_csv def test_search_with_headers_requires_format(process): search_process = subprocess.run(["archivebox", "search", "--with-headers"], capture_output=True) diff --git a/archivebox/tests/test_real_world_add.py b/archivebox/tests/test_real_world_add.py new file mode 100644 index 00000000..3c72e622 --- /dev/null +++ b/archivebox/tests/test_real_world_add.py @@ -0,0 +1,133 @@ +import os +import sqlite3 +import subprocess +from pathlib import Path + + +def _find_snapshot_dir(data_dir: Path, snapshot_id: str) -> Path | None: + candidates = {snapshot_id} + if len(snapshot_id) == 32: + hyphenated = f"{snapshot_id[:8]}-{snapshot_id[8:12]}-{snapshot_id[12:16]}-{snapshot_id[16:20]}-{snapshot_id[20:]}" + candidates.add(hyphenated) + elif len(snapshot_id) == 36 and '-' in snapshot_id: + candidates.add(snapshot_id.replace('-', '')) + + for needle in candidates: + for path in data_dir.rglob(needle): + if path.is_dir(): + return path + return None + + +def _find_html_with_text(root: Path, needle: str) -> list[Path]: + hits: list[Path] = [] + for path in root.rglob("*.htm*"): + if not path.is_file(): + continue + try: + if needle in path.read_text(errors="ignore"): + hits.append(path) + except Exception: + continue + return hits + + +def test_add_real_world_example_domain(tmp_path): + os.chdir(tmp_path) + tmp_short = Path("/tmp") / f"abx-{tmp_path.name}" + tmp_short.mkdir(parents=True, exist_ok=True) + env = os.environ.copy() + env["TMP_DIR"] = str(tmp_short) + env["ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS"] = "true" + + init = subprocess.run( + ["archivebox", "init"], + capture_output=True, + text=True, + timeout=120, + env=env, + ) + assert init.returncode == 0, f"archivebox init failed: {init.stderr}" + + result = subprocess.run( + ["archivebox", "add", "https://example.com"], + capture_output=True, + text=True, + timeout=900, + env=env, + ) + assert result.returncode == 0, ( + "archivebox add failed.\n" + f"stdout:\n{result.stdout}\n" + f"stderr:\n{result.stderr}" + ) + + conn = sqlite3.connect(tmp_path / "index.sqlite3") + c = conn.cursor() + snapshot_row = c.execute( + "SELECT id, url, title FROM core_snapshot WHERE url = ?", + ("https://example.com",), + ).fetchone() + assert snapshot_row is not None, "Snapshot for https://example.com not found in DB" + snapshot_id, snapshot_url, snapshot_title = snapshot_row + assert snapshot_title and "Example Domain" in snapshot_title, ( + f"Expected title to contain Example Domain, got: {snapshot_title}" + ) + + failed_results = c.execute( + "SELECT COUNT(*) FROM core_archiveresult WHERE snapshot_id = ? AND status = 'failed'", + (snapshot_id,), + ).fetchone()[0] + assert failed_results == 0, "Some archive results failed for example.com snapshot" + + binary_workers = c.execute( + "SELECT COUNT(*) FROM machine_process WHERE process_type = 'worker' AND worker_type = 'binary'" + ).fetchone()[0] + assert binary_workers > 0, "Expected BinaryWorker to run installs via BinaryMachine" + + failed_binary_workers = c.execute( + "SELECT COUNT(*) FROM machine_process WHERE process_type = 'worker' AND worker_type = 'binary' " + "AND exit_code IS NOT NULL AND exit_code != 0" + ).fetchone()[0] + assert failed_binary_workers == 0, "BinaryWorker reported non-zero exit codes" + + queued_binaries = c.execute( + "SELECT name FROM machine_binary WHERE status != 'installed'" + ).fetchall() + assert not queued_binaries, f"Some binaries did not install: {queued_binaries}" + conn.close() + + snapshot_dir = _find_snapshot_dir(tmp_path, str(snapshot_id)) + assert snapshot_dir is not None, "Snapshot output directory not found" + + title_path = snapshot_dir / "title" / "title.txt" + assert title_path.exists(), f"Missing title output: {title_path}" + assert "Example Domain" in title_path.read_text(errors="ignore") + + html_sources = [] + for candidate in ("wget", "singlefile", "dom"): + for candidate_dir in (snapshot_dir / candidate, *snapshot_dir.glob(f"*_{candidate}")): + if candidate_dir.exists(): + html_sources.extend(_find_html_with_text(candidate_dir, "Example Domain")) + assert len(html_sources) >= 2, ( + "Expected HTML outputs from multiple extractors to contain Example Domain " + f"(found {len(html_sources)})." + ) + + text_hits = 0 + for path in ( + *snapshot_dir.glob("*_readability/content.txt"), + snapshot_dir / "readability" / "content.txt", + ): + if path.exists() and "Example Domain" in path.read_text(errors="ignore"): + text_hits += 1 + for path in ( + *snapshot_dir.glob("*_htmltotext/htmltotext.txt"), + snapshot_dir / "htmltotext" / "htmltotext.txt", + ): + if path.exists() and "Example Domain" in path.read_text(errors="ignore"): + text_hits += 1 + assert text_hits >= 2, ( + "Expected multiple text extractors to contain Example Domain " + f"(readability/htmltotext hits={text_hits})." + ) diff --git a/archivebox/tests/test_settings_signal_webhooks.py b/archivebox/tests/test_settings_signal_webhooks.py new file mode 100644 index 00000000..acb6367d --- /dev/null +++ b/archivebox/tests/test_settings_signal_webhooks.py @@ -0,0 +1,8 @@ +from django.test import TestCase + + +class TestSignalWebhooksSettings(TestCase): + def test_task_handler_is_sync_in_tests(self): + from signal_webhooks.settings import webhook_settings + + assert webhook_settings.TASK_HANDLER.__name__ == "sync_task_handler" diff --git a/archivebox/tests/test_snapshot.py b/archivebox/tests/test_snapshot.py index 7ca8e5c8..8d2fc3fc 100644 --- a/archivebox/tests/test_snapshot.py +++ b/archivebox/tests/test_snapshot.py @@ -4,7 +4,11 @@ import os import subprocess import sqlite3 -import json +from archivebox.machine.models import Process +from datetime import datetime +from pathlib import Path +from urllib.parse import urlparse +import uuid import pytest @@ -16,19 +20,51 @@ def test_snapshot_creates_snapshot_with_correct_url(tmp_path, process, disable_e os.chdir(tmp_path) subprocess.run( - ['archivebox', 'snapshot', 'https://example.com'], + ['archivebox', 'snapshot', 'create', 'https://example.com'], capture_output=True, - env=disable_extractors_dict, + env={**disable_extractors_dict, 'DATA_DIR': str(tmp_path)}, ) conn = sqlite3.connect('index.sqlite3') c = conn.cursor() - result = c.execute("SELECT url FROM core_snapshot WHERE url = ?", - ('https://example.com',)).fetchone() + snapshot_row = c.execute( + "SELECT id, created_at, url, crawl_id FROM core_snapshot WHERE url = ?", + ('https://example.com',) + ).fetchone() + assert snapshot_row is not None + crawl_row = c.execute( + "SELECT id, created_at, urls, created_by_id FROM crawls_crawl WHERE id = ?", + (snapshot_row[3],) + ).fetchone() + assert crawl_row is not None + user_row = c.execute( + "SELECT username FROM auth_user WHERE id = ?", + (crawl_row[3],) + ).fetchone() + assert user_row is not None conn.close() - assert result is not None - assert result[0] == 'https://example.com' + snapshot_id_raw, snapshot_created_at, snapshot_url, crawl_id = snapshot_row + snapshot_id = str(uuid.UUID(snapshot_id_raw)) + crawl_id, crawl_created_at, crawl_urls, crawl_created_by_id = crawl_row + username = user_row[0] + crawl_date_str = datetime.fromisoformat(crawl_created_at).strftime('%Y%m%d') + snapshot_date_str = datetime.fromisoformat(snapshot_created_at).strftime('%Y%m%d') + domain = urlparse(snapshot_url).hostname or 'unknown' + + # Verify crawl symlink exists and is relative + target_path = tmp_path / 'users' / username / 'snapshots' / snapshot_date_str / domain / snapshot_id + symlinks = [ + p for p in tmp_path.rglob(str(snapshot_id)) + if p.is_symlink() + ] + assert symlinks, "Snapshot symlink should exist under crawl dir" + link_path = symlinks[0] + + assert link_path.is_symlink(), "Snapshot symlink should exist under crawl dir" + link_target = os.readlink(link_path) + assert not os.path.isabs(link_target), "Symlink should be relative" + assert link_path.resolve() == target_path.resolve() def test_snapshot_multiple_urls_creates_multiple_records(tmp_path, process, disable_extractors_dict): @@ -36,11 +72,11 @@ def test_snapshot_multiple_urls_creates_multiple_records(tmp_path, process, disa os.chdir(tmp_path) subprocess.run( - ['archivebox', 'snapshot', + ['archivebox', 'snapshot', 'create', 'https://example.com', 'https://iana.org'], capture_output=True, - env=disable_extractors_dict, + env={**disable_extractors_dict, 'DATA_DIR': str(tmp_path)}, ) conn = sqlite3.connect('index.sqlite3') @@ -59,10 +95,10 @@ def test_snapshot_tag_creates_tag_and_links_to_snapshot(tmp_path, process, disab os.chdir(tmp_path) subprocess.run( - ['archivebox', 'snapshot', '--tag=mytesttag', + ['archivebox', 'snapshot', 'create', '--tag=mytesttag', 'https://example.com'], capture_output=True, - env=disable_extractors_dict, + env={**disable_extractors_dict, 'DATA_DIR': str(tmp_path)}, ) conn = sqlite3.connect('index.sqlite3') @@ -95,22 +131,15 @@ def test_snapshot_jsonl_output_has_correct_structure(tmp_path, process, disable_ # Pass URL as argument instead of stdin for more reliable behavior result = subprocess.run( - ['archivebox', 'snapshot', 'https://example.com'], + ['archivebox', 'snapshot', 'create', 'https://example.com'], capture_output=True, text=True, - env=disable_extractors_dict, + env={**disable_extractors_dict, 'DATA_DIR': str(tmp_path)}, ) # Parse JSONL output lines - snapshot_records = [] - for line in result.stdout.strip().split('\n'): - if line: - try: - record = json.loads(line) - if record.get('type') == 'Snapshot': - snapshot_records.append(record) - except json.JSONDecodeError: - continue + records = Process.parse_records_from_text(result.stdout) + snapshot_records = [r for r in records if r.get('type') == 'Snapshot'] assert len(snapshot_records) >= 1, "Should output at least one Snapshot JSONL record" @@ -127,10 +156,10 @@ def test_snapshot_with_tag_stores_tag_name(tmp_path, process, disable_extractors # Use command line args instead of stdin subprocess.run( - ['archivebox', 'snapshot', '--tag=customtag', 'https://example.com'], + ['archivebox', 'snapshot', 'create', '--tag=customtag', 'https://example.com'], capture_output=True, text=True, - env=disable_extractors_dict, + env={**disable_extractors_dict, 'DATA_DIR': str(tmp_path)}, ) conn = sqlite3.connect('index.sqlite3') @@ -145,40 +174,40 @@ def test_snapshot_with_tag_stores_tag_name(tmp_path, process, disable_extractors assert tag[0] == 'customtag' -def test_snapshot_with_depth_creates_crawl_object(tmp_path, process, disable_extractors_dict): - """Test that --depth > 0 creates a Crawl object with correct max_depth.""" +def test_snapshot_with_depth_sets_snapshot_depth(tmp_path, process, disable_extractors_dict): + """Test that --depth sets snapshot depth when creating snapshots.""" os.chdir(tmp_path) subprocess.run( - ['archivebox', 'snapshot', '--depth=1', + ['archivebox', 'snapshot', 'create', '--depth=1', 'https://example.com'], capture_output=True, - env=disable_extractors_dict, + env={**disable_extractors_dict, 'DATA_DIR': str(tmp_path)}, ) conn = sqlite3.connect('index.sqlite3') c = conn.cursor() - crawl = c.execute("SELECT max_depth FROM crawls_crawl ORDER BY created_at DESC LIMIT 1").fetchone() + snapshot = c.execute("SELECT depth FROM core_snapshot ORDER BY created_at DESC LIMIT 1").fetchone() conn.close() - assert crawl is not None, "Crawl object should be created when depth > 0" - assert crawl[0] == 1, "Crawl max_depth should match --depth value" + assert snapshot is not None, "Snapshot should be created when depth is provided" + assert snapshot[0] == 1, "Snapshot depth should match --depth value" -def test_snapshot_deduplicates_urls(tmp_path, process, disable_extractors_dict): - """Test that adding the same URL twice doesn't create duplicate snapshots.""" +def test_snapshot_allows_duplicate_urls_across_crawls(tmp_path, process, disable_extractors_dict): + """Snapshot create auto-creates a crawl per run; same URL can appear multiple times.""" os.chdir(tmp_path) # Add same URL twice subprocess.run( - ['archivebox', 'snapshot', 'https://example.com'], + ['archivebox', 'snapshot', 'create', 'https://example.com'], capture_output=True, - env=disable_extractors_dict, + env={**disable_extractors_dict, 'DATA_DIR': str(tmp_path)}, ) subprocess.run( - ['archivebox', 'snapshot', 'https://example.com'], + ['archivebox', 'snapshot', 'create', 'https://example.com'], capture_output=True, - env=disable_extractors_dict, + env={**disable_extractors_dict, 'DATA_DIR': str(tmp_path)}, ) conn = sqlite3.connect('index.sqlite3') @@ -187,7 +216,7 @@ def test_snapshot_deduplicates_urls(tmp_path, process, disable_extractors_dict): ('https://example.com',)).fetchone()[0] conn.close() - assert count == 1, "Same URL should not create duplicate snapshots" + assert count == 2, "Same URL should create separate snapshots across different crawls" if __name__ == '__main__': diff --git a/archivebox/workers/orchestrator.py b/archivebox/workers/orchestrator.py index 4b8a2827..358c6ad9 100644 --- a/archivebox/workers/orchestrator.py +++ b/archivebox/workers/orchestrator.py @@ -83,6 +83,10 @@ class Orchestrator: # In foreground mode (exit_on_idle=True), limit to 1 CrawlWorker if self.exit_on_idle: self.MAX_CRAWL_WORKERS = 1 + # Faster UI updates for interactive runs + self.POLL_INTERVAL = 0.25 + # Exit quickly once idle in foreground mode + self.IDLE_TIMEOUT = 1 def __repr__(self) -> str: return f'[underline]Orchestrator[/underline]\\[pid={self.pid}]' @@ -111,8 +115,14 @@ class Orchestrator: # Clean up any stale Process records from previous runs stale_count = Process.cleanup_stale_running() - # Clean up orphaned Chrome processes from previous crashes - chrome_count = Process.cleanup_orphaned_chrome() + # Foreground runs should start fast; skip expensive orphan cleanup unless in daemon mode. + chrome_count = 0 + orphaned_workers = 0 + if not self.exit_on_idle: + # Clean up orphaned Chrome processes from previous crashes + chrome_count = Process.cleanup_orphaned_chrome() + # Clean up orphaned workers from previous crashes + orphaned_workers = Process.cleanup_orphaned_workers() # Collect startup metadata metadata = { @@ -123,6 +133,8 @@ class Orchestrator: metadata['cleaned_stale_pids'] = stale_count if chrome_count: metadata['cleaned_orphaned_chrome'] = chrome_count + if orphaned_workers: + metadata['cleaned_orphaned_workers'] = orphaned_workers log_worker_event( worker_type='Orchestrator', @@ -135,30 +147,26 @@ class Orchestrator: def terminate_all_workers(self) -> None: """Terminate all running worker processes.""" from archivebox.machine.models import Process - import signal - - # Get all running worker processes - running_workers = Process.objects.filter( - process_type=Process.TypeChoices.WORKER, - status__in=['running', 'started'] - ) + # Get running worker processes scoped to this orchestrator when possible + if getattr(self, 'db_process', None): + running_workers = self._get_scoped_running_workers() + else: + running_workers = Process.objects.filter( + process_type=Process.TypeChoices.WORKER, + status=Process.StatusChoices.RUNNING, + ) for worker_process in running_workers: try: - # Send SIGTERM to gracefully terminate the worker - os.kill(worker_process.pid, signal.SIGTERM) - except ProcessLookupError: - # Process already dead - pass + # Gracefully terminate the worker and update Process status + worker_process.terminate(graceful_timeout=5.0) except Exception: - # Ignore other errors during shutdown pass def on_shutdown(self, error: BaseException | None = None) -> None: """Called when orchestrator shuts down.""" - # Terminate all worker processes in exit_on_idle mode - if self.exit_on_idle: - self.terminate_all_workers() + # Terminate all worker processes on shutdown + self.terminate_all_workers() # Update Process record status if hasattr(self, 'db_process') and self.db_process: @@ -188,11 +196,26 @@ class Orchestrator: Process.cleanup_stale_running() self._last_cleanup_time = now + if self.crawl_id and getattr(self, 'db_process', None): + return self._get_scoped_running_workers().count() + return sum(len(W.get_running_workers()) for W in self.WORKER_TYPES) def get_running_workers_for_type(self, WorkerClass: Type[Worker]) -> int: """Get count of running workers for a specific worker type.""" + if self.crawl_id and getattr(self, 'db_process', None): + return self._get_scoped_running_workers().filter(worker_type=WorkerClass.name).count() return len(WorkerClass.get_running_workers()) + + def _get_scoped_running_workers(self): + """Get running workers scoped to this orchestrator process tree.""" + from archivebox.machine.models import Process + + descendants = self.db_process.get_descendants(include_self=False) + return descendants.filter( + process_type=Process.TypeChoices.WORKER, + status=Process.StatusChoices.RUNNING, + ) def should_spawn_worker(self, WorkerClass: Type[Worker], queue_count: int) -> bool: """Determine if we should spawn a new worker.""" @@ -208,8 +231,11 @@ class Orchestrator: max_workers = 1 # Default for unknown types # Check worker limit - running_workers = WorkerClass.get_running_workers() - running_count = len(running_workers) + if self.crawl_id and getattr(self, 'db_process', None) and WorkerClass.name != 'binary': + running_count = self._get_scoped_running_workers().filter(worker_type=WorkerClass.name).count() + else: + running_workers = WorkerClass.get_running_workers() + running_count = len(running_workers) if running_count >= max_workers: return False @@ -225,9 +251,13 @@ class Orchestrator: """Spawn a new worker process. Returns PID or None if spawn failed.""" try: print(f'[yellow]DEBUG: Spawning {WorkerClass.name} worker with crawl_id={self.crawl_id}...[/yellow]') - pid = WorkerClass.start(crawl_id=self.crawl_id) + pid = WorkerClass.start(parent=self.db_process, crawl_id=self.crawl_id) print(f'[yellow]DEBUG: Spawned {WorkerClass.name} worker with PID={pid}[/yellow]') + if self.exit_on_idle: + # Foreground runs have MAX_CRAWL_WORKERS=1; avoid blocking startup on registration. + return pid + # CRITICAL: Block until worker registers itself in Process table # This prevents race condition where orchestrator spawns multiple workers # before any of them finish on_startup() and register @@ -316,7 +346,7 @@ class Orchestrator: if binary_count > 0: running_binary_workers_list = BinaryWorker.get_running_workers() if len(running_binary_workers_list) == 0: - BinaryWorker.start() + BinaryWorker.start(parent=self.db_process) # Check if any BinaryWorkers are still running running_binary_workers = len(BinaryWorker.get_running_workers()) @@ -344,7 +374,7 @@ class Orchestrator: # Claim next crawl crawl = crawl_queue.first() if crawl and self._claim_crawl(crawl): - CrawlWorker.start(crawl_id=str(crawl.id)) + CrawlWorker.start(parent=self.db_process, crawl_id=str(crawl.id)) return queue_sizes @@ -463,7 +493,7 @@ class Orchestrator: with Live( progress_layout.get_layout(), - refresh_per_second=4, + refresh_per_second=8, screen=True, console=orchestrator_console, ): @@ -521,41 +551,147 @@ class Orchestrator: else: status = "Idle" + binary_workers_count = worker_counts.get('binary', 0) # Update orchestrator status progress_layout.update_orchestrator_status( status=status, crawl_queue_count=crawl_queue_count, crawl_workers_count=crawl_workers_count, + binary_queue_count=queue_sizes.get('binary', 0), + binary_workers_count=binary_workers_count, max_crawl_workers=self.MAX_CRAWL_WORKERS, ) - # Update CrawlWorker logs by tailing Process stdout/stderr - if crawl_workers_count > 0: - from archivebox.machine.models import Process - crawl_worker_process = Process.objects.filter( - process_type=Process.TypeChoices.WORKER, - worker_type='crawl', - status__in=['running', 'started'] - ).first() - if crawl_worker_process: - progress_layout.update_crawl_worker_logs(crawl_worker_process) + # Update crawl queue tree (active + recently completed) + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot, ArchiveResult + recent_cutoff = timezone.now() - timedelta(minutes=5) + pending_snapshot_candidates: list[Snapshot] = [] + hooks_by_snapshot: dict[str, list] = {} - # Log queue size changes - if queue_sizes != last_queue_sizes: - for worker_type, count in queue_sizes.items(): - old_count = last_queue_sizes.get(worker_type, 0) - if count != old_count: - if count > old_count: - progress_layout.log_event( - f"{worker_type.capitalize()} queue: {old_count} → {count}", - style="yellow" - ) - else: - progress_layout.log_event( - f"{worker_type.capitalize()} queue: {old_count} → {count}", - style="green" - ) - last_queue_sizes = queue_sizes.copy() + active_qs = Crawl.objects.exclude(status__in=Crawl.FINAL_STATES) + if self.crawl_id: + active_qs = active_qs.filter(id=self.crawl_id) + active_qs = active_qs.order_by('retry_at') + + recent_done_qs = Crawl.objects.filter( + status__in=Crawl.FINAL_STATES, + modified_at__gte=recent_cutoff, + ) + if self.crawl_id: + recent_done_qs = recent_done_qs.filter(id=self.crawl_id) + recent_done_qs = recent_done_qs.order_by('-modified_at') + + crawls = list(active_qs) + active_ids = {c.id for c in crawls} + for crawl in recent_done_qs: + if crawl.id not in active_ids: + crawls.append(crawl) + + def _abbrev(text: str, max_len: int = 80) -> str: + return text if len(text) <= max_len else f"{text[:max_len - 3]}..." + + tree_data: list[dict] = [] + for crawl in crawls: + urls = crawl.get_urls_list() + url_count = len(urls) + label = f"{url_count} url" + ("s" if url_count != 1 else "") + label = _abbrev(label) + + snapshots = [] + snap_qs = Snapshot.objects.filter(crawl_id=crawl.id) + active_snaps = list( + snap_qs.filter(status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]) + .order_by('created_at')[:16] + ) + recent_snaps = list( + snap_qs.filter(status__in=Snapshot.FINAL_STATES) + .order_by('-modified_at')[:8] + ) + snap_ids = {s.id for s in active_snaps} + for s in recent_snaps: + if s.id not in snap_ids: + active_snaps.append(s) + + for snap in active_snaps: + total = snap.archiveresult_set.count() + completed = snap.archiveresult_set.filter(status__in=[ + ArchiveResult.StatusChoices.SUCCEEDED, + ArchiveResult.StatusChoices.SKIPPED, + ArchiveResult.StatusChoices.FAILED, + ]).count() + running = snap.archiveresult_set.filter(status=ArchiveResult.StatusChoices.STARTED).count() + try: + from archivebox.config.configset import get_config + from archivebox.hooks import discover_hooks + hooks_list = discover_hooks('Snapshot', config=get_config(snapshot=snap)) + total_hooks = len(hooks_list) + hooks_by_snapshot[str(snap.id)] = hooks_list + except Exception: + total_hooks = total + pending = max(total_hooks - completed - running, 0) + snap_label = _abbrev(snap.url or str(snap.id), max_len=60) + snapshots.append({ + 'id': str(snap.id), + 'status': snap.status, + 'label': snap_label, + 'hooks': {'completed': completed, 'running': running, 'pending': pending} if total else {}, + }) + pending_snapshot_candidates.append(snap) + + tree_data.append({ + 'id': str(crawl.id), + 'status': crawl.status, + 'label': label, + 'snapshots': snapshots, + }) + + progress_layout.update_crawl_tree(tree_data) + + # Update running process panels (tail stdout/stderr for each running process) + from archivebox.machine.models import Process + if self.crawl_id and getattr(self, 'db_process', None): + process_qs = self.db_process.get_descendants(include_self=False) + process_qs = process_qs.filter(status=Process.StatusChoices.RUNNING) + else: + process_qs = Process.objects.filter( + status=Process.StatusChoices.RUNNING, + ).exclude(process_type=Process.TypeChoices.ORCHESTRATOR) + + running_processes = [ + proc for proc in process_qs.order_by('process_type', 'worker_type', 'started_at') + if proc.is_running + ] + pending_processes = [] + try: + from types import SimpleNamespace + for snap in pending_snapshot_candidates: + hooks_list = hooks_by_snapshot.get(str(snap.id), []) + if not hooks_list: + continue + existing = set( + snap.archiveresult_set.exclude(hook_name='').values_list('hook_name', flat=True) + ) + for hook_path in hooks_list: + if hook_path.name in existing: + continue + pending_processes.append(SimpleNamespace( + process_type='hook', + worker_type='', + pid=None, + cmd=['', str(hook_path)], + url=snap.url, + status='queued', + started_at=None, + timeout=None, + pwd=None, + )) + except Exception: + pending_processes = [] + + progress_layout.update_process_panels(running_processes, pending=pending_processes) + + last_queue_sizes = queue_sizes.copy() # Update snapshot progress from archivebox.core.models import Snapshot @@ -641,11 +777,10 @@ class Orchestrator: # Hooks created but none started yet current_plugin = "waiting" - # Update snapshot worker (show even if no hooks yet) # Debug: Log first time we see this snapshot - if snapshot.id not in progress_layout.snapshot_to_worker: + if snapshot.id not in snapshot_progress: progress_layout.log_event( - f"Assigning to worker: {snapshot.url[:50]}", + f"Tracking snapshot: {snapshot.url[:50]}", style="grey53" ) @@ -656,17 +791,21 @@ class Orchestrator: if prev_progress != curr_progress: prev_total, prev_completed, prev_plugin = prev_progress - # Log hooks created - if total > prev_total: - progress_layout.log_event( - f"Hooks created: {total} for {snapshot.url[:40]}", - style="cyan" - ) - # Log hook completion if completed > prev_completed: + completed_ar = snapshot.archiveresult_set.filter( + status__in=['succeeded', 'skipped', 'failed'] + ).order_by('-end_ts', '-modified_at').first() + hook_label = '' + if completed_ar: + hook_name = completed_ar.hook_name or completed_ar.plugin or '' + if hook_name: + hook_label = hook_name.split('__')[-1] if '__' in hook_name else hook_name + hook_label = hook_label.replace('.py', '').replace('.js', '').replace('.sh', '').replace('.bg', '') + if not hook_label: + hook_label = f"{completed}/{total}" progress_layout.log_event( - f"Hook completed: {completed}/{total} for {snapshot.url[:40]}", + f"Hook completed: {hook_label}", style="green" ) @@ -686,23 +825,15 @@ class Orchestrator: style="red" ) - progress_layout.update_snapshot_worker( - snapshot_id=snapshot.id, - url=snapshot.url, - total=max(total, 1), # Show at least 1 to avoid division by zero - completed=completed, - current_plugin=current_plugin, - ) + # No per-snapshot panels; logs only - # Remove snapshots that are no longer active - for snapshot_id in list(progress_layout.snapshot_to_worker.keys()): + # Cleanup progress tracking for completed snapshots + for snapshot_id in list(snapshot_progress.keys()): if snapshot_id not in active_ids: progress_layout.log_event( f"Snapshot completed/removed", style="blue" ) - progress_layout.remove_snapshot_worker(snapshot_id) - # Also clean up progress tracking if snapshot_id in snapshot_progress: del snapshot_progress[snapshot_id] @@ -734,6 +865,7 @@ class Orchestrator: if progress_layout: progress_layout.log_event("Interrupted by user", style="red") print() # Newline after ^C + self.on_shutdown(error=KeyboardInterrupt()) except BaseException as e: if progress_layout: progress_layout.log_event(f"Error: {e}", style="red") diff --git a/archivebox/workers/tests/test_orchestrator.py b/archivebox/workers/tests/test_orchestrator.py index d54331ec..79d37f95 100644 --- a/archivebox/workers/tests/test_orchestrator.py +++ b/archivebox/workers/tests/test_orchestrator.py @@ -215,6 +215,46 @@ class TestOrchestratorWithProcess(TestCase): mock_count.assert_called() self.assertTrue(result) + def test_orchestrator_scoped_worker_count(self): + """Orchestrator with crawl_id should count only descendant workers.""" + import time + from archivebox.machine.models import Process, Machine + + machine = Machine.current() + orchestrator = Orchestrator(exit_on_idle=True, crawl_id='test-crawl') + + orchestrator.db_process = Process.objects.create( + machine=machine, + process_type=Process.TypeChoices.ORCHESTRATOR, + status=Process.StatusChoices.RUNNING, + pid=12345, + started_at=timezone.now(), + ) + + # Prevent cleanup from marking fake PIDs as exited + orchestrator._last_cleanup_time = time.time() + + Process.objects.create( + machine=machine, + process_type=Process.TypeChoices.WORKER, + worker_type='crawl', + status=Process.StatusChoices.RUNNING, + pid=12346, + parent=orchestrator.db_process, + started_at=timezone.now(), + ) + + Process.objects.create( + machine=machine, + process_type=Process.TypeChoices.WORKER, + worker_type='crawl', + status=Process.StatusChoices.RUNNING, + pid=12347, + started_at=timezone.now(), + ) + + self.assertEqual(orchestrator.get_total_worker_count(), 1) + class TestProcessBasedWorkerTracking(TestCase): """Test Process model methods that replace pid_utils functionality.""" diff --git a/archivebox/workers/worker.py b/archivebox/workers/worker.py index 7546a02a..38f5361b 100644 --- a/archivebox/workers/worker.py +++ b/archivebox/workers/worker.py @@ -23,6 +23,7 @@ from django.db.models import QuerySet from django.utils import timezone from django.conf import settings +from statemachine.exceptions import TransitionNotAllowed from rich import print from archivebox.misc.logging_util import log_worker_event @@ -450,13 +451,34 @@ class CrawlWorker(Worker): def runloop(self) -> None: """Run crawl state machine, spawn SnapshotWorkers.""" import sys + from archivebox.crawls.models import Crawl self.on_startup() try: print(f'🔄 CrawlWorker starting for crawl {self.crawl_id}', file=sys.stderr) + if self.crawl.status == Crawl.StatusChoices.SEALED: + print( + '✅ This crawl has already completed and there are no tasks remaining.\n' + ' To re-crawl it, create a new crawl with the same URLs, e.g.\n' + ' archivebox crawl create | archivebox run', + file=sys.stderr, + ) + return + # Advance state machine: QUEUED → STARTED (triggers run() via @started.enter) - self.crawl.sm.tick() + try: + self.crawl.sm.tick() + except TransitionNotAllowed: + if self.crawl.status == Crawl.StatusChoices.SEALED: + print( + '✅ This crawl has already completed and there are no tasks remaining.\n' + ' To re-crawl it, create a new crawl with the same URLs, e.g.\n' + ' archivebox crawl create | archivebox run', + file=sys.stderr, + ) + return + raise self.crawl.refresh_from_db() print(f'🔄 tick() complete, crawl status={self.crawl.status}', file=sys.stderr) @@ -509,13 +531,20 @@ class CrawlWorker(Worker): status__in=['running', 'started'], ) - # Extract snapshot IDs from their pwd (contains snapshot ID at the end) + # Extract snapshot IDs from worker cmd args (more reliable than pwd paths) running_snapshot_ids = [] for proc in running_processes: - if proc.pwd: - # pwd is like: /path/to/archive/{timestamp} - # We need to match this against snapshot.output_dir - running_snapshot_ids.append(proc.pwd) + cmd = proc.cmd or [] + snapshot_id = None + for i, part in enumerate(cmd): + if part == '--snapshot-id' and i + 1 < len(cmd): + snapshot_id = cmd[i + 1] + break + if part.startswith('--snapshot-id='): + snapshot_id = part.split('=', 1)[1] + break + if snapshot_id: + running_snapshot_ids.append(snapshot_id) # Find snapshots that don't have a running worker all_snapshots = Snapshot.objects.filter( @@ -526,7 +555,7 @@ class CrawlWorker(Worker): # Filter out snapshots that already have workers pending_snapshots = [ snap for snap in all_snapshots - if snap.output_dir not in running_snapshot_ids + if str(snap.id) not in running_snapshot_ids ][:self.MAX_SNAPSHOT_WORKERS - running_count] with open(debug_log, 'a') as f: @@ -631,7 +660,6 @@ class SnapshotWorker(Worker): b. If foreground: wait for completion c. If background: track but continue to next hook d. Update ArchiveResult status - e. Advance current_step when all step's hooks complete 4. When all hooks done: seal snapshot 5. On shutdown: SIGTERM all background hooks """ @@ -662,7 +690,7 @@ class SnapshotWorker(Worker): def runloop(self) -> None: """Execute all hooks sequentially.""" - from archivebox.hooks import discover_hooks, is_background_hook, extract_step + from archivebox.hooks import discover_hooks, is_background_hook from archivebox.core.models import ArchiveResult from archivebox.config.configset import get_config @@ -679,8 +707,7 @@ class SnapshotWorker(Worker): # Execute each hook sequentially for hook_path in hooks: hook_name = hook_path.name - plugin = self._extract_plugin_name(hook_name) - hook_step = extract_step(hook_name) + plugin = self._extract_plugin_name(hook_path, hook_name) is_background = is_background_hook(hook_name) # Create ArchiveResult for THIS HOOK (not per plugin) @@ -724,16 +751,18 @@ class SnapshotWorker(Worker): pid=self.pid, ) - # Check if we can advance to next step - self._try_advance_step() + # Reap any background hooks that finished while we worked + self._reap_background_hooks() - # All hooks launched (or completed) - seal using state machine + # All hooks launched (or completed) - terminate bg hooks and seal + self._finalize_background_hooks() # This triggers enter_sealed() which calls cleanup() and checks parent crawl sealing self.snapshot.sm.seal() self.snapshot.refresh_from_db() except Exception as e: # Mark snapshot as sealed even on error (still triggers cleanup) + self._finalize_background_hooks() self.snapshot.sm.seal() self.snapshot.refresh_from_db() raise @@ -753,7 +782,6 @@ class SnapshotWorker(Worker): script=hook_path, output_dir=output_dir, config=config, - timeout=120, parent=self.db_process, url=str(self.snapshot.url), snapshot_id=str(self.snapshot.id), @@ -773,12 +801,22 @@ class SnapshotWorker(Worker): except TimeoutError: # Hook exceeded timeout - kill it process.kill(signal_num=9) - exit_code = -1 + exit_code = process.exit_code or 137 # Update ArchiveResult from hook output ar.update_from_output() ar.end_ts = timezone.now() + # Apply hook-emitted JSONL records regardless of exit code + from archivebox.hooks import extract_records_from_process, process_hook_records + + records = extract_records_from_process(process) + if records: + process_hook_records( + records, + overrides={'snapshot': self.snapshot, 'crawl': self.snapshot.crawl}, + ) + # Determine final status from hook exit code if exit_code == 0: ar.status = ar.StatusChoices.SUCCEEDED @@ -787,34 +825,53 @@ class SnapshotWorker(Worker): ar.save(update_fields=['status', 'end_ts', 'modified_at']) - def _try_advance_step(self) -> None: - """Advance current_step if all foreground hooks in current step are done.""" - from django.db.models import Q + def _finalize_background_hooks(self) -> None: + """Gracefully terminate background hooks and update their ArchiveResults.""" + if getattr(self, '_background_hooks_finalized', False): + return + + self._background_hooks_finalized = True + + # Send SIGTERM and wait up to each hook's remaining timeout + self._terminate_background_hooks( + background_processes=self.background_processes, + worker_type='SnapshotWorker', + indent_level=2, + ) + + # Clear to avoid double-termination during on_shutdown + self.background_processes = {} + + # Update STARTED background results now that hooks are done from archivebox.core.models import ArchiveResult - current_step = self.snapshot.current_step - - # Single query: foreground hooks in current step that aren't finished - # Foreground hooks: hook_name doesn't contain '.bg.' - pending_foreground = self.snapshot.archiveresult_set.filter( - Q(hook_name__contains=f'__{current_step}_') & # Current step - ~Q(hook_name__contains='.bg.') & # Not background - ~Q(status__in=ArchiveResult.FINAL_STATES) # Not finished - ).exists() - - if pending_foreground: - return # Still waiting for hooks - - # All foreground hooks done - advance! - self.snapshot.current_step += 1 - self.snapshot.save(update_fields=['current_step', 'modified_at']) - - log_worker_event( - worker_type='SnapshotWorker', - event=f'Advanced to step {self.snapshot.current_step}', - indent_level=2, - pid=self.pid, + started_bg = self.snapshot.archiveresult_set.filter( + status=ArchiveResult.StatusChoices.STARTED, + hook_name__contains='.bg.', ) + for ar in started_bg: + ar.update_from_output() + + def _reap_background_hooks(self) -> None: + """Update ArchiveResults for background hooks that already exited.""" + if getattr(self, '_background_hooks_finalized', False): + return + if not self.background_processes: + return + + from archivebox.core.models import ArchiveResult + + for hook_name, process in list(self.background_processes.items()): + exit_code = process.poll() + if exit_code is None: + continue + + ar = self.snapshot.archiveresult_set.filter(hook_name=hook_name).first() + if ar and ar.status == ArchiveResult.StatusChoices.STARTED: + ar.update_from_output() + + # Remove completed hook from tracking + self.background_processes.pop(hook_name, None) def on_shutdown(self, error: BaseException | None = None) -> None: """ @@ -834,12 +891,15 @@ class SnapshotWorker(Worker): super().on_shutdown(error) @staticmethod - def _extract_plugin_name(hook_name: str) -> str: - """Extract plugin name from hook filename.""" - # on_Snapshot__50_wget.py -> wget - name = hook_name.split('__')[-1] # Get part after last __ + def _extract_plugin_name(hook_path: Path, hook_name: str) -> str: + """Extract plugin name from hook path (fallback to filename).""" + plugin_dir = hook_path.parent.name + if plugin_dir not in ('plugins', '.'): + return plugin_dir + # Fallback: on_Snapshot__50_wget.py -> wget + name = hook_name.split('__')[-1] name = name.replace('.py', '').replace('.js', '').replace('.sh', '') - name = name.replace('.bg', '') # Remove .bg suffix + name = name.replace('.bg', '') return name @@ -888,7 +948,7 @@ class BinaryWorker(Worker): machine=machine, status=Binary.StatusChoices.QUEUED, retry_at__lte=timezone.now() - ).order_by('retry_at') + ).order_by('retry_at', 'created_at', 'name') def runloop(self) -> None: """Install binary(ies)."""