continue renaming extractor to plugin, add plan for hook concurrency, add chrome kill helper script

2026-04-04 14:57:56 +10:00 · 2025-12-28 05:29:24 -08:00
parent d2e65cfd38
commit 4ccb0863bb
53 changed files with 456 additions and 493 deletions
--- a/archivebox/plugins/parse_html_urls/on_Snapshot__60_parse_html_urls.py
+++ b/archivebox/plugins/parse_html_urls/on_Snapshot__60_parse_html_urls.py
@@ -28,7 +28,7 @@ from urllib.parse import urljoin, urlparse

 import rich_click as click

-EXTRACTOR_NAME = 'parse_html_urls'
+PLUGIN_NAME = 'parse_html_urls'

 # Check if parse_dom_outlinks extractor already ran
 DOM_OUTLINKS_URLS_FILE = Path('parse_dom_outlinks/urls.jsonl')
@@ -179,7 +179,7 @@ def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0
        record = {
            'type': 'Snapshot',
            'url': found_url,
-            'via_extractor': EXTRACTOR_NAME,
+            'plugin': PLUGIN_NAME,
            'depth': depth + 1,
        }
        if snapshot_id:
--- a/archivebox/plugins/parse_html_urls/tests/test_parse_html_urls.py
+++ b/archivebox/plugins/parse_html_urls/tests/test_parse_html_urls.py
@@ -233,7 +233,7 @@ class TestParseHtmlUrls:
        entry = json.loads(output_file.read_text().strip())
        assert entry['url'] == 'https://example.com'
        assert 'type' in entry
-        assert 'via_extractor' in entry
+        assert 'plugin' in entry


 if __name__ == '__main__':