continue renaming extractor to plugin, add plan for hook concurrency, add chrome kill helper script

This commit is contained in:
Nick Sweeting
2025-12-28 05:29:24 -08:00
parent d2e65cfd38
commit 4ccb0863bb
53 changed files with 456 additions and 493 deletions

View File

@@ -25,7 +25,7 @@ from urllib.request import urlopen
import rich_click as click
EXTRACTOR_NAME = 'parse_txt_urls'
PLUGIN_NAME = 'parse_txt_urls'
# URL regex from archivebox/misc/util.py
# https://mathiasbynens.be/demo/url-regex
@@ -127,7 +127,7 @@ def main(url: str, snapshot_id: str = None):
f.write(json.dumps({
'type': 'Snapshot',
'url': found_url,
'via_extractor': EXTRACTOR_NAME,
'plugin': PLUGIN_NAME,
}) + '\n')
click.echo(f'Found {len(urls_found)} URLs')

View File

@@ -186,7 +186,7 @@ https://other.com
entry = json.loads(output_file.read_text().strip())
assert entry['url'] == 'https://example.com'
assert 'type' in entry
assert 'via_extractor' in entry
assert 'plugin' in entry
if __name__ == '__main__':