way better plugin hooks system wip

This commit is contained in:
Nick Sweeting
2025-12-28 03:39:59 -08:00
parent a38624a4dd
commit 50e527ec65
156 changed files with 10275 additions and 7149 deletions

View File

@@ -186,7 +186,7 @@ def discover_outlinks(
# Collect discovered URLs from urls.jsonl files
# Uses dynamic discovery - any plugin that outputs urls.jsonl is considered a parser
from archivebox.hooks import collect_urls_from_extractors
from archivebox.hooks import collect_urls_from_plugins
discovered_urls = {}
for snapshot_id in snapshot_ids:
@@ -195,7 +195,7 @@ def discover_outlinks(
snapshot_dir = Path(snapshot.output_dir)
# Dynamically collect urls.jsonl from ANY plugin subdirectory
for entry in collect_urls_from_extractors(snapshot_dir):
for entry in collect_urls_from_plugins(snapshot_dir):
url = entry.get('url')
if url and url not in discovered_urls:
# Add metadata for crawl tracking

View File

@@ -21,7 +21,6 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
from archivebox.config import CONSTANTS, VERSION, DATA_DIR
from archivebox.config.common import SERVER_CONFIG
from archivebox.config.collection import write_config_file
from archivebox.misc.folders import fix_invalid_folder_locations, get_invalid_folders
from archivebox.misc.legacy import parse_json_main_index, parse_json_links_details, SnapshotDict
from archivebox.misc.db import apply_migrations
@@ -106,17 +105,10 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
print(f' √ Loaded {all_links.count()} links from existing main index.')
if quick:
print(' > Skipping full snapshot directory check (quick mode)')
print(' > Skipping orphan snapshot import (quick mode)')
else:
try:
# Links in data folders that dont match their timestamp
fixed, cant_fix = fix_invalid_folder_locations(DATA_DIR)
if fixed:
print(f' [yellow]√ Fixed {len(fixed)} data directory locations that didn\'t match their link timestamps.[/yellow]')
if cant_fix:
print(f' [red]! Could not fix {len(cant_fix)} data directory locations due to conflicts with existing folders.[/red]')
# Links in JSON index but not in main index
# Import orphaned links from legacy JSON indexes
orphaned_json_links = {
link_dict['url']: link_dict
for link_dict in parse_json_main_index(DATA_DIR)
@@ -126,7 +118,6 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
pending_links.update(orphaned_json_links)
print(f' [yellow]√ Added {len(orphaned_json_links)} orphaned links from existing JSON index...[/yellow]')
# Links in data dir indexes but not in main index
orphaned_data_dir_links = {
link_dict['url']: link_dict
for link_dict in parse_json_links_details(DATA_DIR)
@@ -136,18 +127,13 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
pending_links.update(orphaned_data_dir_links)
print(f' [yellow]√ Added {len(orphaned_data_dir_links)} orphaned links from existing archive directories.[/yellow]')
# Links in invalid/duplicate data dirs
invalid_folders = {
folder: link
for folder, link in get_invalid_folders(all_links, DATA_DIR).items()
}
if invalid_folders:
print(f' [red]! Skipped adding {len(invalid_folders)} invalid link data directories.[/red]')
print(' X ' + '\n X '.join(f'./{Path(folder).relative_to(DATA_DIR)} {link}' for folder, link in invalid_folders.items()))
print()
print(' [violet]Hint:[/violet] For more information about the link data directories that were skipped, run:')
print(' archivebox status')
print(' archivebox list --status=invalid')
if pending_links:
Snapshot.objects.create_from_dicts(list(pending_links.values()))
# Hint for orphaned snapshot directories
print()
print(' [violet]Hint:[/violet] To import orphaned snapshot directories and reconcile filesystem state, run:')
print(' archivebox update')
except (KeyboardInterrupt, SystemExit):
print(file=sys.stderr)
@@ -157,9 +143,6 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
print(' [violet]Hint:[/violet] In the future you can run a quick init without checking dirs like so:', file=sys.stderr)
print(' archivebox init --quick', file=sys.stderr)
raise SystemExit(1)
if pending_links:
Snapshot.objects.create_from_dicts(list(pending_links.values()))
print('\n[green]----------------------------------------------------------------------[/green]')

View File

@@ -22,7 +22,7 @@ def install(dry_run: bool=False) -> None:
from archivebox.cli.archivebox_init import init
if not (os.access(ARCHIVE_DIR, os.R_OK) and ARCHIVE_DIR.is_dir()):
init() # must init full index because we need a db to store InstalledBinary entries in
init() # must init full index because we need a db to store Binary entries in
print('\n[green][+] Detecting ArchiveBox dependencies...[/green]')

View File

@@ -25,10 +25,7 @@ LINK_FILTERS = {
'timestamp': lambda pattern: {'timestamp': pattern},
}
STATUS_CHOICES = [
'indexed', 'archived', 'unarchived', 'present', 'valid', 'invalid',
'duplicate', 'orphaned', 'corrupted', 'unrecognized'
]
STATUS_CHOICES = ['indexed', 'archived', 'unarchived']
@@ -59,45 +56,6 @@ def get_snapshots(snapshots: Optional[QuerySet]=None,
return result
def list_folders(snapshots: QuerySet, status: str, out_dir: Path=DATA_DIR) -> dict[str, Any]:
from archivebox.misc.checks import check_data_folder
from archivebox.misc.folders import (
get_indexed_folders,
get_archived_folders,
get_unarchived_folders,
get_present_folders,
get_valid_folders,
get_invalid_folders,
get_duplicate_folders,
get_orphaned_folders,
get_corrupted_folders,
get_unrecognized_folders,
)
check_data_folder()
STATUS_FUNCTIONS = {
"indexed": get_indexed_folders,
"archived": get_archived_folders,
"unarchived": get_unarchived_folders,
"present": get_present_folders,
"valid": get_valid_folders,
"invalid": get_invalid_folders,
"duplicate": get_duplicate_folders,
"orphaned": get_orphaned_folders,
"corrupted": get_corrupted_folders,
"unrecognized": get_unrecognized_folders,
}
try:
return STATUS_FUNCTIONS[status](snapshots, out_dir=out_dir)
except KeyError:
raise ValueError('Status not recognized.')
@enforce_types
def search(filter_patterns: list[str] | None=None,
filter_type: str='substring',
@@ -110,12 +68,13 @@ def search(filter_patterns: list[str] | None=None,
csv: str | None=None,
with_headers: bool=False):
"""List, filter, and export information about archive entries"""
from core.models import Snapshot
if with_headers and not (json or html or csv):
stderr('[X] --with-headers requires --json, --html or --csv\n', color='red')
raise SystemExit(2)
# Query DB directly - no filesystem scanning
snapshots = get_snapshots(
filter_patterns=list(filter_patterns) if filter_patterns else None,
filter_type=filter_type,
@@ -123,30 +82,27 @@ def search(filter_patterns: list[str] | None=None,
after=after,
)
# Apply status filter
if status == 'archived':
snapshots = snapshots.filter(downloaded_at__isnull=False)
elif status == 'unarchived':
snapshots = snapshots.filter(downloaded_at__isnull=True)
# 'indexed' = all snapshots (no filter)
if sort:
snapshots = snapshots.order_by(sort)
folders = list_folders(
snapshots=snapshots,
status=status,
out_dir=DATA_DIR,
)
# Export to requested format
if json:
from core.models import Snapshot
# Filter for non-None snapshots
valid_snapshots = [s for s in folders.values() if s is not None]
output = Snapshot.objects.filter(pk__in=[s.pk for s in valid_snapshots]).to_json(with_headers=with_headers)
output = snapshots.to_json(with_headers=with_headers)
elif html:
from core.models import Snapshot
valid_snapshots = [s for s in folders.values() if s is not None]
output = Snapshot.objects.filter(pk__in=[s.pk for s in valid_snapshots]).to_html(with_headers=with_headers)
output = snapshots.to_html(with_headers=with_headers)
elif csv:
from core.models import Snapshot
valid_snapshots = [s for s in folders.values() if s is not None]
output = Snapshot.objects.filter(pk__in=[s.pk for s in valid_snapshots]).to_csv(cols=csv.split(','), header=with_headers)
output = snapshots.to_csv(cols=csv.split(','), header=with_headers)
else:
from archivebox.misc.logging_util import printable_folders
# Convert to dict for printable_folders
folders = {s.output_dir: s for s in snapshots}
output = printable_folders(folders, with_headers)
print(output)

View File

@@ -2,223 +2,284 @@
__package__ = 'archivebox.cli'
import os
import time
import rich_click as click
from typing import Iterable
from pathlib import Path
from archivebox.misc.util import enforce_types, docstring
from archivebox.misc.folders import (
get_indexed_folders,
get_archived_folders,
get_unarchived_folders,
get_present_folders,
get_valid_folders,
get_invalid_folders,
get_duplicate_folders,
get_orphaned_folders,
get_corrupted_folders,
get_unrecognized_folders,
)
# Filter types for URL matching
LINK_FILTERS = {
'exact': lambda pattern: {'url': pattern},
'substring': lambda pattern: {'url__icontains': pattern},
'regex': lambda pattern: {'url__iregex': pattern},
'domain': lambda pattern: {'url__istartswith': f'http://{pattern}'},
'tag': lambda pattern: {'tags__name': pattern},
'timestamp': lambda pattern: {'timestamp': pattern},
}
@enforce_types
def update(filter_patterns: Iterable[str]=(),
only_new: bool=False,
index_only: bool=False,
resume: float | None=None,
overwrite: bool=False,
before: float | None=None,
after: float | None=None,
status: str='indexed',
filter_type: str='exact',
plugins: str="",
max_workers: int=4) -> None:
"""Import any new links from subscriptions and retry any previously failed/skipped links"""
def update(filter_patterns: Iterable[str] = (),
filter_type: str = 'exact',
before: float | None = None,
after: float | None = None,
resume: str | None = None,
batch_size: int = 100,
continuous: bool = False) -> None:
"""
Update snapshots: import orphans, reconcile, and re-run failed extractors.
Two-phase operation:
- Phase 1: Scan archive/ for orphaned snapshots (skip symlinks)
- Phase 2: Process all DB snapshots (reconcile + re-queue for archiving)
- Phase 3: Deduplicate exact duplicates
With filters: Only phase 2 (DB query), no filesystem scan.
Without filters: All phases (full update).
"""
from rich import print
from archivebox.config.django import setup_django
setup_django()
from django.utils import timezone
from core.models import Snapshot
from workers.orchestrator import parallel_archive
# Get snapshots to update based on filters
from django.utils import timezone
while True:
if filter_patterns or before or after:
# Filtered mode: query DB only
print('[*] Processing filtered snapshots from database...')
stats = process_filtered_snapshots(
filter_patterns=filter_patterns,
filter_type=filter_type,
before=before,
after=after,
batch_size=batch_size
)
print_stats(stats)
else:
# Full mode: import orphans + process DB + deduplicate
stats_combined = {'phase1': {}, 'phase2': {}, 'deduplicated': 0}
print('[*] Phase 1: Scanning archive/ for orphaned snapshots...')
stats_combined['phase1'] = import_orphans_from_archive(
resume_from=resume,
batch_size=batch_size
)
print('[*] Phase 2: Processing all database snapshots...')
stats_combined['phase2'] = process_all_db_snapshots(batch_size=batch_size)
print('[*] Phase 3: Deduplicating...')
stats_combined['deduplicated'] = Snapshot.find_and_merge_duplicates()
print_combined_stats(stats_combined)
if not continuous:
break
print('[yellow]Sleeping 60s before next pass...[/yellow]')
time.sleep(60)
resume = None
def import_orphans_from_archive(resume_from: str = None, batch_size: int = 100) -> dict:
"""
Scan archive/ for orphaned snapshots.
Skip symlinks (already migrated).
Create DB records and trigger migration on save().
"""
from core.models import Snapshot
from archivebox.config import CONSTANTS
from django.db import transaction
stats = {'processed': 0, 'imported': 0, 'migrated': 0, 'invalid': 0}
archive_dir = CONSTANTS.ARCHIVE_DIR
if not archive_dir.exists():
return stats
print('[*] Scanning and sorting by modification time...')
# Scan and sort by mtime (newest first)
# Loading (mtime, path) tuples is fine even for millions (~100MB for 1M entries)
entries = [
(e.stat().st_mtime, e.path)
for e in os.scandir(archive_dir)
if e.is_dir(follow_symlinks=False) # Skip symlinks
]
entries.sort(reverse=True) # Newest first
print(f'[*] Found {len(entries)} directories to check')
for mtime, entry_path in entries:
entry_path = Path(entry_path)
# Resume from timestamp if specified
if resume_from and entry_path.name < resume_from:
continue
stats['processed'] += 1
# Check if already in DB
snapshot = Snapshot.load_from_directory(entry_path)
if snapshot:
continue # Already in DB, skip
# Not in DB - create orphaned snapshot
snapshot = Snapshot.create_from_directory(entry_path)
if not snapshot:
# Invalid directory
Snapshot.move_directory_to_invalid(entry_path)
stats['invalid'] += 1
print(f" [{stats['processed']}] Invalid: {entry_path.name}")
continue
needs_migration = snapshot.fs_migration_needed
snapshot.save() # Creates DB record + triggers migration
stats['imported'] += 1
if needs_migration:
stats['migrated'] += 1
print(f" [{stats['processed']}] Imported + migrated: {entry_path.name}")
else:
print(f" [{stats['processed']}] Imported: {entry_path.name}")
if stats['processed'] % batch_size == 0:
transaction.commit()
transaction.commit()
return stats
def process_all_db_snapshots(batch_size: int = 100) -> dict:
"""
Process all snapshots in DB.
Reconcile index.json and queue for archiving.
"""
from core.models import Snapshot
from django.db import transaction
from django.utils import timezone
stats = {'processed': 0, 'reconciled': 0, 'queued': 0}
total = Snapshot.objects.count()
print(f'[*] Processing {total} snapshots from database...')
for snapshot in Snapshot.objects.iterator():
# Reconcile index.json with DB
snapshot.reconcile_with_index_json()
# Queue for archiving (state machine will handle it)
snapshot.status = Snapshot.StatusChoices.QUEUED
snapshot.retry_at = timezone.now()
snapshot.save()
stats['reconciled'] += 1
stats['queued'] += 1
stats['processed'] += 1
if stats['processed'] % batch_size == 0:
transaction.commit()
print(f" [{stats['processed']}/{total}] Processed...")
transaction.commit()
return stats
def process_filtered_snapshots(
filter_patterns: Iterable[str],
filter_type: str,
before: float | None,
after: float | None,
batch_size: int
) -> dict:
"""Process snapshots matching filters (DB query only)."""
from core.models import Snapshot
from django.db import transaction
from django.utils import timezone
from datetime import datetime
stats = {'processed': 0, 'reconciled': 0, 'queued': 0}
snapshots = Snapshot.objects.all()
if filter_patterns:
snapshots = Snapshot.objects.filter_by_patterns(list(filter_patterns), filter_type)
if status == 'unarchived':
snapshots = snapshots.filter(downloaded_at__isnull=True)
elif status == 'archived':
snapshots = snapshots.filter(downloaded_at__isnull=False)
if before:
from datetime import datetime
snapshots = snapshots.filter(bookmarked_at__lt=datetime.fromtimestamp(before))
if after:
from datetime import datetime
snapshots = snapshots.filter(bookmarked_at__gt=datetime.fromtimestamp(after))
if resume:
snapshots = snapshots.filter(timestamp__gte=str(resume))
snapshot_ids = list(snapshots.values_list('pk', flat=True))
if not snapshot_ids:
print('[yellow]No snapshots found matching the given filters[/yellow]')
return
print(f'[green]\\[*] Found {len(snapshot_ids)} snapshots to update[/green]')
if index_only:
print('[yellow]Index-only mode - skipping archiving[/yellow]')
return
methods = plugins.split(',') if plugins else None
# Queue snapshots for archiving via the state machine system
# Workers will pick them up and run the plugins
if len(snapshot_ids) > 1 and max_workers > 1:
parallel_archive(snapshot_ids, max_workers=max_workers, overwrite=overwrite, methods=methods)
else:
# Queue snapshots by setting status to queued
for snapshot in snapshots:
Snapshot.objects.filter(id=snapshot.id).update(
status=Snapshot.StatusChoices.QUEUED,
retry_at=timezone.now(),
)
print(f'[green]Queued {len(snapshot_ids)} snapshots for archiving[/green]')
total = snapshots.count()
print(f'[*] Found {total} matching snapshots')
for snapshot in snapshots.iterator():
# Reconcile index.json with DB
snapshot.reconcile_with_index_json()
# Queue for archiving
snapshot.status = Snapshot.StatusChoices.QUEUED
snapshot.retry_at = timezone.now()
snapshot.save()
stats['reconciled'] += 1
stats['queued'] += 1
stats['processed'] += 1
if stats['processed'] % batch_size == 0:
transaction.commit()
print(f" [{stats['processed']}/{total}] Processed...")
transaction.commit()
return stats
def print_stats(stats: dict):
"""Print statistics for filtered mode."""
from rich import print
print(f"""
[green]Update Complete[/green]
Processed: {stats['processed']}
Reconciled: {stats['reconciled']}
Queued: {stats['queued']}
""")
def print_combined_stats(stats_combined: dict):
"""Print statistics for full mode."""
from rich import print
s1 = stats_combined['phase1']
s2 = stats_combined['phase2']
print(f"""
[green]Archive Update Complete[/green]
Phase 1 (Import Orphans):
Checked: {s1.get('processed', 0)}
Imported: {s1.get('imported', 0)}
Migrated: {s1.get('migrated', 0)}
Invalid: {s1.get('invalid', 0)}
Phase 2 (Process DB):
Processed: {s2.get('processed', 0)}
Reconciled: {s2.get('reconciled', 0)}
Queued: {s2.get('queued', 0)}
Phase 3 (Deduplicate):
Merged: {stats_combined['deduplicated']}
""")
@click.command()
@click.option('--only-new', is_flag=True, help="Don't attempt to retry previously skipped/failed links when updating")
@click.option('--index-only', is_flag=True, help="Update the main index without archiving any content")
@click.option('--resume', type=float, help='Resume the update process from a given timestamp')
@click.option('--overwrite', '-F', is_flag=True, help='Ignore existing archived content and overwrite with new versions (DANGEROUS)')
@click.option('--before', type=float, help="Update only links bookmarked before the given timestamp")
@click.option('--after', type=float, help="Update only links bookmarked after the given timestamp")
@click.option('--status', type=click.Choice([
'indexed', 'archived', 'unarchived',
'present', 'valid', 'invalid',
'duplicate', 'orphaned', 'corrupted', 'unrecognized'
]), default='indexed', help=f'''
Update only links or data directories that have the given status:
indexed {get_indexed_folders.__doc__} (the default)
archived {get_archived_folders.__doc__}
unarchived {get_unarchived_folders.__doc__}
present {get_present_folders.__doc__}
valid {get_valid_folders.__doc__}
invalid {get_invalid_folders.__doc__}
duplicate {get_duplicate_folders.__doc__}
orphaned {get_orphaned_folders.__doc__}
corrupted {get_corrupted_folders.__doc__}
unrecognized {get_unrecognized_folders.__doc__}
''')
@click.option('--filter-type', '-t', type=click.Choice([*LINK_FILTERS.keys(), 'search']), default='exact', help='Type of pattern matching to use when filtering URLs')
@click.option('--plugins', '-p', default='', help='Comma-separated list of plugins to use e.g. title,favicon,screenshot,singlefile,...')
@click.option('--max-workers', '-j', type=int, default=4, help='Number of parallel worker processes for archiving')
@click.option('--resume', type=str, help='Resume from timestamp')
@click.option('--before', type=float, help='Only snapshots before timestamp')
@click.option('--after', type=float, help='Only snapshots after timestamp')
@click.option('--filter-type', '-t', type=click.Choice(['exact', 'substring', 'regex', 'domain', 'tag', 'timestamp']), default='exact')
@click.option('--batch-size', type=int, default=100, help='Commit every N snapshots')
@click.option('--continuous', is_flag=True, help='Run continuously as background worker')
@click.argument('filter_patterns', nargs=-1)
@docstring(update.__doc__)
def main(**kwargs):
"""Import any new links from subscriptions and retry any previously failed/skipped links"""
update(**kwargs)
if __name__ == '__main__':
main()
# LEGACY VERSION:
# @enforce_types
# def update(resume: Optional[float]=None,
# only_new: bool=ARCHIVING_CONFIG.ONLY_NEW,
# index_only: bool=False,
# overwrite: bool=False,
# filter_patterns_str: Optional[str]=None,
# filter_patterns: Optional[List[str]]=None,
# filter_type: Optional[str]=None,
# status: Optional[str]=None,
# after: Optional[str]=None,
# before: Optional[str]=None,
# extractors: str="",
# out_dir: Path=DATA_DIR) -> List[Link]:
# """Import any new links from subscriptions and retry any previously failed/skipped links"""
# from core.models import ArchiveResult
# from .search import index_links
# # from workers.supervisord_util import start_cli_workers
# check_data_folder()
# # start_cli_workers()
# new_links: List[Link] = [] # TODO: Remove input argument: only_new
# extractors = extractors.split(",") if extractors else []
# # Step 1: Filter for selected_links
# print('[*] Finding matching Snapshots to update...')
# print(f' - Filtering by {" ".join(filter_patterns)} ({filter_type}) {before=} {after=} {status=}...')
# matching_snapshots = list_links(
# filter_patterns=filter_patterns,
# filter_type=filter_type,
# before=before,
# after=after,
# )
# print(f' - Checking {matching_snapshots.count()} snapshot folders for existing data with {status=}...')
# matching_folders = list_folders(
# links=matching_snapshots,
# status=status,
# out_dir=out_dir,
# )
# all_links = (link for link in matching_folders.values() if link)
# print(' - Sorting by most unfinished -> least unfinished + date archived...')
# all_links = sorted(all_links, key=lambda link: (ArchiveResult.objects.filter(snapshot__url=link.url).count(), link.timestamp))
# if index_only:
# for link in all_links:
# write_link_details(link, out_dir=out_dir, skip_sql_index=True)
# index_links(all_links, out_dir=out_dir)
# return all_links
# # Step 2: Run the archive methods for each link
# to_archive = new_links if only_new else all_links
# if resume:
# to_archive = [
# link for link in to_archive
# if link.timestamp >= str(resume)
# ]
# if not to_archive:
# stderr('')
# stderr(f'[√] Nothing found to resume after {resume}', color='green')
# return all_links
# archive_kwargs = {
# "out_dir": out_dir,
# }
# if extractors:
# archive_kwargs["methods"] = extractors
# archive_links(to_archive, overwrite=overwrite, **archive_kwargs)
# # Step 4: Re-write links index with updated titles, icons, and resources
# all_links = load_main_index(out_dir=out_dir)
# return all_links

View File

@@ -107,12 +107,12 @@ def version(quiet: bool=False,
from archivebox.config.django import setup_django
setup_django()
from machine.models import Machine, InstalledBinary
from machine.models import Machine, Binary
machine = Machine.current()
# Get all installed binaries from the database
all_installed = InstalledBinary.objects.filter(
# Get all binaries from the database
all_installed = Binary.objects.filter(
machine=machine
).exclude(abspath='').exclude(abspath__isnull=True).order_by('name')
@@ -134,7 +134,7 @@ def version(quiet: bool=False,
failures.append(installed.name)
# Show hint if no binaries are installed yet
has_any_installed = InstalledBinary.objects.filter(machine=machine).exclude(abspath='').exists()
has_any_installed = Binary.objects.filter(machine=machine).exclude(abspath='').exists()
if not has_any_installed:
prnt()
prnt('', '[grey53]Run [green]archivebox install[/green] to detect and install dependencies.[/grey53]')

View File

@@ -472,25 +472,25 @@ class TestURLCollection(unittest.TestCase):
"""Clean up test directory."""
shutil.rmtree(self.test_dir, ignore_errors=True)
def test_collect_urls_from_extractors(self):
"""Should collect urls.jsonl from all extractor subdirectories."""
from archivebox.hooks import collect_urls_from_extractors
def test_collect_urls_from_plugins(self):
"""Should collect urls.jsonl from all parser plugin subdirectories."""
from archivebox.hooks import collect_urls_from_plugins
urls = collect_urls_from_extractors(self.test_dir)
urls = collect_urls_from_plugins(self.test_dir)
self.assertEqual(len(urls), 4)
# Check that via_extractor is set
extractors = {u['via_extractor'] for u in urls}
self.assertIn('wget', extractors)
self.assertIn('parse_html_urls', extractors)
self.assertNotIn('screenshot', extractors) # No urls.jsonl
# Check that plugin is set
plugins = {u['plugin'] for u in urls}
self.assertIn('wget', plugins)
self.assertIn('parse_html_urls', plugins)
self.assertNotIn('screenshot', plugins) # No urls.jsonl
def test_collect_urls_preserves_metadata(self):
"""Should preserve metadata from urls.jsonl entries."""
from archivebox.hooks import collect_urls_from_extractors
from archivebox.hooks import collect_urls_from_plugins
urls = collect_urls_from_extractors(self.test_dir)
urls = collect_urls_from_plugins(self.test_dir)
# Find the entry with title
titled = [u for u in urls if u.get('title') == 'HTML Link 2']
@@ -499,10 +499,10 @@ class TestURLCollection(unittest.TestCase):
def test_collect_urls_empty_dir(self):
"""Should handle empty or non-existent directories."""
from archivebox.hooks import collect_urls_from_extractors
from archivebox.hooks import collect_urls_from_plugins
empty_dir = self.test_dir / 'nonexistent'
urls = collect_urls_from_extractors(empty_dir)
urls = collect_urls_from_plugins(empty_dir)
self.assertEqual(len(urls), 0)
@@ -612,7 +612,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
Test: archivebox crawl URL
Should create snapshot, run plugins, output discovered URLs.
"""
from archivebox.hooks import collect_urls_from_extractors
from archivebox.hooks import collect_urls_from_plugins
from archivebox.misc.jsonl import TYPE_SNAPSHOT
# Create a mock snapshot directory with urls.jsonl
@@ -627,7 +627,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
)
# Collect URLs (as crawl does)
discovered = collect_urls_from_extractors(test_snapshot_dir)
discovered = collect_urls_from_plugins(test_snapshot_dir)
self.assertEqual(len(discovered), 2)
@@ -688,7 +688,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
TYPE_SNAPSHOT
)
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.hooks import collect_urls_from_extractors
from archivebox.hooks import collect_urls_from_plugins
created_by_id = get_or_create_system_user_pk()
@@ -707,7 +707,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
)
# Step 3: Collect discovered URLs (crawl output)
discovered = collect_urls_from_extractors(snapshot_dir)
discovered = collect_urls_from_plugins(snapshot_dir)
crawl_output = []
for entry in discovered:
entry['type'] = TYPE_SNAPSHOT
@@ -835,7 +835,7 @@ class TestParserPluginWorkflows(unittest.TestCase):
"""
Test: archivebox crawl --plugin=parse_html_urls URL | archivebox snapshot | archivebox extract
"""
from archivebox.hooks import collect_urls_from_extractors
from archivebox.hooks import collect_urls_from_plugins
from archivebox.misc.jsonl import TYPE_SNAPSHOT
# Create mock output directory
@@ -847,17 +847,17 @@ class TestParserPluginWorkflows(unittest.TestCase):
)
# Collect URLs
discovered = collect_urls_from_extractors(snapshot_dir)
discovered = collect_urls_from_plugins(snapshot_dir)
self.assertEqual(len(discovered), 1)
self.assertEqual(discovered[0]['url'], 'https://html-discovered.com')
self.assertEqual(discovered[0]['via_extractor'], 'parse_html_urls')
self.assertEqual(discovered[0]['plugin'], 'parse_html_urls')
def test_rss_parser_workflow(self):
"""
Test: archivebox crawl --plugin=parse_rss_urls URL | archivebox snapshot | archivebox extract
"""
from archivebox.hooks import collect_urls_from_extractors
from archivebox.hooks import collect_urls_from_plugins
# Create mock output directory
snapshot_dir = Path(self.test_dir) / 'archive' / 'rss-parser-test'
@@ -869,16 +869,16 @@ class TestParserPluginWorkflows(unittest.TestCase):
)
# Collect URLs
discovered = collect_urls_from_extractors(snapshot_dir)
discovered = collect_urls_from_plugins(snapshot_dir)
self.assertEqual(len(discovered), 2)
self.assertTrue(all(d['via_extractor'] == 'parse_rss_urls' for d in discovered))
self.assertTrue(all(d['plugin'] == 'parse_rss_urls' for d in discovered))
def test_multiple_parsers_dedupe(self):
"""
Multiple parsers may discover the same URL - should be deduplicated.
"""
from archivebox.hooks import collect_urls_from_extractors
from archivebox.hooks import collect_urls_from_plugins
# Create mock output with duplicate URLs from different parsers
snapshot_dir = Path(self.test_dir) / 'archive' / 'dedupe-test'
@@ -895,7 +895,7 @@ class TestParserPluginWorkflows(unittest.TestCase):
)
# Collect URLs
all_discovered = collect_urls_from_extractors(snapshot_dir)
all_discovered = collect_urls_from_plugins(snapshot_dir)
# Both entries are returned (deduplication happens at the crawl command level)
self.assertEqual(len(all_discovered), 2)