way better plugin hooks system wip

2026-04-05 15:27:53 +10:00 · 2025-12-28 03:39:59 -08:00
parent a38624a4dd
commit 50e527ec65
156 changed files with 10275 additions and 7149 deletions
--- a/archivebox/cli/archivebox_crawl.py
+++ b/archivebox/cli/archivebox_crawl.py
@@ -186,7 +186,7 @@ def discover_outlinks(

    # Collect discovered URLs from urls.jsonl files
    # Uses dynamic discovery - any plugin that outputs urls.jsonl is considered a parser
-    from archivebox.hooks import collect_urls_from_extractors
+    from archivebox.hooks import collect_urls_from_plugins

    discovered_urls = {}
    for snapshot_id in snapshot_ids:
@@ -195,7 +195,7 @@ def discover_outlinks(
            snapshot_dir = Path(snapshot.output_dir)

            # Dynamically collect urls.jsonl from ANY plugin subdirectory
-            for entry in collect_urls_from_extractors(snapshot_dir):
+            for entry in collect_urls_from_plugins(snapshot_dir):
                url = entry.get('url')
                if url and url not in discovered_urls:
                    # Add metadata for crawl tracking
--- a/archivebox/cli/archivebox_init.py
+++ b/archivebox/cli/archivebox_init.py
@@ -21,7 +21,6 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
    from archivebox.config import CONSTANTS, VERSION, DATA_DIR
    from archivebox.config.common import SERVER_CONFIG
    from archivebox.config.collection import write_config_file
-    from archivebox.misc.folders import fix_invalid_folder_locations, get_invalid_folders
    from archivebox.misc.legacy import parse_json_main_index, parse_json_links_details, SnapshotDict
    from archivebox.misc.db import apply_migrations
    
@@ -106,17 +105,10 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
        print(f'    √ Loaded {all_links.count()} links from existing main index.')

    if quick:
-        print('    > Skipping full snapshot directory check (quick mode)')
+        print('    > Skipping orphan snapshot import (quick mode)')
    else:
        try:
-            # Links in data folders that dont match their timestamp
-            fixed, cant_fix = fix_invalid_folder_locations(DATA_DIR)
-            if fixed:
-                print(f'    [yellow]√ Fixed {len(fixed)} data directory locations that didn\'t match their link timestamps.[/yellow]')
-            if cant_fix:
-                print(f'    [red]! Could not fix {len(cant_fix)} data directory locations due to conflicts with existing folders.[/red]')
-
-            # Links in JSON index but not in main index
+            # Import orphaned links from legacy JSON indexes
            orphaned_json_links = {
                link_dict['url']: link_dict
                for link_dict in parse_json_main_index(DATA_DIR)
@@ -126,7 +118,6 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
                pending_links.update(orphaned_json_links)
                print(f'    [yellow]√ Added {len(orphaned_json_links)} orphaned links from existing JSON index...[/yellow]')

-            # Links in data dir indexes but not in main index
            orphaned_data_dir_links = {
                link_dict['url']: link_dict
                for link_dict in parse_json_links_details(DATA_DIR)
@@ -136,18 +127,13 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
                pending_links.update(orphaned_data_dir_links)
                print(f'    [yellow]√ Added {len(orphaned_data_dir_links)} orphaned links from existing archive directories.[/yellow]')

-            # Links in invalid/duplicate data dirs
-            invalid_folders = {
-                folder: link
-                for folder, link in get_invalid_folders(all_links, DATA_DIR).items()
-            }
-            if invalid_folders:
-                print(f'    [red]! Skipped adding {len(invalid_folders)} invalid link data directories.[/red]')
-                print('        X ' + '\n        X '.join(f'./{Path(folder).relative_to(DATA_DIR)} {link}' for folder, link in invalid_folders.items()))
-                print()
-                print('    [violet]Hint:[/violet] For more information about the link data directories that were skipped, run:')
-                print('        archivebox status')
-                print('        archivebox list --status=invalid')
+            if pending_links:
+                Snapshot.objects.create_from_dicts(list(pending_links.values()))
+
+            # Hint for orphaned snapshot directories
+            print()
+            print('    [violet]Hint:[/violet] To import orphaned snapshot directories and reconcile filesystem state, run:')
+            print('        archivebox update')

        except (KeyboardInterrupt, SystemExit):
            print(file=sys.stderr)
@@ -157,9 +143,6 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
            print('    [violet]Hint:[/violet] In the future you can run a quick init without checking dirs like so:', file=sys.stderr)
            print('        archivebox init --quick', file=sys.stderr)
            raise SystemExit(1)
-        
-        if pending_links:
-            Snapshot.objects.create_from_dicts(list(pending_links.values()))

    print('\n[green]----------------------------------------------------------------------[/green]')

--- a/archivebox/cli/archivebox_install.py
+++ b/archivebox/cli/archivebox_install.py
@@ -22,7 +22,7 @@ def install(dry_run: bool=False) -> None:
    from archivebox.cli.archivebox_init import init

    if not (os.access(ARCHIVE_DIR, os.R_OK) and ARCHIVE_DIR.is_dir()):
-        init()  # must init full index because we need a db to store InstalledBinary entries in
+        init()  # must init full index because we need a db to store Binary entries in

    print('\n[green][+] Detecting ArchiveBox dependencies...[/green]')

--- a/archivebox/cli/archivebox_search.py
+++ b/archivebox/cli/archivebox_search.py
@@ -25,10 +25,7 @@ LINK_FILTERS = {
    'timestamp': lambda pattern: {'timestamp': pattern},
 }

-STATUS_CHOICES = [
-    'indexed', 'archived', 'unarchived', 'present', 'valid', 'invalid',
-    'duplicate', 'orphaned', 'corrupted', 'unrecognized'
-]
+STATUS_CHOICES = ['indexed', 'archived', 'unarchived']



@@ -59,45 +56,6 @@ def get_snapshots(snapshots: Optional[QuerySet]=None,
    return result


-def list_folders(snapshots: QuerySet, status: str, out_dir: Path=DATA_DIR) -> dict[str, Any]:
-
-    from archivebox.misc.checks import check_data_folder
-    from archivebox.misc.folders import (
-        get_indexed_folders,
-        get_archived_folders,
-        get_unarchived_folders,
-        get_present_folders,
-        get_valid_folders,
-        get_invalid_folders,
-        get_duplicate_folders,
-        get_orphaned_folders,
-        get_corrupted_folders,
-        get_unrecognized_folders,
-    )
-
-    check_data_folder()
-
-    STATUS_FUNCTIONS = {
-        "indexed": get_indexed_folders,
-        "archived": get_archived_folders,
-        "unarchived": get_unarchived_folders,
-        "present": get_present_folders,
-        "valid": get_valid_folders,
-        "invalid": get_invalid_folders,
-        "duplicate": get_duplicate_folders,
-        "orphaned": get_orphaned_folders,
-        "corrupted": get_corrupted_folders,
-        "unrecognized": get_unrecognized_folders,
-    }
-
-    try:
-        return STATUS_FUNCTIONS[status](snapshots, out_dir=out_dir)
-    except KeyError:
-        raise ValueError('Status not recognized.')
-
-
-
-
@enforce_types
 def search(filter_patterns: list[str] | None=None,
           filter_type: str='substring',
@@ -110,12 +68,13 @@ def search(filter_patterns: list[str] | None=None,
           csv: str | None=None,
           with_headers: bool=False):
    """List, filter, and export information about archive entries"""
-    
+    from core.models import Snapshot

    if with_headers and not (json or html or csv):
        stderr('[X] --with-headers requires --json, --html or --csv\n', color='red')
        raise SystemExit(2)

+    # Query DB directly - no filesystem scanning
    snapshots = get_snapshots(
        filter_patterns=list(filter_patterns) if filter_patterns else None,
        filter_type=filter_type,
@@ -123,30 +82,27 @@ def search(filter_patterns: list[str] | None=None,
        after=after,
    )

+    # Apply status filter
+    if status == 'archived':
+        snapshots = snapshots.filter(downloaded_at__isnull=False)
+    elif status == 'unarchived':
+        snapshots = snapshots.filter(downloaded_at__isnull=True)
+    # 'indexed' = all snapshots (no filter)
+
    if sort:
        snapshots = snapshots.order_by(sort)

-    folders = list_folders(
-        snapshots=snapshots,
-        status=status,
-        out_dir=DATA_DIR,
-    )
-
+    # Export to requested format
    if json:
-        from core.models import Snapshot
-        # Filter for non-None snapshots
-        valid_snapshots = [s for s in folders.values() if s is not None]
-        output = Snapshot.objects.filter(pk__in=[s.pk for s in valid_snapshots]).to_json(with_headers=with_headers)
+        output = snapshots.to_json(with_headers=with_headers)
    elif html:
-        from core.models import Snapshot
-        valid_snapshots = [s for s in folders.values() if s is not None]
-        output = Snapshot.objects.filter(pk__in=[s.pk for s in valid_snapshots]).to_html(with_headers=with_headers)
+        output = snapshots.to_html(with_headers=with_headers)
    elif csv:
-        from core.models import Snapshot
-        valid_snapshots = [s for s in folders.values() if s is not None]
-        output = Snapshot.objects.filter(pk__in=[s.pk for s in valid_snapshots]).to_csv(cols=csv.split(','), header=with_headers)
+        output = snapshots.to_csv(cols=csv.split(','), header=with_headers)
    else:
        from archivebox.misc.logging_util import printable_folders
+        # Convert to dict for printable_folders
+        folders = {s.output_dir: s for s in snapshots}
        output = printable_folders(folders, with_headers)

    print(output)
--- a/archivebox/cli/archivebox_update.py
+++ b/archivebox/cli/archivebox_update.py
@@ -2,223 +2,284 @@

 __package__ = 'archivebox.cli'

-
+import os
+import time
 import rich_click as click

 from typing import Iterable
+from pathlib import Path

 from archivebox.misc.util import enforce_types, docstring
-from archivebox.misc.folders import (
-    get_indexed_folders,
-    get_archived_folders,
-    get_unarchived_folders,
-    get_present_folders,
-    get_valid_folders,
-    get_invalid_folders,
-    get_duplicate_folders,
-    get_orphaned_folders,
-    get_corrupted_folders,
-    get_unrecognized_folders,
-)
-
-# Filter types for URL matching
-LINK_FILTERS = {
-    'exact': lambda pattern: {'url': pattern},
-    'substring': lambda pattern: {'url__icontains': pattern},
-    'regex': lambda pattern: {'url__iregex': pattern},
-    'domain': lambda pattern: {'url__istartswith': f'http://{pattern}'},
-    'tag': lambda pattern: {'tags__name': pattern},
-    'timestamp': lambda pattern: {'timestamp': pattern},
-}


@enforce_types
-def update(filter_patterns: Iterable[str]=(),
-          only_new: bool=False,
-          index_only: bool=False,
-          resume: float | None=None,
-          overwrite: bool=False,
-          before: float | None=None,
-          after: float | None=None,
-          status: str='indexed',
-          filter_type: str='exact',
-          plugins: str="",
-          max_workers: int=4) -> None:
-    """Import any new links from subscriptions and retry any previously failed/skipped links"""
-    
+def update(filter_patterns: Iterable[str] = (),
+          filter_type: str = 'exact',
+          before: float | None = None,
+          after: float | None = None,
+          resume: str | None = None,
+          batch_size: int = 100,
+          continuous: bool = False) -> None:
+    """
+    Update snapshots: import orphans, reconcile, and re-run failed extractors.
+
+    Two-phase operation:
+    - Phase 1: Scan archive/ for orphaned snapshots (skip symlinks)
+    - Phase 2: Process all DB snapshots (reconcile + re-queue for archiving)
+    - Phase 3: Deduplicate exact duplicates
+
+    With filters: Only phase 2 (DB query), no filesystem scan.
+    Without filters: All phases (full update).
+    """
+
    from rich import print
-    
    from archivebox.config.django import setup_django
    setup_django()

-    from django.utils import timezone
    from core.models import Snapshot
-    from workers.orchestrator import parallel_archive
-    
-    # Get snapshots to update based on filters
+    from django.utils import timezone
+
+    while True:
+        if filter_patterns or before or after:
+            # Filtered mode: query DB only
+            print('[*] Processing filtered snapshots from database...')
+            stats = process_filtered_snapshots(
+                filter_patterns=filter_patterns,
+                filter_type=filter_type,
+                before=before,
+                after=after,
+                batch_size=batch_size
+            )
+            print_stats(stats)
+        else:
+            # Full mode: import orphans + process DB + deduplicate
+            stats_combined = {'phase1': {}, 'phase2': {}, 'deduplicated': 0}
+
+            print('[*] Phase 1: Scanning archive/ for orphaned snapshots...')
+            stats_combined['phase1'] = import_orphans_from_archive(
+                resume_from=resume,
+                batch_size=batch_size
+            )
+
+            print('[*] Phase 2: Processing all database snapshots...')
+            stats_combined['phase2'] = process_all_db_snapshots(batch_size=batch_size)
+
+            print('[*] Phase 3: Deduplicating...')
+            stats_combined['deduplicated'] = Snapshot.find_and_merge_duplicates()
+
+            print_combined_stats(stats_combined)
+
+        if not continuous:
+            break
+
+        print('[yellow]Sleeping 60s before next pass...[/yellow]')
+        time.sleep(60)
+        resume = None
+
+
+def import_orphans_from_archive(resume_from: str = None, batch_size: int = 100) -> dict:
+    """
+    Scan archive/ for orphaned snapshots.
+    Skip symlinks (already migrated).
+    Create DB records and trigger migration on save().
+    """
+    from core.models import Snapshot
+    from archivebox.config import CONSTANTS
+    from django.db import transaction
+
+    stats = {'processed': 0, 'imported': 0, 'migrated': 0, 'invalid': 0}
+
+    archive_dir = CONSTANTS.ARCHIVE_DIR
+    if not archive_dir.exists():
+        return stats
+
+    print('[*] Scanning and sorting by modification time...')
+
+    # Scan and sort by mtime (newest first)
+    # Loading (mtime, path) tuples is fine even for millions (~100MB for 1M entries)
+    entries = [
+        (e.stat().st_mtime, e.path)
+        for e in os.scandir(archive_dir)
+        if e.is_dir(follow_symlinks=False)  # Skip symlinks
+    ]
+    entries.sort(reverse=True)  # Newest first
+    print(f'[*] Found {len(entries)} directories to check')
+
+    for mtime, entry_path in entries:
+        entry_path = Path(entry_path)
+
+        # Resume from timestamp if specified
+        if resume_from and entry_path.name < resume_from:
+            continue
+
+        stats['processed'] += 1
+
+        # Check if already in DB
+        snapshot = Snapshot.load_from_directory(entry_path)
+        if snapshot:
+            continue  # Already in DB, skip
+
+        # Not in DB - create orphaned snapshot
+        snapshot = Snapshot.create_from_directory(entry_path)
+        if not snapshot:
+            # Invalid directory
+            Snapshot.move_directory_to_invalid(entry_path)
+            stats['invalid'] += 1
+            print(f"    [{stats['processed']}] Invalid: {entry_path.name}")
+            continue
+
+        needs_migration = snapshot.fs_migration_needed
+
+        snapshot.save()  # Creates DB record + triggers migration
+
+        stats['imported'] += 1
+        if needs_migration:
+            stats['migrated'] += 1
+            print(f"    [{stats['processed']}] Imported + migrated: {entry_path.name}")
+        else:
+            print(f"    [{stats['processed']}] Imported: {entry_path.name}")
+
+        if stats['processed'] % batch_size == 0:
+            transaction.commit()
+
+    transaction.commit()
+    return stats
+
+
+def process_all_db_snapshots(batch_size: int = 100) -> dict:
+    """
+    Process all snapshots in DB.
+    Reconcile index.json and queue for archiving.
+    """
+    from core.models import Snapshot
+    from django.db import transaction
+    from django.utils import timezone
+
+    stats = {'processed': 0, 'reconciled': 0, 'queued': 0}
+
+    total = Snapshot.objects.count()
+    print(f'[*] Processing {total} snapshots from database...')
+
+    for snapshot in Snapshot.objects.iterator():
+        # Reconcile index.json with DB
+        snapshot.reconcile_with_index_json()
+
+        # Queue for archiving (state machine will handle it)
+        snapshot.status = Snapshot.StatusChoices.QUEUED
+        snapshot.retry_at = timezone.now()
+        snapshot.save()
+
+        stats['reconciled'] += 1
+        stats['queued'] += 1
+        stats['processed'] += 1
+
+        if stats['processed'] % batch_size == 0:
+            transaction.commit()
+            print(f"    [{stats['processed']}/{total}] Processed...")
+
+    transaction.commit()
+    return stats
+
+
+def process_filtered_snapshots(
+    filter_patterns: Iterable[str],
+    filter_type: str,
+    before: float | None,
+    after: float | None,
+    batch_size: int
+) -> dict:
+    """Process snapshots matching filters (DB query only)."""
+    from core.models import Snapshot
+    from django.db import transaction
+    from django.utils import timezone
+    from datetime import datetime
+
+    stats = {'processed': 0, 'reconciled': 0, 'queued': 0}
+
    snapshots = Snapshot.objects.all()
-    
+
    if filter_patterns:
        snapshots = Snapshot.objects.filter_by_patterns(list(filter_patterns), filter_type)
-    
-    if status == 'unarchived':
-        snapshots = snapshots.filter(downloaded_at__isnull=True)
-    elif status == 'archived':
-        snapshots = snapshots.filter(downloaded_at__isnull=False)
-    
+
    if before:
-        from datetime import datetime
        snapshots = snapshots.filter(bookmarked_at__lt=datetime.fromtimestamp(before))
    if after:
-        from datetime import datetime
        snapshots = snapshots.filter(bookmarked_at__gt=datetime.fromtimestamp(after))
-    
-    if resume:
-        snapshots = snapshots.filter(timestamp__gte=str(resume))
-    
-    snapshot_ids = list(snapshots.values_list('pk', flat=True))
-    
-    if not snapshot_ids:
-        print('[yellow]No snapshots found matching the given filters[/yellow]')
-        return
-    
-    print(f'[green]\\[*] Found {len(snapshot_ids)} snapshots to update[/green]')
-    
-    if index_only:
-        print('[yellow]Index-only mode - skipping archiving[/yellow]')
-        return
-    
-    methods = plugins.split(',') if plugins else None

-    # Queue snapshots for archiving via the state machine system
-    # Workers will pick them up and run the plugins
-    if len(snapshot_ids) > 1 and max_workers > 1:
-        parallel_archive(snapshot_ids, max_workers=max_workers, overwrite=overwrite, methods=methods)
-    else:
-        # Queue snapshots by setting status to queued
-        for snapshot in snapshots:
-            Snapshot.objects.filter(id=snapshot.id).update(
-                status=Snapshot.StatusChoices.QUEUED,
-                retry_at=timezone.now(),
-            )
-        print(f'[green]Queued {len(snapshot_ids)} snapshots for archiving[/green]')
+    total = snapshots.count()
+    print(f'[*] Found {total} matching snapshots')
+
+    for snapshot in snapshots.iterator():
+        # Reconcile index.json with DB
+        snapshot.reconcile_with_index_json()
+
+        # Queue for archiving
+        snapshot.status = Snapshot.StatusChoices.QUEUED
+        snapshot.retry_at = timezone.now()
+        snapshot.save()
+
+        stats['reconciled'] += 1
+        stats['queued'] += 1
+        stats['processed'] += 1
+
+        if stats['processed'] % batch_size == 0:
+            transaction.commit()
+            print(f"    [{stats['processed']}/{total}] Processed...")
+
+    transaction.commit()
+    return stats
+
+
+def print_stats(stats: dict):
+    """Print statistics for filtered mode."""
+    from rich import print
+
+    print(f"""
+[green]Update Complete[/green]
+  Processed:   {stats['processed']}
+  Reconciled:  {stats['reconciled']}
+  Queued:      {stats['queued']}
+""")
+
+
+def print_combined_stats(stats_combined: dict):
+    """Print statistics for full mode."""
+    from rich import print
+
+    s1 = stats_combined['phase1']
+    s2 = stats_combined['phase2']
+
+    print(f"""
+[green]Archive Update Complete[/green]
+
+Phase 1 (Import Orphans):
+  Checked:     {s1.get('processed', 0)}
+  Imported:    {s1.get('imported', 0)}
+  Migrated:    {s1.get('migrated', 0)}
+  Invalid:     {s1.get('invalid', 0)}
+
+Phase 2 (Process DB):
+  Processed:   {s2.get('processed', 0)}
+  Reconciled:  {s2.get('reconciled', 0)}
+  Queued:      {s2.get('queued', 0)}
+
+Phase 3 (Deduplicate):
+  Merged:      {stats_combined['deduplicated']}
+""")


@click.command()
-@click.option('--only-new', is_flag=True, help="Don't attempt to retry previously skipped/failed links when updating")
-@click.option('--index-only', is_flag=True, help="Update the main index without archiving any content")
-@click.option('--resume', type=float, help='Resume the update process from a given timestamp')
-@click.option('--overwrite', '-F', is_flag=True, help='Ignore existing archived content and overwrite with new versions (DANGEROUS)')
-@click.option('--before', type=float, help="Update only links bookmarked before the given timestamp")
-@click.option('--after', type=float, help="Update only links bookmarked after the given timestamp") 
-@click.option('--status', type=click.Choice([
-    'indexed', 'archived', 'unarchived',
-    'present', 'valid', 'invalid',
-    'duplicate', 'orphaned', 'corrupted', 'unrecognized'
-]), default='indexed', help=f'''
-Update only links or data directories that have the given status:
-    indexed       {get_indexed_folders.__doc__} (the default)
-    archived      {get_archived_folders.__doc__}
-    unarchived    {get_unarchived_folders.__doc__}
-
-    present       {get_present_folders.__doc__}
-    valid         {get_valid_folders.__doc__}
-    invalid       {get_invalid_folders.__doc__}
-
-    duplicate     {get_duplicate_folders.__doc__}
-    orphaned      {get_orphaned_folders.__doc__}
-    corrupted     {get_corrupted_folders.__doc__}
-    unrecognized  {get_unrecognized_folders.__doc__}
-''')
-@click.option('--filter-type', '-t', type=click.Choice([*LINK_FILTERS.keys(), 'search']), default='exact', help='Type of pattern matching to use when filtering URLs')
-@click.option('--plugins', '-p', default='', help='Comma-separated list of plugins to use e.g. title,favicon,screenshot,singlefile,...')
-@click.option('--max-workers', '-j', type=int, default=4, help='Number of parallel worker processes for archiving')
+@click.option('--resume', type=str, help='Resume from timestamp')
+@click.option('--before', type=float, help='Only snapshots before timestamp')
+@click.option('--after', type=float, help='Only snapshots after timestamp')
+@click.option('--filter-type', '-t', type=click.Choice(['exact', 'substring', 'regex', 'domain', 'tag', 'timestamp']), default='exact')
+@click.option('--batch-size', type=int, default=100, help='Commit every N snapshots')
+@click.option('--continuous', is_flag=True, help='Run continuously as background worker')
@click.argument('filter_patterns', nargs=-1)
@docstring(update.__doc__)
 def main(**kwargs):
-    """Import any new links from subscriptions and retry any previously failed/skipped links"""
    update(**kwargs)


 if __name__ == '__main__':
    main()
-
-
-
-
-# LEGACY VERSION:
-# @enforce_types
-# def update(resume: Optional[float]=None,
-#            only_new: bool=ARCHIVING_CONFIG.ONLY_NEW,
-#            index_only: bool=False,
-#            overwrite: bool=False,
-#            filter_patterns_str: Optional[str]=None,
-#            filter_patterns: Optional[List[str]]=None,
-#            filter_type: Optional[str]=None,
-#            status: Optional[str]=None,
-#            after: Optional[str]=None,
-#            before: Optional[str]=None,
-#            extractors: str="",
-#            out_dir: Path=DATA_DIR) -> List[Link]:
-#     """Import any new links from subscriptions and retry any previously failed/skipped links"""
-
-#     from core.models import ArchiveResult
-#     from .search import index_links
-#     # from workers.supervisord_util import start_cli_workers
-    
-
-#     check_data_folder()
-#     # start_cli_workers()
-#     new_links: List[Link] = [] # TODO: Remove input argument: only_new
-
-#     extractors = extractors.split(",") if extractors else []
-
-#     # Step 1: Filter for selected_links
-#     print('[*] Finding matching Snapshots to update...')
-#     print(f'    - Filtering by {" ".join(filter_patterns)} ({filter_type}) {before=} {after=} {status=}...')
-#     matching_snapshots = list_links(
-#         filter_patterns=filter_patterns,
-#         filter_type=filter_type,
-#         before=before,
-#         after=after,
-#     )
-#     print(f'    - Checking {matching_snapshots.count()} snapshot folders for existing data with {status=}...')
-#     matching_folders = list_folders(
-#         links=matching_snapshots,
-#         status=status,
-#         out_dir=out_dir,
-#     )
-#     all_links = (link for link in matching_folders.values() if link)
-#     print('    - Sorting by most unfinished -> least unfinished + date archived...')
-#     all_links = sorted(all_links, key=lambda link: (ArchiveResult.objects.filter(snapshot__url=link.url).count(), link.timestamp))
-
-#     if index_only:
-#         for link in all_links:
-#             write_link_details(link, out_dir=out_dir, skip_sql_index=True)
-#         index_links(all_links, out_dir=out_dir)
-#         return all_links
-        
-#     # Step 2: Run the archive methods for each link
-#     to_archive = new_links if only_new else all_links
-#     if resume:
-#         to_archive = [
-#             link for link in to_archive
-#             if link.timestamp >= str(resume)
-#         ]
-#         if not to_archive:
-#             stderr('')
-#             stderr(f'[√] Nothing found to resume after {resume}', color='green')
-#             return all_links
-
-#     archive_kwargs = {
-#         "out_dir": out_dir,
-#     }
-#     if extractors:
-#         archive_kwargs["methods"] = extractors
-
-
-#     archive_links(to_archive, overwrite=overwrite, **archive_kwargs)
-
-#     # Step 4: Re-write links index with updated titles, icons, and resources
-#     all_links = load_main_index(out_dir=out_dir)
-#     return all_links
--- a/archivebox/cli/archivebox_version.py
+++ b/archivebox/cli/archivebox_version.py
@@ -107,12 +107,12 @@ def version(quiet: bool=False,
    from archivebox.config.django import setup_django
    setup_django()

-    from machine.models import Machine, InstalledBinary
+    from machine.models import Machine, Binary

    machine = Machine.current()

-    # Get all installed binaries from the database
-    all_installed = InstalledBinary.objects.filter(
+    # Get all binaries from the database
+    all_installed = Binary.objects.filter(
        machine=machine
    ).exclude(abspath='').exclude(abspath__isnull=True).order_by('name')

@@ -134,7 +134,7 @@ def version(quiet: bool=False,
                failures.append(installed.name)

    # Show hint if no binaries are installed yet
-    has_any_installed = InstalledBinary.objects.filter(machine=machine).exclude(abspath='').exists()
+    has_any_installed = Binary.objects.filter(machine=machine).exclude(abspath='').exists()
    if not has_any_installed:
        prnt()
        prnt('', '[grey53]Run [green]archivebox install[/green] to detect and install dependencies.[/grey53]')
--- a/archivebox/cli/tests_piping.py
+++ b/archivebox/cli/tests_piping.py
@@ -472,25 +472,25 @@ class TestURLCollection(unittest.TestCase):
        """Clean up test directory."""
        shutil.rmtree(self.test_dir, ignore_errors=True)

-    def test_collect_urls_from_extractors(self):
-        """Should collect urls.jsonl from all extractor subdirectories."""
-        from archivebox.hooks import collect_urls_from_extractors
+    def test_collect_urls_from_plugins(self):
+        """Should collect urls.jsonl from all parser plugin subdirectories."""
+        from archivebox.hooks import collect_urls_from_plugins

-        urls = collect_urls_from_extractors(self.test_dir)
+        urls = collect_urls_from_plugins(self.test_dir)

        self.assertEqual(len(urls), 4)

-        # Check that via_extractor is set
-        extractors = {u['via_extractor'] for u in urls}
-        self.assertIn('wget', extractors)
-        self.assertIn('parse_html_urls', extractors)
-        self.assertNotIn('screenshot', extractors)  # No urls.jsonl
+        # Check that plugin is set
+        plugins = {u['plugin'] for u in urls}
+        self.assertIn('wget', plugins)
+        self.assertIn('parse_html_urls', plugins)
+        self.assertNotIn('screenshot', plugins)  # No urls.jsonl

    def test_collect_urls_preserves_metadata(self):
        """Should preserve metadata from urls.jsonl entries."""
-        from archivebox.hooks import collect_urls_from_extractors
+        from archivebox.hooks import collect_urls_from_plugins

-        urls = collect_urls_from_extractors(self.test_dir)
+        urls = collect_urls_from_plugins(self.test_dir)

        # Find the entry with title
        titled = [u for u in urls if u.get('title') == 'HTML Link 2']
@@ -499,10 +499,10 @@ class TestURLCollection(unittest.TestCase):

    def test_collect_urls_empty_dir(self):
        """Should handle empty or non-existent directories."""
-        from archivebox.hooks import collect_urls_from_extractors
+        from archivebox.hooks import collect_urls_from_plugins

        empty_dir = self.test_dir / 'nonexistent'
-        urls = collect_urls_from_extractors(empty_dir)
+        urls = collect_urls_from_plugins(empty_dir)

        self.assertEqual(len(urls), 0)

@@ -612,7 +612,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
        Test: archivebox crawl URL
        Should create snapshot, run plugins, output discovered URLs.
        """
-        from archivebox.hooks import collect_urls_from_extractors
+        from archivebox.hooks import collect_urls_from_plugins
        from archivebox.misc.jsonl import TYPE_SNAPSHOT

        # Create a mock snapshot directory with urls.jsonl
@@ -627,7 +627,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
        )

        # Collect URLs (as crawl does)
-        discovered = collect_urls_from_extractors(test_snapshot_dir)
+        discovered = collect_urls_from_plugins(test_snapshot_dir)

        self.assertEqual(len(discovered), 2)

@@ -688,7 +688,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
            TYPE_SNAPSHOT
        )
        from archivebox.base_models.models import get_or_create_system_user_pk
-        from archivebox.hooks import collect_urls_from_extractors
+        from archivebox.hooks import collect_urls_from_plugins

        created_by_id = get_or_create_system_user_pk()

@@ -707,7 +707,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
        )

        # Step 3: Collect discovered URLs (crawl output)
-        discovered = collect_urls_from_extractors(snapshot_dir)
+        discovered = collect_urls_from_plugins(snapshot_dir)
        crawl_output = []
        for entry in discovered:
            entry['type'] = TYPE_SNAPSHOT
@@ -835,7 +835,7 @@ class TestParserPluginWorkflows(unittest.TestCase):
        """
        Test: archivebox crawl --plugin=parse_html_urls URL | archivebox snapshot | archivebox extract
        """
-        from archivebox.hooks import collect_urls_from_extractors
+        from archivebox.hooks import collect_urls_from_plugins
        from archivebox.misc.jsonl import TYPE_SNAPSHOT

        # Create mock output directory
@@ -847,17 +847,17 @@ class TestParserPluginWorkflows(unittest.TestCase):
        )

        # Collect URLs
-        discovered = collect_urls_from_extractors(snapshot_dir)
+        discovered = collect_urls_from_plugins(snapshot_dir)

        self.assertEqual(len(discovered), 1)
        self.assertEqual(discovered[0]['url'], 'https://html-discovered.com')
-        self.assertEqual(discovered[0]['via_extractor'], 'parse_html_urls')
+        self.assertEqual(discovered[0]['plugin'], 'parse_html_urls')

    def test_rss_parser_workflow(self):
        """
        Test: archivebox crawl --plugin=parse_rss_urls URL | archivebox snapshot | archivebox extract
        """
-        from archivebox.hooks import collect_urls_from_extractors
+        from archivebox.hooks import collect_urls_from_plugins

        # Create mock output directory
        snapshot_dir = Path(self.test_dir) / 'archive' / 'rss-parser-test'
@@ -869,16 +869,16 @@ class TestParserPluginWorkflows(unittest.TestCase):
        )

        # Collect URLs
-        discovered = collect_urls_from_extractors(snapshot_dir)
+        discovered = collect_urls_from_plugins(snapshot_dir)

        self.assertEqual(len(discovered), 2)
-        self.assertTrue(all(d['via_extractor'] == 'parse_rss_urls' for d in discovered))
+        self.assertTrue(all(d['plugin'] == 'parse_rss_urls' for d in discovered))

    def test_multiple_parsers_dedupe(self):
        """
        Multiple parsers may discover the same URL - should be deduplicated.
        """
-        from archivebox.hooks import collect_urls_from_extractors
+        from archivebox.hooks import collect_urls_from_plugins

        # Create mock output with duplicate URLs from different parsers
        snapshot_dir = Path(self.test_dir) / 'archive' / 'dedupe-test'
@@ -895,7 +895,7 @@ class TestParserPluginWorkflows(unittest.TestCase):
        )

        # Collect URLs
-        all_discovered = collect_urls_from_extractors(snapshot_dir)
+        all_discovered = collect_urls_from_plugins(snapshot_dir)

        # Both entries are returned (deduplication happens at the crawl command level)
        self.assertEqual(len(all_discovered), 2)