remove Seed model in favor of Crawl as template

2026-01-05 02:16:27 +10:00 · 2025-12-25 01:52:38 -08:00
parent 28e6c5bb65
commit bb53228ebf
30 changed files with 785 additions and 690 deletions
--- a/archivebox/cli/init.py
+++ b/archivebox/cli/init.py
@@ -21,6 +21,7 @@ class ArchiveBoxGroup(click.Group):
    meta_commands = {
        'help': 'archivebox.cli.archivebox_help.main',
        'version': 'archivebox.cli.archivebox_version.main',
+        'mcp': 'archivebox.cli.archivebox_mcp.main',
    }
    setup_commands = {
        'init': 'archivebox.cli.archivebox_init.main',
--- a/archivebox/cli/archivebox_add.py
+++ b/archivebox/cli/archivebox_add.py
@@ -36,15 +36,14 @@ def add(urls: str | list[str],
        created_by_id: int | None=None) -> QuerySet['Snapshot']:
    """Add a new URL or list of URLs to your archive.

-    The new flow is:
+    The flow is:
    1. Save URLs to sources file
-    2. Create Seed pointing to the file
-    3. Create Crawl with max_depth
-    4. Create root Snapshot pointing to file:// URL (depth=0)
-    5. Orchestrator runs parser extractors on root snapshot
-    6. Parser extractors output to urls.jsonl
-    7. URLs are added to Crawl.urls and child Snapshots are created
-    8. Repeat until max_depth is reached
+    2. Create Crawl with URLs and max_depth
+    3. Orchestrator creates Snapshots from Crawl URLs (depth=0)
+    4. Orchestrator runs parser extractors on root snapshots
+    5. Parser extractors output to urls.jsonl
+    6. URLs are added to Crawl.urls and child Snapshots are created
+    7. Repeat until max_depth is reached
    """

    from rich import print
@@ -55,7 +54,7 @@ def add(urls: str | list[str],

    # import models once django is set up
    from core.models import Snapshot
-    from crawls.models import Seed, Crawl
+    from crawls.models import Crawl
    from archivebox.base_models.models import get_or_create_system_user_pk
    from workers.orchestrator import Orchestrator

@@ -66,19 +65,24 @@ def add(urls: str | list[str],
    sources_file.parent.mkdir(parents=True, exist_ok=True)
    sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))

-    # 2. Create a new Seed pointing to the sources file
+    # 2. Create a new Crawl with inline URLs
    cli_args = [*sys.argv]
    if cli_args[0].lower().endswith('archivebox'):
        cli_args[0] = 'archivebox'
    cmd_str = ' '.join(cli_args)

    timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
-    seed = Seed.from_file(
-        sources_file,
+
+    # Read URLs directly into crawl
+    urls_content = sources_file.read_text()
+
+    crawl = Crawl.objects.create(
+        urls=urls_content,
+        extractor=parser,
+        max_depth=depth,
+        tags_str=tag,
        label=f'{USER}@{HOSTNAME} $ {cmd_str} [{timestamp}]',
-        parser=parser,
-        tag=tag,
-        created_by=created_by_id,
+        created_by_id=created_by_id,
        config={
            'ONLY_NEW': not update,
            'INDEX_ONLY': index_only,
@@ -88,15 +92,13 @@ def add(urls: str | list[str],
        }
    )

-    # 3. Create a new Crawl pointing to the Seed (status=queued)
-    crawl = Crawl.from_seed(seed, max_depth=depth)
-
    print(f'[green]\\[+] Created Crawl {crawl.id} with max_depth={depth}[/green]')
-    print(f'    [dim]Seed: {seed.uri}[/dim]')
+    first_url = crawl.get_urls_list()[0] if crawl.get_urls_list() else ''
+    print(f'    [dim]First URL: {first_url}[/dim]')

-    # 4. The CrawlMachine will create the root Snapshot when started
-    #    Root snapshot URL = file:///path/to/sources/...txt
-    #    Parser extractors will run on it and discover URLs
+    # 3. The CrawlMachine will create the root Snapshot when started
+    #    If URLs are from a file: first URL = file:///path/to/sources/...txt
+    #    Parser extractors will run on it and discover more URLs
    #    Those URLs become child Snapshots (depth=1)

    if index_only:
--- a/archivebox/cli/archivebox_crawl.py
+++ b/archivebox/cli/archivebox_crawl.py
@@ -76,7 +76,7 @@ def discover_outlinks(
    )
    from archivebox.base_models.models import get_or_create_system_user_pk
    from core.models import Snapshot, ArchiveResult
-    from crawls.models import Seed, Crawl
+    from crawls.models import Crawl
    from archivebox.config import CONSTANTS
    from workers.orchestrator import Orchestrator

@@ -117,12 +117,12 @@ def discover_outlinks(
        sources_file.parent.mkdir(parents=True, exist_ok=True)
        sources_file.write_text('\n'.join(r.get('url', '') for r in new_url_records if r.get('url')))

-        seed = Seed.from_file(
+        crawl = Crawl.from_file(
            sources_file,
+            max_depth=depth,
            label=f'crawl --depth={depth}',
            created_by=created_by_id,
        )
-        crawl = Crawl.from_seed(seed, max_depth=depth)

        # Create snapshots for new URLs
        for record in new_url_records:
--- a/archivebox/cli/archivebox_install.py
+++ b/archivebox/cli/archivebox_install.py
@@ -42,27 +42,20 @@ def install(dry_run: bool=False) -> None:
    setup_django()

    from django.utils import timezone
-    from crawls.models import Seed, Crawl
+    from crawls.models import Crawl
    from archivebox.base_models.models import get_or_create_system_user_pk

-    # Create a seed and crawl for dependency detection
+    # Create a crawl for dependency detection
    # Using a minimal crawl that will trigger on_Crawl hooks
    created_by_id = get_or_create_system_user_pk()

-    seed, _created = Seed.objects.get_or_create(
-        uri='archivebox://install',
+    crawl, created = Crawl.objects.get_or_create(
+        urls='archivebox://install',
        label='Dependency detection',
        created_by_id=created_by_id,
        defaults={
            'extractor': 'auto',
-        }
-    )
-
-    crawl, created = Crawl.objects.get_or_create(
-        seed=seed,
-        max_depth=0,
-        created_by_id=created_by_id,
-        defaults={
+            'max_depth': 0,
            'status': 'queued',
        }
    )
--- a/archivebox/cli/archivebox_snapshot.py
+++ b/archivebox/cli/archivebox_snapshot.py
@@ -92,7 +92,7 @@ def create_snapshots(
    )
    from archivebox.base_models.models import get_or_create_system_user_pk
    from core.models import Snapshot
-    from crawls.models import Seed, Crawl
+    from crawls.models import Crawl
    from archivebox.config import CONSTANTS

    created_by_id = created_by_id or get_or_create_system_user_pk()
@@ -108,17 +108,17 @@ def create_snapshots(
    # If depth > 0, we need a Crawl to manage recursive discovery
    crawl = None
    if depth > 0:
-        # Create a seed for this batch
+        # Create a crawl for this batch
        sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__snapshot.txt'
        sources_file.parent.mkdir(parents=True, exist_ok=True)
        sources_file.write_text('\n'.join(r.get('url', '') for r in records if r.get('url')))

-        seed = Seed.from_file(
+        crawl = Crawl.from_file(
            sources_file,
+            max_depth=depth,
            label=f'snapshot --depth={depth}',
            created_by=created_by_id,
        )
-        crawl = Crawl.from_seed(seed, max_depth=depth)

    # Process each record
    created_snapshots = []
--- a/archivebox/cli/archivebox_version.py
+++ b/archivebox/cli/archivebox_version.py
@@ -111,53 +111,27 @@ def version(quiet: bool=False,

    machine = Machine.current()

-    # Get all *_BINARY config values
-    binary_config_keys = [key for key in config.keys() if key.endswith('_BINARY')]
+    # Get all installed binaries from the database
+    all_installed = InstalledBinary.objects.filter(
+        machine=machine
+    ).exclude(abspath='').exclude(abspath__isnull=True).order_by('name')

-    if not binary_config_keys:
-        prnt('', '[grey53]No binary dependencies defined in config.[/grey53]')
+    if not all_installed.exists():
+        prnt('', '[grey53]No binaries detected. Run [green]archivebox install[/green] to detect dependencies.[/grey53]')
    else:
-        for key in sorted(set(binary_config_keys)):
-            # Get the actual binary name/path from config value
-            # Prioritize Machine.config overrides over base config
-            bin_value = machine.config.get(key) or config.get(key, '').strip()
-            if not bin_value:
+        for installed in all_installed:
+            # Skip if user specified specific binaries and this isn't one
+            if binaries and installed.name not in binaries:
                continue

-            # Check if it's a path (has slashes) or just a name
-            is_path = '/' in str(bin_value)
-
-            if is_path:
-                # It's a full path - match against abspath
-                bin_name = Path(bin_value).name
-                # Skip if user specified specific binaries and this isn't one
-                if binaries and bin_name not in binaries:
-                    continue
-                # Find InstalledBinary where abspath ends with this path
-                installed = InstalledBinary.objects.filter(
-                    machine=machine,
-                    abspath__endswith=bin_value,
-                ).exclude(abspath='').exclude(abspath__isnull=True).order_by('-modified_at').first()
-            else:
-                # It's just a binary name - match against name
-                bin_name = bin_value
-                # Skip if user specified specific binaries and this isn't one
-                if binaries and bin_name not in binaries:
-                    continue
-                # Find InstalledBinary by name
-                installed = InstalledBinary.objects.filter(
-                    machine=machine,
-                    name__iexact=bin_name,
-                ).exclude(abspath='').exclude(abspath__isnull=True).order_by('-modified_at').first()
-
-            if installed and installed.is_valid:
+            if installed.is_valid:
                display_path = installed.abspath.replace(str(DATA_DIR), '.').replace(str(Path('~').expanduser()), '~')
                version_str = (installed.version or 'unknown')[:15]
                provider = (installed.binprovider or 'env')[:8]
-                prnt('', '[green]√[/green]', '', bin_name.ljust(18), version_str.ljust(16), provider.ljust(8), display_path, overflow='ignore', crop=False)
+                prnt('', '[green]√[/green]', '', installed.name.ljust(18), version_str.ljust(16), provider.ljust(8), display_path, overflow='ignore', crop=False)
            else:
-                prnt('', '[red]X[/red]', '', bin_name.ljust(18), '[grey53]not installed[/grey53]', overflow='ignore', crop=False)
-                failures.append(bin_name)
+                prnt('', '[red]X[/red]', '', installed.name.ljust(18), '[grey53]not installed[/grey53]', overflow='ignore', crop=False)
+                failures.append(installed.name)

    # Show hint if no binaries are installed yet
    has_any_installed = InstalledBinary.objects.filter(machine=machine).exclude(abspath='').exists()