remove Seed model in favor of Crawl as template

This commit is contained in:
Nick Sweeting
2025-12-25 01:52:38 -08:00
parent 28e6c5bb65
commit bb53228ebf
30 changed files with 785 additions and 690 deletions

View File

@@ -21,6 +21,7 @@ class ArchiveBoxGroup(click.Group):
meta_commands = {
'help': 'archivebox.cli.archivebox_help.main',
'version': 'archivebox.cli.archivebox_version.main',
'mcp': 'archivebox.cli.archivebox_mcp.main',
}
setup_commands = {
'init': 'archivebox.cli.archivebox_init.main',

View File

@@ -36,15 +36,14 @@ def add(urls: str | list[str],
created_by_id: int | None=None) -> QuerySet['Snapshot']:
"""Add a new URL or list of URLs to your archive.
The new flow is:
The flow is:
1. Save URLs to sources file
2. Create Seed pointing to the file
3. Create Crawl with max_depth
4. Create root Snapshot pointing to file:// URL (depth=0)
5. Orchestrator runs parser extractors on root snapshot
6. Parser extractors output to urls.jsonl
7. URLs are added to Crawl.urls and child Snapshots are created
8. Repeat until max_depth is reached
2. Create Crawl with URLs and max_depth
3. Orchestrator creates Snapshots from Crawl URLs (depth=0)
4. Orchestrator runs parser extractors on root snapshots
5. Parser extractors output to urls.jsonl
6. URLs are added to Crawl.urls and child Snapshots are created
7. Repeat until max_depth is reached
"""
from rich import print
@@ -55,7 +54,7 @@ def add(urls: str | list[str],
# import models once django is set up
from core.models import Snapshot
from crawls.models import Seed, Crawl
from crawls.models import Crawl
from archivebox.base_models.models import get_or_create_system_user_pk
from workers.orchestrator import Orchestrator
@@ -66,19 +65,24 @@ def add(urls: str | list[str],
sources_file.parent.mkdir(parents=True, exist_ok=True)
sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))
# 2. Create a new Seed pointing to the sources file
# 2. Create a new Crawl with inline URLs
cli_args = [*sys.argv]
if cli_args[0].lower().endswith('archivebox'):
cli_args[0] = 'archivebox'
cmd_str = ' '.join(cli_args)
timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
seed = Seed.from_file(
sources_file,
# Read URLs directly into crawl
urls_content = sources_file.read_text()
crawl = Crawl.objects.create(
urls=urls_content,
extractor=parser,
max_depth=depth,
tags_str=tag,
label=f'{USER}@{HOSTNAME} $ {cmd_str} [{timestamp}]',
parser=parser,
tag=tag,
created_by=created_by_id,
created_by_id=created_by_id,
config={
'ONLY_NEW': not update,
'INDEX_ONLY': index_only,
@@ -88,15 +92,13 @@ def add(urls: str | list[str],
}
)
# 3. Create a new Crawl pointing to the Seed (status=queued)
crawl = Crawl.from_seed(seed, max_depth=depth)
print(f'[green]\\[+] Created Crawl {crawl.id} with max_depth={depth}[/green]')
print(f' [dim]Seed: {seed.uri}[/dim]')
first_url = crawl.get_urls_list()[0] if crawl.get_urls_list() else ''
print(f' [dim]First URL: {first_url}[/dim]')
# 4. The CrawlMachine will create the root Snapshot when started
# Root snapshot URL = file:///path/to/sources/...txt
# Parser extractors will run on it and discover URLs
# 3. The CrawlMachine will create the root Snapshot when started
# If URLs are from a file: first URL = file:///path/to/sources/...txt
# Parser extractors will run on it and discover more URLs
# Those URLs become child Snapshots (depth=1)
if index_only:

View File

@@ -76,7 +76,7 @@ def discover_outlinks(
)
from archivebox.base_models.models import get_or_create_system_user_pk
from core.models import Snapshot, ArchiveResult
from crawls.models import Seed, Crawl
from crawls.models import Crawl
from archivebox.config import CONSTANTS
from workers.orchestrator import Orchestrator
@@ -117,12 +117,12 @@ def discover_outlinks(
sources_file.parent.mkdir(parents=True, exist_ok=True)
sources_file.write_text('\n'.join(r.get('url', '') for r in new_url_records if r.get('url')))
seed = Seed.from_file(
crawl = Crawl.from_file(
sources_file,
max_depth=depth,
label=f'crawl --depth={depth}',
created_by=created_by_id,
)
crawl = Crawl.from_seed(seed, max_depth=depth)
# Create snapshots for new URLs
for record in new_url_records:

View File

@@ -42,27 +42,20 @@ def install(dry_run: bool=False) -> None:
setup_django()
from django.utils import timezone
from crawls.models import Seed, Crawl
from crawls.models import Crawl
from archivebox.base_models.models import get_or_create_system_user_pk
# Create a seed and crawl for dependency detection
# Create a crawl for dependency detection
# Using a minimal crawl that will trigger on_Crawl hooks
created_by_id = get_or_create_system_user_pk()
seed, _created = Seed.objects.get_or_create(
uri='archivebox://install',
crawl, created = Crawl.objects.get_or_create(
urls='archivebox://install',
label='Dependency detection',
created_by_id=created_by_id,
defaults={
'extractor': 'auto',
}
)
crawl, created = Crawl.objects.get_or_create(
seed=seed,
max_depth=0,
created_by_id=created_by_id,
defaults={
'max_depth': 0,
'status': 'queued',
}
)

View File

@@ -92,7 +92,7 @@ def create_snapshots(
)
from archivebox.base_models.models import get_or_create_system_user_pk
from core.models import Snapshot
from crawls.models import Seed, Crawl
from crawls.models import Crawl
from archivebox.config import CONSTANTS
created_by_id = created_by_id or get_or_create_system_user_pk()
@@ -108,17 +108,17 @@ def create_snapshots(
# If depth > 0, we need a Crawl to manage recursive discovery
crawl = None
if depth > 0:
# Create a seed for this batch
# Create a crawl for this batch
sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__snapshot.txt'
sources_file.parent.mkdir(parents=True, exist_ok=True)
sources_file.write_text('\n'.join(r.get('url', '') for r in records if r.get('url')))
seed = Seed.from_file(
crawl = Crawl.from_file(
sources_file,
max_depth=depth,
label=f'snapshot --depth={depth}',
created_by=created_by_id,
)
crawl = Crawl.from_seed(seed, max_depth=depth)
# Process each record
created_snapshots = []

View File

@@ -111,53 +111,27 @@ def version(quiet: bool=False,
machine = Machine.current()
# Get all *_BINARY config values
binary_config_keys = [key for key in config.keys() if key.endswith('_BINARY')]
# Get all installed binaries from the database
all_installed = InstalledBinary.objects.filter(
machine=machine
).exclude(abspath='').exclude(abspath__isnull=True).order_by('name')
if not binary_config_keys:
prnt('', '[grey53]No binary dependencies defined in config.[/grey53]')
if not all_installed.exists():
prnt('', '[grey53]No binaries detected. Run [green]archivebox install[/green] to detect dependencies.[/grey53]')
else:
for key in sorted(set(binary_config_keys)):
# Get the actual binary name/path from config value
# Prioritize Machine.config overrides over base config
bin_value = machine.config.get(key) or config.get(key, '').strip()
if not bin_value:
for installed in all_installed:
# Skip if user specified specific binaries and this isn't one
if binaries and installed.name not in binaries:
continue
# Check if it's a path (has slashes) or just a name
is_path = '/' in str(bin_value)
if is_path:
# It's a full path - match against abspath
bin_name = Path(bin_value).name
# Skip if user specified specific binaries and this isn't one
if binaries and bin_name not in binaries:
continue
# Find InstalledBinary where abspath ends with this path
installed = InstalledBinary.objects.filter(
machine=machine,
abspath__endswith=bin_value,
).exclude(abspath='').exclude(abspath__isnull=True).order_by('-modified_at').first()
else:
# It's just a binary name - match against name
bin_name = bin_value
# Skip if user specified specific binaries and this isn't one
if binaries and bin_name not in binaries:
continue
# Find InstalledBinary by name
installed = InstalledBinary.objects.filter(
machine=machine,
name__iexact=bin_name,
).exclude(abspath='').exclude(abspath__isnull=True).order_by('-modified_at').first()
if installed and installed.is_valid:
if installed.is_valid:
display_path = installed.abspath.replace(str(DATA_DIR), '.').replace(str(Path('~').expanduser()), '~')
version_str = (installed.version or 'unknown')[:15]
provider = (installed.binprovider or 'env')[:8]
prnt('', '[green]√[/green]', '', bin_name.ljust(18), version_str.ljust(16), provider.ljust(8), display_path, overflow='ignore', crop=False)
prnt('', '[green]√[/green]', '', installed.name.ljust(18), version_str.ljust(16), provider.ljust(8), display_path, overflow='ignore', crop=False)
else:
prnt('', '[red]X[/red]', '', bin_name.ljust(18), '[grey53]not installed[/grey53]', overflow='ignore', crop=False)
failures.append(bin_name)
prnt('', '[red]X[/red]', '', installed.name.ljust(18), '[grey53]not installed[/grey53]', overflow='ignore', crop=False)
failures.append(installed.name)
# Show hint if no binaries are installed yet
has_any_installed = InstalledBinary.objects.filter(machine=machine).exclude(abspath='').exists()