mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-01-05 02:16:27 +10:00
remove Seed model in favor of Crawl as template
This commit is contained in:
@@ -21,6 +21,7 @@ class ArchiveBoxGroup(click.Group):
|
||||
meta_commands = {
|
||||
'help': 'archivebox.cli.archivebox_help.main',
|
||||
'version': 'archivebox.cli.archivebox_version.main',
|
||||
'mcp': 'archivebox.cli.archivebox_mcp.main',
|
||||
}
|
||||
setup_commands = {
|
||||
'init': 'archivebox.cli.archivebox_init.main',
|
||||
|
||||
@@ -36,15 +36,14 @@ def add(urls: str | list[str],
|
||||
created_by_id: int | None=None) -> QuerySet['Snapshot']:
|
||||
"""Add a new URL or list of URLs to your archive.
|
||||
|
||||
The new flow is:
|
||||
The flow is:
|
||||
1. Save URLs to sources file
|
||||
2. Create Seed pointing to the file
|
||||
3. Create Crawl with max_depth
|
||||
4. Create root Snapshot pointing to file:// URL (depth=0)
|
||||
5. Orchestrator runs parser extractors on root snapshot
|
||||
6. Parser extractors output to urls.jsonl
|
||||
7. URLs are added to Crawl.urls and child Snapshots are created
|
||||
8. Repeat until max_depth is reached
|
||||
2. Create Crawl with URLs and max_depth
|
||||
3. Orchestrator creates Snapshots from Crawl URLs (depth=0)
|
||||
4. Orchestrator runs parser extractors on root snapshots
|
||||
5. Parser extractors output to urls.jsonl
|
||||
6. URLs are added to Crawl.urls and child Snapshots are created
|
||||
7. Repeat until max_depth is reached
|
||||
"""
|
||||
|
||||
from rich import print
|
||||
@@ -55,7 +54,7 @@ def add(urls: str | list[str],
|
||||
|
||||
# import models once django is set up
|
||||
from core.models import Snapshot
|
||||
from crawls.models import Seed, Crawl
|
||||
from crawls.models import Crawl
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from workers.orchestrator import Orchestrator
|
||||
|
||||
@@ -66,19 +65,24 @@ def add(urls: str | list[str],
|
||||
sources_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))
|
||||
|
||||
# 2. Create a new Seed pointing to the sources file
|
||||
# 2. Create a new Crawl with inline URLs
|
||||
cli_args = [*sys.argv]
|
||||
if cli_args[0].lower().endswith('archivebox'):
|
||||
cli_args[0] = 'archivebox'
|
||||
cmd_str = ' '.join(cli_args)
|
||||
|
||||
timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
|
||||
seed = Seed.from_file(
|
||||
sources_file,
|
||||
|
||||
# Read URLs directly into crawl
|
||||
urls_content = sources_file.read_text()
|
||||
|
||||
crawl = Crawl.objects.create(
|
||||
urls=urls_content,
|
||||
extractor=parser,
|
||||
max_depth=depth,
|
||||
tags_str=tag,
|
||||
label=f'{USER}@{HOSTNAME} $ {cmd_str} [{timestamp}]',
|
||||
parser=parser,
|
||||
tag=tag,
|
||||
created_by=created_by_id,
|
||||
created_by_id=created_by_id,
|
||||
config={
|
||||
'ONLY_NEW': not update,
|
||||
'INDEX_ONLY': index_only,
|
||||
@@ -88,15 +92,13 @@ def add(urls: str | list[str],
|
||||
}
|
||||
)
|
||||
|
||||
# 3. Create a new Crawl pointing to the Seed (status=queued)
|
||||
crawl = Crawl.from_seed(seed, max_depth=depth)
|
||||
|
||||
print(f'[green]\\[+] Created Crawl {crawl.id} with max_depth={depth}[/green]')
|
||||
print(f' [dim]Seed: {seed.uri}[/dim]')
|
||||
first_url = crawl.get_urls_list()[0] if crawl.get_urls_list() else ''
|
||||
print(f' [dim]First URL: {first_url}[/dim]')
|
||||
|
||||
# 4. The CrawlMachine will create the root Snapshot when started
|
||||
# Root snapshot URL = file:///path/to/sources/...txt
|
||||
# Parser extractors will run on it and discover URLs
|
||||
# 3. The CrawlMachine will create the root Snapshot when started
|
||||
# If URLs are from a file: first URL = file:///path/to/sources/...txt
|
||||
# Parser extractors will run on it and discover more URLs
|
||||
# Those URLs become child Snapshots (depth=1)
|
||||
|
||||
if index_only:
|
||||
|
||||
@@ -76,7 +76,7 @@ def discover_outlinks(
|
||||
)
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from core.models import Snapshot, ArchiveResult
|
||||
from crawls.models import Seed, Crawl
|
||||
from crawls.models import Crawl
|
||||
from archivebox.config import CONSTANTS
|
||||
from workers.orchestrator import Orchestrator
|
||||
|
||||
@@ -117,12 +117,12 @@ def discover_outlinks(
|
||||
sources_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
sources_file.write_text('\n'.join(r.get('url', '') for r in new_url_records if r.get('url')))
|
||||
|
||||
seed = Seed.from_file(
|
||||
crawl = Crawl.from_file(
|
||||
sources_file,
|
||||
max_depth=depth,
|
||||
label=f'crawl --depth={depth}',
|
||||
created_by=created_by_id,
|
||||
)
|
||||
crawl = Crawl.from_seed(seed, max_depth=depth)
|
||||
|
||||
# Create snapshots for new URLs
|
||||
for record in new_url_records:
|
||||
|
||||
@@ -42,27 +42,20 @@ def install(dry_run: bool=False) -> None:
|
||||
setup_django()
|
||||
|
||||
from django.utils import timezone
|
||||
from crawls.models import Seed, Crawl
|
||||
from crawls.models import Crawl
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
|
||||
# Create a seed and crawl for dependency detection
|
||||
# Create a crawl for dependency detection
|
||||
# Using a minimal crawl that will trigger on_Crawl hooks
|
||||
created_by_id = get_or_create_system_user_pk()
|
||||
|
||||
seed, _created = Seed.objects.get_or_create(
|
||||
uri='archivebox://install',
|
||||
crawl, created = Crawl.objects.get_or_create(
|
||||
urls='archivebox://install',
|
||||
label='Dependency detection',
|
||||
created_by_id=created_by_id,
|
||||
defaults={
|
||||
'extractor': 'auto',
|
||||
}
|
||||
)
|
||||
|
||||
crawl, created = Crawl.objects.get_or_create(
|
||||
seed=seed,
|
||||
max_depth=0,
|
||||
created_by_id=created_by_id,
|
||||
defaults={
|
||||
'max_depth': 0,
|
||||
'status': 'queued',
|
||||
}
|
||||
)
|
||||
|
||||
@@ -92,7 +92,7 @@ def create_snapshots(
|
||||
)
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from core.models import Snapshot
|
||||
from crawls.models import Seed, Crawl
|
||||
from crawls.models import Crawl
|
||||
from archivebox.config import CONSTANTS
|
||||
|
||||
created_by_id = created_by_id or get_or_create_system_user_pk()
|
||||
@@ -108,17 +108,17 @@ def create_snapshots(
|
||||
# If depth > 0, we need a Crawl to manage recursive discovery
|
||||
crawl = None
|
||||
if depth > 0:
|
||||
# Create a seed for this batch
|
||||
# Create a crawl for this batch
|
||||
sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__snapshot.txt'
|
||||
sources_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
sources_file.write_text('\n'.join(r.get('url', '') for r in records if r.get('url')))
|
||||
|
||||
seed = Seed.from_file(
|
||||
crawl = Crawl.from_file(
|
||||
sources_file,
|
||||
max_depth=depth,
|
||||
label=f'snapshot --depth={depth}',
|
||||
created_by=created_by_id,
|
||||
)
|
||||
crawl = Crawl.from_seed(seed, max_depth=depth)
|
||||
|
||||
# Process each record
|
||||
created_snapshots = []
|
||||
|
||||
@@ -111,53 +111,27 @@ def version(quiet: bool=False,
|
||||
|
||||
machine = Machine.current()
|
||||
|
||||
# Get all *_BINARY config values
|
||||
binary_config_keys = [key for key in config.keys() if key.endswith('_BINARY')]
|
||||
# Get all installed binaries from the database
|
||||
all_installed = InstalledBinary.objects.filter(
|
||||
machine=machine
|
||||
).exclude(abspath='').exclude(abspath__isnull=True).order_by('name')
|
||||
|
||||
if not binary_config_keys:
|
||||
prnt('', '[grey53]No binary dependencies defined in config.[/grey53]')
|
||||
if not all_installed.exists():
|
||||
prnt('', '[grey53]No binaries detected. Run [green]archivebox install[/green] to detect dependencies.[/grey53]')
|
||||
else:
|
||||
for key in sorted(set(binary_config_keys)):
|
||||
# Get the actual binary name/path from config value
|
||||
# Prioritize Machine.config overrides over base config
|
||||
bin_value = machine.config.get(key) or config.get(key, '').strip()
|
||||
if not bin_value:
|
||||
for installed in all_installed:
|
||||
# Skip if user specified specific binaries and this isn't one
|
||||
if binaries and installed.name not in binaries:
|
||||
continue
|
||||
|
||||
# Check if it's a path (has slashes) or just a name
|
||||
is_path = '/' in str(bin_value)
|
||||
|
||||
if is_path:
|
||||
# It's a full path - match against abspath
|
||||
bin_name = Path(bin_value).name
|
||||
# Skip if user specified specific binaries and this isn't one
|
||||
if binaries and bin_name not in binaries:
|
||||
continue
|
||||
# Find InstalledBinary where abspath ends with this path
|
||||
installed = InstalledBinary.objects.filter(
|
||||
machine=machine,
|
||||
abspath__endswith=bin_value,
|
||||
).exclude(abspath='').exclude(abspath__isnull=True).order_by('-modified_at').first()
|
||||
else:
|
||||
# It's just a binary name - match against name
|
||||
bin_name = bin_value
|
||||
# Skip if user specified specific binaries and this isn't one
|
||||
if binaries and bin_name not in binaries:
|
||||
continue
|
||||
# Find InstalledBinary by name
|
||||
installed = InstalledBinary.objects.filter(
|
||||
machine=machine,
|
||||
name__iexact=bin_name,
|
||||
).exclude(abspath='').exclude(abspath__isnull=True).order_by('-modified_at').first()
|
||||
|
||||
if installed and installed.is_valid:
|
||||
if installed.is_valid:
|
||||
display_path = installed.abspath.replace(str(DATA_DIR), '.').replace(str(Path('~').expanduser()), '~')
|
||||
version_str = (installed.version or 'unknown')[:15]
|
||||
provider = (installed.binprovider or 'env')[:8]
|
||||
prnt('', '[green]√[/green]', '', bin_name.ljust(18), version_str.ljust(16), provider.ljust(8), display_path, overflow='ignore', crop=False)
|
||||
prnt('', '[green]√[/green]', '', installed.name.ljust(18), version_str.ljust(16), provider.ljust(8), display_path, overflow='ignore', crop=False)
|
||||
else:
|
||||
prnt('', '[red]X[/red]', '', bin_name.ljust(18), '[grey53]not installed[/grey53]', overflow='ignore', crop=False)
|
||||
failures.append(bin_name)
|
||||
prnt('', '[red]X[/red]', '', installed.name.ljust(18), '[grey53]not installed[/grey53]', overflow='ignore', crop=False)
|
||||
failures.append(installed.name)
|
||||
|
||||
# Show hint if no binaries are installed yet
|
||||
has_any_installed = InstalledBinary.objects.filter(machine=machine).exclude(abspath='').exists()
|
||||
|
||||
Reference in New Issue
Block a user