Improve scheduling, runtime paths, and API behavior

This commit is contained in:
Nick Sweeting
2026-03-15 18:31:56 -07:00
parent 7d42c6c8b5
commit 70c9358cf9
37 changed files with 1058 additions and 398 deletions

View File

@@ -18,6 +18,7 @@ from rich import print
from archivebox.config import CONSTANTS
from archivebox.base_models.models import ModelWithUUID, ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, get_or_create_system_user_pk
from archivebox.workers.models import ModelWithStateMachine, BaseStateMachine
from archivebox.crawls.schedule_utils import next_run_for_schedule, validate_schedule
if TYPE_CHECKING:
from archivebox.core.models import Snapshot, ArchiveResult
@@ -51,12 +52,50 @@ class CrawlSchedule(ModelWithUUID, ModelWithNotes):
return reverse_lazy('api-1:get_any', args=[self.id])
def save(self, *args, **kwargs):
self.schedule = (self.schedule or '').strip()
validate_schedule(self.schedule)
self.label = self.label or (self.template.label if self.template else '')
super().save(*args, **kwargs)
if self.template:
self.template.schedule = self
self.template.save()
@property
def last_run_at(self):
latest_crawl = self.crawl_set.order_by('-created_at').first()
if latest_crawl:
return latest_crawl.created_at
if self.template:
return self.template.created_at
return self.created_at
@property
def next_run_at(self):
return next_run_for_schedule(self.schedule, self.last_run_at)
def is_due(self, now=None) -> bool:
now = now or timezone.now()
return self.is_enabled and self.next_run_at <= now
def enqueue(self, queued_at=None) -> 'Crawl':
queued_at = queued_at or timezone.now()
template = self.template
label = template.label or self.label
return Crawl.objects.create(
urls=template.urls,
config=template.config or {},
max_depth=template.max_depth,
tags_str=template.tags_str,
persona_id=template.persona_id,
label=label,
notes=template.notes,
schedule=self,
status=Crawl.StatusChoices.QUEUED,
retry_at=queued_at,
created_by=template.created_by,
)
class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWithStateMachine):
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
@@ -204,6 +243,15 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
if url.strip() and not url.strip().startswith('#')
]
def get_system_task(self) -> str | None:
urls = self.get_urls_list()
if len(urls) != 1:
return None
system_url = urls[0].strip().lower()
if system_url.startswith('archivebox://'):
return system_url
return None
def add_url(self, entry: dict) -> bool:
"""
@@ -345,6 +393,13 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
def get_runtime_config():
return get_config(crawl=self)
system_task = self.get_system_task()
if system_task == 'archivebox://update':
from archivebox.cli.archivebox_update import process_all_db_snapshots
process_all_db_snapshots()
return None
machine = Machine.current()
declared_binary_names: set[str] = set()
@@ -446,6 +501,12 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
print(f'[cyan]📝 Processing {len(records)} records from {hook.name}[/cyan]')
for record in records[:3]:
print(f' Record: type={record.get("type")}, keys={list(record.keys())[:5]}')
if system_task:
records = [
record
for record in records
if record.get('type') in ('Binary', 'Machine')
]
overrides = {'crawl': self}
stats = process_hook_records(records, overrides=overrides)
if stats:
@@ -519,6 +580,18 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
install_declared_binaries(declared_binary_names)
# Create snapshots from all URLs in self.urls
if system_task:
leaked_snapshots = self.snapshot_set.all()
if leaked_snapshots.exists():
leaked_count = leaked_snapshots.count()
leaked_snapshots.delete()
print(f'[yellow]⚠️ Removed {leaked_count} leaked snapshot(s) created during system crawl {system_task}[/yellow]')
with open(debug_log, 'a') as f:
f.write(f'Skipping snapshot creation for system crawl: {system_task}\n')
f.write(f'=== Crawl.run() complete ===\n\n')
f.flush()
return None
with open(debug_log, 'a') as f:
f.write(f'Creating snapshots from URLs...\n')
f.flush()

View File

@@ -0,0 +1,43 @@
from __future__ import annotations
from datetime import datetime
from croniter import croniter
SCHEDULE_ALIASES: dict[str, str] = {
"minute": "* * * * *",
"minutely": "* * * * *",
"hour": "0 * * * *",
"hourly": "0 * * * *",
"day": "0 0 * * *",
"daily": "0 0 * * *",
"week": "0 0 * * 0",
"weekly": "0 0 * * 0",
"month": "0 0 1 * *",
"monthly": "0 0 1 * *",
"year": "0 0 1 1 *",
"yearly": "0 0 1 1 *",
}
def normalize_schedule(schedule: str) -> str:
normalized = (schedule or "").strip()
if not normalized:
raise ValueError("Schedule cannot be empty.")
return SCHEDULE_ALIASES.get(normalized.lower(), normalized)
def validate_schedule(schedule: str) -> str:
normalized = normalize_schedule(schedule)
if not croniter.is_valid(normalized):
raise ValueError(
"Invalid schedule. Use an alias like daily/weekly/monthly or a cron expression such as '0 */6 * * *'."
)
return normalized
def next_run_for_schedule(schedule: str, after: datetime) -> datetime:
normalized = validate_schedule(schedule)
return croniter(normalized, after).get_next(datetime)