Files
ArchiveBox/archivebox/services/runner.py
2026-03-25 05:36:07 -07:00

773 lines
29 KiB
Python

from __future__ import annotations
import asyncio
import json
import os
import shutil
import subprocess
import sys
import time
from contextlib import nullcontext
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Any
from asgiref.sync import sync_to_async
from django.utils import timezone
from rich.console import Console
from abx_dl.events import BinaryRequestEvent
from abx_dl.limits import CrawlLimitState
from abx_dl.models import Plugin, discover_plugins, filter_plugins
from abx_dl.orchestrator import (
create_bus,
download,
install_plugins as abx_install_plugins,
setup_services as setup_abx_services,
)
from .archive_result_service import ArchiveResultService
from .binary_service import BinaryService
from .crawl_service import CrawlService
from .process_service import ProcessService
from .snapshot_service import SnapshotService
from .tag_service import TagService
from .live_ui import LiveBusUI
def _bus_name(prefix: str, identifier: str) -> str:
normalized = "".join(ch if ch.isalnum() else "_" for ch in identifier)
return f"{prefix}_{normalized}"
def _count_selected_hooks(plugins: dict[str, Plugin], selected_plugins: list[str] | None) -> int:
selected = filter_plugins(plugins, selected_plugins) if selected_plugins else plugins
return sum(1 for plugin in selected.values() for hook in plugin.hooks if "CrawlSetup" in hook.name or "Snapshot" in hook.name)
def ensure_background_runner(*, allow_under_pytest: bool = False) -> bool:
if os.environ.get("PYTEST_CURRENT_TEST") and not allow_under_pytest:
return False
from archivebox.config import CONSTANTS
from archivebox.machine.models import Machine, Process
Process.cleanup_stale_running()
Process.cleanup_orphaned_workers()
machine = Machine.current()
if Process.objects.filter(
machine=machine,
status=Process.StatusChoices.RUNNING,
process_type=Process.TypeChoices.ORCHESTRATOR,
).exists():
return False
log_path = CONSTANTS.LOGS_DIR / "errors.log"
log_path.parent.mkdir(parents=True, exist_ok=True)
env = os.environ.copy()
env.setdefault("DATA_DIR", str(CONSTANTS.DATA_DIR))
with log_path.open("a", encoding="utf-8") as log_handle:
subprocess.Popen(
[sys.executable, "-m", "archivebox", "run", "--daemon"],
cwd=str(CONSTANTS.DATA_DIR),
env=env,
stdin=subprocess.DEVNULL,
stdout=log_handle,
stderr=log_handle,
start_new_session=True,
)
return True
class CrawlRunner:
MAX_CONCURRENT_SNAPSHOTS = 8
def __init__(
self,
crawl,
*,
snapshot_ids: list[str] | None = None,
selected_plugins: list[str] | None = None,
process_discovered_snapshots_inline: bool = True,
):
self.crawl = crawl
self.bus = create_bus(name=_bus_name("ArchiveBox", str(crawl.id)), total_timeout=3600.0)
self.plugins = discover_plugins()
ProcessService(self.bus)
BinaryService(self.bus)
TagService(self.bus)
CrawlService(self.bus, crawl_id=str(crawl.id))
self.process_discovered_snapshots_inline = process_discovered_snapshots_inline
async def ignore_snapshot(_snapshot_id: str) -> None:
return None
SnapshotService(
self.bus,
crawl_id=str(crawl.id),
schedule_snapshot=self.enqueue_snapshot if process_discovered_snapshots_inline else ignore_snapshot,
)
ArchiveResultService(self.bus)
self.selected_plugins = selected_plugins
self.initial_snapshot_ids = snapshot_ids
self.snapshot_tasks: dict[str, asyncio.Task[None]] = {}
self.snapshot_semaphore = asyncio.Semaphore(self.MAX_CONCURRENT_SNAPSHOTS)
self.persona = None
self.base_config: dict[str, Any] = {}
self.derived_config: dict[str, Any] = {}
self.primary_url = ""
self._live_stream = None
async def run(self) -> None:
try:
snapshot_ids = await sync_to_async(self.load_run_state, thread_sensitive=True)()
live_ui = self._create_live_ui()
with live_ui if live_ui is not None else nullcontext():
setup_abx_services(
self.bus,
plugins=self.plugins,
config_overrides={
**self.base_config,
"ABX_RUNTIME": "archivebox",
},
derived_config_overrides=self.derived_config,
persist_derived=False,
auto_install=True,
emit_jsonl=False,
)
if snapshot_ids:
root_snapshot_id = snapshot_ids[0]
await self.run_crawl_setup(root_snapshot_id)
for snapshot_id in snapshot_ids:
await self.enqueue_snapshot(snapshot_id)
await self.wait_for_snapshot_tasks()
await self.run_crawl_cleanup(root_snapshot_id)
finally:
await self.bus.stop()
if self._live_stream is not None:
try:
self._live_stream.close()
except Exception:
pass
self._live_stream = None
await sync_to_async(self.finalize_run_state, thread_sensitive=True)()
async def enqueue_snapshot(self, snapshot_id: str) -> None:
task = self.snapshot_tasks.get(snapshot_id)
if task is not None and not task.done():
return
task = asyncio.create_task(self.run_snapshot(snapshot_id))
self.snapshot_tasks[snapshot_id] = task
async def wait_for_snapshot_tasks(self) -> None:
while True:
pending_tasks: list[asyncio.Task[None]] = []
for snapshot_id, task in list(self.snapshot_tasks.items()):
if task.done():
if self.snapshot_tasks.get(snapshot_id) is task:
self.snapshot_tasks.pop(snapshot_id, None)
task.result()
continue
pending_tasks.append(task)
if not pending_tasks:
return
done, _pending = await asyncio.wait(pending_tasks, return_when=asyncio.FIRST_COMPLETED)
for task in done:
task.result()
def load_run_state(self) -> list[str]:
from archivebox.config.configset import get_config
from archivebox.machine.models import Machine, NetworkInterface, Process
self.primary_url = self.crawl.get_urls_list()[0] if self.crawl.get_urls_list() else ""
current_iface = NetworkInterface.current(refresh=True)
current_process = Process.current()
if current_process.iface_id != current_iface.id or current_process.machine_id != current_iface.machine_id:
current_process.iface = current_iface
current_process.machine = current_iface.machine
current_process.save(update_fields=["iface", "machine", "modified_at"])
self.persona = self.crawl.resolve_persona()
self.base_config = get_config(crawl=self.crawl)
self.derived_config = dict(Machine.current().config)
self.base_config["ABX_RUNTIME"] = "archivebox"
if self.selected_plugins is None:
raw_plugins = self.base_config["PLUGINS"].strip()
self.selected_plugins = [name.strip() for name in raw_plugins.split(",") if name.strip()] if raw_plugins else None
if self.persona:
self.base_config.update(
self.persona.prepare_runtime_for_crawl(
self.crawl,
chrome_binary=self.base_config["CHROME_BINARY"],
),
)
if self.initial_snapshot_ids:
return [str(snapshot_id) for snapshot_id in self.initial_snapshot_ids]
created = self.crawl.create_snapshots_from_urls()
snapshots = created or list(self.crawl.snapshot_set.filter(depth=0).order_by("created_at"))
return [str(snapshot.id) for snapshot in snapshots]
def finalize_run_state(self) -> None:
from archivebox.crawls.models import Crawl
if self.persona:
self.persona.cleanup_runtime_for_crawl(self.crawl)
crawl = Crawl.objects.get(id=self.crawl.id)
if crawl.is_finished():
if crawl.status != Crawl.StatusChoices.SEALED:
crawl.status = Crawl.StatusChoices.SEALED
crawl.retry_at = None
crawl.save(update_fields=["status", "retry_at", "modified_at"])
return
if crawl.status == Crawl.StatusChoices.SEALED:
crawl.status = Crawl.StatusChoices.QUEUED
elif crawl.status != Crawl.StatusChoices.STARTED:
crawl.status = Crawl.StatusChoices.STARTED
crawl.retry_at = crawl.retry_at or timezone.now()
crawl.save(update_fields=["status", "retry_at", "modified_at"])
def _create_live_ui(self) -> LiveBusUI | None:
stdout_is_tty = sys.stdout.isatty()
stderr_is_tty = sys.stderr.isatty()
interactive_tty = stdout_is_tty or stderr_is_tty
if not interactive_tty:
return None
stream = sys.stderr if stderr_is_tty else sys.stdout
if os.path.exists("/dev/tty"):
try:
self._live_stream = open("/dev/tty", "w", buffering=1, encoding=stream.encoding or "utf-8")
stream = self._live_stream
except OSError:
self._live_stream = None
try:
terminal_size = os.get_terminal_size(stream.fileno())
terminal_width = terminal_size.columns
terminal_height = terminal_size.lines
except (AttributeError, OSError, ValueError):
terminal_size = shutil.get_terminal_size(fallback=(160, 40))
terminal_width = terminal_size.columns
terminal_height = terminal_size.lines
ui_console = Console(
file=stream,
force_terminal=True,
width=terminal_width,
height=terminal_height,
_environ={
"COLUMNS": str(terminal_width),
"LINES": str(terminal_height),
},
)
plugins_label = ", ".join(self.selected_plugins) if self.selected_plugins else f"all ({len(self.plugins)} available)"
live_ui = LiveBusUI(
self.bus,
total_hooks=_count_selected_hooks(self.plugins, self.selected_plugins),
timeout_seconds=self.base_config["TIMEOUT"],
ui_console=ui_console,
interactive_tty=True,
)
live_ui.print_intro(
url=self.primary_url or "crawl",
output_dir=Path(self.crawl.output_dir),
plugins_label=plugins_label,
)
return live_ui
def load_snapshot_payload(self, snapshot_id: str) -> dict[str, Any]:
from archivebox.core.models import Snapshot
from archivebox.config.configset import get_config
snapshot = Snapshot.objects.select_related("crawl").get(id=snapshot_id)
config = get_config(crawl=self.crawl, snapshot=snapshot)
config.update(self.base_config)
config["CRAWL_DIR"] = str(self.crawl.output_dir)
config["SNAP_DIR"] = str(snapshot.output_dir)
extra_context: dict[str, Any] = {}
if config.get("EXTRA_CONTEXT"):
parsed_extra_context = json.loads(str(config["EXTRA_CONTEXT"]))
if not isinstance(parsed_extra_context, dict):
raise TypeError("EXTRA_CONTEXT must decode to an object")
extra_context = parsed_extra_context
extra_context["snapshot_id"] = str(snapshot.id)
extra_context["snapshot_depth"] = snapshot.depth
config["EXTRA_CONTEXT"] = json.dumps(extra_context, separators=(",", ":"), sort_keys=True)
return {
"id": str(snapshot.id),
"url": snapshot.url,
"title": snapshot.title,
"timestamp": snapshot.timestamp,
"bookmarked_at": snapshot.bookmarked_at.isoformat() if snapshot.bookmarked_at else "",
"created_at": snapshot.created_at.isoformat() if snapshot.created_at else "",
"tags": snapshot.tags_str(),
"depth": snapshot.depth,
"status": snapshot.status,
"output_dir": str(snapshot.output_dir),
"config": config,
}
async def run_crawl_setup(self, snapshot_id: str) -> None:
snapshot = await sync_to_async(self.load_snapshot_payload, thread_sensitive=True)(snapshot_id)
await download(
url=snapshot["url"],
plugins=self.plugins,
output_dir=Path(snapshot["output_dir"]),
selected_plugins=self.selected_plugins,
config_overrides=snapshot["config"],
derived_config_overrides=self.derived_config,
bus=self.bus,
emit_jsonl=False,
install_enabled=True,
crawl_setup_enabled=True,
crawl_start_enabled=False,
snapshot_cleanup_enabled=False,
crawl_cleanup_enabled=False,
machine_service=None,
binary_service=None,
process_service=None,
archive_result_service=None,
tag_service=None,
)
async def run_crawl_cleanup(self, snapshot_id: str) -> None:
snapshot = await sync_to_async(self.load_snapshot_payload, thread_sensitive=True)(snapshot_id)
await download(
bus=self.bus,
url=snapshot["url"],
output_dir=Path(snapshot["output_dir"]),
plugins=self.plugins,
selected_plugins=self.selected_plugins,
config_overrides=snapshot["config"],
derived_config_overrides=self.derived_config,
emit_jsonl=False,
install_enabled=False,
crawl_setup_enabled=False,
crawl_start_enabled=False,
snapshot_cleanup_enabled=False,
crawl_cleanup_enabled=True,
machine_service=None,
binary_service=None,
process_service=None,
archive_result_service=None,
tag_service=None,
)
async def run_snapshot(self, snapshot_id: str) -> None:
async with self.snapshot_semaphore:
snapshot = await sync_to_async(self.load_snapshot_payload, thread_sensitive=True)(snapshot_id)
if snapshot["status"] == "sealed":
return
if snapshot["depth"] > 0 and CrawlLimitState.from_config(snapshot["config"]).get_stop_reason() == "max_size":
await sync_to_async(self.seal_snapshot_due_to_limit, thread_sensitive=True)(snapshot_id)
return
try:
await download(
url=snapshot["url"],
plugins=self.plugins,
output_dir=Path(snapshot["output_dir"]),
selected_plugins=self.selected_plugins,
config_overrides=snapshot["config"],
derived_config_overrides=self.derived_config,
bus=self.bus,
emit_jsonl=False,
install_enabled=False,
crawl_setup_enabled=False,
crawl_start_enabled=True,
snapshot_cleanup_enabled=True,
crawl_cleanup_enabled=False,
machine_service=None,
binary_service=None,
process_service=None,
archive_result_service=None,
tag_service=None,
)
finally:
current_task = asyncio.current_task()
if current_task is not None and self.snapshot_tasks.get(snapshot_id) is current_task:
self.snapshot_tasks.pop(snapshot_id, None)
def seal_snapshot_due_to_limit(self, snapshot_id: str) -> None:
from archivebox.core.models import Snapshot
snapshot = Snapshot.objects.filter(id=snapshot_id).first()
if snapshot is None or snapshot.status == Snapshot.StatusChoices.SEALED:
return
snapshot.status = Snapshot.StatusChoices.SEALED
snapshot.retry_at = None
snapshot.save(update_fields=["status", "retry_at", "modified_at"])
def run_crawl(
crawl_id: str,
*,
snapshot_ids: list[str] | None = None,
selected_plugins: list[str] | None = None,
process_discovered_snapshots_inline: bool = True,
) -> None:
from archivebox.crawls.models import Crawl
crawl = Crawl.objects.get(id=crawl_id)
asyncio.run(
CrawlRunner(
crawl,
snapshot_ids=snapshot_ids,
selected_plugins=selected_plugins,
process_discovered_snapshots_inline=process_discovered_snapshots_inline,
).run(),
)
async def _run_binary(binary_id: str) -> None:
from archivebox.config.configset import get_config
from archivebox.machine.models import Binary, Machine
binary = await Binary.objects.aget(id=binary_id)
plugins = discover_plugins()
config = get_config()
machine = await sync_to_async(Machine.current, thread_sensitive=True)()
derived_config = dict(machine.config)
config["ABX_RUNTIME"] = "archivebox"
bus = create_bus(name=_bus_name("ArchiveBox_binary", str(binary.id)), total_timeout=1800.0)
ProcessService(bus)
BinaryService(bus)
TagService(bus)
ArchiveResultService(bus)
setup_abx_services(
bus,
plugins=plugins,
config_overrides=config,
derived_config_overrides=derived_config,
persist_derived=False,
auto_install=True,
emit_jsonl=False,
)
try:
await bus.emit(
BinaryRequestEvent(
name=binary.name,
plugin_name="archivebox",
hook_name="on_BinaryRequest__archivebox_run",
output_dir=str(binary.output_dir),
binary_id=str(binary.id),
machine_id=str(binary.machine_id),
binproviders=binary.binproviders,
overrides=binary.overrides or None,
),
)
finally:
await bus.stop()
def run_binary(binary_id: str) -> None:
asyncio.run(_run_binary(binary_id))
async def _run_install(plugin_names: list[str] | None = None) -> None:
from archivebox.config.configset import get_config
from archivebox.machine.models import Machine
plugins = discover_plugins()
config = get_config()
machine = await sync_to_async(Machine.current, thread_sensitive=True)()
derived_config = dict(machine.config)
config["ABX_RUNTIME"] = "archivebox"
bus = create_bus(name="ArchiveBox_install", total_timeout=3600.0)
ProcessService(bus)
BinaryService(bus)
TagService(bus)
ArchiveResultService(bus)
setup_abx_services(
bus,
plugins=plugins,
config_overrides=config,
derived_config_overrides=derived_config,
persist_derived=False,
auto_install=True,
emit_jsonl=False,
)
live_stream = None
try:
selected_plugins = filter_plugins(plugins, list(plugin_names), include_providers=True) if plugin_names else plugins
if not selected_plugins:
return
plugins_label = ", ".join(plugin_names) if plugin_names else f"all ({len(plugins)} available)"
timeout_seconds = config["TIMEOUT"]
stdout_is_tty = sys.stdout.isatty()
stderr_is_tty = sys.stderr.isatty()
interactive_tty = stdout_is_tty or stderr_is_tty
ui_console = None
live_ui = None
if interactive_tty:
stream = sys.stderr if stderr_is_tty else sys.stdout
if os.path.exists("/dev/tty"):
try:
live_stream = open("/dev/tty", "w", buffering=1, encoding=stream.encoding or "utf-8")
stream = live_stream
except OSError:
live_stream = None
try:
terminal_size = os.get_terminal_size(stream.fileno())
terminal_width = terminal_size.columns
terminal_height = terminal_size.lines
except (AttributeError, OSError, ValueError):
terminal_size = shutil.get_terminal_size(fallback=(160, 40))
terminal_width = terminal_size.columns
terminal_height = terminal_size.lines
ui_console = Console(
file=stream,
force_terminal=True,
width=terminal_width,
height=terminal_height,
_environ={
"COLUMNS": str(terminal_width),
"LINES": str(terminal_height),
},
)
with TemporaryDirectory(prefix="archivebox-install-") as temp_dir:
output_dir = Path(temp_dir)
if ui_console is not None:
live_ui = LiveBusUI(
bus,
total_hooks=_count_selected_hooks(selected_plugins, None),
timeout_seconds=timeout_seconds,
ui_console=ui_console,
interactive_tty=interactive_tty,
)
live_ui.print_intro(
url="install",
output_dir=output_dir,
plugins_label=plugins_label,
)
with live_ui if live_ui is not None else nullcontext():
results = await abx_install_plugins(
plugin_names=plugin_names,
plugins=plugins,
output_dir=output_dir,
config_overrides=config,
derived_config_overrides=derived_config,
emit_jsonl=False,
bus=bus,
machine_service=None,
binary_service=None,
process_service=None,
)
if live_ui is not None:
live_ui.print_summary(results, output_dir=output_dir)
finally:
await bus.stop()
try:
if live_stream is not None:
live_stream.close()
except Exception:
pass
def run_install(*, plugin_names: list[str] | None = None) -> None:
asyncio.run(_run_install(plugin_names=plugin_names))
def recover_orphaned_crawls() -> int:
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot
from archivebox.machine.models import Process
active_crawl_ids: set[str] = set()
orphaned_crawls = list(
Crawl.objects.filter(
status=Crawl.StatusChoices.STARTED,
retry_at__isnull=True,
).prefetch_related("snapshot_set"),
)
running_processes = Process.objects.filter(
status=Process.StatusChoices.RUNNING,
process_type__in=[
Process.TypeChoices.WORKER,
Process.TypeChoices.HOOK,
Process.TypeChoices.BINARY,
],
).only("pwd")
for proc in running_processes:
if not proc.pwd:
continue
proc_pwd = Path(proc.pwd)
for crawl in orphaned_crawls:
matched_snapshot = None
for snapshot in crawl.snapshot_set.all():
try:
proc_pwd.relative_to(snapshot.output_dir)
matched_snapshot = snapshot
break
except ValueError:
continue
if matched_snapshot is not None:
active_crawl_ids.add(str(crawl.id))
break
recovered = 0
now = timezone.now()
for crawl in orphaned_crawls:
if str(crawl.id) in active_crawl_ids:
continue
snapshots = list(crawl.snapshot_set.all())
if not snapshots or all(snapshot.status == Snapshot.StatusChoices.SEALED for snapshot in snapshots):
crawl.status = Crawl.StatusChoices.SEALED
crawl.retry_at = None
crawl.save(update_fields=["status", "retry_at", "modified_at"])
recovered += 1
continue
crawl.retry_at = now
crawl.save(update_fields=["retry_at", "modified_at"])
recovered += 1
return recovered
def recover_orphaned_snapshots() -> int:
from archivebox.crawls.models import Crawl
from archivebox.core.models import ArchiveResult, Snapshot
from archivebox.machine.models import Process
active_snapshot_ids: set[str] = set()
orphaned_snapshots = list(
Snapshot.objects.filter(status=Snapshot.StatusChoices.STARTED, retry_at__isnull=True)
.select_related("crawl")
.prefetch_related("archiveresult_set"),
)
running_processes = Process.objects.filter(
status=Process.StatusChoices.RUNNING,
process_type__in=[
Process.TypeChoices.WORKER,
Process.TypeChoices.HOOK,
Process.TypeChoices.BINARY,
],
).only("pwd")
for proc in running_processes:
if not proc.pwd:
continue
proc_pwd = Path(proc.pwd)
for snapshot in orphaned_snapshots:
try:
proc_pwd.relative_to(snapshot.output_dir)
active_snapshot_ids.add(str(snapshot.id))
break
except ValueError:
continue
recovered = 0
now = timezone.now()
for snapshot in orphaned_snapshots:
if str(snapshot.id) in active_snapshot_ids:
continue
results = list(snapshot.archiveresult_set.all())
if results and all(result.status in ArchiveResult.FINAL_STATES for result in results):
snapshot.status = Snapshot.StatusChoices.SEALED
snapshot.retry_at = None
snapshot.downloaded_at = snapshot.downloaded_at or now
snapshot.save(update_fields=["status", "retry_at", "downloaded_at", "modified_at"])
crawl = snapshot.crawl
if crawl.is_finished() and crawl.status != Crawl.StatusChoices.SEALED:
crawl.status = Crawl.StatusChoices.SEALED
crawl.retry_at = None
crawl.save(update_fields=["status", "retry_at", "modified_at"])
recovered += 1
continue
snapshot.status = Snapshot.StatusChoices.QUEUED
snapshot.retry_at = now
snapshot.save(update_fields=["status", "retry_at", "modified_at"])
crawl = snapshot.crawl
crawl.status = Crawl.StatusChoices.QUEUED
crawl.retry_at = now
crawl.save(update_fields=["status", "retry_at", "modified_at"])
recovered += 1
return recovered
def run_pending_crawls(*, daemon: bool = False, crawl_id: str | None = None) -> int:
from archivebox.crawls.models import Crawl, CrawlSchedule
from archivebox.core.models import Snapshot
from archivebox.machine.models import Binary
while True:
if daemon and crawl_id is None:
now = timezone.now()
for schedule in CrawlSchedule.objects.filter(is_enabled=True).select_related("template", "template__created_by"):
if schedule.is_due(now):
schedule.enqueue(queued_at=now)
if crawl_id is None:
binary = (
Binary.objects.filter(retry_at__lte=timezone.now())
.exclude(status=Binary.StatusChoices.INSTALLED)
.order_by("retry_at", "created_at")
.first()
)
if binary is not None:
if not binary.claim_processing_lock(lock_seconds=60):
continue
run_binary(str(binary.id))
continue
queued_crawls = Crawl.objects.filter(
retry_at__lte=timezone.now(),
status=Crawl.StatusChoices.QUEUED,
)
if crawl_id:
queued_crawls = queued_crawls.filter(id=crawl_id)
queued_crawls = queued_crawls.order_by("retry_at", "created_at")
queued_crawl = queued_crawls.first()
if queued_crawl is not None:
if not queued_crawl.claim_processing_lock(lock_seconds=60):
continue
run_crawl(str(queued_crawl.id), process_discovered_snapshots_inline=False)
continue
if crawl_id is None:
snapshot = (
Snapshot.objects.filter(retry_at__lte=timezone.now())
.exclude(status=Snapshot.StatusChoices.SEALED)
.select_related("crawl")
.order_by("retry_at", "created_at")
.first()
)
if snapshot is not None:
if not snapshot.claim_processing_lock(lock_seconds=60):
continue
run_crawl(
str(snapshot.crawl_id),
snapshot_ids=[str(snapshot.id)],
process_discovered_snapshots_inline=False,
)
continue
pending = Crawl.objects.filter(
retry_at__lte=timezone.now(),
status=Crawl.StatusChoices.STARTED,
)
if crawl_id:
pending = pending.filter(id=crawl_id)
pending = pending.order_by("retry_at", "created_at")
crawl = pending.first()
if crawl is None:
if daemon:
time.sleep(2.0)
continue
return 0
if not crawl.claim_processing_lock(lock_seconds=60):
continue
run_crawl(str(crawl.id), process_discovered_snapshots_inline=False)