WIP: checkpoint working tree before rebasing onto dev

This commit is contained in:
Nick Sweeting
2026-03-22 20:23:45 -07:00
parent a6548df8d0
commit f400a2cd67
87 changed files with 12607 additions and 1808 deletions

View File

@@ -1,5 +1,6 @@
from __future__ import annotations
import json
import mimetypes
from collections import defaultdict
from pathlib import Path
@@ -7,9 +8,10 @@ from pathlib import Path
from asgiref.sync import sync_to_async
from django.utils import timezone
from abx_dl.events import ArchiveResultEvent
from abx_dl.events import ArchiveResultEvent, ProcessCompletedEvent
from abx_dl.services.base import BaseService
from .db import run_db_op
from .process_service import ProcessService, parse_event_datetime
@@ -48,22 +50,93 @@ def _collect_output_metadata(plugin_dir: Path) -> tuple[dict[str, dict], int, st
def _normalize_status(status: str) -> str:
if status == "noresult":
return "skipped"
return "noresults"
return status or "failed"
def _has_content_files(output_files: list[str]) -> bool:
return any(Path(path).suffix not in {".log", ".pid", ".sh"} for path in output_files)
def _iter_archiveresult_records(stdout: str) -> list[dict]:
records: list[dict] = []
for raw_line in stdout.splitlines():
line = raw_line.strip()
if not line.startswith("{"):
continue
try:
record = json.loads(line)
except json.JSONDecodeError:
continue
if record.get("type") == "ArchiveResult":
records.append(record)
return records
class ArchiveResultService(BaseService):
LISTENS_TO = [ArchiveResultEvent]
LISTENS_TO = [ArchiveResultEvent, ProcessCompletedEvent]
EMITS = []
def __init__(self, bus, *, process_service: ProcessService):
self.process_service = process_service
super().__init__(bus)
async def on_ArchiveResultEvent(self, event: ArchiveResultEvent) -> None:
await sync_to_async(self._project, thread_sensitive=True)(event)
async def on_ArchiveResultEvent__Outer(self, event: ArchiveResultEvent) -> None:
snapshot_output_dir = await run_db_op(self._get_snapshot_output_dir, event.snapshot_id)
if snapshot_output_dir is None:
return
plugin_dir = Path(snapshot_output_dir) / event.plugin
output_files, output_size, output_mimetypes = await sync_to_async(_collect_output_metadata)(plugin_dir)
await run_db_op(self._project, event, output_files, output_size, output_mimetypes)
def _project(self, event: ArchiveResultEvent) -> None:
async def on_ProcessCompletedEvent__Outer(self, event: ProcessCompletedEvent) -> None:
if not event.snapshot_id or not event.hook_name.startswith("on_Snapshot"):
return
plugin_dir = Path(event.output_dir)
output_files, output_size, output_mimetypes = await sync_to_async(_collect_output_metadata)(plugin_dir)
records = _iter_archiveresult_records(event.stdout)
if records:
for record in records:
await run_db_op(
self._project_from_process_completed,
event,
record,
output_files,
output_size,
output_mimetypes,
)
return
synthetic_record = {
"plugin": event.plugin_name,
"hook_name": event.hook_name,
"status": "failed" if event.exit_code != 0 else ("succeeded" if _has_content_files(event.output_files) else "skipped"),
"output_str": event.stderr if event.exit_code != 0 else "",
"error": event.stderr if event.exit_code != 0 else "",
}
await run_db_op(
self._project_from_process_completed,
event,
synthetic_record,
output_files,
output_size,
output_mimetypes,
)
def _get_snapshot_output_dir(self, snapshot_id: str) -> str | None:
from archivebox.core.models import Snapshot
snapshot = Snapshot.objects.filter(id=snapshot_id).only("output_dir").first()
return str(snapshot.output_dir) if snapshot is not None else None
def _project(
self,
event: ArchiveResultEvent,
output_files: dict[str, dict],
output_size: int,
output_mimetypes: str,
) -> None:
from archivebox.core.models import ArchiveResult, Snapshot
from archivebox.machine.models import Process
@@ -86,8 +159,6 @@ class ArchiveResultService(BaseService):
},
)
plugin_dir = Path(snapshot.output_dir) / event.plugin
output_files, output_size, output_mimetypes = _collect_output_metadata(plugin_dir)
result.process = process or result.process
result.status = _normalize_status(event.status)
result.output_str = event.output_str
@@ -97,7 +168,28 @@ class ArchiveResultService(BaseService):
result.output_mimetypes = output_mimetypes
result.start_ts = parse_event_datetime(event.start_ts) or result.start_ts or timezone.now()
result.end_ts = parse_event_datetime(event.end_ts) or timezone.now()
result.retry_at = None
if event.error:
result.notes = event.error
result.save()
def _project_from_process_completed(
self,
event: ProcessCompletedEvent,
record: dict,
output_files: dict[str, dict],
output_size: int,
output_mimetypes: str,
) -> None:
archive_result_event = ArchiveResultEvent(
snapshot_id=record.get("snapshot_id") or event.snapshot_id,
plugin=record.get("plugin") or event.plugin_name,
hook_name=record.get("hook_name") or event.hook_name,
status=record.get("status") or "",
process_id=event.process_id,
output_str=record.get("output_str") or "",
output_json=record.get("output_json") if isinstance(record.get("output_json"), dict) else None,
start_ts=event.start_ts,
end_ts=event.end_ts,
error=record.get("error") or (event.stderr if event.exit_code != 0 else ""),
)
self._project(archive_result_event, output_files, output_size, output_mimetypes)

View File

@@ -1,19 +1,23 @@
from __future__ import annotations
from asgiref.sync import sync_to_async
import asyncio
from abx_dl.events import BinaryEvent, BinaryInstalledEvent
from abx_dl.services.base import BaseService
from .db import run_db_op
class BinaryService(BaseService):
LISTENS_TO = [BinaryEvent, BinaryInstalledEvent]
EMITS = []
async def on_BinaryEvent(self, event: BinaryEvent) -> None:
await sync_to_async(self._project_binary, thread_sensitive=True)(event)
async def on_BinaryEvent__Outer(self, event: BinaryEvent) -> None:
await run_db_op(self._project_binary, event)
async def on_BinaryInstalledEvent(self, event: BinaryInstalledEvent) -> None:
await sync_to_async(self._project_installed_binary, thread_sensitive=True)(event)
async def on_BinaryInstalledEvent__Outer(self, event: BinaryInstalledEvent) -> None:
resolved = await asyncio.to_thread(self._resolve_installed_binary_metadata, event)
await run_db_op(self._project_installed_binary, event, resolved)
def _project_binary(self, event: BinaryEvent) -> None:
from archivebox.machine.models import Binary, Machine
@@ -44,7 +48,39 @@ class BinaryService(BaseService):
},
)
def _project_installed_binary(self, event: BinaryInstalledEvent) -> None:
def _resolve_installed_binary_metadata(self, event: BinaryInstalledEvent) -> dict[str, str]:
resolved = {
"abspath": event.abspath or "",
"version": event.version or "",
"sha256": event.sha256 or "",
"binproviders": event.binproviders or "",
"binprovider": event.binprovider or "",
}
if resolved["abspath"] and resolved["version"] and resolved["binprovider"]:
return resolved
try:
from abx_dl.dependencies import load_binary
allowed_providers = resolved["binproviders"] or resolved["binprovider"] or "env,pip,npm,brew,apt"
spec = {
"name": event.name,
"binproviders": allowed_providers,
"overrides": event.overrides or {},
}
binary = load_binary(spec)
resolved["abspath"] = str(getattr(binary, "abspath", None) or resolved["abspath"] or "")
resolved["version"] = str(getattr(binary, "version", None) or resolved["version"] or "")
resolved["sha256"] = str(getattr(binary, "sha256", None) or resolved["sha256"] or "")
provider_name = getattr(getattr(binary, "loaded_binprovider", None), "name", None)
if provider_name:
resolved["binprovider"] = str(provider_name)
except Exception:
pass
return resolved
def _project_installed_binary(self, event: BinaryInstalledEvent, resolved: dict[str, str]) -> None:
from archivebox.machine.models import Binary, Machine
machine = Machine.current()
@@ -55,10 +91,14 @@ class BinaryService(BaseService):
"status": Binary.StatusChoices.QUEUED,
},
)
binary.abspath = event.abspath or binary.abspath
binary.version = event.version or binary.version
binary.sha256 = event.sha256 or binary.sha256
binary.binprovider = event.binprovider or binary.binprovider
binary.abspath = resolved["abspath"] or binary.abspath
binary.version = resolved["version"] or binary.version
binary.sha256 = resolved["sha256"] or binary.sha256
if resolved["binproviders"]:
binary.binproviders = resolved["binproviders"]
binary.binprovider = resolved["binprovider"] or binary.binprovider
if event.overrides and binary.overrides != event.overrides:
binary.overrides = event.overrides
binary.status = Binary.StatusChoices.INSTALLED
binary.retry_at = None
binary.save(update_fields=["abspath", "version", "sha256", "binprovider", "status", "retry_at", "modified_at"])
binary.save(update_fields=["abspath", "version", "sha256", "binproviders", "binprovider", "overrides", "status", "retry_at", "modified_at"])

View File

@@ -1,11 +1,10 @@
from __future__ import annotations
from asgiref.sync import sync_to_async
from django.utils import timezone
from abx_dl.events import CrawlCleanupEvent, CrawlCompletedEvent, CrawlSetupEvent, CrawlStartEvent
from abx_dl.services.base import BaseService
from .db import run_db_op
class CrawlService(BaseService):
LISTENS_TO = [CrawlSetupEvent, CrawlStartEvent, CrawlCleanupEvent, CrawlCompletedEvent]
@@ -15,17 +14,17 @@ class CrawlService(BaseService):
self.crawl_id = crawl_id
super().__init__(bus)
async def on_CrawlSetupEvent(self, event: CrawlSetupEvent) -> None:
await sync_to_async(self._mark_started, thread_sensitive=True)()
async def on_CrawlSetupEvent__Outer(self, event: CrawlSetupEvent) -> None:
await run_db_op(self._mark_started)
async def on_CrawlStartEvent(self, event: CrawlStartEvent) -> None:
await sync_to_async(self._mark_started, thread_sensitive=True)()
async def on_CrawlStartEvent__Outer(self, event: CrawlStartEvent) -> None:
await run_db_op(self._mark_started)
async def on_CrawlCleanupEvent(self, event: CrawlCleanupEvent) -> None:
await sync_to_async(self._mark_started, thread_sensitive=True)()
async def on_CrawlCleanupEvent__Outer(self, event: CrawlCleanupEvent) -> None:
await run_db_op(self._mark_started)
async def on_CrawlCompletedEvent(self, event: CrawlCompletedEvent) -> None:
await sync_to_async(self._mark_completed, thread_sensitive=True)()
async def on_CrawlCompletedEvent__Outer(self, event: CrawlCompletedEvent) -> None:
await run_db_op(self._mark_completed)
def _mark_started(self) -> None:
from archivebox.crawls.models import Crawl

16
archivebox/services/db.py Normal file
View File

@@ -0,0 +1,16 @@
from __future__ import annotations
from asgiref.sync import sync_to_async
from django.db import close_old_connections
def _run_db_op(func, *args, **kwargs):
close_old_connections()
try:
return func(*args, **kwargs)
finally:
close_old_connections()
async def run_db_op(func, *args, **kwargs):
return await sync_to_async(_run_db_op, thread_sensitive=True)(func, *args, **kwargs)

View File

@@ -0,0 +1 @@
from abx_dl.cli import LiveBusUI

View File

@@ -1,16 +1,17 @@
from __future__ import annotations
from asgiref.sync import sync_to_async
from abx_dl.events import MachineEvent
from abx_dl.services.base import BaseService
from .db import run_db_op
class MachineService(BaseService):
LISTENS_TO = [MachineEvent]
EMITS = []
async def on_MachineEvent(self, event: MachineEvent) -> None:
await sync_to_async(self._project, thread_sensitive=True)(event)
async def on_MachineEvent__Outer(self, event: MachineEvent) -> None:
await run_db_op(self._project, event)
def _project(self, event: MachineEvent) -> None:
from archivebox.machine.models import Machine

View File

@@ -3,12 +3,13 @@ from __future__ import annotations
from datetime import datetime
from typing import TYPE_CHECKING
from asgiref.sync import sync_to_async
from django.utils import timezone
from abx_dl.events import ProcessCompletedEvent, ProcessStartedEvent
from abx_dl.services.base import BaseService
from .db import run_db_op
if TYPE_CHECKING:
from archivebox.machine.models import Process
@@ -33,27 +34,33 @@ class ProcessService(BaseService):
self.process_ids: dict[str, str] = {}
super().__init__(bus)
async def on_ProcessStartedEvent(self, event: ProcessStartedEvent) -> None:
await sync_to_async(self._project_started, thread_sensitive=True)(event)
async def on_ProcessStartedEvent__Outer(self, event: ProcessStartedEvent) -> None:
await run_db_op(self._project_started, event)
async def on_ProcessCompletedEvent(self, event: ProcessCompletedEvent) -> None:
await sync_to_async(self._project_completed, thread_sensitive=True)(event)
async def on_ProcessCompletedEvent__Outer(self, event: ProcessCompletedEvent) -> None:
await run_db_op(self._project_completed, event)
def get_db_process_id(self, process_id: str) -> str | None:
return self.process_ids.get(process_id)
def _get_or_create_process(self, event: ProcessStartedEvent | ProcessCompletedEvent) -> "Process":
from archivebox.machine.models import Machine, Process
from archivebox.machine.models import NetworkInterface, Process
db_process_id = self.process_ids.get(event.process_id)
iface = NetworkInterface.current(refresh=True)
if db_process_id:
process = Process.objects.filter(id=db_process_id).first()
if process is not None:
if process.iface_id != iface.id or process.machine_id != iface.machine_id:
process.iface = iface
process.machine = iface.machine
process.save(update_fields=["iface", "machine", "modified_at"])
return process
process_type = Process.TypeChoices.BINARY if event.hook_name.startswith("on_Binary") else Process.TypeChoices.HOOK
process = Process.objects.create(
machine=Machine.current(),
machine=iface.machine,
iface=iface,
process_type=process_type,
pwd=event.output_dir,
cmd=[event.hook_path, *event.hook_args],
@@ -77,12 +84,14 @@ class ProcessService(BaseService):
process.started_at = parse_event_datetime(event.start_ts) or process.started_at or timezone.now()
process.status = process.StatusChoices.RUNNING
process.retry_at = None
process.hydrate_binary_from_context(plugin_name=event.plugin_name, hook_path=event.hook_path)
process.save()
def _project_completed(self, event: ProcessCompletedEvent) -> None:
process = self._get_or_create_process(event)
process.pwd = event.output_dir
process.cmd = [event.hook_path, *event.hook_args]
if not process.cmd:
process.cmd = [event.hook_path, *event.hook_args]
process.env = event.env
process.pid = event.pid or process.pid
process.started_at = parse_event_datetime(event.start_ts) or process.started_at
@@ -92,4 +101,5 @@ class ProcessService(BaseService):
process.exit_code = event.exit_code
process.status = process.StatusChoices.EXITED
process.retry_at = None
process.hydrate_binary_from_context(plugin_name=event.plugin_name, hook_path=event.hook_path)
process.save()

View File

@@ -3,16 +3,21 @@ from __future__ import annotations
import asyncio
import json
import os
import shutil
import subprocess
import sys
import time
from contextlib import nullcontext
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Any
from django.utils import timezone
from rich.console import Console
from abx_dl.events import BinaryEvent
from abx_dl.models import INSTALL_URL, Snapshot as AbxSnapshot, discover_plugins
from abx_dl.orchestrator import create_bus, download, install_plugins as abx_install_plugins, setup_services as setup_abx_services
from abx_dl.models import INSTALL_URL, Plugin, Snapshot as AbxSnapshot, discover_plugins, filter_plugins
from abx_dl.orchestrator import create_bus, download, install_plugins as abx_install_plugins, prepare_install_plugins, setup_services as setup_abx_services
from .archive_result_service import ArchiveResultService
from .binary_service import BinaryService
@@ -21,6 +26,7 @@ from .machine_service import MachineService
from .process_service import ProcessService
from .snapshot_service import SnapshotService
from .tag_service import TagService
from .live_ui import LiveBusUI
def _bus_name(prefix: str, identifier: str) -> str:
@@ -35,6 +41,19 @@ def _selected_plugins_from_config(config: dict[str, Any]) -> list[str] | None:
return [name.strip() for name in raw.split(",") if name.strip()]
def _count_selected_hooks(plugins: dict[str, Plugin], selected_plugins: list[str] | None) -> int:
selected = filter_plugins(plugins, selected_plugins) if selected_plugins else plugins
total = 0
for plugin in selected.values():
total += len(list(plugin.get_crawl_hooks()))
total += len(list(plugin.get_snapshot_hooks()))
return total
def _runner_debug(message: str) -> None:
print(f"[runner] {message}", file=sys.stderr, flush=True)
def _attach_bus_trace(bus) -> None:
trace_target = (os.environ.get("ARCHIVEBOX_BUS_TRACE") or "").strip()
if not trace_target:
@@ -78,10 +97,51 @@ async def _stop_bus_trace(bus) -> None:
bus._archivebox_trace_task = None
def ensure_background_runner(*, allow_under_pytest: bool = False) -> bool:
if os.environ.get("PYTEST_CURRENT_TEST") and not allow_under_pytest:
return False
from archivebox.config import CONSTANTS
from archivebox.machine.models import Machine, Process
Process.cleanup_stale_running()
machine = Machine.current()
if Process.objects.filter(
machine=machine,
status=Process.StatusChoices.RUNNING,
process_type=Process.TypeChoices.ORCHESTRATOR,
).exists():
return False
log_path = CONSTANTS.LOGS_DIR / "errors.log"
log_path.parent.mkdir(parents=True, exist_ok=True)
env = os.environ.copy()
env.setdefault("DATA_DIR", str(CONSTANTS.DATA_DIR))
with log_path.open("a", encoding="utf-8") as log_handle:
subprocess.Popen(
[sys.executable, "-m", "archivebox", "run", "--daemon"],
cwd=str(CONSTANTS.DATA_DIR),
env=env,
stdin=subprocess.DEVNULL,
stdout=log_handle,
stderr=log_handle,
start_new_session=True,
)
return True
class CrawlRunner:
MAX_CONCURRENT_SNAPSHOTS = 8
def __init__(self, crawl, *, snapshot_ids: list[str] | None = None, selected_plugins: list[str] | None = None):
def __init__(
self,
crawl,
*,
snapshot_ids: list[str] | None = None,
selected_plugins: list[str] | None = None,
process_discovered_snapshots_inline: bool = True,
):
self.crawl = crawl
self.bus = create_bus(name=_bus_name("ArchiveBox", str(crawl.id)), total_timeout=3600.0)
self.plugins = discover_plugins()
@@ -90,7 +150,12 @@ class CrawlRunner:
self.binary_service = BinaryService(self.bus)
self.tag_service = TagService(self.bus)
self.crawl_service = CrawlService(self.bus, crawl_id=str(crawl.id))
self.snapshot_service = SnapshotService(self.bus, crawl_id=str(crawl.id), schedule_snapshot=self.enqueue_snapshot)
self.process_discovered_snapshots_inline = process_discovered_snapshots_inline
self.snapshot_service = SnapshotService(
self.bus,
crawl_id=str(crawl.id),
schedule_snapshot=self.enqueue_snapshot if process_discovered_snapshots_inline else self.leave_snapshot_queued,
)
self.archive_result_service = ArchiveResultService(self.bus, process_service=self.process_service)
self.selected_plugins = selected_plugins
self.initial_snapshot_ids = snapshot_ids
@@ -100,6 +165,29 @@ class CrawlRunner:
self.persona = None
self.base_config: dict[str, Any] = {}
self.primary_url = ""
self._live_stream = None
def _create_projector_bus(self, *, identifier: str, config_overrides: dict[str, Any]):
bus = create_bus(name=_bus_name("ArchiveBox", identifier), total_timeout=3600.0)
process_service = ProcessService(bus)
MachineService(bus)
BinaryService(bus)
TagService(bus)
CrawlService(bus, crawl_id=str(self.crawl.id))
SnapshotService(
bus,
crawl_id=str(self.crawl.id),
schedule_snapshot=self.enqueue_snapshot if self.process_discovered_snapshots_inline else self.leave_snapshot_queued,
)
ArchiveResultService(bus, process_service=process_service)
abx_services = setup_abx_services(
bus,
plugins=self.plugins,
config_overrides=config_overrides,
auto_install=True,
emit_jsonl=False,
)
return bus, abx_services
async def run(self) -> None:
from asgiref.sync import sync_to_async
@@ -107,35 +195,63 @@ class CrawlRunner:
try:
await sync_to_async(self._prepare, thread_sensitive=True)()
_attach_bus_trace(self.bus)
self.abx_services = setup_abx_services(
self.bus,
plugins=self.plugins,
config_overrides=self.base_config,
auto_install=True,
emit_jsonl=False,
)
if self.crawl.get_system_task() == INSTALL_URL:
await self._run_install_crawl()
else:
snapshot_ids = await sync_to_async(self._initial_snapshot_ids, thread_sensitive=True)()
if snapshot_ids:
root_snapshot_id = snapshot_ids[0]
await self._run_crawl_setup(root_snapshot_id)
for snapshot_id in snapshot_ids:
await self.enqueue_snapshot(snapshot_id)
await self._wait_for_snapshot_tasks()
await self._run_crawl_cleanup(root_snapshot_id)
if self.abx_services is not None:
await self.abx_services.process.wait_for_background_monitors()
live_ui = self._create_live_ui()
with live_ui if live_ui is not None else nullcontext():
_attach_bus_trace(self.bus)
self.abx_services = setup_abx_services(
self.bus,
plugins=self.plugins,
config_overrides=self.base_config,
auto_install=True,
emit_jsonl=False,
)
if self.crawl.get_system_task() == INSTALL_URL:
await self._run_install_crawl()
else:
snapshot_ids = await sync_to_async(self._initial_snapshot_ids, thread_sensitive=True)()
if snapshot_ids:
root_snapshot_id = snapshot_ids[0]
_runner_debug(f"crawl {self.crawl.id} starting crawl setup root_snapshot={root_snapshot_id}")
await self._run_crawl_setup(root_snapshot_id)
_runner_debug(f"crawl {self.crawl.id} finished crawl setup root_snapshot={root_snapshot_id}")
for snapshot_id in snapshot_ids:
await self.enqueue_snapshot(snapshot_id)
_runner_debug(f"crawl {self.crawl.id} waiting for snapshot tasks count={len(self.snapshot_tasks)}")
await self._wait_for_snapshot_tasks()
_runner_debug(f"crawl {self.crawl.id} finished waiting for snapshot tasks")
_runner_debug(f"crawl {self.crawl.id} starting django crawl.cleanup()")
await sync_to_async(self.crawl.cleanup, thread_sensitive=True)()
_runner_debug(f"crawl {self.crawl.id} finished django crawl.cleanup()")
_runner_debug(f"crawl {self.crawl.id} starting abx crawl cleanup root_snapshot={root_snapshot_id}")
await self._run_crawl_cleanup(root_snapshot_id)
_runner_debug(f"crawl {self.crawl.id} finished abx crawl cleanup root_snapshot={root_snapshot_id}")
if self.abx_services is not None:
_runner_debug(f"crawl {self.crawl.id} waiting for main bus background monitors")
await self.abx_services.process.wait_for_background_monitors()
_runner_debug(f"crawl {self.crawl.id} finished waiting for main bus background monitors")
finally:
await _stop_bus_trace(self.bus)
await self.bus.stop()
if self._live_stream is not None:
try:
self._live_stream.close()
except Exception:
pass
self._live_stream = None
await sync_to_async(self._cleanup_persona, thread_sensitive=True)()
crawl = await sync_to_async(Crawl.objects.get, thread_sensitive=True)(id=self.crawl.id)
if crawl.status != Crawl.StatusChoices.SEALED:
crawl.status = Crawl.StatusChoices.SEALED
crawl.retry_at = None
crawl_is_finished = await sync_to_async(crawl.is_finished, thread_sensitive=True)()
if crawl_is_finished:
if crawl.status != Crawl.StatusChoices.SEALED:
crawl.status = Crawl.StatusChoices.SEALED
crawl.retry_at = None
await sync_to_async(crawl.save, thread_sensitive=True)(update_fields=["status", "retry_at", "modified_at"])
else:
if crawl.status == Crawl.StatusChoices.SEALED:
crawl.status = Crawl.StatusChoices.QUEUED
elif crawl.status != Crawl.StatusChoices.STARTED:
crawl.status = Crawl.StatusChoices.STARTED
crawl.retry_at = crawl.retry_at or timezone.now()
await sync_to_async(crawl.save, thread_sensitive=True)(update_fields=["status", "retry_at", "modified_at"])
async def enqueue_snapshot(self, snapshot_id: str) -> None:
@@ -145,17 +261,36 @@ class CrawlRunner:
task = asyncio.create_task(self._run_snapshot(snapshot_id))
self.snapshot_tasks[snapshot_id] = task
async def leave_snapshot_queued(self, snapshot_id: str) -> None:
return None
async def _wait_for_snapshot_tasks(self) -> None:
while True:
active = [task for task in self.snapshot_tasks.values() if not task.done()]
if not active:
pending_tasks: list[asyncio.Task[None]] = []
for snapshot_id, task in list(self.snapshot_tasks.items()):
if task.done():
if self.snapshot_tasks.get(snapshot_id) is task:
self.snapshot_tasks.pop(snapshot_id, None)
task.result()
continue
pending_tasks.append(task)
if not pending_tasks:
return
await asyncio.gather(*active)
done, _pending = await asyncio.wait(pending_tasks, return_when=asyncio.FIRST_COMPLETED)
for task in done:
task.result()
def _prepare(self) -> None:
from archivebox.config.configset import get_config
from archivebox.machine.models import NetworkInterface, Process
self.primary_url = self.crawl.get_urls_list()[0] if self.crawl.get_urls_list() else ""
current_iface = NetworkInterface.current(refresh=True)
current_process = Process.current()
if current_process.iface_id != current_iface.id or current_process.machine_id != current_iface.machine_id:
current_process.iface = current_iface
current_process.machine = current_iface.machine
current_process.save(update_fields=["iface", "machine", "modified_at"])
self.persona = self.crawl.resolve_persona()
self.base_config = get_config(crawl=self.crawl)
if self.selected_plugins is None:
@@ -168,6 +303,52 @@ class CrawlRunner:
if self.persona:
self.persona.cleanup_runtime_for_crawl(self.crawl)
def _create_live_ui(self) -> LiveBusUI | None:
stdout_is_tty = sys.stdout.isatty()
stderr_is_tty = sys.stderr.isatty()
interactive_tty = stdout_is_tty or stderr_is_tty
if not interactive_tty:
return None
stream = sys.stderr if stderr_is_tty else sys.stdout
if os.path.exists("/dev/tty"):
try:
self._live_stream = open("/dev/tty", "w", buffering=1, encoding=getattr(stream, "encoding", None) or "utf-8")
stream = self._live_stream
except OSError:
self._live_stream = None
try:
terminal_size = os.get_terminal_size(stream.fileno())
terminal_width = terminal_size.columns
terminal_height = terminal_size.lines
except (AttributeError, OSError, ValueError):
terminal_size = shutil.get_terminal_size(fallback=(160, 40))
terminal_width = terminal_size.columns
terminal_height = terminal_size.lines
ui_console = Console(
file=stream,
force_terminal=True,
width=terminal_width,
height=terminal_height,
_environ={
"COLUMNS": str(terminal_width),
"LINES": str(terminal_height),
},
)
plugins_label = ", ".join(self.selected_plugins) if self.selected_plugins else f"all ({len(self.plugins)} available)"
live_ui = LiveBusUI(
self.bus,
total_hooks=_count_selected_hooks(self.plugins, self.selected_plugins),
timeout_seconds=int(self.base_config.get("TIMEOUT") or 60),
ui_console=ui_console,
interactive_tty=True,
)
live_ui.print_intro(
url=self.primary_url or INSTALL_URL,
output_dir=Path(self.crawl.output_dir),
plugins_label=plugins_label,
)
return live_ui
def _create_root_snapshots(self) -> list[str]:
created = self.crawl.create_snapshots_from_urls()
snapshots = created or list(self.crawl.snapshot_set.filter(depth=0).order_by("created_at"))
@@ -290,18 +471,34 @@ class CrawlRunner:
parent_snapshot_id=snapshot["parent_snapshot_id"],
crawl_id=str(self.crawl.id),
)
await download(
url=snapshot["url"],
plugins=self.plugins,
output_dir=Path(snapshot["output_dir"]),
selected_plugins=self.selected_plugins,
snapshot_bus, snapshot_services = self._create_projector_bus(
identifier=f"{self.crawl.id}_{snapshot['id']}",
config_overrides=snapshot["config"],
bus=self.bus,
emit_jsonl=False,
snapshot=abx_snapshot,
skip_crawl_setup=True,
skip_crawl_cleanup=True,
)
try:
_attach_bus_trace(snapshot_bus)
_runner_debug(f"snapshot {snapshot_id} starting download()")
await download(
url=snapshot["url"],
plugins=self.plugins,
output_dir=Path(snapshot["output_dir"]),
selected_plugins=self.selected_plugins,
config_overrides=snapshot["config"],
bus=snapshot_bus,
emit_jsonl=False,
snapshot=abx_snapshot,
skip_crawl_setup=True,
skip_crawl_cleanup=True,
)
_runner_debug(f"snapshot {snapshot_id} finished download(), waiting for background monitors")
await snapshot_services.process.wait_for_background_monitors()
_runner_debug(f"snapshot {snapshot_id} finished waiting for background monitors")
finally:
current_task = asyncio.current_task()
if current_task is not None and self.snapshot_tasks.get(snapshot_id) is current_task:
self.snapshot_tasks.pop(snapshot_id, None)
await _stop_bus_trace(snapshot_bus)
await snapshot_bus.stop()
def _load_snapshot_run_data(self, snapshot_id: str):
from archivebox.core.models import Snapshot
@@ -322,11 +519,24 @@ class CrawlRunner:
}
def run_crawl(crawl_id: str, *, snapshot_ids: list[str] | None = None, selected_plugins: list[str] | None = None) -> None:
def run_crawl(
crawl_id: str,
*,
snapshot_ids: list[str] | None = None,
selected_plugins: list[str] | None = None,
process_discovered_snapshots_inline: bool = True,
) -> None:
from archivebox.crawls.models import Crawl
crawl = Crawl.objects.get(id=crawl_id)
asyncio.run(CrawlRunner(crawl, snapshot_ids=snapshot_ids, selected_plugins=selected_plugins).run())
asyncio.run(
CrawlRunner(
crawl,
snapshot_ids=snapshot_ids,
selected_plugins=selected_plugins,
process_discovered_snapshots_inline=process_discovered_snapshots_inline,
).run()
)
async def _run_binary(binary_id: str) -> None:
@@ -397,28 +607,203 @@ async def _run_install(plugin_names: list[str] | None = None) -> None:
BinaryService(bus)
TagService(bus)
ArchiveResultService(bus, process_service=process_service)
live_stream = None
try:
_attach_bus_trace(bus)
await abx_install_plugins(
plugin_names=plugin_names,
plugins=plugins,
config_overrides=config,
emit_jsonl=False,
bus=bus,
)
await abx_services.process.wait_for_background_monitors()
selected_plugins = prepare_install_plugins(plugins, plugin_names=plugin_names)
plugins_label = ", ".join(plugin_names) if plugin_names else f"all ({len(plugins)} available)"
timeout_seconds = int(config.get("TIMEOUT") or 60)
stdout_is_tty = sys.stdout.isatty()
stderr_is_tty = sys.stderr.isatty()
interactive_tty = stdout_is_tty or stderr_is_tty
ui_console = None
live_ui = None
if interactive_tty:
stream = sys.stderr if stderr_is_tty else sys.stdout
if os.path.exists("/dev/tty"):
try:
live_stream = open("/dev/tty", "w", buffering=1, encoding=getattr(stream, "encoding", None) or "utf-8")
stream = live_stream
except OSError:
live_stream = None
try:
terminal_size = os.get_terminal_size(stream.fileno())
terminal_width = terminal_size.columns
terminal_height = terminal_size.lines
except (AttributeError, OSError, ValueError):
terminal_size = shutil.get_terminal_size(fallback=(160, 40))
terminal_width = terminal_size.columns
terminal_height = terminal_size.lines
ui_console = Console(
file=stream,
force_terminal=True,
width=terminal_width,
height=terminal_height,
_environ={
"COLUMNS": str(terminal_width),
"LINES": str(terminal_height),
},
)
with TemporaryDirectory(prefix="archivebox-install-") as temp_dir:
output_dir = Path(temp_dir)
if ui_console is not None:
live_ui = LiveBusUI(
bus,
total_hooks=_count_selected_hooks(selected_plugins, None),
timeout_seconds=timeout_seconds,
ui_console=ui_console,
interactive_tty=interactive_tty,
)
live_ui.print_intro(
url=INSTALL_URL,
output_dir=output_dir,
plugins_label=plugins_label,
)
with live_ui if live_ui is not None else nullcontext():
_attach_bus_trace(bus)
results = await abx_install_plugins(
plugin_names=plugin_names,
plugins=plugins,
output_dir=output_dir,
config_overrides=config,
emit_jsonl=False,
bus=bus,
)
await abx_services.process.wait_for_background_monitors()
if live_ui is not None:
live_ui.print_summary(results, output_dir=output_dir)
finally:
await _stop_bus_trace(bus)
await bus.stop()
try:
if live_stream is not None:
live_stream.close()
except Exception:
pass
def run_install(*, plugin_names: list[str] | None = None) -> None:
asyncio.run(_run_install(plugin_names=plugin_names))
def recover_orphaned_crawls() -> int:
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot
from archivebox.machine.models import Process
active_crawl_ids: set[str] = set()
running_processes = Process.objects.filter(
status=Process.StatusChoices.RUNNING,
process_type__in=[
Process.TypeChoices.WORKER,
Process.TypeChoices.HOOK,
Process.TypeChoices.BINARY,
],
).only("env")
for proc in running_processes:
env = proc.env or {}
if not isinstance(env, dict):
continue
crawl_id = env.get("CRAWL_ID")
if crawl_id:
active_crawl_ids.add(str(crawl_id))
recovered = 0
now = timezone.now()
orphaned_crawls = Crawl.objects.filter(
status=Crawl.StatusChoices.STARTED,
retry_at__isnull=True,
).prefetch_related("snapshot_set")
for crawl in orphaned_crawls:
if str(crawl.id) in active_crawl_ids:
continue
snapshots = list(crawl.snapshot_set.all())
if not snapshots or all(snapshot.status == Snapshot.StatusChoices.SEALED for snapshot in snapshots):
crawl.status = Crawl.StatusChoices.SEALED
crawl.retry_at = None
crawl.save(update_fields=["status", "retry_at", "modified_at"])
recovered += 1
continue
crawl.retry_at = now
crawl.save(update_fields=["retry_at", "modified_at"])
recovered += 1
return recovered
def recover_orphaned_snapshots() -> int:
from archivebox.crawls.models import Crawl
from archivebox.core.models import ArchiveResult, Snapshot
from archivebox.machine.models import Process
active_snapshot_ids: set[str] = set()
running_processes = Process.objects.filter(
status=Process.StatusChoices.RUNNING,
process_type__in=[
Process.TypeChoices.WORKER,
Process.TypeChoices.HOOK,
Process.TypeChoices.BINARY,
],
).only("env")
for proc in running_processes:
env = proc.env or {}
if not isinstance(env, dict):
continue
snapshot_id = env.get("SNAPSHOT_ID")
if snapshot_id:
active_snapshot_ids.add(str(snapshot_id))
recovered = 0
now = timezone.now()
orphaned_snapshots = (
Snapshot.objects
.filter(status=Snapshot.StatusChoices.STARTED, retry_at__isnull=True)
.select_related("crawl")
.prefetch_related("archiveresult_set")
)
for snapshot in orphaned_snapshots:
if str(snapshot.id) in active_snapshot_ids:
continue
results = list(snapshot.archiveresult_set.all())
if results and all(result.status in ArchiveResult.FINAL_STATES for result in results):
snapshot.status = Snapshot.StatusChoices.SEALED
snapshot.retry_at = None
snapshot.downloaded_at = snapshot.downloaded_at or now
snapshot.save(update_fields=["status", "retry_at", "downloaded_at", "modified_at"])
crawl = snapshot.crawl
if crawl.is_finished() and crawl.status != Crawl.StatusChoices.SEALED:
crawl.status = Crawl.StatusChoices.SEALED
crawl.retry_at = None
crawl.save(update_fields=["status", "retry_at", "modified_at"])
recovered += 1
continue
snapshot.status = Snapshot.StatusChoices.QUEUED
snapshot.retry_at = now
snapshot.save(update_fields=["status", "retry_at", "modified_at"])
crawl = snapshot.crawl
crawl.status = Crawl.StatusChoices.QUEUED
crawl.retry_at = now
crawl.save(update_fields=["status", "retry_at", "modified_at"])
recovered += 1
return recovered
def run_pending_crawls(*, daemon: bool = False, crawl_id: str | None = None) -> int:
from archivebox.crawls.models import Crawl, CrawlSchedule
from archivebox.core.models import Snapshot
from archivebox.machine.models import Binary
while True:
@@ -436,10 +821,48 @@ def run_pending_crawls(*, daemon: bool = False, crawl_id: str | None = None) ->
.first()
)
if binary is not None:
if not binary.claim_processing_lock(lock_seconds=60):
continue
run_binary(str(binary.id))
continue
pending = Crawl.objects.filter(retry_at__lte=timezone.now()).exclude(status=Crawl.StatusChoices.SEALED)
queued_crawls = Crawl.objects.filter(
retry_at__lte=timezone.now(),
status=Crawl.StatusChoices.QUEUED,
)
if crawl_id:
queued_crawls = queued_crawls.filter(id=crawl_id)
queued_crawls = queued_crawls.order_by("retry_at", "created_at")
queued_crawl = queued_crawls.first()
if queued_crawl is not None:
if not queued_crawl.claim_processing_lock(lock_seconds=60):
continue
run_crawl(str(queued_crawl.id), process_discovered_snapshots_inline=False)
continue
if crawl_id is None:
snapshot = (
Snapshot.objects.filter(retry_at__lte=timezone.now())
.exclude(status=Snapshot.StatusChoices.SEALED)
.select_related("crawl")
.order_by("retry_at", "created_at")
.first()
)
if snapshot is not None:
if not snapshot.claim_processing_lock(lock_seconds=60):
continue
run_crawl(
str(snapshot.crawl_id),
snapshot_ids=[str(snapshot.id)],
process_discovered_snapshots_inline=False,
)
continue
pending = Crawl.objects.filter(
retry_at__lte=timezone.now(),
status=Crawl.StatusChoices.STARTED,
)
if crawl_id:
pending = pending.filter(id=crawl_id)
pending = pending.order_by("retry_at", "created_at")
@@ -451,4 +874,7 @@ def run_pending_crawls(*, daemon: bool = False, crawl_id: str | None = None) ->
continue
return 0
run_crawl(str(crawl.id))
if not crawl.claim_processing_lock(lock_seconds=60):
continue
run_crawl(str(crawl.id), process_discovered_snapshots_inline=False)

View File

@@ -1,13 +1,13 @@
from __future__ import annotations
import re
from asgiref.sync import sync_to_async
from django.utils import timezone
from abx_dl.events import SnapshotCompletedEvent, SnapshotEvent
from abx_dl.services.base import BaseService
from .db import run_db_op
class SnapshotService(BaseService):
LISTENS_TO = [SnapshotEvent, SnapshotCompletedEvent]
@@ -18,13 +18,17 @@ class SnapshotService(BaseService):
self.schedule_snapshot = schedule_snapshot
super().__init__(bus)
async def on_SnapshotEvent(self, event: SnapshotEvent) -> None:
snapshot_id = await sync_to_async(self._project_snapshot, thread_sensitive=True)(event)
async def on_SnapshotEvent__Outer(self, event: SnapshotEvent) -> None:
snapshot_id = await run_db_op(self._project_snapshot, event)
if snapshot_id:
await sync_to_async(self._ensure_crawl_symlink)(snapshot_id)
if snapshot_id and event.depth > 0:
await self.schedule_snapshot(snapshot_id)
async def on_SnapshotCompletedEvent(self, event: SnapshotCompletedEvent) -> None:
await sync_to_async(self._seal_snapshot, thread_sensitive=True)(event.snapshot_id)
async def on_SnapshotCompletedEvent__Outer(self, event: SnapshotCompletedEvent) -> None:
snapshot_id = await run_db_op(self._seal_snapshot, event.snapshot_id)
if snapshot_id:
await sync_to_async(self._write_snapshot_details)(snapshot_id)
def _project_snapshot(self, event: SnapshotEvent) -> str | None:
from archivebox.core.models import Snapshot
@@ -39,7 +43,6 @@ class SnapshotService(BaseService):
snapshot.status = Snapshot.StatusChoices.STARTED
snapshot.retry_at = None
snapshot.save(update_fields=["status", "retry_at", "modified_at"])
snapshot.ensure_crawl_symlink()
return str(snapshot.id)
if event.depth > crawl.max_depth:
@@ -73,56 +76,36 @@ class SnapshotService(BaseService):
if snapshot.status != Snapshot.StatusChoices.SEALED:
snapshot.status = Snapshot.StatusChoices.QUEUED
snapshot.save(update_fields=["status", "retry_at", "modified_at"])
snapshot.ensure_crawl_symlink()
return str(snapshot.id)
def _url_passes_filters(self, crawl, parent_snapshot, url: str) -> bool:
from archivebox.config.configset import get_config
return crawl.url_passes_filters(url, snapshot=parent_snapshot)
config = get_config(
user=getattr(crawl, "created_by", None),
crawl=crawl,
snapshot=parent_snapshot,
)
def to_pattern_list(value):
if isinstance(value, list):
return value
if isinstance(value, str):
return [pattern.strip() for pattern in value.split(",") if pattern.strip()]
return []
allowlist = to_pattern_list(config.get("URL_ALLOWLIST", ""))
denylist = to_pattern_list(config.get("URL_DENYLIST", ""))
for pattern in denylist:
try:
if re.search(pattern, url):
return False
except re.error:
continue
if allowlist:
for pattern in allowlist:
try:
if re.search(pattern, url):
return True
except re.error:
continue
return False
return True
def _seal_snapshot(self, snapshot_id: str) -> None:
def _seal_snapshot(self, snapshot_id: str) -> str | None:
from archivebox.core.models import Snapshot
snapshot = Snapshot.objects.filter(id=snapshot_id).first()
if snapshot is None:
return
return None
snapshot.status = Snapshot.StatusChoices.SEALED
snapshot.retry_at = None
snapshot.downloaded_at = snapshot.downloaded_at or timezone.now()
snapshot.save(update_fields=["status", "retry_at", "downloaded_at", "modified_at"])
return str(snapshot.id)
def _ensure_crawl_symlink(self, snapshot_id: str) -> None:
from archivebox.core.models import Snapshot
snapshot = Snapshot.objects.filter(id=snapshot_id).select_related("crawl", "crawl__created_by").first()
if snapshot is not None:
snapshot.ensure_crawl_symlink()
def _write_snapshot_details(self, snapshot_id: str) -> None:
from archivebox.core.models import Snapshot
snapshot = Snapshot.objects.filter(id=snapshot_id).select_related("crawl", "crawl__created_by").first()
if snapshot is None:
return
snapshot.write_index_jsonl()
snapshot.write_json_details()
snapshot.write_html_details()

View File

@@ -1,16 +1,17 @@
from __future__ import annotations
from asgiref.sync import sync_to_async
from abx_dl.events import TagEvent
from abx_dl.services.base import BaseService
from .db import run_db_op
class TagService(BaseService):
LISTENS_TO = [TagEvent]
EMITS = []
async def on_TagEvent(self, event: TagEvent) -> None:
await sync_to_async(self._project, thread_sensitive=True)(event)
async def on_TagEvent__Outer(self, event: TagEvent) -> None:
await run_db_op(self._project, event)
def _project(self, event: TagEvent) -> None:
from archivebox.core.models import Snapshot, Tag