mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
WIP: checkpoint working tree before rebasing onto dev
This commit is contained in:
@@ -1,5 +1,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import mimetypes
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
@@ -7,9 +8,10 @@ from pathlib import Path
|
||||
from asgiref.sync import sync_to_async
|
||||
from django.utils import timezone
|
||||
|
||||
from abx_dl.events import ArchiveResultEvent
|
||||
from abx_dl.events import ArchiveResultEvent, ProcessCompletedEvent
|
||||
from abx_dl.services.base import BaseService
|
||||
|
||||
from .db import run_db_op
|
||||
from .process_service import ProcessService, parse_event_datetime
|
||||
|
||||
|
||||
@@ -48,22 +50,93 @@ def _collect_output_metadata(plugin_dir: Path) -> tuple[dict[str, dict], int, st
|
||||
|
||||
def _normalize_status(status: str) -> str:
|
||||
if status == "noresult":
|
||||
return "skipped"
|
||||
return "noresults"
|
||||
return status or "failed"
|
||||
|
||||
|
||||
def _has_content_files(output_files: list[str]) -> bool:
|
||||
return any(Path(path).suffix not in {".log", ".pid", ".sh"} for path in output_files)
|
||||
|
||||
|
||||
def _iter_archiveresult_records(stdout: str) -> list[dict]:
|
||||
records: list[dict] = []
|
||||
for raw_line in stdout.splitlines():
|
||||
line = raw_line.strip()
|
||||
if not line.startswith("{"):
|
||||
continue
|
||||
try:
|
||||
record = json.loads(line)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
if record.get("type") == "ArchiveResult":
|
||||
records.append(record)
|
||||
return records
|
||||
|
||||
|
||||
class ArchiveResultService(BaseService):
|
||||
LISTENS_TO = [ArchiveResultEvent]
|
||||
LISTENS_TO = [ArchiveResultEvent, ProcessCompletedEvent]
|
||||
EMITS = []
|
||||
|
||||
def __init__(self, bus, *, process_service: ProcessService):
|
||||
self.process_service = process_service
|
||||
super().__init__(bus)
|
||||
|
||||
async def on_ArchiveResultEvent(self, event: ArchiveResultEvent) -> None:
|
||||
await sync_to_async(self._project, thread_sensitive=True)(event)
|
||||
async def on_ArchiveResultEvent__Outer(self, event: ArchiveResultEvent) -> None:
|
||||
snapshot_output_dir = await run_db_op(self._get_snapshot_output_dir, event.snapshot_id)
|
||||
if snapshot_output_dir is None:
|
||||
return
|
||||
plugin_dir = Path(snapshot_output_dir) / event.plugin
|
||||
output_files, output_size, output_mimetypes = await sync_to_async(_collect_output_metadata)(plugin_dir)
|
||||
await run_db_op(self._project, event, output_files, output_size, output_mimetypes)
|
||||
|
||||
def _project(self, event: ArchiveResultEvent) -> None:
|
||||
async def on_ProcessCompletedEvent__Outer(self, event: ProcessCompletedEvent) -> None:
|
||||
if not event.snapshot_id or not event.hook_name.startswith("on_Snapshot"):
|
||||
return
|
||||
|
||||
plugin_dir = Path(event.output_dir)
|
||||
output_files, output_size, output_mimetypes = await sync_to_async(_collect_output_metadata)(plugin_dir)
|
||||
records = _iter_archiveresult_records(event.stdout)
|
||||
if records:
|
||||
for record in records:
|
||||
await run_db_op(
|
||||
self._project_from_process_completed,
|
||||
event,
|
||||
record,
|
||||
output_files,
|
||||
output_size,
|
||||
output_mimetypes,
|
||||
)
|
||||
return
|
||||
|
||||
synthetic_record = {
|
||||
"plugin": event.plugin_name,
|
||||
"hook_name": event.hook_name,
|
||||
"status": "failed" if event.exit_code != 0 else ("succeeded" if _has_content_files(event.output_files) else "skipped"),
|
||||
"output_str": event.stderr if event.exit_code != 0 else "",
|
||||
"error": event.stderr if event.exit_code != 0 else "",
|
||||
}
|
||||
await run_db_op(
|
||||
self._project_from_process_completed,
|
||||
event,
|
||||
synthetic_record,
|
||||
output_files,
|
||||
output_size,
|
||||
output_mimetypes,
|
||||
)
|
||||
|
||||
def _get_snapshot_output_dir(self, snapshot_id: str) -> str | None:
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
snapshot = Snapshot.objects.filter(id=snapshot_id).only("output_dir").first()
|
||||
return str(snapshot.output_dir) if snapshot is not None else None
|
||||
|
||||
def _project(
|
||||
self,
|
||||
event: ArchiveResultEvent,
|
||||
output_files: dict[str, dict],
|
||||
output_size: int,
|
||||
output_mimetypes: str,
|
||||
) -> None:
|
||||
from archivebox.core.models import ArchiveResult, Snapshot
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
@@ -86,8 +159,6 @@ class ArchiveResultService(BaseService):
|
||||
},
|
||||
)
|
||||
|
||||
plugin_dir = Path(snapshot.output_dir) / event.plugin
|
||||
output_files, output_size, output_mimetypes = _collect_output_metadata(plugin_dir)
|
||||
result.process = process or result.process
|
||||
result.status = _normalize_status(event.status)
|
||||
result.output_str = event.output_str
|
||||
@@ -97,7 +168,28 @@ class ArchiveResultService(BaseService):
|
||||
result.output_mimetypes = output_mimetypes
|
||||
result.start_ts = parse_event_datetime(event.start_ts) or result.start_ts or timezone.now()
|
||||
result.end_ts = parse_event_datetime(event.end_ts) or timezone.now()
|
||||
result.retry_at = None
|
||||
if event.error:
|
||||
result.notes = event.error
|
||||
result.save()
|
||||
|
||||
def _project_from_process_completed(
|
||||
self,
|
||||
event: ProcessCompletedEvent,
|
||||
record: dict,
|
||||
output_files: dict[str, dict],
|
||||
output_size: int,
|
||||
output_mimetypes: str,
|
||||
) -> None:
|
||||
archive_result_event = ArchiveResultEvent(
|
||||
snapshot_id=record.get("snapshot_id") or event.snapshot_id,
|
||||
plugin=record.get("plugin") or event.plugin_name,
|
||||
hook_name=record.get("hook_name") or event.hook_name,
|
||||
status=record.get("status") or "",
|
||||
process_id=event.process_id,
|
||||
output_str=record.get("output_str") or "",
|
||||
output_json=record.get("output_json") if isinstance(record.get("output_json"), dict) else None,
|
||||
start_ts=event.start_ts,
|
||||
end_ts=event.end_ts,
|
||||
error=record.get("error") or (event.stderr if event.exit_code != 0 else ""),
|
||||
)
|
||||
self._project(archive_result_event, output_files, output_size, output_mimetypes)
|
||||
|
||||
@@ -1,19 +1,23 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from asgiref.sync import sync_to_async
|
||||
import asyncio
|
||||
|
||||
from abx_dl.events import BinaryEvent, BinaryInstalledEvent
|
||||
from abx_dl.services.base import BaseService
|
||||
|
||||
from .db import run_db_op
|
||||
|
||||
|
||||
class BinaryService(BaseService):
|
||||
LISTENS_TO = [BinaryEvent, BinaryInstalledEvent]
|
||||
EMITS = []
|
||||
|
||||
async def on_BinaryEvent(self, event: BinaryEvent) -> None:
|
||||
await sync_to_async(self._project_binary, thread_sensitive=True)(event)
|
||||
async def on_BinaryEvent__Outer(self, event: BinaryEvent) -> None:
|
||||
await run_db_op(self._project_binary, event)
|
||||
|
||||
async def on_BinaryInstalledEvent(self, event: BinaryInstalledEvent) -> None:
|
||||
await sync_to_async(self._project_installed_binary, thread_sensitive=True)(event)
|
||||
async def on_BinaryInstalledEvent__Outer(self, event: BinaryInstalledEvent) -> None:
|
||||
resolved = await asyncio.to_thread(self._resolve_installed_binary_metadata, event)
|
||||
await run_db_op(self._project_installed_binary, event, resolved)
|
||||
|
||||
def _project_binary(self, event: BinaryEvent) -> None:
|
||||
from archivebox.machine.models import Binary, Machine
|
||||
@@ -44,7 +48,39 @@ class BinaryService(BaseService):
|
||||
},
|
||||
)
|
||||
|
||||
def _project_installed_binary(self, event: BinaryInstalledEvent) -> None:
|
||||
def _resolve_installed_binary_metadata(self, event: BinaryInstalledEvent) -> dict[str, str]:
|
||||
resolved = {
|
||||
"abspath": event.abspath or "",
|
||||
"version": event.version or "",
|
||||
"sha256": event.sha256 or "",
|
||||
"binproviders": event.binproviders or "",
|
||||
"binprovider": event.binprovider or "",
|
||||
}
|
||||
if resolved["abspath"] and resolved["version"] and resolved["binprovider"]:
|
||||
return resolved
|
||||
|
||||
try:
|
||||
from abx_dl.dependencies import load_binary
|
||||
|
||||
allowed_providers = resolved["binproviders"] or resolved["binprovider"] or "env,pip,npm,brew,apt"
|
||||
spec = {
|
||||
"name": event.name,
|
||||
"binproviders": allowed_providers,
|
||||
"overrides": event.overrides or {},
|
||||
}
|
||||
binary = load_binary(spec)
|
||||
resolved["abspath"] = str(getattr(binary, "abspath", None) or resolved["abspath"] or "")
|
||||
resolved["version"] = str(getattr(binary, "version", None) or resolved["version"] or "")
|
||||
resolved["sha256"] = str(getattr(binary, "sha256", None) or resolved["sha256"] or "")
|
||||
provider_name = getattr(getattr(binary, "loaded_binprovider", None), "name", None)
|
||||
if provider_name:
|
||||
resolved["binprovider"] = str(provider_name)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return resolved
|
||||
|
||||
def _project_installed_binary(self, event: BinaryInstalledEvent, resolved: dict[str, str]) -> None:
|
||||
from archivebox.machine.models import Binary, Machine
|
||||
|
||||
machine = Machine.current()
|
||||
@@ -55,10 +91,14 @@ class BinaryService(BaseService):
|
||||
"status": Binary.StatusChoices.QUEUED,
|
||||
},
|
||||
)
|
||||
binary.abspath = event.abspath or binary.abspath
|
||||
binary.version = event.version or binary.version
|
||||
binary.sha256 = event.sha256 or binary.sha256
|
||||
binary.binprovider = event.binprovider or binary.binprovider
|
||||
binary.abspath = resolved["abspath"] or binary.abspath
|
||||
binary.version = resolved["version"] or binary.version
|
||||
binary.sha256 = resolved["sha256"] or binary.sha256
|
||||
if resolved["binproviders"]:
|
||||
binary.binproviders = resolved["binproviders"]
|
||||
binary.binprovider = resolved["binprovider"] or binary.binprovider
|
||||
if event.overrides and binary.overrides != event.overrides:
|
||||
binary.overrides = event.overrides
|
||||
binary.status = Binary.StatusChoices.INSTALLED
|
||||
binary.retry_at = None
|
||||
binary.save(update_fields=["abspath", "version", "sha256", "binprovider", "status", "retry_at", "modified_at"])
|
||||
binary.save(update_fields=["abspath", "version", "sha256", "binproviders", "binprovider", "overrides", "status", "retry_at", "modified_at"])
|
||||
|
||||
@@ -1,11 +1,10 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from asgiref.sync import sync_to_async
|
||||
from django.utils import timezone
|
||||
|
||||
from abx_dl.events import CrawlCleanupEvent, CrawlCompletedEvent, CrawlSetupEvent, CrawlStartEvent
|
||||
from abx_dl.services.base import BaseService
|
||||
|
||||
from .db import run_db_op
|
||||
|
||||
|
||||
class CrawlService(BaseService):
|
||||
LISTENS_TO = [CrawlSetupEvent, CrawlStartEvent, CrawlCleanupEvent, CrawlCompletedEvent]
|
||||
@@ -15,17 +14,17 @@ class CrawlService(BaseService):
|
||||
self.crawl_id = crawl_id
|
||||
super().__init__(bus)
|
||||
|
||||
async def on_CrawlSetupEvent(self, event: CrawlSetupEvent) -> None:
|
||||
await sync_to_async(self._mark_started, thread_sensitive=True)()
|
||||
async def on_CrawlSetupEvent__Outer(self, event: CrawlSetupEvent) -> None:
|
||||
await run_db_op(self._mark_started)
|
||||
|
||||
async def on_CrawlStartEvent(self, event: CrawlStartEvent) -> None:
|
||||
await sync_to_async(self._mark_started, thread_sensitive=True)()
|
||||
async def on_CrawlStartEvent__Outer(self, event: CrawlStartEvent) -> None:
|
||||
await run_db_op(self._mark_started)
|
||||
|
||||
async def on_CrawlCleanupEvent(self, event: CrawlCleanupEvent) -> None:
|
||||
await sync_to_async(self._mark_started, thread_sensitive=True)()
|
||||
async def on_CrawlCleanupEvent__Outer(self, event: CrawlCleanupEvent) -> None:
|
||||
await run_db_op(self._mark_started)
|
||||
|
||||
async def on_CrawlCompletedEvent(self, event: CrawlCompletedEvent) -> None:
|
||||
await sync_to_async(self._mark_completed, thread_sensitive=True)()
|
||||
async def on_CrawlCompletedEvent__Outer(self, event: CrawlCompletedEvent) -> None:
|
||||
await run_db_op(self._mark_completed)
|
||||
|
||||
def _mark_started(self) -> None:
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
16
archivebox/services/db.py
Normal file
16
archivebox/services/db.py
Normal file
@@ -0,0 +1,16 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from asgiref.sync import sync_to_async
|
||||
from django.db import close_old_connections
|
||||
|
||||
|
||||
def _run_db_op(func, *args, **kwargs):
|
||||
close_old_connections()
|
||||
try:
|
||||
return func(*args, **kwargs)
|
||||
finally:
|
||||
close_old_connections()
|
||||
|
||||
|
||||
async def run_db_op(func, *args, **kwargs):
|
||||
return await sync_to_async(_run_db_op, thread_sensitive=True)(func, *args, **kwargs)
|
||||
1
archivebox/services/live_ui.py
Normal file
1
archivebox/services/live_ui.py
Normal file
@@ -0,0 +1 @@
|
||||
from abx_dl.cli import LiveBusUI
|
||||
@@ -1,16 +1,17 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from asgiref.sync import sync_to_async
|
||||
from abx_dl.events import MachineEvent
|
||||
from abx_dl.services.base import BaseService
|
||||
|
||||
from .db import run_db_op
|
||||
|
||||
|
||||
class MachineService(BaseService):
|
||||
LISTENS_TO = [MachineEvent]
|
||||
EMITS = []
|
||||
|
||||
async def on_MachineEvent(self, event: MachineEvent) -> None:
|
||||
await sync_to_async(self._project, thread_sensitive=True)(event)
|
||||
async def on_MachineEvent__Outer(self, event: MachineEvent) -> None:
|
||||
await run_db_op(self._project, event)
|
||||
|
||||
def _project(self, event: MachineEvent) -> None:
|
||||
from archivebox.machine.models import Machine
|
||||
|
||||
@@ -3,12 +3,13 @@ from __future__ import annotations
|
||||
from datetime import datetime
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from asgiref.sync import sync_to_async
|
||||
from django.utils import timezone
|
||||
|
||||
from abx_dl.events import ProcessCompletedEvent, ProcessStartedEvent
|
||||
from abx_dl.services.base import BaseService
|
||||
|
||||
from .db import run_db_op
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
@@ -33,27 +34,33 @@ class ProcessService(BaseService):
|
||||
self.process_ids: dict[str, str] = {}
|
||||
super().__init__(bus)
|
||||
|
||||
async def on_ProcessStartedEvent(self, event: ProcessStartedEvent) -> None:
|
||||
await sync_to_async(self._project_started, thread_sensitive=True)(event)
|
||||
async def on_ProcessStartedEvent__Outer(self, event: ProcessStartedEvent) -> None:
|
||||
await run_db_op(self._project_started, event)
|
||||
|
||||
async def on_ProcessCompletedEvent(self, event: ProcessCompletedEvent) -> None:
|
||||
await sync_to_async(self._project_completed, thread_sensitive=True)(event)
|
||||
async def on_ProcessCompletedEvent__Outer(self, event: ProcessCompletedEvent) -> None:
|
||||
await run_db_op(self._project_completed, event)
|
||||
|
||||
def get_db_process_id(self, process_id: str) -> str | None:
|
||||
return self.process_ids.get(process_id)
|
||||
|
||||
def _get_or_create_process(self, event: ProcessStartedEvent | ProcessCompletedEvent) -> "Process":
|
||||
from archivebox.machine.models import Machine, Process
|
||||
from archivebox.machine.models import NetworkInterface, Process
|
||||
|
||||
db_process_id = self.process_ids.get(event.process_id)
|
||||
iface = NetworkInterface.current(refresh=True)
|
||||
if db_process_id:
|
||||
process = Process.objects.filter(id=db_process_id).first()
|
||||
if process is not None:
|
||||
if process.iface_id != iface.id or process.machine_id != iface.machine_id:
|
||||
process.iface = iface
|
||||
process.machine = iface.machine
|
||||
process.save(update_fields=["iface", "machine", "modified_at"])
|
||||
return process
|
||||
|
||||
process_type = Process.TypeChoices.BINARY if event.hook_name.startswith("on_Binary") else Process.TypeChoices.HOOK
|
||||
process = Process.objects.create(
|
||||
machine=Machine.current(),
|
||||
machine=iface.machine,
|
||||
iface=iface,
|
||||
process_type=process_type,
|
||||
pwd=event.output_dir,
|
||||
cmd=[event.hook_path, *event.hook_args],
|
||||
@@ -77,12 +84,14 @@ class ProcessService(BaseService):
|
||||
process.started_at = parse_event_datetime(event.start_ts) or process.started_at or timezone.now()
|
||||
process.status = process.StatusChoices.RUNNING
|
||||
process.retry_at = None
|
||||
process.hydrate_binary_from_context(plugin_name=event.plugin_name, hook_path=event.hook_path)
|
||||
process.save()
|
||||
|
||||
def _project_completed(self, event: ProcessCompletedEvent) -> None:
|
||||
process = self._get_or_create_process(event)
|
||||
process.pwd = event.output_dir
|
||||
process.cmd = [event.hook_path, *event.hook_args]
|
||||
if not process.cmd:
|
||||
process.cmd = [event.hook_path, *event.hook_args]
|
||||
process.env = event.env
|
||||
process.pid = event.pid or process.pid
|
||||
process.started_at = parse_event_datetime(event.start_ts) or process.started_at
|
||||
@@ -92,4 +101,5 @@ class ProcessService(BaseService):
|
||||
process.exit_code = event.exit_code
|
||||
process.status = process.StatusChoices.EXITED
|
||||
process.retry_at = None
|
||||
process.hydrate_binary_from_context(plugin_name=event.plugin_name, hook_path=event.hook_path)
|
||||
process.save()
|
||||
|
||||
@@ -3,16 +3,21 @@ from __future__ import annotations
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from contextlib import nullcontext
|
||||
from pathlib import Path
|
||||
from tempfile import TemporaryDirectory
|
||||
from typing import Any
|
||||
|
||||
from django.utils import timezone
|
||||
from rich.console import Console
|
||||
|
||||
from abx_dl.events import BinaryEvent
|
||||
from abx_dl.models import INSTALL_URL, Snapshot as AbxSnapshot, discover_plugins
|
||||
from abx_dl.orchestrator import create_bus, download, install_plugins as abx_install_plugins, setup_services as setup_abx_services
|
||||
from abx_dl.models import INSTALL_URL, Plugin, Snapshot as AbxSnapshot, discover_plugins, filter_plugins
|
||||
from abx_dl.orchestrator import create_bus, download, install_plugins as abx_install_plugins, prepare_install_plugins, setup_services as setup_abx_services
|
||||
|
||||
from .archive_result_service import ArchiveResultService
|
||||
from .binary_service import BinaryService
|
||||
@@ -21,6 +26,7 @@ from .machine_service import MachineService
|
||||
from .process_service import ProcessService
|
||||
from .snapshot_service import SnapshotService
|
||||
from .tag_service import TagService
|
||||
from .live_ui import LiveBusUI
|
||||
|
||||
|
||||
def _bus_name(prefix: str, identifier: str) -> str:
|
||||
@@ -35,6 +41,19 @@ def _selected_plugins_from_config(config: dict[str, Any]) -> list[str] | None:
|
||||
return [name.strip() for name in raw.split(",") if name.strip()]
|
||||
|
||||
|
||||
def _count_selected_hooks(plugins: dict[str, Plugin], selected_plugins: list[str] | None) -> int:
|
||||
selected = filter_plugins(plugins, selected_plugins) if selected_plugins else plugins
|
||||
total = 0
|
||||
for plugin in selected.values():
|
||||
total += len(list(plugin.get_crawl_hooks()))
|
||||
total += len(list(plugin.get_snapshot_hooks()))
|
||||
return total
|
||||
|
||||
|
||||
def _runner_debug(message: str) -> None:
|
||||
print(f"[runner] {message}", file=sys.stderr, flush=True)
|
||||
|
||||
|
||||
def _attach_bus_trace(bus) -> None:
|
||||
trace_target = (os.environ.get("ARCHIVEBOX_BUS_TRACE") or "").strip()
|
||||
if not trace_target:
|
||||
@@ -78,10 +97,51 @@ async def _stop_bus_trace(bus) -> None:
|
||||
bus._archivebox_trace_task = None
|
||||
|
||||
|
||||
def ensure_background_runner(*, allow_under_pytest: bool = False) -> bool:
|
||||
if os.environ.get("PYTEST_CURRENT_TEST") and not allow_under_pytest:
|
||||
return False
|
||||
|
||||
from archivebox.config import CONSTANTS
|
||||
from archivebox.machine.models import Machine, Process
|
||||
|
||||
Process.cleanup_stale_running()
|
||||
machine = Machine.current()
|
||||
if Process.objects.filter(
|
||||
machine=machine,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
process_type=Process.TypeChoices.ORCHESTRATOR,
|
||||
).exists():
|
||||
return False
|
||||
|
||||
log_path = CONSTANTS.LOGS_DIR / "errors.log"
|
||||
log_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
env = os.environ.copy()
|
||||
env.setdefault("DATA_DIR", str(CONSTANTS.DATA_DIR))
|
||||
|
||||
with log_path.open("a", encoding="utf-8") as log_handle:
|
||||
subprocess.Popen(
|
||||
[sys.executable, "-m", "archivebox", "run", "--daemon"],
|
||||
cwd=str(CONSTANTS.DATA_DIR),
|
||||
env=env,
|
||||
stdin=subprocess.DEVNULL,
|
||||
stdout=log_handle,
|
||||
stderr=log_handle,
|
||||
start_new_session=True,
|
||||
)
|
||||
return True
|
||||
|
||||
|
||||
class CrawlRunner:
|
||||
MAX_CONCURRENT_SNAPSHOTS = 8
|
||||
|
||||
def __init__(self, crawl, *, snapshot_ids: list[str] | None = None, selected_plugins: list[str] | None = None):
|
||||
def __init__(
|
||||
self,
|
||||
crawl,
|
||||
*,
|
||||
snapshot_ids: list[str] | None = None,
|
||||
selected_plugins: list[str] | None = None,
|
||||
process_discovered_snapshots_inline: bool = True,
|
||||
):
|
||||
self.crawl = crawl
|
||||
self.bus = create_bus(name=_bus_name("ArchiveBox", str(crawl.id)), total_timeout=3600.0)
|
||||
self.plugins = discover_plugins()
|
||||
@@ -90,7 +150,12 @@ class CrawlRunner:
|
||||
self.binary_service = BinaryService(self.bus)
|
||||
self.tag_service = TagService(self.bus)
|
||||
self.crawl_service = CrawlService(self.bus, crawl_id=str(crawl.id))
|
||||
self.snapshot_service = SnapshotService(self.bus, crawl_id=str(crawl.id), schedule_snapshot=self.enqueue_snapshot)
|
||||
self.process_discovered_snapshots_inline = process_discovered_snapshots_inline
|
||||
self.snapshot_service = SnapshotService(
|
||||
self.bus,
|
||||
crawl_id=str(crawl.id),
|
||||
schedule_snapshot=self.enqueue_snapshot if process_discovered_snapshots_inline else self.leave_snapshot_queued,
|
||||
)
|
||||
self.archive_result_service = ArchiveResultService(self.bus, process_service=self.process_service)
|
||||
self.selected_plugins = selected_plugins
|
||||
self.initial_snapshot_ids = snapshot_ids
|
||||
@@ -100,6 +165,29 @@ class CrawlRunner:
|
||||
self.persona = None
|
||||
self.base_config: dict[str, Any] = {}
|
||||
self.primary_url = ""
|
||||
self._live_stream = None
|
||||
|
||||
def _create_projector_bus(self, *, identifier: str, config_overrides: dict[str, Any]):
|
||||
bus = create_bus(name=_bus_name("ArchiveBox", identifier), total_timeout=3600.0)
|
||||
process_service = ProcessService(bus)
|
||||
MachineService(bus)
|
||||
BinaryService(bus)
|
||||
TagService(bus)
|
||||
CrawlService(bus, crawl_id=str(self.crawl.id))
|
||||
SnapshotService(
|
||||
bus,
|
||||
crawl_id=str(self.crawl.id),
|
||||
schedule_snapshot=self.enqueue_snapshot if self.process_discovered_snapshots_inline else self.leave_snapshot_queued,
|
||||
)
|
||||
ArchiveResultService(bus, process_service=process_service)
|
||||
abx_services = setup_abx_services(
|
||||
bus,
|
||||
plugins=self.plugins,
|
||||
config_overrides=config_overrides,
|
||||
auto_install=True,
|
||||
emit_jsonl=False,
|
||||
)
|
||||
return bus, abx_services
|
||||
|
||||
async def run(self) -> None:
|
||||
from asgiref.sync import sync_to_async
|
||||
@@ -107,35 +195,63 @@ class CrawlRunner:
|
||||
|
||||
try:
|
||||
await sync_to_async(self._prepare, thread_sensitive=True)()
|
||||
_attach_bus_trace(self.bus)
|
||||
self.abx_services = setup_abx_services(
|
||||
self.bus,
|
||||
plugins=self.plugins,
|
||||
config_overrides=self.base_config,
|
||||
auto_install=True,
|
||||
emit_jsonl=False,
|
||||
)
|
||||
if self.crawl.get_system_task() == INSTALL_URL:
|
||||
await self._run_install_crawl()
|
||||
else:
|
||||
snapshot_ids = await sync_to_async(self._initial_snapshot_ids, thread_sensitive=True)()
|
||||
if snapshot_ids:
|
||||
root_snapshot_id = snapshot_ids[0]
|
||||
await self._run_crawl_setup(root_snapshot_id)
|
||||
for snapshot_id in snapshot_ids:
|
||||
await self.enqueue_snapshot(snapshot_id)
|
||||
await self._wait_for_snapshot_tasks()
|
||||
await self._run_crawl_cleanup(root_snapshot_id)
|
||||
if self.abx_services is not None:
|
||||
await self.abx_services.process.wait_for_background_monitors()
|
||||
live_ui = self._create_live_ui()
|
||||
with live_ui if live_ui is not None else nullcontext():
|
||||
_attach_bus_trace(self.bus)
|
||||
self.abx_services = setup_abx_services(
|
||||
self.bus,
|
||||
plugins=self.plugins,
|
||||
config_overrides=self.base_config,
|
||||
auto_install=True,
|
||||
emit_jsonl=False,
|
||||
)
|
||||
if self.crawl.get_system_task() == INSTALL_URL:
|
||||
await self._run_install_crawl()
|
||||
else:
|
||||
snapshot_ids = await sync_to_async(self._initial_snapshot_ids, thread_sensitive=True)()
|
||||
if snapshot_ids:
|
||||
root_snapshot_id = snapshot_ids[0]
|
||||
_runner_debug(f"crawl {self.crawl.id} starting crawl setup root_snapshot={root_snapshot_id}")
|
||||
await self._run_crawl_setup(root_snapshot_id)
|
||||
_runner_debug(f"crawl {self.crawl.id} finished crawl setup root_snapshot={root_snapshot_id}")
|
||||
for snapshot_id in snapshot_ids:
|
||||
await self.enqueue_snapshot(snapshot_id)
|
||||
_runner_debug(f"crawl {self.crawl.id} waiting for snapshot tasks count={len(self.snapshot_tasks)}")
|
||||
await self._wait_for_snapshot_tasks()
|
||||
_runner_debug(f"crawl {self.crawl.id} finished waiting for snapshot tasks")
|
||||
_runner_debug(f"crawl {self.crawl.id} starting django crawl.cleanup()")
|
||||
await sync_to_async(self.crawl.cleanup, thread_sensitive=True)()
|
||||
_runner_debug(f"crawl {self.crawl.id} finished django crawl.cleanup()")
|
||||
_runner_debug(f"crawl {self.crawl.id} starting abx crawl cleanup root_snapshot={root_snapshot_id}")
|
||||
await self._run_crawl_cleanup(root_snapshot_id)
|
||||
_runner_debug(f"crawl {self.crawl.id} finished abx crawl cleanup root_snapshot={root_snapshot_id}")
|
||||
if self.abx_services is not None:
|
||||
_runner_debug(f"crawl {self.crawl.id} waiting for main bus background monitors")
|
||||
await self.abx_services.process.wait_for_background_monitors()
|
||||
_runner_debug(f"crawl {self.crawl.id} finished waiting for main bus background monitors")
|
||||
finally:
|
||||
await _stop_bus_trace(self.bus)
|
||||
await self.bus.stop()
|
||||
if self._live_stream is not None:
|
||||
try:
|
||||
self._live_stream.close()
|
||||
except Exception:
|
||||
pass
|
||||
self._live_stream = None
|
||||
await sync_to_async(self._cleanup_persona, thread_sensitive=True)()
|
||||
crawl = await sync_to_async(Crawl.objects.get, thread_sensitive=True)(id=self.crawl.id)
|
||||
if crawl.status != Crawl.StatusChoices.SEALED:
|
||||
crawl.status = Crawl.StatusChoices.SEALED
|
||||
crawl.retry_at = None
|
||||
crawl_is_finished = await sync_to_async(crawl.is_finished, thread_sensitive=True)()
|
||||
if crawl_is_finished:
|
||||
if crawl.status != Crawl.StatusChoices.SEALED:
|
||||
crawl.status = Crawl.StatusChoices.SEALED
|
||||
crawl.retry_at = None
|
||||
await sync_to_async(crawl.save, thread_sensitive=True)(update_fields=["status", "retry_at", "modified_at"])
|
||||
else:
|
||||
if crawl.status == Crawl.StatusChoices.SEALED:
|
||||
crawl.status = Crawl.StatusChoices.QUEUED
|
||||
elif crawl.status != Crawl.StatusChoices.STARTED:
|
||||
crawl.status = Crawl.StatusChoices.STARTED
|
||||
crawl.retry_at = crawl.retry_at or timezone.now()
|
||||
await sync_to_async(crawl.save, thread_sensitive=True)(update_fields=["status", "retry_at", "modified_at"])
|
||||
|
||||
async def enqueue_snapshot(self, snapshot_id: str) -> None:
|
||||
@@ -145,17 +261,36 @@ class CrawlRunner:
|
||||
task = asyncio.create_task(self._run_snapshot(snapshot_id))
|
||||
self.snapshot_tasks[snapshot_id] = task
|
||||
|
||||
async def leave_snapshot_queued(self, snapshot_id: str) -> None:
|
||||
return None
|
||||
|
||||
async def _wait_for_snapshot_tasks(self) -> None:
|
||||
while True:
|
||||
active = [task for task in self.snapshot_tasks.values() if not task.done()]
|
||||
if not active:
|
||||
pending_tasks: list[asyncio.Task[None]] = []
|
||||
for snapshot_id, task in list(self.snapshot_tasks.items()):
|
||||
if task.done():
|
||||
if self.snapshot_tasks.get(snapshot_id) is task:
|
||||
self.snapshot_tasks.pop(snapshot_id, None)
|
||||
task.result()
|
||||
continue
|
||||
pending_tasks.append(task)
|
||||
if not pending_tasks:
|
||||
return
|
||||
await asyncio.gather(*active)
|
||||
done, _pending = await asyncio.wait(pending_tasks, return_when=asyncio.FIRST_COMPLETED)
|
||||
for task in done:
|
||||
task.result()
|
||||
|
||||
def _prepare(self) -> None:
|
||||
from archivebox.config.configset import get_config
|
||||
from archivebox.machine.models import NetworkInterface, Process
|
||||
|
||||
self.primary_url = self.crawl.get_urls_list()[0] if self.crawl.get_urls_list() else ""
|
||||
current_iface = NetworkInterface.current(refresh=True)
|
||||
current_process = Process.current()
|
||||
if current_process.iface_id != current_iface.id or current_process.machine_id != current_iface.machine_id:
|
||||
current_process.iface = current_iface
|
||||
current_process.machine = current_iface.machine
|
||||
current_process.save(update_fields=["iface", "machine", "modified_at"])
|
||||
self.persona = self.crawl.resolve_persona()
|
||||
self.base_config = get_config(crawl=self.crawl)
|
||||
if self.selected_plugins is None:
|
||||
@@ -168,6 +303,52 @@ class CrawlRunner:
|
||||
if self.persona:
|
||||
self.persona.cleanup_runtime_for_crawl(self.crawl)
|
||||
|
||||
def _create_live_ui(self) -> LiveBusUI | None:
|
||||
stdout_is_tty = sys.stdout.isatty()
|
||||
stderr_is_tty = sys.stderr.isatty()
|
||||
interactive_tty = stdout_is_tty or stderr_is_tty
|
||||
if not interactive_tty:
|
||||
return None
|
||||
stream = sys.stderr if stderr_is_tty else sys.stdout
|
||||
if os.path.exists("/dev/tty"):
|
||||
try:
|
||||
self._live_stream = open("/dev/tty", "w", buffering=1, encoding=getattr(stream, "encoding", None) or "utf-8")
|
||||
stream = self._live_stream
|
||||
except OSError:
|
||||
self._live_stream = None
|
||||
try:
|
||||
terminal_size = os.get_terminal_size(stream.fileno())
|
||||
terminal_width = terminal_size.columns
|
||||
terminal_height = terminal_size.lines
|
||||
except (AttributeError, OSError, ValueError):
|
||||
terminal_size = shutil.get_terminal_size(fallback=(160, 40))
|
||||
terminal_width = terminal_size.columns
|
||||
terminal_height = terminal_size.lines
|
||||
ui_console = Console(
|
||||
file=stream,
|
||||
force_terminal=True,
|
||||
width=terminal_width,
|
||||
height=terminal_height,
|
||||
_environ={
|
||||
"COLUMNS": str(terminal_width),
|
||||
"LINES": str(terminal_height),
|
||||
},
|
||||
)
|
||||
plugins_label = ", ".join(self.selected_plugins) if self.selected_plugins else f"all ({len(self.plugins)} available)"
|
||||
live_ui = LiveBusUI(
|
||||
self.bus,
|
||||
total_hooks=_count_selected_hooks(self.plugins, self.selected_plugins),
|
||||
timeout_seconds=int(self.base_config.get("TIMEOUT") or 60),
|
||||
ui_console=ui_console,
|
||||
interactive_tty=True,
|
||||
)
|
||||
live_ui.print_intro(
|
||||
url=self.primary_url or INSTALL_URL,
|
||||
output_dir=Path(self.crawl.output_dir),
|
||||
plugins_label=plugins_label,
|
||||
)
|
||||
return live_ui
|
||||
|
||||
def _create_root_snapshots(self) -> list[str]:
|
||||
created = self.crawl.create_snapshots_from_urls()
|
||||
snapshots = created or list(self.crawl.snapshot_set.filter(depth=0).order_by("created_at"))
|
||||
@@ -290,18 +471,34 @@ class CrawlRunner:
|
||||
parent_snapshot_id=snapshot["parent_snapshot_id"],
|
||||
crawl_id=str(self.crawl.id),
|
||||
)
|
||||
await download(
|
||||
url=snapshot["url"],
|
||||
plugins=self.plugins,
|
||||
output_dir=Path(snapshot["output_dir"]),
|
||||
selected_plugins=self.selected_plugins,
|
||||
snapshot_bus, snapshot_services = self._create_projector_bus(
|
||||
identifier=f"{self.crawl.id}_{snapshot['id']}",
|
||||
config_overrides=snapshot["config"],
|
||||
bus=self.bus,
|
||||
emit_jsonl=False,
|
||||
snapshot=abx_snapshot,
|
||||
skip_crawl_setup=True,
|
||||
skip_crawl_cleanup=True,
|
||||
)
|
||||
try:
|
||||
_attach_bus_trace(snapshot_bus)
|
||||
_runner_debug(f"snapshot {snapshot_id} starting download()")
|
||||
await download(
|
||||
url=snapshot["url"],
|
||||
plugins=self.plugins,
|
||||
output_dir=Path(snapshot["output_dir"]),
|
||||
selected_plugins=self.selected_plugins,
|
||||
config_overrides=snapshot["config"],
|
||||
bus=snapshot_bus,
|
||||
emit_jsonl=False,
|
||||
snapshot=abx_snapshot,
|
||||
skip_crawl_setup=True,
|
||||
skip_crawl_cleanup=True,
|
||||
)
|
||||
_runner_debug(f"snapshot {snapshot_id} finished download(), waiting for background monitors")
|
||||
await snapshot_services.process.wait_for_background_monitors()
|
||||
_runner_debug(f"snapshot {snapshot_id} finished waiting for background monitors")
|
||||
finally:
|
||||
current_task = asyncio.current_task()
|
||||
if current_task is not None and self.snapshot_tasks.get(snapshot_id) is current_task:
|
||||
self.snapshot_tasks.pop(snapshot_id, None)
|
||||
await _stop_bus_trace(snapshot_bus)
|
||||
await snapshot_bus.stop()
|
||||
|
||||
def _load_snapshot_run_data(self, snapshot_id: str):
|
||||
from archivebox.core.models import Snapshot
|
||||
@@ -322,11 +519,24 @@ class CrawlRunner:
|
||||
}
|
||||
|
||||
|
||||
def run_crawl(crawl_id: str, *, snapshot_ids: list[str] | None = None, selected_plugins: list[str] | None = None) -> None:
|
||||
def run_crawl(
|
||||
crawl_id: str,
|
||||
*,
|
||||
snapshot_ids: list[str] | None = None,
|
||||
selected_plugins: list[str] | None = None,
|
||||
process_discovered_snapshots_inline: bool = True,
|
||||
) -> None:
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
crawl = Crawl.objects.get(id=crawl_id)
|
||||
asyncio.run(CrawlRunner(crawl, snapshot_ids=snapshot_ids, selected_plugins=selected_plugins).run())
|
||||
asyncio.run(
|
||||
CrawlRunner(
|
||||
crawl,
|
||||
snapshot_ids=snapshot_ids,
|
||||
selected_plugins=selected_plugins,
|
||||
process_discovered_snapshots_inline=process_discovered_snapshots_inline,
|
||||
).run()
|
||||
)
|
||||
|
||||
|
||||
async def _run_binary(binary_id: str) -> None:
|
||||
@@ -397,28 +607,203 @@ async def _run_install(plugin_names: list[str] | None = None) -> None:
|
||||
BinaryService(bus)
|
||||
TagService(bus)
|
||||
ArchiveResultService(bus, process_service=process_service)
|
||||
live_stream = None
|
||||
|
||||
try:
|
||||
_attach_bus_trace(bus)
|
||||
await abx_install_plugins(
|
||||
plugin_names=plugin_names,
|
||||
plugins=plugins,
|
||||
config_overrides=config,
|
||||
emit_jsonl=False,
|
||||
bus=bus,
|
||||
)
|
||||
await abx_services.process.wait_for_background_monitors()
|
||||
selected_plugins = prepare_install_plugins(plugins, plugin_names=plugin_names)
|
||||
plugins_label = ", ".join(plugin_names) if plugin_names else f"all ({len(plugins)} available)"
|
||||
timeout_seconds = int(config.get("TIMEOUT") or 60)
|
||||
stdout_is_tty = sys.stdout.isatty()
|
||||
stderr_is_tty = sys.stderr.isatty()
|
||||
interactive_tty = stdout_is_tty or stderr_is_tty
|
||||
ui_console = None
|
||||
live_ui = None
|
||||
|
||||
if interactive_tty:
|
||||
stream = sys.stderr if stderr_is_tty else sys.stdout
|
||||
if os.path.exists("/dev/tty"):
|
||||
try:
|
||||
live_stream = open("/dev/tty", "w", buffering=1, encoding=getattr(stream, "encoding", None) or "utf-8")
|
||||
stream = live_stream
|
||||
except OSError:
|
||||
live_stream = None
|
||||
try:
|
||||
terminal_size = os.get_terminal_size(stream.fileno())
|
||||
terminal_width = terminal_size.columns
|
||||
terminal_height = terminal_size.lines
|
||||
except (AttributeError, OSError, ValueError):
|
||||
terminal_size = shutil.get_terminal_size(fallback=(160, 40))
|
||||
terminal_width = terminal_size.columns
|
||||
terminal_height = terminal_size.lines
|
||||
ui_console = Console(
|
||||
file=stream,
|
||||
force_terminal=True,
|
||||
width=terminal_width,
|
||||
height=terminal_height,
|
||||
_environ={
|
||||
"COLUMNS": str(terminal_width),
|
||||
"LINES": str(terminal_height),
|
||||
},
|
||||
)
|
||||
|
||||
with TemporaryDirectory(prefix="archivebox-install-") as temp_dir:
|
||||
output_dir = Path(temp_dir)
|
||||
if ui_console is not None:
|
||||
live_ui = LiveBusUI(
|
||||
bus,
|
||||
total_hooks=_count_selected_hooks(selected_plugins, None),
|
||||
timeout_seconds=timeout_seconds,
|
||||
ui_console=ui_console,
|
||||
interactive_tty=interactive_tty,
|
||||
)
|
||||
live_ui.print_intro(
|
||||
url=INSTALL_URL,
|
||||
output_dir=output_dir,
|
||||
plugins_label=plugins_label,
|
||||
)
|
||||
with live_ui if live_ui is not None else nullcontext():
|
||||
_attach_bus_trace(bus)
|
||||
results = await abx_install_plugins(
|
||||
plugin_names=plugin_names,
|
||||
plugins=plugins,
|
||||
output_dir=output_dir,
|
||||
config_overrides=config,
|
||||
emit_jsonl=False,
|
||||
bus=bus,
|
||||
)
|
||||
await abx_services.process.wait_for_background_monitors()
|
||||
if live_ui is not None:
|
||||
live_ui.print_summary(results, output_dir=output_dir)
|
||||
finally:
|
||||
await _stop_bus_trace(bus)
|
||||
await bus.stop()
|
||||
try:
|
||||
if live_stream is not None:
|
||||
live_stream.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def run_install(*, plugin_names: list[str] | None = None) -> None:
|
||||
asyncio.run(_run_install(plugin_names=plugin_names))
|
||||
|
||||
|
||||
def recover_orphaned_crawls() -> int:
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
active_crawl_ids: set[str] = set()
|
||||
running_processes = Process.objects.filter(
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
process_type__in=[
|
||||
Process.TypeChoices.WORKER,
|
||||
Process.TypeChoices.HOOK,
|
||||
Process.TypeChoices.BINARY,
|
||||
],
|
||||
).only("env")
|
||||
|
||||
for proc in running_processes:
|
||||
env = proc.env or {}
|
||||
if not isinstance(env, dict):
|
||||
continue
|
||||
crawl_id = env.get("CRAWL_ID")
|
||||
if crawl_id:
|
||||
active_crawl_ids.add(str(crawl_id))
|
||||
|
||||
recovered = 0
|
||||
now = timezone.now()
|
||||
orphaned_crawls = Crawl.objects.filter(
|
||||
status=Crawl.StatusChoices.STARTED,
|
||||
retry_at__isnull=True,
|
||||
).prefetch_related("snapshot_set")
|
||||
|
||||
for crawl in orphaned_crawls:
|
||||
if str(crawl.id) in active_crawl_ids:
|
||||
continue
|
||||
|
||||
snapshots = list(crawl.snapshot_set.all())
|
||||
if not snapshots or all(snapshot.status == Snapshot.StatusChoices.SEALED for snapshot in snapshots):
|
||||
crawl.status = Crawl.StatusChoices.SEALED
|
||||
crawl.retry_at = None
|
||||
crawl.save(update_fields=["status", "retry_at", "modified_at"])
|
||||
recovered += 1
|
||||
continue
|
||||
|
||||
crawl.retry_at = now
|
||||
crawl.save(update_fields=["retry_at", "modified_at"])
|
||||
recovered += 1
|
||||
|
||||
return recovered
|
||||
|
||||
|
||||
def recover_orphaned_snapshots() -> int:
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.core.models import ArchiveResult, Snapshot
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
active_snapshot_ids: set[str] = set()
|
||||
running_processes = Process.objects.filter(
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
process_type__in=[
|
||||
Process.TypeChoices.WORKER,
|
||||
Process.TypeChoices.HOOK,
|
||||
Process.TypeChoices.BINARY,
|
||||
],
|
||||
).only("env")
|
||||
|
||||
for proc in running_processes:
|
||||
env = proc.env or {}
|
||||
if not isinstance(env, dict):
|
||||
continue
|
||||
snapshot_id = env.get("SNAPSHOT_ID")
|
||||
if snapshot_id:
|
||||
active_snapshot_ids.add(str(snapshot_id))
|
||||
|
||||
recovered = 0
|
||||
now = timezone.now()
|
||||
orphaned_snapshots = (
|
||||
Snapshot.objects
|
||||
.filter(status=Snapshot.StatusChoices.STARTED, retry_at__isnull=True)
|
||||
.select_related("crawl")
|
||||
.prefetch_related("archiveresult_set")
|
||||
)
|
||||
|
||||
for snapshot in orphaned_snapshots:
|
||||
if str(snapshot.id) in active_snapshot_ids:
|
||||
continue
|
||||
|
||||
results = list(snapshot.archiveresult_set.all())
|
||||
if results and all(result.status in ArchiveResult.FINAL_STATES for result in results):
|
||||
snapshot.status = Snapshot.StatusChoices.SEALED
|
||||
snapshot.retry_at = None
|
||||
snapshot.downloaded_at = snapshot.downloaded_at or now
|
||||
snapshot.save(update_fields=["status", "retry_at", "downloaded_at", "modified_at"])
|
||||
|
||||
crawl = snapshot.crawl
|
||||
if crawl.is_finished() and crawl.status != Crawl.StatusChoices.SEALED:
|
||||
crawl.status = Crawl.StatusChoices.SEALED
|
||||
crawl.retry_at = None
|
||||
crawl.save(update_fields=["status", "retry_at", "modified_at"])
|
||||
recovered += 1
|
||||
continue
|
||||
|
||||
snapshot.status = Snapshot.StatusChoices.QUEUED
|
||||
snapshot.retry_at = now
|
||||
snapshot.save(update_fields=["status", "retry_at", "modified_at"])
|
||||
|
||||
crawl = snapshot.crawl
|
||||
crawl.status = Crawl.StatusChoices.QUEUED
|
||||
crawl.retry_at = now
|
||||
crawl.save(update_fields=["status", "retry_at", "modified_at"])
|
||||
recovered += 1
|
||||
|
||||
return recovered
|
||||
|
||||
|
||||
def run_pending_crawls(*, daemon: bool = False, crawl_id: str | None = None) -> int:
|
||||
from archivebox.crawls.models import Crawl, CrawlSchedule
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.machine.models import Binary
|
||||
|
||||
while True:
|
||||
@@ -436,10 +821,48 @@ def run_pending_crawls(*, daemon: bool = False, crawl_id: str | None = None) ->
|
||||
.first()
|
||||
)
|
||||
if binary is not None:
|
||||
if not binary.claim_processing_lock(lock_seconds=60):
|
||||
continue
|
||||
run_binary(str(binary.id))
|
||||
continue
|
||||
|
||||
pending = Crawl.objects.filter(retry_at__lte=timezone.now()).exclude(status=Crawl.StatusChoices.SEALED)
|
||||
queued_crawls = Crawl.objects.filter(
|
||||
retry_at__lte=timezone.now(),
|
||||
status=Crawl.StatusChoices.QUEUED,
|
||||
)
|
||||
if crawl_id:
|
||||
queued_crawls = queued_crawls.filter(id=crawl_id)
|
||||
queued_crawls = queued_crawls.order_by("retry_at", "created_at")
|
||||
|
||||
queued_crawl = queued_crawls.first()
|
||||
if queued_crawl is not None:
|
||||
if not queued_crawl.claim_processing_lock(lock_seconds=60):
|
||||
continue
|
||||
run_crawl(str(queued_crawl.id), process_discovered_snapshots_inline=False)
|
||||
continue
|
||||
|
||||
if crawl_id is None:
|
||||
snapshot = (
|
||||
Snapshot.objects.filter(retry_at__lte=timezone.now())
|
||||
.exclude(status=Snapshot.StatusChoices.SEALED)
|
||||
.select_related("crawl")
|
||||
.order_by("retry_at", "created_at")
|
||||
.first()
|
||||
)
|
||||
if snapshot is not None:
|
||||
if not snapshot.claim_processing_lock(lock_seconds=60):
|
||||
continue
|
||||
run_crawl(
|
||||
str(snapshot.crawl_id),
|
||||
snapshot_ids=[str(snapshot.id)],
|
||||
process_discovered_snapshots_inline=False,
|
||||
)
|
||||
continue
|
||||
|
||||
pending = Crawl.objects.filter(
|
||||
retry_at__lte=timezone.now(),
|
||||
status=Crawl.StatusChoices.STARTED,
|
||||
)
|
||||
if crawl_id:
|
||||
pending = pending.filter(id=crawl_id)
|
||||
pending = pending.order_by("retry_at", "created_at")
|
||||
@@ -451,4 +874,7 @@ def run_pending_crawls(*, daemon: bool = False, crawl_id: str | None = None) ->
|
||||
continue
|
||||
return 0
|
||||
|
||||
run_crawl(str(crawl.id))
|
||||
if not crawl.claim_processing_lock(lock_seconds=60):
|
||||
continue
|
||||
|
||||
run_crawl(str(crawl.id), process_discovered_snapshots_inline=False)
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
|
||||
from asgiref.sync import sync_to_async
|
||||
from django.utils import timezone
|
||||
|
||||
from abx_dl.events import SnapshotCompletedEvent, SnapshotEvent
|
||||
from abx_dl.services.base import BaseService
|
||||
|
||||
from .db import run_db_op
|
||||
|
||||
|
||||
class SnapshotService(BaseService):
|
||||
LISTENS_TO = [SnapshotEvent, SnapshotCompletedEvent]
|
||||
@@ -18,13 +18,17 @@ class SnapshotService(BaseService):
|
||||
self.schedule_snapshot = schedule_snapshot
|
||||
super().__init__(bus)
|
||||
|
||||
async def on_SnapshotEvent(self, event: SnapshotEvent) -> None:
|
||||
snapshot_id = await sync_to_async(self._project_snapshot, thread_sensitive=True)(event)
|
||||
async def on_SnapshotEvent__Outer(self, event: SnapshotEvent) -> None:
|
||||
snapshot_id = await run_db_op(self._project_snapshot, event)
|
||||
if snapshot_id:
|
||||
await sync_to_async(self._ensure_crawl_symlink)(snapshot_id)
|
||||
if snapshot_id and event.depth > 0:
|
||||
await self.schedule_snapshot(snapshot_id)
|
||||
|
||||
async def on_SnapshotCompletedEvent(self, event: SnapshotCompletedEvent) -> None:
|
||||
await sync_to_async(self._seal_snapshot, thread_sensitive=True)(event.snapshot_id)
|
||||
async def on_SnapshotCompletedEvent__Outer(self, event: SnapshotCompletedEvent) -> None:
|
||||
snapshot_id = await run_db_op(self._seal_snapshot, event.snapshot_id)
|
||||
if snapshot_id:
|
||||
await sync_to_async(self._write_snapshot_details)(snapshot_id)
|
||||
|
||||
def _project_snapshot(self, event: SnapshotEvent) -> str | None:
|
||||
from archivebox.core.models import Snapshot
|
||||
@@ -39,7 +43,6 @@ class SnapshotService(BaseService):
|
||||
snapshot.status = Snapshot.StatusChoices.STARTED
|
||||
snapshot.retry_at = None
|
||||
snapshot.save(update_fields=["status", "retry_at", "modified_at"])
|
||||
snapshot.ensure_crawl_symlink()
|
||||
return str(snapshot.id)
|
||||
|
||||
if event.depth > crawl.max_depth:
|
||||
@@ -73,56 +76,36 @@ class SnapshotService(BaseService):
|
||||
if snapshot.status != Snapshot.StatusChoices.SEALED:
|
||||
snapshot.status = Snapshot.StatusChoices.QUEUED
|
||||
snapshot.save(update_fields=["status", "retry_at", "modified_at"])
|
||||
snapshot.ensure_crawl_symlink()
|
||||
return str(snapshot.id)
|
||||
|
||||
def _url_passes_filters(self, crawl, parent_snapshot, url: str) -> bool:
|
||||
from archivebox.config.configset import get_config
|
||||
return crawl.url_passes_filters(url, snapshot=parent_snapshot)
|
||||
|
||||
config = get_config(
|
||||
user=getattr(crawl, "created_by", None),
|
||||
crawl=crawl,
|
||||
snapshot=parent_snapshot,
|
||||
)
|
||||
|
||||
def to_pattern_list(value):
|
||||
if isinstance(value, list):
|
||||
return value
|
||||
if isinstance(value, str):
|
||||
return [pattern.strip() for pattern in value.split(",") if pattern.strip()]
|
||||
return []
|
||||
|
||||
allowlist = to_pattern_list(config.get("URL_ALLOWLIST", ""))
|
||||
denylist = to_pattern_list(config.get("URL_DENYLIST", ""))
|
||||
|
||||
for pattern in denylist:
|
||||
try:
|
||||
if re.search(pattern, url):
|
||||
return False
|
||||
except re.error:
|
||||
continue
|
||||
|
||||
if allowlist:
|
||||
for pattern in allowlist:
|
||||
try:
|
||||
if re.search(pattern, url):
|
||||
return True
|
||||
except re.error:
|
||||
continue
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def _seal_snapshot(self, snapshot_id: str) -> None:
|
||||
def _seal_snapshot(self, snapshot_id: str) -> str | None:
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
snapshot = Snapshot.objects.filter(id=snapshot_id).first()
|
||||
if snapshot is None:
|
||||
return
|
||||
return None
|
||||
snapshot.status = Snapshot.StatusChoices.SEALED
|
||||
snapshot.retry_at = None
|
||||
snapshot.downloaded_at = snapshot.downloaded_at or timezone.now()
|
||||
snapshot.save(update_fields=["status", "retry_at", "downloaded_at", "modified_at"])
|
||||
return str(snapshot.id)
|
||||
|
||||
def _ensure_crawl_symlink(self, snapshot_id: str) -> None:
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
snapshot = Snapshot.objects.filter(id=snapshot_id).select_related("crawl", "crawl__created_by").first()
|
||||
if snapshot is not None:
|
||||
snapshot.ensure_crawl_symlink()
|
||||
|
||||
def _write_snapshot_details(self, snapshot_id: str) -> None:
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
snapshot = Snapshot.objects.filter(id=snapshot_id).select_related("crawl", "crawl__created_by").first()
|
||||
if snapshot is None:
|
||||
return
|
||||
snapshot.write_index_jsonl()
|
||||
snapshot.write_json_details()
|
||||
snapshot.write_html_details()
|
||||
|
||||
@@ -1,16 +1,17 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from asgiref.sync import sync_to_async
|
||||
from abx_dl.events import TagEvent
|
||||
from abx_dl.services.base import BaseService
|
||||
|
||||
from .db import run_db_op
|
||||
|
||||
|
||||
class TagService(BaseService):
|
||||
LISTENS_TO = [TagEvent]
|
||||
EMITS = []
|
||||
|
||||
async def on_TagEvent(self, event: TagEvent) -> None:
|
||||
await sync_to_async(self._project, thread_sensitive=True)(event)
|
||||
async def on_TagEvent__Outer(self, event: TagEvent) -> None:
|
||||
await run_db_op(self._project, event)
|
||||
|
||||
def _project(self, event: TagEvent) -> None:
|
||||
from archivebox.core.models import Snapshot, Tag
|
||||
|
||||
Reference in New Issue
Block a user