Update CI uv handling and runner changes

This commit is contained in:
Nick Sweeting
2026-03-23 13:27:23 -07:00
parent e1eb5693c9
commit 39450111dd
14 changed files with 73 additions and 146 deletions

View File

@@ -5,6 +5,7 @@ __package__ = "archivebox.cli"
import sys
import os
import platform
import logging
from pathlib import Path
from collections.abc import Iterable
@@ -123,28 +124,28 @@ def version(
setup_django()
from archivebox.machine.models import Machine, Binary
from archivebox.config.views import KNOWN_BINARIES, canonical_binary_name
from abx_dl.dependencies import load_binary
machine = Machine.current()
# Get all binaries from the database with timeout protection
all_installed = (
Binary.objects.filter(
machine=machine,
)
.exclude(abspath="")
.exclude(abspath__isnull=True)
.order_by("name")
)
requested_names = {canonical_binary_name(name) for name in binaries} if binaries else set()
if not all_installed.exists():
db_binaries = {
canonical_binary_name(binary.name): binary for binary in Binary.objects.filter(machine=machine).order_by("name", "-modified_at")
}
all_binary_names = sorted(set(KNOWN_BINARIES) | set(db_binaries.keys()))
if not all_binary_names:
prnt("", "[grey53]No binaries detected. Run [green]archivebox install[/green] to detect dependencies.[/grey53]")
else:
for installed in all_installed:
# Skip if user specified specific binaries and this isn't one
if binaries and installed.name not in binaries:
any_available = False
for name in all_binary_names:
if requested_names and name not in requested_names:
continue
if installed.is_valid:
installed = db_binaries.get(name)
if installed and installed.is_valid:
display_path = installed.abspath.replace(str(DATA_DIR), ".").replace(str(Path("~").expanduser()), "~")
version_str = (installed.version or "unknown")[:15]
provider = (installed.binprovider or "env")[:8]
@@ -152,16 +153,51 @@ def version(
"",
"[green]√[/green]",
"",
installed.name.ljust(18),
name.ljust(18),
version_str.ljust(16),
provider.ljust(8),
display_path,
overflow="ignore",
crop=False,
)
else:
prnt("", "[red]X[/red]", "", installed.name.ljust(18), "[grey53]not installed[/grey53]", overflow="ignore", crop=False)
failures.append(installed.name)
any_available = True
continue
loaded = None
try:
abx_pkg_logger = logging.getLogger("abx_pkg")
previous_level = abx_pkg_logger.level
abx_pkg_logger.setLevel(logging.CRITICAL)
try:
loaded = load_binary({"name": name, "binproviders": "env,pip,npm,brew,apt"})
finally:
abx_pkg_logger.setLevel(previous_level)
except Exception:
loaded = None
if loaded and loaded.is_valid and loaded.loaded_abspath:
display_path = str(loaded.loaded_abspath).replace(str(DATA_DIR), ".").replace(str(Path("~").expanduser()), "~")
version_str = str(loaded.loaded_version or "unknown")[:15]
provider = str(getattr(getattr(loaded, "loaded_binprovider", None), "name", "") or "env")[:8]
prnt(
"",
"[green]√[/green]",
"",
name.ljust(18),
version_str.ljust(16),
provider.ljust(8),
display_path,
overflow="ignore",
crop=False,
)
any_available = True
continue
prnt("", "[red]X[/red]", "", name.ljust(18), "[grey53]not installed[/grey53]", overflow="ignore", crop=False)
failures.append(name)
if not any_available:
prnt("", "[grey53]No binaries detected. Run [green]archivebox install[/green] to detect dependencies.[/grey53]")
# Show hint if no binaries are installed yet
has_any_installed = Binary.objects.filter(machine=machine).exclude(abspath="").exists()

View File

@@ -59,6 +59,9 @@ KNOWN_BINARIES = [
CANONICAL_BINARY_ALIASES = {
"youtube-dl": "yt-dlp",
"ytdlp": "yt-dlp",
"ripgrep": "rg",
"singlefile": "single-file",
"mercury-parser": "postlight-parser",
}

View File

@@ -1689,24 +1689,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
Clean up background ArchiveResult hooks and empty results.
Called by the state machine when entering the 'sealed' state.
Uses Process records to kill background hooks, then deletes empty ArchiveResults.
Deletes empty ArchiveResults after the abx-dl cleanup phase has finished.
"""
from archivebox.machine.models import Process
# Kill any background ArchiveResult hooks using Process records
# Find all running hook Processes linked to this snapshot's ArchiveResults
running_hooks = Process.objects.filter(
archiveresult__snapshot=self,
process_type=Process.TypeChoices.HOOK,
status=Process.StatusChoices.RUNNING,
).distinct()
for process in running_hooks:
# Use Process.kill_tree() to gracefully kill parent + children
killed_count = process.kill_tree(graceful_timeout=2.0)
if killed_count > 0:
print(f"[yellow]🔪 Killed {killed_count} process(es) for hook {process.pid}[/yellow]")
# Clean up .pid files from output directory
if Path(self.output_dir).exists():
for pid_file in Path(self.output_dir).glob("**/*.pid"):

View File

@@ -948,19 +948,6 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
def cleanup(self):
"""Clean up background hooks and run on_CrawlEnd hooks."""
from archivebox.hooks import run_hook, discover_hooks
from archivebox.machine.models import Process
running_hooks = Process.objects.filter(
process_type=Process.TypeChoices.HOOK,
status=Process.StatusChoices.RUNNING,
env__CRAWL_ID=str(self.id),
).distinct()
for process in running_hooks:
# Use Process.kill_tree() to gracefully kill parent + children
killed_count = process.kill_tree(graceful_timeout=2.0)
if killed_count > 0:
print(f"[yellow]🔪 Killed {killed_count} orphaned crawl hook process(es)[/yellow]")
# Clean up .pid files from output directory
if self.output_dir.exists():

View File

@@ -725,19 +725,6 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine):
since installations are foreground, but included for consistency).
"""
# Kill any background binary installation hooks using Process records
# (rarely used since binary installations are typically foreground)
running_hooks = Process.objects.filter(
binary=self,
process_type=Process.TypeChoices.HOOK,
status=Process.StatusChoices.RUNNING,
)
for process in running_hooks:
killed_count = process.kill_tree(graceful_timeout=2.0)
if killed_count > 0:
print(f"[yellow]🔪 Killed {killed_count} binary installation hook process(es)[/yellow]")
# Clean up .pid files from output directory
output_dir = self.output_dir
if output_dir.exists():

View File

@@ -1,51 +1,3 @@
from __future__ import annotations
from abx_dl.cli import LiveBusUI
from pathlib import Path
from typing import Any
from rich.console import Console
class LiveBusUI:
"""Small tty-only runner UI.
The runner only needs a context manager and a couple of print helpers here.
Keeping this minimal avoids a hard dependency on a heavier live dashboard.
"""
def __init__(
self,
bus: Any,
*,
total_hooks: int,
timeout_seconds: int,
ui_console: Console,
interactive_tty: bool,
) -> None:
self.bus = bus
self.total_hooks = total_hooks
self.timeout_seconds = timeout_seconds
self.ui_console = ui_console
self.interactive_tty = interactive_tty
def __enter__(self) -> LiveBusUI:
return self
def __exit__(self, exc_type, exc, tb) -> bool:
return False
def print_intro(self, *, url: str, output_dir: Path, plugins_label: str) -> None:
if not self.interactive_tty:
return
self.ui_console.print(
f"[bold]ArchiveBox[/bold] {url} -> [dim]{output_dir}[/dim] "
f"([cyan]{plugins_label}[/cyan], {self.total_hooks} hooks, {self.timeout_seconds}s timeout)",
)
def print_summary(self, results: list[Any] | tuple[Any, ...] | None, *, output_dir: Path) -> None:
if not self.interactive_tty:
return
total_results = len(results or [])
self.ui_console.print(
f"[green]Completed[/green] {total_results} result(s) in [dim]{output_dir}[/dim]",
)
__all__ = ["LiveBusUI"]

View File

@@ -58,10 +58,6 @@ def _count_selected_hooks(plugins: dict[str, Plugin], selected_plugins: list[str
)
def _runner_debug(message: str) -> None:
print(f"[runner] {message}", file=sys.stderr, flush=True)
def _binary_env_key(name: str) -> str:
normalized = "".join(ch if ch.isalnum() else "_" for ch in name).upper()
return f"{normalized}_BINARY"
@@ -302,24 +298,13 @@ class CrawlRunner:
snapshot_ids = await sync_to_async(self._initial_snapshot_ids, thread_sensitive=True)()
if snapshot_ids:
root_snapshot_id = snapshot_ids[0]
_runner_debug(f"crawl {self.crawl.id} starting crawl setup root_snapshot={root_snapshot_id}")
await self._run_crawl_setup(root_snapshot_id)
_runner_debug(f"crawl {self.crawl.id} finished crawl setup root_snapshot={root_snapshot_id}")
for snapshot_id in snapshot_ids:
await self.enqueue_snapshot(snapshot_id)
_runner_debug(f"crawl {self.crawl.id} waiting for snapshot tasks count={len(self.snapshot_tasks)}")
await self._wait_for_snapshot_tasks()
_runner_debug(f"crawl {self.crawl.id} finished waiting for snapshot tasks")
_runner_debug(f"crawl {self.crawl.id} starting django crawl.cleanup()")
await sync_to_async(self.crawl.cleanup, thread_sensitive=True)()
_runner_debug(f"crawl {self.crawl.id} finished django crawl.cleanup()")
_runner_debug(f"crawl {self.crawl.id} starting abx crawl cleanup root_snapshot={root_snapshot_id}")
await self._run_crawl_cleanup(root_snapshot_id)
_runner_debug(f"crawl {self.crawl.id} finished abx crawl cleanup root_snapshot={root_snapshot_id}")
if self.abx_services is not None:
_runner_debug(f"crawl {self.crawl.id} waiting for main bus background monitors")
await self.abx_services.process.wait_for_background_monitors()
_runner_debug(f"crawl {self.crawl.id} finished waiting for main bus background monitors")
finally:
await _stop_bus_trace(self.bus)
await self.bus.stop()
@@ -551,7 +536,6 @@ class CrawlRunner:
)
try:
_attach_bus_trace(snapshot_bus)
_runner_debug(f"snapshot {snapshot_id} starting download()")
await download(
url=snapshot["url"],
plugins=self.plugins,
@@ -564,9 +548,7 @@ class CrawlRunner:
skip_crawl_setup=True,
skip_crawl_cleanup=True,
)
_runner_debug(f"snapshot {snapshot_id} finished download(), waiting for background monitors")
await snapshot_services.process.wait_for_background_monitors()
_runner_debug(f"snapshot {snapshot_id} finished waiting for background monitors")
finally:
current_task = asyncio.current_task()
if current_task is not None and self.snapshot_tasks.get(snapshot_id) is current_task:

View File

@@ -78,11 +78,7 @@ class Command(BaseCommand):
running = Process.objects.filter(
machine=machine,
status=Process.StatusChoices.RUNNING,
process_type__in=[
Process.TypeChoices.ORCHESTRATOR,
Process.TypeChoices.HOOK,
Process.TypeChoices.BINARY,
],
process_type=Process.TypeChoices.ORCHESTRATOR,
)
for proc in running:
try: