diff --git a/.github/workflows/debian.yml b/.github/workflows/debian.yml index 1ffbffad..2efc6324 100644 --- a/.github/workflows/debian.yml +++ b/.github/workflows/debian.yml @@ -97,7 +97,7 @@ jobs: - name: Build local wheel run: | - uv sync --frozen --all-extras --no-install-project --no-install-workspace + uv sync --frozen --all-extras --no-install-project --no-install-workspace --no-sources uv build --wheel --out-dir /tmp/wheels/ - name: Download .deb artifact diff --git a/.github/workflows/homebrew.yml b/.github/workflows/homebrew.yml index 70ce8ded..eb73d94d 100644 --- a/.github/workflows/homebrew.yml +++ b/.github/workflows/homebrew.yml @@ -55,7 +55,7 @@ jobs: - name: Build local sdist run: | - uv sync --frozen --all-extras --no-install-project --no-install-workspace + uv sync --frozen --all-extras --no-install-project --no-install-workspace --no-sources uv build --sdist --out-dir /tmp/sdist/ - name: Generate formula from local sdist diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 272b8869..f851489e 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -28,4 +28,4 @@ jobs: run: uv sync --all-extras --all-groups --no-sources --no-cache - name: Run prek - run: uv run prek run --all-files + run: uv run --no-sync prek run --all-files diff --git a/.github/workflows/pip.yml b/.github/workflows/pip.yml index 709c95c3..c6c06c34 100755 --- a/.github/workflows/pip.yml +++ b/.github/workflows/pip.yml @@ -39,7 +39,7 @@ jobs: version: 1.0 - name: UV install archivebox dev + run sub-dependencies - run: uv sync --frozen --all-extras --no-install-project --no-install-workspace + run: uv sync --frozen --all-extras --no-install-project --no-install-workspace --no-sources - name: UV build archivebox and archivebox/pkgs/* packages run: | @@ -53,13 +53,13 @@ jobs: password: ${{ secrets.PYPI_PAT_SECRET }} - name: UV install archivebox and archivebox/pkgs/* locally for tests - run: uv sync --frozen --all-extras + run: uv sync --frozen --all-extras --no-sources - name: UV run archivebox init + archivebox version run: | mkdir -p data && cd data - uv run archivebox init \ - && uv run archivebox version + uv run --no-sync archivebox init \ + && uv run --no-sync archivebox version # && uv run archivebox add 'https://example.com' \ # && uv run archivebox status \ # || (echo "UV Failed to run archivebox!" && exit 1) diff --git a/.github/workflows/test-parallel.yml b/.github/workflows/test-parallel.yml index e93e9c04..0d66faa9 100644 --- a/.github/workflows/test-parallel.yml +++ b/.github/workflows/test-parallel.yml @@ -104,7 +104,7 @@ jobs: - name: Run test - ${{ matrix.test.name }} run: | mkdir -p tests/out - uv run pytest -xvs "${{ matrix.test.path }}" --basetemp=tests/out --ignore=archivebox/pkgs + uv run --no-sync pytest -xvs "${{ matrix.test.path }}" --basetemp=tests/out --ignore=archivebox/pkgs plugin-tests: name: Plugin tests @@ -155,4 +155,4 @@ jobs: TWOCAPTCHA_API_KEY: ${{ secrets.TWOCAPTCHA_API_KEY }} API_KEY_2CAPTCHA: ${{ secrets.TWOCAPTCHA_API_KEY }} run: | - uv run bash ./bin/test_plugins.sh --no-coverage + uv run --no-sync bash ./bin/test_plugins.sh --no-coverage diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 2d06ae91..83df0e32 100755 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -67,14 +67,14 @@ jobs: - name: Archivebox version run: | mkdir -p tests/out/data - DATA_DIR="$PWD/tests/out/data" uv run archivebox version + DATA_DIR="$PWD/tests/out/data" uv run --no-sync archivebox version - name: Test built package with pytest # TODO: remove this exception for windows once we get tests passing on that platform if: ${{ !contains(matrix.os, 'windows') }} run: | mkdir -p tests/out - uv run pytest -s archivebox/tests --basetemp=tests/out --ignore=archivebox/pkgs + uv run --no-sync pytest -s archivebox/tests --basetemp=tests/out --ignore=archivebox/pkgs - name: Run plugin tests if: ${{ !contains(matrix.os, 'windows') }} @@ -83,7 +83,7 @@ jobs: TWOCAPTCHA_API_KEY: ${{ secrets.TWOCAPTCHA_API_KEY }} API_KEY_2CAPTCHA: ${{ secrets.TWOCAPTCHA_API_KEY }} run: | - uv run bash ./bin/test_plugins.sh --no-coverage + uv run --no-sync bash ./bin/test_plugins.sh --no-coverage docker_tests: runs-on: ubuntu-latest diff --git a/archivebox/cli/archivebox_version.py b/archivebox/cli/archivebox_version.py index 7d293c5a..a9ec19d6 100755 --- a/archivebox/cli/archivebox_version.py +++ b/archivebox/cli/archivebox_version.py @@ -5,6 +5,7 @@ __package__ = "archivebox.cli" import sys import os import platform +import logging from pathlib import Path from collections.abc import Iterable @@ -123,28 +124,28 @@ def version( setup_django() from archivebox.machine.models import Machine, Binary + from archivebox.config.views import KNOWN_BINARIES, canonical_binary_name + from abx_dl.dependencies import load_binary machine = Machine.current() - # Get all binaries from the database with timeout protection - all_installed = ( - Binary.objects.filter( - machine=machine, - ) - .exclude(abspath="") - .exclude(abspath__isnull=True) - .order_by("name") - ) + requested_names = {canonical_binary_name(name) for name in binaries} if binaries else set() - if not all_installed.exists(): + db_binaries = { + canonical_binary_name(binary.name): binary for binary in Binary.objects.filter(machine=machine).order_by("name", "-modified_at") + } + all_binary_names = sorted(set(KNOWN_BINARIES) | set(db_binaries.keys())) + + if not all_binary_names: prnt("", "[grey53]No binaries detected. Run [green]archivebox install[/green] to detect dependencies.[/grey53]") else: - for installed in all_installed: - # Skip if user specified specific binaries and this isn't one - if binaries and installed.name not in binaries: + any_available = False + for name in all_binary_names: + if requested_names and name not in requested_names: continue - if installed.is_valid: + installed = db_binaries.get(name) + if installed and installed.is_valid: display_path = installed.abspath.replace(str(DATA_DIR), ".").replace(str(Path("~").expanduser()), "~") version_str = (installed.version or "unknown")[:15] provider = (installed.binprovider or "env")[:8] @@ -152,16 +153,51 @@ def version( "", "[green]√[/green]", "", - installed.name.ljust(18), + name.ljust(18), version_str.ljust(16), provider.ljust(8), display_path, overflow="ignore", crop=False, ) - else: - prnt("", "[red]X[/red]", "", installed.name.ljust(18), "[grey53]not installed[/grey53]", overflow="ignore", crop=False) - failures.append(installed.name) + any_available = True + continue + + loaded = None + try: + abx_pkg_logger = logging.getLogger("abx_pkg") + previous_level = abx_pkg_logger.level + abx_pkg_logger.setLevel(logging.CRITICAL) + try: + loaded = load_binary({"name": name, "binproviders": "env,pip,npm,brew,apt"}) + finally: + abx_pkg_logger.setLevel(previous_level) + except Exception: + loaded = None + + if loaded and loaded.is_valid and loaded.loaded_abspath: + display_path = str(loaded.loaded_abspath).replace(str(DATA_DIR), ".").replace(str(Path("~").expanduser()), "~") + version_str = str(loaded.loaded_version or "unknown")[:15] + provider = str(getattr(getattr(loaded, "loaded_binprovider", None), "name", "") or "env")[:8] + prnt( + "", + "[green]√[/green]", + "", + name.ljust(18), + version_str.ljust(16), + provider.ljust(8), + display_path, + overflow="ignore", + crop=False, + ) + any_available = True + continue + + prnt("", "[red]X[/red]", "", name.ljust(18), "[grey53]not installed[/grey53]", overflow="ignore", crop=False) + failures.append(name) + + if not any_available: + prnt("", "[grey53]No binaries detected. Run [green]archivebox install[/green] to detect dependencies.[/grey53]") # Show hint if no binaries are installed yet has_any_installed = Binary.objects.filter(machine=machine).exclude(abspath="").exists() diff --git a/archivebox/config/views.py b/archivebox/config/views.py index a6f821c8..7c2f99dd 100644 --- a/archivebox/config/views.py +++ b/archivebox/config/views.py @@ -59,6 +59,9 @@ KNOWN_BINARIES = [ CANONICAL_BINARY_ALIASES = { "youtube-dl": "yt-dlp", "ytdlp": "yt-dlp", + "ripgrep": "rg", + "singlefile": "single-file", + "mercury-parser": "postlight-parser", } diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 83340d82..4256584f 100755 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -1689,24 +1689,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea Clean up background ArchiveResult hooks and empty results. Called by the state machine when entering the 'sealed' state. - Uses Process records to kill background hooks, then deletes empty ArchiveResults. + Deletes empty ArchiveResults after the abx-dl cleanup phase has finished. """ - from archivebox.machine.models import Process - - # Kill any background ArchiveResult hooks using Process records - # Find all running hook Processes linked to this snapshot's ArchiveResults - running_hooks = Process.objects.filter( - archiveresult__snapshot=self, - process_type=Process.TypeChoices.HOOK, - status=Process.StatusChoices.RUNNING, - ).distinct() - - for process in running_hooks: - # Use Process.kill_tree() to gracefully kill parent + children - killed_count = process.kill_tree(graceful_timeout=2.0) - if killed_count > 0: - print(f"[yellow]🔪 Killed {killed_count} process(es) for hook {process.pid}[/yellow]") - # Clean up .pid files from output directory if Path(self.output_dir).exists(): for pid_file in Path(self.output_dir).glob("**/*.pid"): diff --git a/archivebox/crawls/models.py b/archivebox/crawls/models.py index 08fbaeca..d3487b89 100755 --- a/archivebox/crawls/models.py +++ b/archivebox/crawls/models.py @@ -948,19 +948,6 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith def cleanup(self): """Clean up background hooks and run on_CrawlEnd hooks.""" from archivebox.hooks import run_hook, discover_hooks - from archivebox.machine.models import Process - - running_hooks = Process.objects.filter( - process_type=Process.TypeChoices.HOOK, - status=Process.StatusChoices.RUNNING, - env__CRAWL_ID=str(self.id), - ).distinct() - - for process in running_hooks: - # Use Process.kill_tree() to gracefully kill parent + children - killed_count = process.kill_tree(graceful_timeout=2.0) - if killed_count > 0: - print(f"[yellow]🔪 Killed {killed_count} orphaned crawl hook process(es)[/yellow]") # Clean up .pid files from output directory if self.output_dir.exists(): diff --git a/archivebox/machine/models.py b/archivebox/machine/models.py index 65756255..1d8f75f6 100755 --- a/archivebox/machine/models.py +++ b/archivebox/machine/models.py @@ -725,19 +725,6 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine): since installations are foreground, but included for consistency). """ - # Kill any background binary installation hooks using Process records - # (rarely used since binary installations are typically foreground) - running_hooks = Process.objects.filter( - binary=self, - process_type=Process.TypeChoices.HOOK, - status=Process.StatusChoices.RUNNING, - ) - - for process in running_hooks: - killed_count = process.kill_tree(graceful_timeout=2.0) - if killed_count > 0: - print(f"[yellow]🔪 Killed {killed_count} binary installation hook process(es)[/yellow]") - # Clean up .pid files from output directory output_dir = self.output_dir if output_dir.exists(): diff --git a/archivebox/services/live_ui.py b/archivebox/services/live_ui.py index 830cbb1b..a89f016c 100644 --- a/archivebox/services/live_ui.py +++ b/archivebox/services/live_ui.py @@ -1,51 +1,3 @@ -from __future__ import annotations +from abx_dl.cli import LiveBusUI -from pathlib import Path -from typing import Any - -from rich.console import Console - - -class LiveBusUI: - """Small tty-only runner UI. - - The runner only needs a context manager and a couple of print helpers here. - Keeping this minimal avoids a hard dependency on a heavier live dashboard. - """ - - def __init__( - self, - bus: Any, - *, - total_hooks: int, - timeout_seconds: int, - ui_console: Console, - interactive_tty: bool, - ) -> None: - self.bus = bus - self.total_hooks = total_hooks - self.timeout_seconds = timeout_seconds - self.ui_console = ui_console - self.interactive_tty = interactive_tty - - def __enter__(self) -> LiveBusUI: - return self - - def __exit__(self, exc_type, exc, tb) -> bool: - return False - - def print_intro(self, *, url: str, output_dir: Path, plugins_label: str) -> None: - if not self.interactive_tty: - return - self.ui_console.print( - f"[bold]ArchiveBox[/bold] {url} -> [dim]{output_dir}[/dim] " - f"([cyan]{plugins_label}[/cyan], {self.total_hooks} hooks, {self.timeout_seconds}s timeout)", - ) - - def print_summary(self, results: list[Any] | tuple[Any, ...] | None, *, output_dir: Path) -> None: - if not self.interactive_tty: - return - total_results = len(results or []) - self.ui_console.print( - f"[green]Completed[/green] {total_results} result(s) in [dim]{output_dir}[/dim]", - ) +__all__ = ["LiveBusUI"] diff --git a/archivebox/services/runner.py b/archivebox/services/runner.py index 3f86a214..b1d0619b 100644 --- a/archivebox/services/runner.py +++ b/archivebox/services/runner.py @@ -58,10 +58,6 @@ def _count_selected_hooks(plugins: dict[str, Plugin], selected_plugins: list[str ) -def _runner_debug(message: str) -> None: - print(f"[runner] {message}", file=sys.stderr, flush=True) - - def _binary_env_key(name: str) -> str: normalized = "".join(ch if ch.isalnum() else "_" for ch in name).upper() return f"{normalized}_BINARY" @@ -302,24 +298,13 @@ class CrawlRunner: snapshot_ids = await sync_to_async(self._initial_snapshot_ids, thread_sensitive=True)() if snapshot_ids: root_snapshot_id = snapshot_ids[0] - _runner_debug(f"crawl {self.crawl.id} starting crawl setup root_snapshot={root_snapshot_id}") await self._run_crawl_setup(root_snapshot_id) - _runner_debug(f"crawl {self.crawl.id} finished crawl setup root_snapshot={root_snapshot_id}") for snapshot_id in snapshot_ids: await self.enqueue_snapshot(snapshot_id) - _runner_debug(f"crawl {self.crawl.id} waiting for snapshot tasks count={len(self.snapshot_tasks)}") await self._wait_for_snapshot_tasks() - _runner_debug(f"crawl {self.crawl.id} finished waiting for snapshot tasks") - _runner_debug(f"crawl {self.crawl.id} starting django crawl.cleanup()") - await sync_to_async(self.crawl.cleanup, thread_sensitive=True)() - _runner_debug(f"crawl {self.crawl.id} finished django crawl.cleanup()") - _runner_debug(f"crawl {self.crawl.id} starting abx crawl cleanup root_snapshot={root_snapshot_id}") await self._run_crawl_cleanup(root_snapshot_id) - _runner_debug(f"crawl {self.crawl.id} finished abx crawl cleanup root_snapshot={root_snapshot_id}") if self.abx_services is not None: - _runner_debug(f"crawl {self.crawl.id} waiting for main bus background monitors") await self.abx_services.process.wait_for_background_monitors() - _runner_debug(f"crawl {self.crawl.id} finished waiting for main bus background monitors") finally: await _stop_bus_trace(self.bus) await self.bus.stop() @@ -551,7 +536,6 @@ class CrawlRunner: ) try: _attach_bus_trace(snapshot_bus) - _runner_debug(f"snapshot {snapshot_id} starting download()") await download( url=snapshot["url"], plugins=self.plugins, @@ -564,9 +548,7 @@ class CrawlRunner: skip_crawl_setup=True, skip_crawl_cleanup=True, ) - _runner_debug(f"snapshot {snapshot_id} finished download(), waiting for background monitors") await snapshot_services.process.wait_for_background_monitors() - _runner_debug(f"snapshot {snapshot_id} finished waiting for background monitors") finally: current_task = asyncio.current_task() if current_task is not None and self.snapshot_tasks.get(snapshot_id) is current_task: diff --git a/archivebox/workers/management/commands/runner_watch.py b/archivebox/workers/management/commands/runner_watch.py index 7c1329ec..9cb82152 100644 --- a/archivebox/workers/management/commands/runner_watch.py +++ b/archivebox/workers/management/commands/runner_watch.py @@ -78,11 +78,7 @@ class Command(BaseCommand): running = Process.objects.filter( machine=machine, status=Process.StatusChoices.RUNNING, - process_type__in=[ - Process.TypeChoices.ORCHESTRATOR, - Process.TypeChoices.HOOK, - Process.TypeChoices.BINARY, - ], + process_type=Process.TypeChoices.ORCHESTRATOR, ) for proc in running: try: