mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
Update CI uv handling and runner changes
This commit is contained in:
2
.github/workflows/debian.yml
vendored
2
.github/workflows/debian.yml
vendored
@@ -97,7 +97,7 @@ jobs:
|
||||
|
||||
- name: Build local wheel
|
||||
run: |
|
||||
uv sync --frozen --all-extras --no-install-project --no-install-workspace
|
||||
uv sync --frozen --all-extras --no-install-project --no-install-workspace --no-sources
|
||||
uv build --wheel --out-dir /tmp/wheels/
|
||||
|
||||
- name: Download .deb artifact
|
||||
|
||||
2
.github/workflows/homebrew.yml
vendored
2
.github/workflows/homebrew.yml
vendored
@@ -55,7 +55,7 @@ jobs:
|
||||
|
||||
- name: Build local sdist
|
||||
run: |
|
||||
uv sync --frozen --all-extras --no-install-project --no-install-workspace
|
||||
uv sync --frozen --all-extras --no-install-project --no-install-workspace --no-sources
|
||||
uv build --sdist --out-dir /tmp/sdist/
|
||||
|
||||
- name: Generate formula from local sdist
|
||||
|
||||
2
.github/workflows/lint.yml
vendored
2
.github/workflows/lint.yml
vendored
@@ -28,4 +28,4 @@ jobs:
|
||||
run: uv sync --all-extras --all-groups --no-sources --no-cache
|
||||
|
||||
- name: Run prek
|
||||
run: uv run prek run --all-files
|
||||
run: uv run --no-sync prek run --all-files
|
||||
|
||||
8
.github/workflows/pip.yml
vendored
8
.github/workflows/pip.yml
vendored
@@ -39,7 +39,7 @@ jobs:
|
||||
version: 1.0
|
||||
|
||||
- name: UV install archivebox dev + run sub-dependencies
|
||||
run: uv sync --frozen --all-extras --no-install-project --no-install-workspace
|
||||
run: uv sync --frozen --all-extras --no-install-project --no-install-workspace --no-sources
|
||||
|
||||
- name: UV build archivebox and archivebox/pkgs/* packages
|
||||
run: |
|
||||
@@ -53,13 +53,13 @@ jobs:
|
||||
password: ${{ secrets.PYPI_PAT_SECRET }}
|
||||
|
||||
- name: UV install archivebox and archivebox/pkgs/* locally for tests
|
||||
run: uv sync --frozen --all-extras
|
||||
run: uv sync --frozen --all-extras --no-sources
|
||||
|
||||
- name: UV run archivebox init + archivebox version
|
||||
run: |
|
||||
mkdir -p data && cd data
|
||||
uv run archivebox init \
|
||||
&& uv run archivebox version
|
||||
uv run --no-sync archivebox init \
|
||||
&& uv run --no-sync archivebox version
|
||||
# && uv run archivebox add 'https://example.com' \
|
||||
# && uv run archivebox status \
|
||||
# || (echo "UV Failed to run archivebox!" && exit 1)
|
||||
|
||||
4
.github/workflows/test-parallel.yml
vendored
4
.github/workflows/test-parallel.yml
vendored
@@ -104,7 +104,7 @@ jobs:
|
||||
- name: Run test - ${{ matrix.test.name }}
|
||||
run: |
|
||||
mkdir -p tests/out
|
||||
uv run pytest -xvs "${{ matrix.test.path }}" --basetemp=tests/out --ignore=archivebox/pkgs
|
||||
uv run --no-sync pytest -xvs "${{ matrix.test.path }}" --basetemp=tests/out --ignore=archivebox/pkgs
|
||||
|
||||
plugin-tests:
|
||||
name: Plugin tests
|
||||
@@ -155,4 +155,4 @@ jobs:
|
||||
TWOCAPTCHA_API_KEY: ${{ secrets.TWOCAPTCHA_API_KEY }}
|
||||
API_KEY_2CAPTCHA: ${{ secrets.TWOCAPTCHA_API_KEY }}
|
||||
run: |
|
||||
uv run bash ./bin/test_plugins.sh --no-coverage
|
||||
uv run --no-sync bash ./bin/test_plugins.sh --no-coverage
|
||||
|
||||
6
.github/workflows/test.yml
vendored
6
.github/workflows/test.yml
vendored
@@ -67,14 +67,14 @@ jobs:
|
||||
- name: Archivebox version
|
||||
run: |
|
||||
mkdir -p tests/out/data
|
||||
DATA_DIR="$PWD/tests/out/data" uv run archivebox version
|
||||
DATA_DIR="$PWD/tests/out/data" uv run --no-sync archivebox version
|
||||
|
||||
- name: Test built package with pytest
|
||||
# TODO: remove this exception for windows once we get tests passing on that platform
|
||||
if: ${{ !contains(matrix.os, 'windows') }}
|
||||
run: |
|
||||
mkdir -p tests/out
|
||||
uv run pytest -s archivebox/tests --basetemp=tests/out --ignore=archivebox/pkgs
|
||||
uv run --no-sync pytest -s archivebox/tests --basetemp=tests/out --ignore=archivebox/pkgs
|
||||
|
||||
- name: Run plugin tests
|
||||
if: ${{ !contains(matrix.os, 'windows') }}
|
||||
@@ -83,7 +83,7 @@ jobs:
|
||||
TWOCAPTCHA_API_KEY: ${{ secrets.TWOCAPTCHA_API_KEY }}
|
||||
API_KEY_2CAPTCHA: ${{ secrets.TWOCAPTCHA_API_KEY }}
|
||||
run: |
|
||||
uv run bash ./bin/test_plugins.sh --no-coverage
|
||||
uv run --no-sync bash ./bin/test_plugins.sh --no-coverage
|
||||
|
||||
docker_tests:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
@@ -5,6 +5,7 @@ __package__ = "archivebox.cli"
|
||||
import sys
|
||||
import os
|
||||
import platform
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from collections.abc import Iterable
|
||||
|
||||
@@ -123,28 +124,28 @@ def version(
|
||||
setup_django()
|
||||
|
||||
from archivebox.machine.models import Machine, Binary
|
||||
from archivebox.config.views import KNOWN_BINARIES, canonical_binary_name
|
||||
from abx_dl.dependencies import load_binary
|
||||
|
||||
machine = Machine.current()
|
||||
|
||||
# Get all binaries from the database with timeout protection
|
||||
all_installed = (
|
||||
Binary.objects.filter(
|
||||
machine=machine,
|
||||
)
|
||||
.exclude(abspath="")
|
||||
.exclude(abspath__isnull=True)
|
||||
.order_by("name")
|
||||
)
|
||||
requested_names = {canonical_binary_name(name) for name in binaries} if binaries else set()
|
||||
|
||||
if not all_installed.exists():
|
||||
db_binaries = {
|
||||
canonical_binary_name(binary.name): binary for binary in Binary.objects.filter(machine=machine).order_by("name", "-modified_at")
|
||||
}
|
||||
all_binary_names = sorted(set(KNOWN_BINARIES) | set(db_binaries.keys()))
|
||||
|
||||
if not all_binary_names:
|
||||
prnt("", "[grey53]No binaries detected. Run [green]archivebox install[/green] to detect dependencies.[/grey53]")
|
||||
else:
|
||||
for installed in all_installed:
|
||||
# Skip if user specified specific binaries and this isn't one
|
||||
if binaries and installed.name not in binaries:
|
||||
any_available = False
|
||||
for name in all_binary_names:
|
||||
if requested_names and name not in requested_names:
|
||||
continue
|
||||
|
||||
if installed.is_valid:
|
||||
installed = db_binaries.get(name)
|
||||
if installed and installed.is_valid:
|
||||
display_path = installed.abspath.replace(str(DATA_DIR), ".").replace(str(Path("~").expanduser()), "~")
|
||||
version_str = (installed.version or "unknown")[:15]
|
||||
provider = (installed.binprovider or "env")[:8]
|
||||
@@ -152,16 +153,51 @@ def version(
|
||||
"",
|
||||
"[green]√[/green]",
|
||||
"",
|
||||
installed.name.ljust(18),
|
||||
name.ljust(18),
|
||||
version_str.ljust(16),
|
||||
provider.ljust(8),
|
||||
display_path,
|
||||
overflow="ignore",
|
||||
crop=False,
|
||||
)
|
||||
else:
|
||||
prnt("", "[red]X[/red]", "", installed.name.ljust(18), "[grey53]not installed[/grey53]", overflow="ignore", crop=False)
|
||||
failures.append(installed.name)
|
||||
any_available = True
|
||||
continue
|
||||
|
||||
loaded = None
|
||||
try:
|
||||
abx_pkg_logger = logging.getLogger("abx_pkg")
|
||||
previous_level = abx_pkg_logger.level
|
||||
abx_pkg_logger.setLevel(logging.CRITICAL)
|
||||
try:
|
||||
loaded = load_binary({"name": name, "binproviders": "env,pip,npm,brew,apt"})
|
||||
finally:
|
||||
abx_pkg_logger.setLevel(previous_level)
|
||||
except Exception:
|
||||
loaded = None
|
||||
|
||||
if loaded and loaded.is_valid and loaded.loaded_abspath:
|
||||
display_path = str(loaded.loaded_abspath).replace(str(DATA_DIR), ".").replace(str(Path("~").expanduser()), "~")
|
||||
version_str = str(loaded.loaded_version or "unknown")[:15]
|
||||
provider = str(getattr(getattr(loaded, "loaded_binprovider", None), "name", "") or "env")[:8]
|
||||
prnt(
|
||||
"",
|
||||
"[green]√[/green]",
|
||||
"",
|
||||
name.ljust(18),
|
||||
version_str.ljust(16),
|
||||
provider.ljust(8),
|
||||
display_path,
|
||||
overflow="ignore",
|
||||
crop=False,
|
||||
)
|
||||
any_available = True
|
||||
continue
|
||||
|
||||
prnt("", "[red]X[/red]", "", name.ljust(18), "[grey53]not installed[/grey53]", overflow="ignore", crop=False)
|
||||
failures.append(name)
|
||||
|
||||
if not any_available:
|
||||
prnt("", "[grey53]No binaries detected. Run [green]archivebox install[/green] to detect dependencies.[/grey53]")
|
||||
|
||||
# Show hint if no binaries are installed yet
|
||||
has_any_installed = Binary.objects.filter(machine=machine).exclude(abspath="").exists()
|
||||
|
||||
@@ -59,6 +59,9 @@ KNOWN_BINARIES = [
|
||||
CANONICAL_BINARY_ALIASES = {
|
||||
"youtube-dl": "yt-dlp",
|
||||
"ytdlp": "yt-dlp",
|
||||
"ripgrep": "rg",
|
||||
"singlefile": "single-file",
|
||||
"mercury-parser": "postlight-parser",
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -1689,24 +1689,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
Clean up background ArchiveResult hooks and empty results.
|
||||
|
||||
Called by the state machine when entering the 'sealed' state.
|
||||
Uses Process records to kill background hooks, then deletes empty ArchiveResults.
|
||||
Deletes empty ArchiveResults after the abx-dl cleanup phase has finished.
|
||||
"""
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
# Kill any background ArchiveResult hooks using Process records
|
||||
# Find all running hook Processes linked to this snapshot's ArchiveResults
|
||||
running_hooks = Process.objects.filter(
|
||||
archiveresult__snapshot=self,
|
||||
process_type=Process.TypeChoices.HOOK,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
).distinct()
|
||||
|
||||
for process in running_hooks:
|
||||
# Use Process.kill_tree() to gracefully kill parent + children
|
||||
killed_count = process.kill_tree(graceful_timeout=2.0)
|
||||
if killed_count > 0:
|
||||
print(f"[yellow]🔪 Killed {killed_count} process(es) for hook {process.pid}[/yellow]")
|
||||
|
||||
# Clean up .pid files from output directory
|
||||
if Path(self.output_dir).exists():
|
||||
for pid_file in Path(self.output_dir).glob("**/*.pid"):
|
||||
|
||||
@@ -948,19 +948,6 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
def cleanup(self):
|
||||
"""Clean up background hooks and run on_CrawlEnd hooks."""
|
||||
from archivebox.hooks import run_hook, discover_hooks
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
running_hooks = Process.objects.filter(
|
||||
process_type=Process.TypeChoices.HOOK,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
env__CRAWL_ID=str(self.id),
|
||||
).distinct()
|
||||
|
||||
for process in running_hooks:
|
||||
# Use Process.kill_tree() to gracefully kill parent + children
|
||||
killed_count = process.kill_tree(graceful_timeout=2.0)
|
||||
if killed_count > 0:
|
||||
print(f"[yellow]🔪 Killed {killed_count} orphaned crawl hook process(es)[/yellow]")
|
||||
|
||||
# Clean up .pid files from output directory
|
||||
if self.output_dir.exists():
|
||||
|
||||
@@ -725,19 +725,6 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine):
|
||||
since installations are foreground, but included for consistency).
|
||||
"""
|
||||
|
||||
# Kill any background binary installation hooks using Process records
|
||||
# (rarely used since binary installations are typically foreground)
|
||||
running_hooks = Process.objects.filter(
|
||||
binary=self,
|
||||
process_type=Process.TypeChoices.HOOK,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
)
|
||||
|
||||
for process in running_hooks:
|
||||
killed_count = process.kill_tree(graceful_timeout=2.0)
|
||||
if killed_count > 0:
|
||||
print(f"[yellow]🔪 Killed {killed_count} binary installation hook process(es)[/yellow]")
|
||||
|
||||
# Clean up .pid files from output directory
|
||||
output_dir = self.output_dir
|
||||
if output_dir.exists():
|
||||
|
||||
@@ -1,51 +1,3 @@
|
||||
from __future__ import annotations
|
||||
from abx_dl.cli import LiveBusUI
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from rich.console import Console
|
||||
|
||||
|
||||
class LiveBusUI:
|
||||
"""Small tty-only runner UI.
|
||||
|
||||
The runner only needs a context manager and a couple of print helpers here.
|
||||
Keeping this minimal avoids a hard dependency on a heavier live dashboard.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
bus: Any,
|
||||
*,
|
||||
total_hooks: int,
|
||||
timeout_seconds: int,
|
||||
ui_console: Console,
|
||||
interactive_tty: bool,
|
||||
) -> None:
|
||||
self.bus = bus
|
||||
self.total_hooks = total_hooks
|
||||
self.timeout_seconds = timeout_seconds
|
||||
self.ui_console = ui_console
|
||||
self.interactive_tty = interactive_tty
|
||||
|
||||
def __enter__(self) -> LiveBusUI:
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc, tb) -> bool:
|
||||
return False
|
||||
|
||||
def print_intro(self, *, url: str, output_dir: Path, plugins_label: str) -> None:
|
||||
if not self.interactive_tty:
|
||||
return
|
||||
self.ui_console.print(
|
||||
f"[bold]ArchiveBox[/bold] {url} -> [dim]{output_dir}[/dim] "
|
||||
f"([cyan]{plugins_label}[/cyan], {self.total_hooks} hooks, {self.timeout_seconds}s timeout)",
|
||||
)
|
||||
|
||||
def print_summary(self, results: list[Any] | tuple[Any, ...] | None, *, output_dir: Path) -> None:
|
||||
if not self.interactive_tty:
|
||||
return
|
||||
total_results = len(results or [])
|
||||
self.ui_console.print(
|
||||
f"[green]Completed[/green] {total_results} result(s) in [dim]{output_dir}[/dim]",
|
||||
)
|
||||
__all__ = ["LiveBusUI"]
|
||||
|
||||
@@ -58,10 +58,6 @@ def _count_selected_hooks(plugins: dict[str, Plugin], selected_plugins: list[str
|
||||
)
|
||||
|
||||
|
||||
def _runner_debug(message: str) -> None:
|
||||
print(f"[runner] {message}", file=sys.stderr, flush=True)
|
||||
|
||||
|
||||
def _binary_env_key(name: str) -> str:
|
||||
normalized = "".join(ch if ch.isalnum() else "_" for ch in name).upper()
|
||||
return f"{normalized}_BINARY"
|
||||
@@ -302,24 +298,13 @@ class CrawlRunner:
|
||||
snapshot_ids = await sync_to_async(self._initial_snapshot_ids, thread_sensitive=True)()
|
||||
if snapshot_ids:
|
||||
root_snapshot_id = snapshot_ids[0]
|
||||
_runner_debug(f"crawl {self.crawl.id} starting crawl setup root_snapshot={root_snapshot_id}")
|
||||
await self._run_crawl_setup(root_snapshot_id)
|
||||
_runner_debug(f"crawl {self.crawl.id} finished crawl setup root_snapshot={root_snapshot_id}")
|
||||
for snapshot_id in snapshot_ids:
|
||||
await self.enqueue_snapshot(snapshot_id)
|
||||
_runner_debug(f"crawl {self.crawl.id} waiting for snapshot tasks count={len(self.snapshot_tasks)}")
|
||||
await self._wait_for_snapshot_tasks()
|
||||
_runner_debug(f"crawl {self.crawl.id} finished waiting for snapshot tasks")
|
||||
_runner_debug(f"crawl {self.crawl.id} starting django crawl.cleanup()")
|
||||
await sync_to_async(self.crawl.cleanup, thread_sensitive=True)()
|
||||
_runner_debug(f"crawl {self.crawl.id} finished django crawl.cleanup()")
|
||||
_runner_debug(f"crawl {self.crawl.id} starting abx crawl cleanup root_snapshot={root_snapshot_id}")
|
||||
await self._run_crawl_cleanup(root_snapshot_id)
|
||||
_runner_debug(f"crawl {self.crawl.id} finished abx crawl cleanup root_snapshot={root_snapshot_id}")
|
||||
if self.abx_services is not None:
|
||||
_runner_debug(f"crawl {self.crawl.id} waiting for main bus background monitors")
|
||||
await self.abx_services.process.wait_for_background_monitors()
|
||||
_runner_debug(f"crawl {self.crawl.id} finished waiting for main bus background monitors")
|
||||
finally:
|
||||
await _stop_bus_trace(self.bus)
|
||||
await self.bus.stop()
|
||||
@@ -551,7 +536,6 @@ class CrawlRunner:
|
||||
)
|
||||
try:
|
||||
_attach_bus_trace(snapshot_bus)
|
||||
_runner_debug(f"snapshot {snapshot_id} starting download()")
|
||||
await download(
|
||||
url=snapshot["url"],
|
||||
plugins=self.plugins,
|
||||
@@ -564,9 +548,7 @@ class CrawlRunner:
|
||||
skip_crawl_setup=True,
|
||||
skip_crawl_cleanup=True,
|
||||
)
|
||||
_runner_debug(f"snapshot {snapshot_id} finished download(), waiting for background monitors")
|
||||
await snapshot_services.process.wait_for_background_monitors()
|
||||
_runner_debug(f"snapshot {snapshot_id} finished waiting for background monitors")
|
||||
finally:
|
||||
current_task = asyncio.current_task()
|
||||
if current_task is not None and self.snapshot_tasks.get(snapshot_id) is current_task:
|
||||
|
||||
@@ -78,11 +78,7 @@ class Command(BaseCommand):
|
||||
running = Process.objects.filter(
|
||||
machine=machine,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
process_type__in=[
|
||||
Process.TypeChoices.ORCHESTRATOR,
|
||||
Process.TypeChoices.HOOK,
|
||||
Process.TypeChoices.BINARY,
|
||||
],
|
||||
process_type=Process.TypeChoices.ORCHESTRATOR,
|
||||
)
|
||||
for proc in running:
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user