mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-01-07 03:16:01 +10:00
codecov, migrations, orchestrator fixes
This commit is contained in:
@@ -30,6 +30,7 @@ __package__ = 'archivebox.workers'
|
||||
import os
|
||||
import time
|
||||
from typing import Type
|
||||
from datetime import timedelta
|
||||
from multiprocessing import Process as MPProcess
|
||||
|
||||
from django.utils import timezone
|
||||
@@ -67,12 +68,19 @@ class Orchestrator:
|
||||
MAX_WORKERS_PER_TYPE: int = 8 # Max workers per model type
|
||||
MAX_TOTAL_WORKERS: int = 24 # Max workers across all types
|
||||
|
||||
def __init__(self, exit_on_idle: bool = True):
|
||||
def __init__(self, exit_on_idle: bool = True, crawl_id: str | None = None):
|
||||
self.exit_on_idle = exit_on_idle
|
||||
self.crawl_id = crawl_id # If set, only process work for this crawl
|
||||
self.pid: int = os.getpid()
|
||||
self.pid_file = None
|
||||
self.idle_count: int = 0
|
||||
self._last_cleanup_time: float = 0.0 # For throttling cleanup_stale_running()
|
||||
|
||||
# CRITICAL: In foreground mode (exit_on_idle=True), use ONLY 1 worker
|
||||
# to keep execution strictly sequential and deterministic
|
||||
if self.exit_on_idle:
|
||||
self.MAX_WORKERS_PER_TYPE = 1
|
||||
self.MAX_TOTAL_WORKERS = 1
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f'[underline]Orchestrator[/underline]\\[pid={self.pid}]'
|
||||
@@ -315,15 +323,12 @@ class Orchestrator:
|
||||
# Enable progress bars only in TTY + foreground mode
|
||||
show_progress = IS_TTY and self.exit_on_idle
|
||||
|
||||
# Debug
|
||||
print(f"[yellow]DEBUG: IS_TTY={IS_TTY}, exit_on_idle={self.exit_on_idle}, show_progress={show_progress}[/yellow]")
|
||||
|
||||
self.on_startup()
|
||||
task_ids = {}
|
||||
|
||||
if not show_progress:
|
||||
# No progress bars - just run normally
|
||||
self._run_orchestrator_loop(None, task_ids, None, None)
|
||||
self._run_orchestrator_loop(None, task_ids)
|
||||
else:
|
||||
# Redirect worker subprocess output to /dev/null
|
||||
devnull_fd = os.open(os.devnull, os.O_WRONLY)
|
||||
@@ -356,7 +361,7 @@ class Orchestrator:
|
||||
TaskProgressColumn(),
|
||||
console=orchestrator_console,
|
||||
) as progress:
|
||||
self._run_orchestrator_loop(progress, task_ids, None, None)
|
||||
self._run_orchestrator_loop(progress, task_ids)
|
||||
|
||||
# Restore original console
|
||||
logging_module.CONSOLE = original_console
|
||||
@@ -374,7 +379,7 @@ class Orchestrator:
|
||||
pass
|
||||
# stdout_for_console is closed by orchestrator_console
|
||||
|
||||
def _run_orchestrator_loop(self, progress, task_ids, read_fd, console):
|
||||
def _run_orchestrator_loop(self, progress, task_ids):
|
||||
"""Run the main orchestrator loop with optional progress display."""
|
||||
try:
|
||||
while True:
|
||||
@@ -385,12 +390,28 @@ class Orchestrator:
|
||||
if progress:
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
# Get all started snapshots
|
||||
active_snapshots = list(Snapshot.objects.filter(status='started'))
|
||||
# Get all started snapshots (optionally filtered by crawl_id)
|
||||
snapshot_filter = {'status': 'started'}
|
||||
if self.crawl_id:
|
||||
snapshot_filter['crawl_id'] = self.crawl_id
|
||||
else:
|
||||
# Only if processing all crawls, filter by recent modified_at to avoid stale snapshots
|
||||
recent_cutoff = timezone.now() - timedelta(minutes=5)
|
||||
snapshot_filter['modified_at__gte'] = recent_cutoff
|
||||
|
||||
active_snapshots = list(Snapshot.objects.filter(**snapshot_filter))
|
||||
|
||||
# Track which snapshots are still active
|
||||
active_ids = set()
|
||||
|
||||
# Debug: check for duplicates
|
||||
snapshot_urls = [s.url for s in active_snapshots]
|
||||
if len(active_snapshots) != len(set(snapshot_urls)):
|
||||
# We have duplicate URLs - let's deduplicate by showing snapshot ID
|
||||
show_id = True
|
||||
else:
|
||||
show_id = False
|
||||
|
||||
for snapshot in active_snapshots:
|
||||
active_ids.add(snapshot.id)
|
||||
|
||||
@@ -421,7 +442,11 @@ class Orchestrator:
|
||||
|
||||
# Build description with URL + current plugin
|
||||
url = snapshot.url[:50] + '...' if len(snapshot.url) > 50 else snapshot.url
|
||||
description = f"{url}{current_plugin}"
|
||||
if show_id:
|
||||
# Show snapshot ID if there are duplicate URLs
|
||||
description = f"[{str(snapshot.id)[:8]}] {url}{current_plugin}"
|
||||
else:
|
||||
description = f"{url}{current_plugin}"
|
||||
|
||||
# Create or update task
|
||||
if snapshot.id not in task_ids:
|
||||
|
||||
@@ -63,9 +63,10 @@ class Worker:
|
||||
POLL_INTERVAL: ClassVar[float] = 0.2 # How often to check for new work (seconds)
|
||||
IDLE_TIMEOUT: ClassVar[int] = 50 # Exit after N idle iterations (10 sec at 0.2 poll interval)
|
||||
|
||||
def __init__(self, worker_id: int = 0, daemon: bool = False, **kwargs: Any):
|
||||
def __init__(self, worker_id: int = 0, daemon: bool = False, crawl_id: str | None = None, **kwargs: Any):
|
||||
self.worker_id = worker_id
|
||||
self.daemon = daemon
|
||||
self.crawl_id = crawl_id # If set, only process work for this crawl
|
||||
self.pid: int = os.getpid()
|
||||
self.pid_file: Path | None = None
|
||||
self.idle_count: int = 0
|
||||
@@ -346,6 +347,13 @@ class CrawlWorker(Worker):
|
||||
from archivebox.crawls.models import Crawl
|
||||
return Crawl
|
||||
|
||||
def get_queue(self) -> QuerySet:
|
||||
"""Get queue of Crawls ready for processing, optionally filtered by crawl_id."""
|
||||
qs = super().get_queue()
|
||||
if self.crawl_id:
|
||||
qs = qs.filter(id=self.crawl_id)
|
||||
return qs
|
||||
|
||||
|
||||
class SnapshotWorker(Worker):
|
||||
"""Worker for processing Snapshot objects."""
|
||||
|
||||
Reference in New Issue
Block a user