mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-01-03 09:25:42 +10:00
codecov, migrations, orchestrator fixes
This commit is contained in:
@@ -1515,6 +1515,12 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
parent_snapshot = overrides.get('snapshot') # Parent snapshot
|
||||
created_by_id = overrides.get('created_by_id') or (parent_snapshot.created_by.pk if parent_snapshot else get_or_create_system_user_pk())
|
||||
|
||||
# DEBUG: Check if crawl_id in record matches overrides crawl
|
||||
import sys
|
||||
record_crawl_id = record.get('crawl_id')
|
||||
if record_crawl_id and crawl and str(crawl.id) != str(record_crawl_id):
|
||||
print(f"[yellow]⚠️ Snapshot.from_json crawl mismatch: record has crawl_id={record_crawl_id}, overrides has crawl={crawl.id}[/yellow]", file=sys.stderr)
|
||||
|
||||
# If no crawl provided, inherit from parent or auto-create one
|
||||
if not crawl:
|
||||
if parent_snapshot:
|
||||
@@ -1536,6 +1542,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
label=f'auto-created for {url[:50]}',
|
||||
created_by_id=created_by_id,
|
||||
)
|
||||
print(f"[red]⚠️ Snapshot.from_json auto-created new crawl {crawl.id} for url={url}[/red]", file=sys.stderr)
|
||||
|
||||
# Parse tags
|
||||
tags_str = record.get('tags', '')
|
||||
@@ -1546,8 +1553,9 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
if tag.strip()
|
||||
))
|
||||
|
||||
# Get most recent snapshot with this URL (URLs can exist in multiple crawls)
|
||||
snapshot = Snapshot.objects.filter(url=url).order_by('-created_at').first()
|
||||
# Check for existing snapshot with same URL in same crawl
|
||||
# (URLs can exist in multiple crawls, but should be unique within a crawl)
|
||||
snapshot = Snapshot.objects.filter(url=url, crawl=crawl).order_by('-created_at').first()
|
||||
|
||||
title = record.get('title')
|
||||
timestamp = record.get('timestamp')
|
||||
|
||||
@@ -892,15 +892,34 @@ def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[
|
||||
"""
|
||||
plugin_upper = plugin_name.upper()
|
||||
|
||||
# 1. Enabled: PLUGINNAME_ENABLED (default True)
|
||||
# 1. Enabled: Check PLUGINS whitelist first, then PLUGINNAME_ENABLED (default True)
|
||||
# Old names (USE_*, SAVE_*) are aliased in config.json via x-aliases
|
||||
enabled_key = f'{plugin_upper}_ENABLED'
|
||||
enabled = config.get(enabled_key)
|
||||
if enabled is None:
|
||||
enabled = True
|
||||
elif isinstance(enabled, str):
|
||||
# Handle string values from config file ("true"/"false")
|
||||
enabled = enabled.lower() not in ('false', '0', 'no', '')
|
||||
|
||||
# Check if PLUGINS whitelist is specified (e.g., --plugins=wget,favicon)
|
||||
plugins_whitelist = config.get('PLUGINS', '')
|
||||
if plugins_whitelist:
|
||||
# PLUGINS whitelist is specified - only enable plugins in the list
|
||||
plugin_names = [p.strip().lower() for p in plugins_whitelist.split(',') if p.strip()]
|
||||
if plugin_name.lower() not in plugin_names:
|
||||
# Plugin not in whitelist - explicitly disabled
|
||||
enabled = False
|
||||
else:
|
||||
# Plugin is in whitelist - check if explicitly disabled by PLUGINNAME_ENABLED
|
||||
enabled_key = f'{plugin_upper}_ENABLED'
|
||||
enabled = config.get(enabled_key)
|
||||
if enabled is None:
|
||||
enabled = True # Default to enabled if in whitelist
|
||||
elif isinstance(enabled, str):
|
||||
enabled = enabled.lower() not in ('false', '0', 'no', '')
|
||||
else:
|
||||
# No PLUGINS whitelist - use PLUGINNAME_ENABLED (default True)
|
||||
enabled_key = f'{plugin_upper}_ENABLED'
|
||||
enabled = config.get(enabled_key)
|
||||
if enabled is None:
|
||||
enabled = True
|
||||
elif isinstance(enabled, str):
|
||||
# Handle string values from config file ("true"/"false")
|
||||
enabled = enabled.lower() not in ('false', '0', 'no', '')
|
||||
|
||||
# 2. Timeout: PLUGINNAME_TIMEOUT (fallback to TIMEOUT, default 300)
|
||||
timeout_key = f'{plugin_upper}_TIMEOUT'
|
||||
|
||||
@@ -80,7 +80,8 @@ class TestAccessibilityWithChrome(TestCase):
|
||||
# Run accessibility hook with the active Chrome session
|
||||
result = subprocess.run(
|
||||
['node', str(ACCESSIBILITY_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
|
||||
cwd=str(snapshot_chrome_dir),
|
||||
cwd=str(snapshot_chrome_dir,
|
||||
env=get_test_env()),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
|
||||
@@ -208,7 +208,8 @@ def test_chrome_launch_and_tab_creation():
|
||||
env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
|
||||
result = subprocess.run(
|
||||
['node', str(CHROME_TAB_HOOK), '--url=https://example.com', '--snapshot-id=snap-123', '--crawl-id=test-crawl-123'],
|
||||
cwd=str(snapshot_chrome_dir),
|
||||
cwd=str(snapshot_chrome_dir,
|
||||
env=get_test_env()),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
@@ -268,7 +269,8 @@ def test_chrome_navigation():
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(CHROME_TAB_HOOK), '--url=https://example.com', '--snapshot-id=snap-nav-123', '--crawl-id=test-crawl-nav'],
|
||||
cwd=str(snapshot_chrome_dir),
|
||||
cwd=str(snapshot_chrome_dir,
|
||||
env=get_test_env()),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
@@ -279,7 +281,8 @@ def test_chrome_navigation():
|
||||
# Navigate to URL
|
||||
result = subprocess.run(
|
||||
['node', str(CHROME_NAVIGATE_HOOK), '--url=https://example.com', '--snapshot-id=snap-nav-123'],
|
||||
cwd=str(snapshot_chrome_dir),
|
||||
cwd=str(snapshot_chrome_dir,
|
||||
env=get_test_env()),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120,
|
||||
@@ -414,7 +417,8 @@ def test_multiple_snapshots_share_chrome():
|
||||
# Create tab for this snapshot
|
||||
result = subprocess.run(
|
||||
['node', str(CHROME_TAB_HOOK), f'--url=https://example.com/{snap_num}', f'--snapshot-id=snap-{snap_num}', '--crawl-id=test-multi-crawl'],
|
||||
cwd=str(snapshot_chrome_dir),
|
||||
cwd=str(snapshot_chrome_dir,
|
||||
env=get_test_env()),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
|
||||
@@ -80,7 +80,8 @@ class TestConsolelogWithChrome(TestCase):
|
||||
# Run consolelog hook with the active Chrome session
|
||||
result = subprocess.run(
|
||||
['node', str(CONSOLELOG_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
|
||||
cwd=str(snapshot_chrome_dir),
|
||||
cwd=str(snapshot_chrome_dir,
|
||||
env=get_test_env()),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120, # Longer timeout as it waits for navigation
|
||||
|
||||
@@ -68,7 +68,8 @@ def test_extracts_dom_from_example_com():
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120
|
||||
)
|
||||
,
|
||||
env=get_test_env())
|
||||
|
||||
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
|
||||
|
||||
@@ -152,7 +153,8 @@ def test_staticfile_present_skips():
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30
|
||||
)
|
||||
,
|
||||
env=get_test_env())
|
||||
|
||||
assert result.returncode == 0, "Should exit 0 when permanently skipping"
|
||||
|
||||
|
||||
@@ -50,7 +50,8 @@ def test_node_is_available():
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10
|
||||
)
|
||||
,
|
||||
env=get_test_env())
|
||||
assert result.returncode == 0, f"node not executable: {result.stderr}"
|
||||
assert result.stdout.startswith('v'), f"Unexpected node version format: {result.stdout}"
|
||||
|
||||
@@ -72,7 +73,8 @@ def test_extracts_headers_from_example_com():
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60
|
||||
)
|
||||
,
|
||||
env=get_test_env())
|
||||
|
||||
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
|
||||
|
||||
@@ -133,7 +135,8 @@ def test_headers_output_structure():
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60
|
||||
)
|
||||
,
|
||||
env=get_test_env())
|
||||
|
||||
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
|
||||
|
||||
@@ -192,7 +195,8 @@ def test_falls_back_to_http_when_chrome_unavailable():
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60
|
||||
)
|
||||
,
|
||||
env=get_test_env())
|
||||
|
||||
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
|
||||
|
||||
@@ -309,7 +313,8 @@ def test_handles_https_urls():
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60
|
||||
)
|
||||
,
|
||||
env=get_test_env())
|
||||
|
||||
if result.returncode == 0:
|
||||
output_headers_file = tmpdir / 'headers.json'
|
||||
@@ -334,7 +339,8 @@ def test_handles_404_gracefully():
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60
|
||||
)
|
||||
,
|
||||
env=get_test_env())
|
||||
|
||||
# May succeed or fail depending on server behavior
|
||||
# If it succeeds, verify 404 status is captured
|
||||
|
||||
@@ -123,7 +123,8 @@ def test_scrolls_page_and_outputs_stats():
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-infiniscroll'],
|
||||
cwd=str(infiniscroll_dir),
|
||||
cwd=str(infiniscroll_dir,
|
||||
env=get_test_env()),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
@@ -187,7 +188,8 @@ def test_config_scroll_limit_honored():
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-limit'],
|
||||
cwd=str(infiniscroll_dir),
|
||||
cwd=str(infiniscroll_dir,
|
||||
env=get_test_env()),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
@@ -246,7 +248,8 @@ def test_config_timeout_honored():
|
||||
start_time = time.time()
|
||||
result = subprocess.run(
|
||||
['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-timeout'],
|
||||
cwd=str(infiniscroll_dir),
|
||||
cwd=str(infiniscroll_dir,
|
||||
env=get_test_env()),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
|
||||
@@ -154,7 +154,8 @@ def test_extension_loads_in_chromium():
|
||||
# Step 1: Install the extension
|
||||
result = subprocess.run(
|
||||
['node', str(INSTALL_SCRIPT)],
|
||||
cwd=str(tmpdir),
|
||||
cwd=str(tmpdir,
|
||||
env=get_test_env()),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
@@ -291,7 +292,8 @@ const puppeteer = require('puppeteer-core');
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(script_path)],
|
||||
cwd=str(tmpdir),
|
||||
cwd=str(tmpdir,
|
||||
env=get_test_env()),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
@@ -443,7 +445,8 @@ const puppeteer = require('puppeteer-core');
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(script_path)],
|
||||
cwd=str(script_dir),
|
||||
cwd=str(script_dir,
|
||||
env=get_test_env()),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
@@ -557,7 +560,8 @@ def test_hides_cookie_consent_on_filmin():
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(INSTALL_SCRIPT)],
|
||||
cwd=str(tmpdir),
|
||||
cwd=str(tmpdir,
|
||||
env=get_test_env()),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env_with_ext,
|
||||
|
||||
@@ -80,7 +80,8 @@ class TestParseDomOutlinksWithChrome(TestCase):
|
||||
# Run outlinks hook with the active Chrome session
|
||||
result = subprocess.run(
|
||||
['node', str(OUTLINKS_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
|
||||
cwd=str(snapshot_chrome_dir),
|
||||
cwd=str(snapshot_chrome_dir,
|
||||
env=get_test_env()),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
|
||||
@@ -69,7 +69,8 @@ def test_extracts_pdf_from_example_com():
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120
|
||||
)
|
||||
,
|
||||
env=get_test_env())
|
||||
|
||||
# Parse clean JSONL output (hook might fail due to network issues)
|
||||
result_json = None
|
||||
|
||||
@@ -81,7 +81,8 @@ class TestRedirectsWithChrome(TestCase):
|
||||
# Run redirects hook with the active Chrome session
|
||||
result = subprocess.run(
|
||||
['node', str(REDIRECTS_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
|
||||
cwd=str(snapshot_chrome_dir),
|
||||
cwd=str(snapshot_chrome_dir,
|
||||
env=get_test_env()),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
|
||||
@@ -80,7 +80,8 @@ class TestResponsesWithChrome(TestCase):
|
||||
# Run responses hook with the active Chrome session
|
||||
result = subprocess.run(
|
||||
['node', str(RESPONSES_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
|
||||
cwd=str(snapshot_chrome_dir),
|
||||
cwd=str(snapshot_chrome_dir,
|
||||
env=get_test_env()),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120, # Longer timeout as it waits for navigation
|
||||
|
||||
@@ -65,7 +65,8 @@ def test_extracts_screenshot_from_example_com():
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120
|
||||
timeout=120,
|
||||
env=get_test_env()
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
|
||||
|
||||
@@ -80,7 +80,8 @@ class TestSEOWithChrome(TestCase):
|
||||
# Run SEO hook with the active Chrome session
|
||||
result = subprocess.run(
|
||||
['node', str(SEO_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
|
||||
cwd=str(snapshot_chrome_dir),
|
||||
cwd=str(snapshot_chrome_dir,
|
||||
env=get_test_env()),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
|
||||
@@ -80,7 +80,8 @@ class TestSSLWithChrome(TestCase):
|
||||
# Run SSL hook with the active Chrome session
|
||||
result = subprocess.run(
|
||||
['node', str(SSL_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
|
||||
cwd=str(snapshot_chrome_dir),
|
||||
cwd=str(snapshot_chrome_dir,
|
||||
env=get_test_env()),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
|
||||
@@ -80,7 +80,8 @@ class TestStaticfileWithChrome(TestCase):
|
||||
# Run staticfile hook with the active Chrome session
|
||||
result = subprocess.run(
|
||||
['node', str(STATICFILE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
|
||||
cwd=str(snapshot_chrome_dir),
|
||||
cwd=str(snapshot_chrome_dir,
|
||||
env=get_test_env()),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120, # Longer timeout as it waits for navigation
|
||||
|
||||
@@ -53,7 +53,8 @@ def test_extracts_title_from_example_com():
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60
|
||||
)
|
||||
,
|
||||
env=get_test_env())
|
||||
|
||||
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
|
||||
|
||||
@@ -105,7 +106,8 @@ def test_falls_back_to_http_when_chrome_unavailable():
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60
|
||||
)
|
||||
,
|
||||
env=get_test_env())
|
||||
|
||||
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
|
||||
|
||||
@@ -219,7 +221,8 @@ def test_handles_https_urls():
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60
|
||||
)
|
||||
,
|
||||
env=get_test_env())
|
||||
|
||||
if result.returncode == 0:
|
||||
# Hook writes to current directory
|
||||
@@ -249,7 +252,8 @@ def test_handles_404_gracefully():
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60
|
||||
)
|
||||
,
|
||||
env=get_test_env())
|
||||
|
||||
# May succeed or fail depending on server behavior
|
||||
# example.com returns "Example Domain" even for 404s
|
||||
@@ -272,7 +276,8 @@ def test_handles_redirects():
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60
|
||||
)
|
||||
,
|
||||
env=get_test_env())
|
||||
|
||||
# Should succeed and follow redirect
|
||||
if result.returncode == 0:
|
||||
|
||||
@@ -283,7 +283,8 @@ const puppeteer = require('puppeteer-core');
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(script_path)],
|
||||
cwd=str(script_dir),
|
||||
cwd=str(script_dir,
|
||||
env=get_test_env()),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
@@ -482,7 +483,8 @@ const puppeteer = require('puppeteer-core');
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(script_path)],
|
||||
cwd=str(tmpdir),
|
||||
cwd=str(tmpdir,
|
||||
env=get_test_env()),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
|
||||
@@ -30,6 +30,7 @@ __package__ = 'archivebox.workers'
|
||||
import os
|
||||
import time
|
||||
from typing import Type
|
||||
from datetime import timedelta
|
||||
from multiprocessing import Process as MPProcess
|
||||
|
||||
from django.utils import timezone
|
||||
@@ -67,12 +68,19 @@ class Orchestrator:
|
||||
MAX_WORKERS_PER_TYPE: int = 8 # Max workers per model type
|
||||
MAX_TOTAL_WORKERS: int = 24 # Max workers across all types
|
||||
|
||||
def __init__(self, exit_on_idle: bool = True):
|
||||
def __init__(self, exit_on_idle: bool = True, crawl_id: str | None = None):
|
||||
self.exit_on_idle = exit_on_idle
|
||||
self.crawl_id = crawl_id # If set, only process work for this crawl
|
||||
self.pid: int = os.getpid()
|
||||
self.pid_file = None
|
||||
self.idle_count: int = 0
|
||||
self._last_cleanup_time: float = 0.0 # For throttling cleanup_stale_running()
|
||||
|
||||
# CRITICAL: In foreground mode (exit_on_idle=True), use ONLY 1 worker
|
||||
# to keep execution strictly sequential and deterministic
|
||||
if self.exit_on_idle:
|
||||
self.MAX_WORKERS_PER_TYPE = 1
|
||||
self.MAX_TOTAL_WORKERS = 1
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f'[underline]Orchestrator[/underline]\\[pid={self.pid}]'
|
||||
@@ -315,15 +323,12 @@ class Orchestrator:
|
||||
# Enable progress bars only in TTY + foreground mode
|
||||
show_progress = IS_TTY and self.exit_on_idle
|
||||
|
||||
# Debug
|
||||
print(f"[yellow]DEBUG: IS_TTY={IS_TTY}, exit_on_idle={self.exit_on_idle}, show_progress={show_progress}[/yellow]")
|
||||
|
||||
self.on_startup()
|
||||
task_ids = {}
|
||||
|
||||
if not show_progress:
|
||||
# No progress bars - just run normally
|
||||
self._run_orchestrator_loop(None, task_ids, None, None)
|
||||
self._run_orchestrator_loop(None, task_ids)
|
||||
else:
|
||||
# Redirect worker subprocess output to /dev/null
|
||||
devnull_fd = os.open(os.devnull, os.O_WRONLY)
|
||||
@@ -356,7 +361,7 @@ class Orchestrator:
|
||||
TaskProgressColumn(),
|
||||
console=orchestrator_console,
|
||||
) as progress:
|
||||
self._run_orchestrator_loop(progress, task_ids, None, None)
|
||||
self._run_orchestrator_loop(progress, task_ids)
|
||||
|
||||
# Restore original console
|
||||
logging_module.CONSOLE = original_console
|
||||
@@ -374,7 +379,7 @@ class Orchestrator:
|
||||
pass
|
||||
# stdout_for_console is closed by orchestrator_console
|
||||
|
||||
def _run_orchestrator_loop(self, progress, task_ids, read_fd, console):
|
||||
def _run_orchestrator_loop(self, progress, task_ids):
|
||||
"""Run the main orchestrator loop with optional progress display."""
|
||||
try:
|
||||
while True:
|
||||
@@ -385,12 +390,28 @@ class Orchestrator:
|
||||
if progress:
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
# Get all started snapshots
|
||||
active_snapshots = list(Snapshot.objects.filter(status='started'))
|
||||
# Get all started snapshots (optionally filtered by crawl_id)
|
||||
snapshot_filter = {'status': 'started'}
|
||||
if self.crawl_id:
|
||||
snapshot_filter['crawl_id'] = self.crawl_id
|
||||
else:
|
||||
# Only if processing all crawls, filter by recent modified_at to avoid stale snapshots
|
||||
recent_cutoff = timezone.now() - timedelta(minutes=5)
|
||||
snapshot_filter['modified_at__gte'] = recent_cutoff
|
||||
|
||||
active_snapshots = list(Snapshot.objects.filter(**snapshot_filter))
|
||||
|
||||
# Track which snapshots are still active
|
||||
active_ids = set()
|
||||
|
||||
# Debug: check for duplicates
|
||||
snapshot_urls = [s.url for s in active_snapshots]
|
||||
if len(active_snapshots) != len(set(snapshot_urls)):
|
||||
# We have duplicate URLs - let's deduplicate by showing snapshot ID
|
||||
show_id = True
|
||||
else:
|
||||
show_id = False
|
||||
|
||||
for snapshot in active_snapshots:
|
||||
active_ids.add(snapshot.id)
|
||||
|
||||
@@ -421,7 +442,11 @@ class Orchestrator:
|
||||
|
||||
# Build description with URL + current plugin
|
||||
url = snapshot.url[:50] + '...' if len(snapshot.url) > 50 else snapshot.url
|
||||
description = f"{url}{current_plugin}"
|
||||
if show_id:
|
||||
# Show snapshot ID if there are duplicate URLs
|
||||
description = f"[{str(snapshot.id)[:8]}] {url}{current_plugin}"
|
||||
else:
|
||||
description = f"{url}{current_plugin}"
|
||||
|
||||
# Create or update task
|
||||
if snapshot.id not in task_ids:
|
||||
|
||||
@@ -63,9 +63,10 @@ class Worker:
|
||||
POLL_INTERVAL: ClassVar[float] = 0.2 # How often to check for new work (seconds)
|
||||
IDLE_TIMEOUT: ClassVar[int] = 50 # Exit after N idle iterations (10 sec at 0.2 poll interval)
|
||||
|
||||
def __init__(self, worker_id: int = 0, daemon: bool = False, **kwargs: Any):
|
||||
def __init__(self, worker_id: int = 0, daemon: bool = False, crawl_id: str | None = None, **kwargs: Any):
|
||||
self.worker_id = worker_id
|
||||
self.daemon = daemon
|
||||
self.crawl_id = crawl_id # If set, only process work for this crawl
|
||||
self.pid: int = os.getpid()
|
||||
self.pid_file: Path | None = None
|
||||
self.idle_count: int = 0
|
||||
@@ -346,6 +347,13 @@ class CrawlWorker(Worker):
|
||||
from archivebox.crawls.models import Crawl
|
||||
return Crawl
|
||||
|
||||
def get_queue(self) -> QuerySet:
|
||||
"""Get queue of Crawls ready for processing, optionally filtered by crawl_id."""
|
||||
qs = super().get_queue()
|
||||
if self.crawl_id:
|
||||
qs = qs.filter(id=self.crawl_id)
|
||||
return qs
|
||||
|
||||
|
||||
class SnapshotWorker(Worker):
|
||||
"""Worker for processing Snapshot objects."""
|
||||
|
||||
Reference in New Issue
Block a user