type and test fixes

This commit is contained in:
Nick Sweeting
2026-03-15 20:12:27 -07:00
parent 3889eb4efa
commit bc21d4bfdb
52 changed files with 762 additions and 1317 deletions

View File

@@ -31,7 +31,7 @@ __package__ = 'archivebox.workers'
import os
import time
from typing import Type
from datetime import timedelta
from datetime import datetime, timedelta
from multiprocessing import Process as MPProcess
from pathlib import Path
@@ -189,7 +189,7 @@ class Orchestrator:
event='Shutting down',
indent_level=0,
pid=self.pid,
error=error if error and not isinstance(error, KeyboardInterrupt) else None,
error=error if isinstance(error, Exception) and not isinstance(error, KeyboardInterrupt) else None,
)
def get_total_worker_count(self) -> int:
@@ -567,7 +567,8 @@ class Orchestrator:
status=ArchiveResult.StatusChoices.STARTED,
).select_related('process')
for ar in started_ars:
if ar.process_id and ar.process and ar.process.status == Process.StatusChoices.RUNNING:
process_id = getattr(ar, 'process_id', None)
if process_id and ar.process and ar.process.status == Process.StatusChoices.RUNNING:
try:
ar.process.kill_tree(graceful_timeout=0.0)
except Exception:
@@ -904,28 +905,29 @@ class Orchestrator:
size = ''
stderr_tail = ''
if ar:
if ar.process_id and ar.process:
process_id = getattr(ar, 'process_id', None)
if process_id and ar.process:
stderr_tail = _tail_stderr_line(ar.process)
if ar.status == ArchiveResult.StatusChoices.STARTED:
status = 'started'
is_running = True
is_pending = False
start_ts = ar.start_ts or (ar.process.started_at if ar.process_id and ar.process else None)
start_ts = ar.start_ts or (ar.process.started_at if process_id and ar.process else None)
if start_ts:
elapsed = _format_seconds((now - start_ts).total_seconds())
hook_timeout = None
if ar.process_id and ar.process and ar.process.timeout:
if process_id and ar.process and ar.process.timeout:
hook_timeout = ar.process.timeout
hook_timeout = hook_timeout or hook_timeouts.get(hook_name)
if hook_timeout:
timeout = _format_seconds(hook_timeout)
else:
status = ar.status
if ar.process_id and ar.process and ar.process.exit_code == 137:
if process_id and ar.process and ar.process.exit_code == 137:
status = 'failed'
is_pending = False
start_ts = ar.start_ts or (ar.process.started_at if ar.process_id and ar.process else None)
end_ts = ar.end_ts or (ar.process.ended_at if ar.process_id and ar.process else None)
start_ts = ar.start_ts or (ar.process.started_at if process_id and ar.process else None)
end_ts = ar.end_ts or (ar.process.ended_at if process_id and ar.process else None)
if start_ts and end_ts:
elapsed = _format_seconds((end_ts - start_ts).total_seconds())
size = _format_size(getattr(ar, 'output_size', None))
@@ -1093,7 +1095,7 @@ class Orchestrator:
from archivebox.core.models import Snapshot
# Get all started snapshots (optionally filtered by crawl_id)
snapshot_filter = {'status': 'started'}
snapshot_filter: dict[str, str | datetime] = {'status': 'started'}
if self.crawl_id:
snapshot_filter['crawl_id'] = self.crawl_id
else:

View File

@@ -335,6 +335,7 @@ def start_worker(supervisor, daemon, lazy=False):
for added in added:
supervisor.addProcessGroup(added)
procs = []
for _ in range(25):
procs = supervisor.getAllProcessInfo()
for proc in procs:

View File

@@ -1,7 +1,9 @@
from datetime import timedelta
from typing import cast
from unittest.mock import patch
from django.contrib.auth import get_user_model
from django.contrib.auth.models import UserManager
from django.test import TestCase
from django.utils import timezone
@@ -12,7 +14,8 @@ from archivebox.workers.worker import CrawlWorker
class TestScheduledCrawlMaterialization(TestCase):
def setUp(self):
self.user = get_user_model().objects.create_user(
user_manager = cast(UserManager, get_user_model().objects)
self.user = user_manager.create_user(
username='schedule-user',
password='password',
)
@@ -52,6 +55,8 @@ class TestScheduledCrawlMaterialization(TestCase):
self.assertEqual(scheduled_crawls.count(), 2)
queued_crawl = scheduled_crawls.last()
self.assertIsNotNone(queued_crawl)
assert queued_crawl is not None
self.assertEqual(queued_crawl.status, Crawl.StatusChoices.QUEUED)
self.assertEqual(queued_crawl.urls, 'https://example.com/feed.xml')
self.assertEqual(queued_crawl.max_depth, 1)
@@ -63,7 +68,7 @@ class TestScheduledCrawlMaterialization(TestCase):
Orchestrator(exit_on_idle=True)._materialize_due_schedules()
self.assertEqual(Crawl.objects.filter(schedule=schedule).count(), 1)
Orchestrator(exit_on_idle=False, crawl_id=str(schedule.template_id))._materialize_due_schedules()
Orchestrator(exit_on_idle=False, crawl_id=str(schedule.template.id))._materialize_due_schedules()
self.assertEqual(Crawl.objects.filter(schedule=schedule).count(), 1)
@patch.object(CrawlWorker, 'start')

View File

@@ -1,5 +1,6 @@
from pathlib import Path
from types import SimpleNamespace
from typing import Any, cast
from unittest.mock import patch
from django.test import SimpleTestCase
@@ -11,14 +12,14 @@ class TestSnapshotWorkerRetryForegroundHooks(SimpleTestCase):
def _make_worker(self):
worker = SnapshotWorker.__new__(SnapshotWorker)
worker.pid = 12345
worker.snapshot = SimpleNamespace(
cast(Any, worker).snapshot = SimpleNamespace(
status='started',
refresh_from_db=lambda: None,
)
worker._snapshot_exceeded_hard_timeout = lambda: False
worker._seal_snapshot_due_to_timeout = lambda: None
worker._run_hook = lambda *args, **kwargs: SimpleNamespace()
worker._wait_for_hook = lambda *args, **kwargs: None
worker._wait_for_hook = lambda process, ar: None
return worker
@patch('archivebox.workers.worker.log_worker_event')
@@ -49,10 +50,10 @@ class TestSnapshotWorkerRetryForegroundHooks(SimpleTestCase):
run_calls.append((args, kwargs))
return SimpleNamespace()
def wait_for_hook(process, archive_result):
wait_calls.append((process, archive_result))
archive_result.status = 'succeeded'
archive_result.output_files = {'singlefile.html': {}}
def wait_for_hook(process, ar):
wait_calls.append((process, ar))
ar.status = 'succeeded'
ar.output_files = {'singlefile.html': {}}
archive_result = SimpleNamespace(
status='failed',