This commit is contained in:
Nick Sweeting
2025-12-28 17:51:54 -08:00
parent 54f91c1339
commit f0aa19fa7d
157 changed files with 6774 additions and 5061 deletions

View File

@@ -3,5 +3,5 @@ __order__ = 100
def register_admin(admin_site):
from workers.admin import register_admin
from archivebox.workers.admin import register_admin
register_admin(admin_site)

View File

@@ -3,5 +3,5 @@ from django.apps import AppConfig
class WorkersConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField'
name = 'workers'
name = 'archivebox.workers'

View File

@@ -1,6 +1,6 @@
from django.core.management.base import BaseCommand
from workers.orchestrator import Orchestrator
from archivebox.workers.orchestrator import Orchestrator
class Command(BaseCommand):

View File

@@ -42,6 +42,7 @@ class BaseModelWithStateMachine(models.Model, MachineMixin):
retry_at_field_name: ClassVar[str]
class Meta:
app_label = 'workers'
abstract = True
@classmethod
@@ -163,9 +164,9 @@ class BaseModelWithStateMachine(models.Model, MachineMixin):
def bump_retry_at(self, seconds: int = 10):
self.RETRY_AT = timezone.now() + timedelta(seconds=seconds)
def update_for_workers(self, **kwargs) -> bool:
def update_and_requeue(self, **kwargs) -> bool:
"""
Atomically update the object's fields for worker processing.
Atomically update fields and schedule retry_at for next worker tick.
Returns True if the update was successful, False if the object was modified by another worker.
"""
# Get the current retry_at to use as optimistic lock
@@ -307,7 +308,7 @@ class ModelWithStateMachine(BaseModelWithStateMachine):
status: models.CharField = BaseModelWithStateMachine.StatusField()
retry_at: models.DateTimeField = BaseModelWithStateMachine.RetryAtField()
state_machine_name: ClassVar[str] # e.g. 'core.statemachines.ArchiveResultMachine'
state_machine_name: ClassVar[str] # e.g. 'core.models.ArchiveResultMachine'
state_field_name: ClassVar[str] = 'status'
state_machine_attr: ClassVar[str] = 'sm'
bind_events_as_methods: ClassVar[bool] = True
@@ -316,4 +317,41 @@ class ModelWithStateMachine(BaseModelWithStateMachine):
retry_at_field_name: ClassVar[str] = 'retry_at'
class Meta:
app_label = 'workers'
abstract = True
class BaseStateMachine(StateMachine):
"""
Base class for all ArchiveBox state machines.
Eliminates boilerplate __init__, __repr__, __str__ methods that were
duplicated across all 4 state machines (Snapshot, ArchiveResult, Crawl, Binary).
Subclasses must set model_attr_name to specify the attribute name
(e.g., 'snapshot', 'archiveresult', 'crawl', 'binary').
Example usage:
class SnapshotMachine(BaseStateMachine, strict_states=True):
model_attr_name = 'snapshot'
# States and transitions...
queued = State(value=Snapshot.StatusChoices.QUEUED, initial=True)
# ...
The model instance is accessible via self.{model_attr_name}
(e.g., self.snapshot, self.archiveresult, etc.)
"""
model_attr_name: str = 'obj' # Override in subclasses
def __init__(self, obj, *args, **kwargs):
setattr(self, self.model_attr_name, obj)
super().__init__(obj, *args, **kwargs)
def __repr__(self) -> str:
obj = getattr(self, self.model_attr_name)
return f'{self.__class__.__name__}[{obj.id}]'
def __str__(self) -> str:
return self.__repr__()

View File

@@ -41,7 +41,7 @@ def bg_archive_snapshots(snapshots, kwargs: dict | None = None) -> int:
Returns the number of snapshots queued.
"""
from core.models import Snapshot
from archivebox.core.models import Snapshot
kwargs = kwargs or {}
@@ -68,7 +68,7 @@ def bg_archive_snapshot(snapshot, overwrite: bool = False, methods: list | None
Returns 1 if queued, 0 otherwise.
"""
from core.models import Snapshot
from archivebox.core.models import Snapshot
# Queue the snapshot by setting status to queued
if hasattr(snapshot, 'id'):

View File

@@ -2,7 +2,7 @@
from django.views.generic import TemplateView
from django.contrib.auth.mixins import UserPassesTestMixin
from django.utils import timezone
from api.auth import get_or_create_api_token
from archivebox.api.auth import get_or_create_api_token
class JobsDashboardView(UserPassesTestMixin, TemplateView):

View File

@@ -322,7 +322,7 @@ class CrawlWorker(Worker):
MAX_TICK_TIME: ClassVar[int] = 60
def get_model(self):
from crawls.models import Crawl
from archivebox.crawls.models import Crawl
return Crawl
@@ -333,7 +333,7 @@ class SnapshotWorker(Worker):
MAX_TICK_TIME: ClassVar[int] = 60
def get_model(self):
from core.models import Snapshot
from archivebox.core.models import Snapshot
return Snapshot
@@ -348,7 +348,7 @@ class ArchiveResultWorker(Worker):
self.plugin = plugin
def get_model(self):
from core.models import ArchiveResult
from archivebox.core.models import ArchiveResult
return ArchiveResult
def get_queue(self) -> QuerySet:
@@ -358,7 +358,7 @@ class ArchiveResultWorker(Worker):
Uses step-based filtering: only claims ARs where hook step <= snapshot.current_step.
This ensures hooks execute in order (step 0 → 1 → 2 ... → 9).
"""
from core.models import ArchiveResult
from archivebox.core.models import ArchiveResult
from archivebox.hooks import extract_step
qs = super().get_queue()