wip

2026-04-05 23:37:58 +10:00 · 2025-12-28 17:51:54 -08:00
parent 54f91c1339
commit f0aa19fa7d
157 changed files with 6774 additions and 5061 deletions
--- a/archivebox/crawls/admin.py
+++ b/archivebox/crawls/admin.py
@@ -17,8 +17,8 @@ from django_object_actions import action

 from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin

-from core.models import Snapshot
-from crawls.models import Crawl, CrawlSchedule
+from archivebox.core.models import Snapshot
+from archivebox.crawls.models import Crawl, CrawlSchedule


 def render_snapshots_list(snapshots_qs, limit=20):
--- a/archivebox/crawls/apps.py
+++ b/archivebox/crawls/apps.py
@@ -3,4 +3,4 @@ from django.apps import AppConfig

 class CrawlsConfig(AppConfig):
    default_auto_field = "django.db.models.BigAutoField"
-    name = "crawls"
+    name = "archivebox.crawls"
--- a/archivebox/crawls/models.py
+++ b/archivebox/crawls/models.py
@@ -1,6 +1,7 @@
 __package__ = 'archivebox.crawls'

 from typing import TYPE_CHECKING, Iterable
+from datetime import timedelta
 from archivebox.uuid_compat import uuid7
 from pathlib import Path

@@ -11,13 +12,15 @@ from django.conf import settings
 from django.urls import reverse_lazy
 from django.utils import timezone
 from django_stubs_ext.db.models import TypedModelMeta
+from statemachine import State, registry
+from rich import print

 from archivebox.config import CONSTANTS
 from archivebox.base_models.models import ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, get_or_create_system_user_pk
-from workers.models import ModelWithStateMachine
+from archivebox.workers.models import ModelWithStateMachine, BaseStateMachine

 if TYPE_CHECKING:
-    from core.models import Snapshot, ArchiveResult
+    from archivebox.core.models import Snapshot, ArchiveResult


 class CrawlSchedule(ModelWithSerializers, ModelWithNotes, ModelWithHealthStats):
@@ -35,6 +38,7 @@ class CrawlSchedule(ModelWithSerializers, ModelWithNotes, ModelWithHealthStats):
    crawl_set: models.Manager['Crawl']

    class Meta(TypedModelMeta):
+        app_label = 'crawls'
        verbose_name = 'Scheduled Crawl'
        verbose_name_plural = 'Scheduled Crawls'

@@ -73,7 +77,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
    status = ModelWithStateMachine.StatusField(choices=ModelWithStateMachine.StatusChoices, default=ModelWithStateMachine.StatusChoices.QUEUED)
    retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)

-    state_machine_name = 'crawls.statemachines.CrawlMachine'
+    state_machine_name = 'crawls.models.CrawlMachine'
    retry_at_field_name = 'retry_at'
    state_field_name = 'status'
    StatusChoices = ModelWithStateMachine.StatusChoices
@@ -82,6 +86,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
    snapshot_set: models.Manager['Snapshot']

    class Meta(TypedModelMeta):
+        app_label = 'crawls'
        verbose_name = 'Crawl'
        verbose_name_plural = 'Crawls'

@@ -168,7 +173,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
        return Path(path_str)

    def create_root_snapshot(self) -> 'Snapshot':
-        from core.models import Snapshot
+        from archivebox.core.models import Snapshot

        first_url = self.get_urls_list()[0] if self.get_urls_list() else None
        if not first_url:
@@ -245,7 +250,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
            List of newly created Snapshot objects
        """
        import json
-        from core.models import Snapshot
+        from archivebox.core.models import Snapshot

        created_snapshots = []

@@ -309,9 +314,13 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
        import time
        from pathlib import Path
        from archivebox.hooks import run_hook, discover_hooks, process_hook_records
+        from archivebox.config.configset import get_config
+
+        # Get merged config with crawl context
+        config = get_config(crawl=self)

        # Discover and run on_Crawl hooks
-        hooks = discover_hooks('Crawl')
+        hooks = discover_hooks('Crawl', config=config)
        first_url = self.get_urls_list()[0] if self.get_urls_list() else ''

        for hook in hooks:
@@ -323,8 +332,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
            result = run_hook(
                hook,
                output_dir=output_dir,
-                timeout=60,
-                config_objects=[self],
+                config=config,
                crawl_id=str(self.id),
                source_url=first_url,
            )
@@ -380,7 +388,10 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
                    pass

        # Run on_CrawlEnd hooks
-        hooks = discover_hooks('CrawlEnd')
+        from archivebox.config.configset import get_config
+        config = get_config(crawl=self)
+
+        hooks = discover_hooks('CrawlEnd', config=config)
        first_url = self.get_urls_list()[0] if self.get_urls_list() else ''

        for hook in hooks:
@@ -391,8 +402,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
            result = run_hook(
                hook,
                output_dir=output_dir,
-                timeout=30,
-                config_objects=[self],
+                config=config,
                crawl_id=str(self.id),
                source_url=first_url,
            )
@@ -400,3 +410,131 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
            # Log failures but don't block
            if result and result['returncode'] != 0:
                print(f'[yellow]⚠️ CrawlEnd hook failed: {hook.name}[/yellow]')
+
+
+# =============================================================================
+# State Machines
+# =============================================================================
+
+class CrawlMachine(BaseStateMachine, strict_states=True):
+    """
+    State machine for managing Crawl lifecycle.
+
+    Hook Lifecycle:
+    ┌─────────────────────────────────────────────────────────────┐
+    │ QUEUED State                                                │
+    │  • Waiting for crawl to be ready (has URLs)                 │
+    └─────────────────────────────────────────────────────────────┘
+                            ↓ tick() when can_start()
+    ┌─────────────────────────────────────────────────────────────┐
+    │ STARTED State → enter_started()                             │
+    │  1. crawl.run()                                             │
+    │     • discover_hooks('Crawl') → finds all crawl hooks       │
+    │     • For each hook:                                        │
+    │       - run_hook(script, output_dir, ...)                   │
+    │       - Parse JSONL from hook output                        │
+    │       - process_hook_records() → creates Snapshots          │
+    │     • create_root_snapshot() → root snapshot for crawl      │
+    │     • create_snapshots_from_urls() → from self.urls field   │
+    │                                                              │
+    │  2. Snapshots process independently with their own          │
+    │     state machines (see SnapshotMachine)                    │
+    └─────────────────────────────────────────────────────────────┘
+                            ↓ tick() when is_finished()
+    ┌─────────────────────────────────────────────────────────────┐
+    │ SEALED State → enter_sealed()                               │
+    │  • cleanup() → runs on_CrawlEnd hooks, kills background     │
+    │  • Set retry_at=None (no more processing)                   │
+    └─────────────────────────────────────────────────────────────┘
+    """
+
+    model_attr_name = 'crawl'
+
+    # States
+    queued = State(value=Crawl.StatusChoices.QUEUED, initial=True)
+    started = State(value=Crawl.StatusChoices.STARTED)
+    sealed = State(value=Crawl.StatusChoices.SEALED, final=True)
+
+    # Tick Event
+    tick = (
+        queued.to.itself(unless='can_start') |
+        queued.to(started, cond='can_start') |
+        started.to.itself(unless='is_finished') |
+        started.to(sealed, cond='is_finished')
+    )
+
+    def can_start(self) -> bool:
+        if not self.crawl.urls:
+            print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: no URLs[/red]')
+            return False
+        urls_list = self.crawl.get_urls_list()
+        if not urls_list:
+            print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: no valid URLs in urls field[/red]')
+            return False
+        return True
+
+    def is_finished(self) -> bool:
+        from archivebox.core.models import Snapshot
+
+        # check that at least one snapshot exists for this crawl
+        snapshots = Snapshot.objects.filter(crawl=self.crawl)
+        if not snapshots.exists():
+            return False
+
+        # check if all snapshots are sealed
+        # Snapshots handle their own background hooks via the step system,
+        # so we just need to wait for all snapshots to reach sealed state
+        if snapshots.filter(status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]).exists():
+            return False
+
+        return True
+
+    @started.enter
+    def enter_started(self):
+        # Lock the crawl by bumping retry_at so other workers don't pick it up while we create snapshots
+        self.crawl.update_and_requeue(
+            retry_at=timezone.now() + timedelta(seconds=30),  # Lock for 30 seconds
+        )
+
+        try:
+            # Run the crawl - runs hooks, processes JSONL, creates snapshots
+            self.crawl.run()
+
+            # Update status to STARTED once snapshots are created
+            # Set retry_at to future so we don't busy-loop - wait for snapshots to process
+            self.crawl.update_and_requeue(
+                retry_at=timezone.now() + timedelta(seconds=5),  # Check again in 5s
+                status=Crawl.StatusChoices.STARTED,
+            )
+        except Exception as e:
+            print(f'[red]⚠️ Crawl {self.crawl.id} failed to start: {e}[/red]')
+            import traceback
+            traceback.print_exc()
+            # Re-raise so the worker knows it failed
+            raise
+
+    def on_started_to_started(self):
+        """Called when Crawl stays in started state (snapshots not sealed yet)."""
+        # Bump retry_at so we check again in a few seconds
+        self.crawl.update_and_requeue(
+            retry_at=timezone.now() + timedelta(seconds=5),
+        )
+
+    @sealed.enter
+    def enter_sealed(self):
+        # Clean up background hooks and run on_CrawlEnd hooks
+        self.crawl.cleanup()
+
+        self.crawl.update_and_requeue(
+            retry_at=None,
+            status=Crawl.StatusChoices.SEALED,
+        )
+
+
+# =============================================================================
+# Register State Machines
+# =============================================================================
+
+# Manually register state machines with python-statemachine registry
+# (normally auto-discovered from statemachines.py, but we define them here for clarity)
+registry.register(CrawlMachine)
--- a/archivebox/crawls/statemachines.py
+++ b/archivebox/crawls/statemachines.py
@@ -1,114 +0,0 @@
-__package__ = 'archivebox.crawls'
-
-import os
-from typing import ClassVar
-from datetime import timedelta
-from django.utils import timezone
-
-from rich import print
-
-from statemachine import State, StateMachine
-
-# from workers.actor import ActorType
-from crawls.models import Crawl
-
-
-class CrawlMachine(StateMachine, strict_states=True):
-    """State machine for managing Crawl lifecycle."""
-    
-    model: Crawl
-    
-    # States
-    queued = State(value=Crawl.StatusChoices.QUEUED, initial=True)
-    started = State(value=Crawl.StatusChoices.STARTED)
-    sealed = State(value=Crawl.StatusChoices.SEALED, final=True)
-    
-    # Tick Event
-    tick = (
-        queued.to.itself(unless='can_start') |
-        queued.to(started, cond='can_start') |
-        started.to.itself(unless='is_finished') |
-        started.to(sealed, cond='is_finished')
-    )
-    
-    def __init__(self, crawl, *args, **kwargs):
-        self.crawl = crawl
-        super().__init__(crawl, *args, **kwargs)
-    
-    def __repr__(self) -> str:
-        return f'Crawl[{self.crawl.id}]'
-
-    def __str__(self) -> str:
-        return self.__repr__()
-        
-    def can_start(self) -> bool:
-        if not self.crawl.urls:
-            print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: no URLs[/red]')
-            return False
-        urls_list = self.crawl.get_urls_list()
-        if not urls_list:
-            print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: no valid URLs in urls field[/red]')
-            return False
-        return True
-        
-    def is_finished(self) -> bool:
-        from core.models import Snapshot, ArchiveResult
-        
-        # check that at least one snapshot exists for this crawl
-        snapshots = Snapshot.objects.filter(crawl=self.crawl)
-        if not snapshots.exists():
-            return False
-        
-        # check to make sure no snapshots are in non-final states
-        if snapshots.filter(status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]).exists():
-            return False
-        
-        # check that some archiveresults exist for this crawl
-        results = ArchiveResult.objects.filter(snapshot__crawl=self.crawl)
-        if not results.exists():
-            return False
-        
-        # check if all archiveresults are finished
-        if results.filter(status__in=[ArchiveResult.StatusChoices.QUEUED, ArchiveResult.StatusChoices.STARTED]).exists():
-            return False
-        
-        return True
-        
-    # def before_transition(self, event, state):
-    #     print(f"Before '{event}', on the '{state.id}' state.")
-    #     return "before_transition_return"
-
-    @started.enter
-    def enter_started(self):
-        # Suppressed: state transition logs
-        # Lock the crawl by bumping retry_at so other workers don't pick it up while we create snapshots
-        self.crawl.update_for_workers(
-            retry_at=timezone.now() + timedelta(seconds=30),  # Lock for 30 seconds
-        )
-
-        try:
-            # Run the crawl - runs hooks, processes JSONL, creates snapshots
-            self.crawl.run()
-
-            # Update status to STARTED once snapshots are created
-            self.crawl.update_for_workers(
-                retry_at=timezone.now(),  # Process immediately
-                status=Crawl.StatusChoices.STARTED,
-            )
-        except Exception as e:
-            print(f'[red]⚠️ Crawl {self.crawl.id} failed to start: {e}[/red]')
-            import traceback
-            traceback.print_exc()
-            # Re-raise so the worker knows it failed
-            raise
-
-    @sealed.enter
-    def enter_sealed(self):
-        # Clean up background hooks and run on_CrawlEnd hooks
-        self.crawl.cleanup()
-
-        # Suppressed: state transition logs
-        self.crawl.update_for_workers(
-            retry_at=None,
-            status=Crawl.StatusChoices.SEALED,
-        )