__package__ = 'archivebox.crawls' import os from typing import ClassVar from datetime import timedelta from django.utils import timezone from rich import print from statemachine import State, StateMachine # from workers.actor import ActorType from crawls.models import Crawl class CrawlMachine(StateMachine, strict_states=True): """State machine for managing Crawl lifecycle.""" model: Crawl # States queued = State(value=Crawl.StatusChoices.QUEUED, initial=True) started = State(value=Crawl.StatusChoices.STARTED) sealed = State(value=Crawl.StatusChoices.SEALED, final=True) # Tick Event tick = ( queued.to.itself(unless='can_start') | queued.to(started, cond='can_start') | started.to.itself(unless='is_finished') | started.to(sealed, cond='is_finished') ) def __init__(self, crawl, *args, **kwargs): self.crawl = crawl super().__init__(crawl, *args, **kwargs) def __repr__(self) -> str: return f'Crawl[{self.crawl.id}]' def __str__(self) -> str: return self.__repr__() def can_start(self) -> bool: if not self.crawl.seed: print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: no seed[/red]') return False if not self.crawl.seed.uri: print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: seed has no URI[/red]') return False return True def is_finished(self) -> bool: from core.models import Snapshot, ArchiveResult # check that at least one snapshot exists for this crawl snapshots = Snapshot.objects.filter(crawl=self.crawl) if not snapshots.exists(): return False # check to make sure no snapshots are in non-final states if snapshots.filter(status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]).exists(): return False # check that some archiveresults exist for this crawl results = ArchiveResult.objects.filter(snapshot__crawl=self.crawl) if not results.exists(): return False # check if all archiveresults are finished if results.filter(status__in=[Crawl.StatusChoices.QUEUED, Crawl.StatusChoices.STARTED]).exists(): return False return True # def before_transition(self, event, state): # print(f"Before '{event}', on the '{state.id}' state.") # return "before_transition_return" @started.enter def enter_started(self): # Suppressed: state transition logs # lock the crawl object while we create snapshots self.crawl.update_for_workers( retry_at=timezone.now() + timedelta(seconds=5), status=Crawl.StatusChoices.QUEUED, ) try: # Run on_Crawl hooks to validate/install dependencies self._run_crawl_hooks() # Run the crawl - creates root snapshot and processes queued URLs self.crawl.run() # only update status to STARTED once snapshots are created self.crawl.update_for_workers( retry_at=timezone.now() + timedelta(seconds=5), status=Crawl.StatusChoices.STARTED, ) except Exception as e: print(f'[red]⚠️ Crawl {self.crawl.id} failed to start: {e}[/red]') import traceback traceback.print_exc() # Re-raise so the worker knows it failed raise def _run_crawl_hooks(self): """Run on_Crawl hooks to validate/install dependencies.""" from pathlib import Path from archivebox.hooks import run_hooks, discover_hooks from archivebox.config import CONSTANTS # Discover and run all on_Crawl hooks hooks = discover_hooks('Crawl') if not hooks: return # Create a temporary output directory for hook results output_dir = Path(CONSTANTS.DATA_DIR) / 'tmp' / f'crawl_{self.crawl.id}' output_dir.mkdir(parents=True, exist_ok=True) # Run all on_Crawl hooks results = run_hooks( event_name='Crawl', output_dir=output_dir, timeout=60, config_objects=[self.crawl, self.crawl.seed] if self.crawl.seed else [self.crawl], crawl_id=str(self.crawl.id), seed_uri=self.crawl.seed.uri if self.crawl.seed else '', ) # Process hook results - parse JSONL output and create DB objects self._process_hook_results(results) def _process_hook_results(self, results: list): """Process JSONL output from hooks to create InstalledBinary and update Machine config.""" import json from machine.models import Machine, InstalledBinary machine = Machine.current() for result in results: if result['returncode'] != 0: # Hook failed - might indicate missing dependency continue # Parse JSONL output for line in result['stdout'].strip().split('\n'): if not line.strip(): continue try: obj = json.loads(line) obj_type = obj.get('type') if obj_type == 'InstalledBinary': # Create or update InstalledBinary record # Skip if essential fields are missing if not obj.get('name') or not obj.get('abspath') or not obj.get('version'): continue InstalledBinary.objects.update_or_create( machine=machine, name=obj['name'], defaults={ 'abspath': obj['abspath'], 'version': obj['version'], 'sha256': obj.get('sha256') or '', 'binprovider': obj.get('binprovider') or 'env', } ) elif obj_type == 'Machine': # Update Machine config method = obj.get('_method', 'update') if method == 'update': key = obj.get('key', '') value = obj.get('value') if key.startswith('config/'): config_key = key[7:] # Remove 'config/' prefix machine.config[config_key] = value machine.save(update_fields=['config']) elif obj_type == 'Dependency': # Dependency request - could trigger installation # For now just log it (installation hooks would be separate) print(f'[yellow]Dependency requested: {obj.get("bin_name")}[/yellow]') except json.JSONDecodeError: # Not JSON, skip continue @sealed.enter def enter_sealed(self): # Suppressed: state transition logs self.crawl.update_for_workers( retry_at=None, status=Crawl.StatusChoices.SEALED, )