unified Process source of truth and better screenshot tests

This commit is contained in:
Nick Sweeting
2026-01-02 04:20:34 -08:00
parent 3672174dad
commit dd77511026
44 changed files with 3369 additions and 1919 deletions

View File

@@ -4,6 +4,7 @@
from django.db import migrations, connection
import json
from pathlib import Path
from archivebox.uuid_compat import uuid7
def parse_cmd_field(cmd_raw):
@@ -39,7 +40,6 @@ def parse_cmd_field(cmd_raw):
def get_or_create_current_machine(cursor):
"""Get or create Machine.current() using raw SQL."""
import uuid
import socket
from datetime import datetime
@@ -55,7 +55,8 @@ def get_or_create_current_machine(cursor):
return row[0]
# Create new machine
machine_id = str(uuid.uuid4())
# Django UUIDField stores UUIDs as 32-char hex (no dashes) in SQLite
machine_id = uuid7().hex
now = datetime.now().isoformat()
# Check which columns exist (schema differs between 0.8.x and 0.9.x)
@@ -103,7 +104,6 @@ def get_or_create_binary(cursor, machine_id, name, abspath, version):
Returns:
binary_id (str)
"""
import uuid
from datetime import datetime
# If abspath is just a name without slashes, it's not a full path
@@ -123,7 +123,8 @@ def get_or_create_binary(cursor, machine_id, name, abspath, version):
return row[0]
# Create new binary
binary_id = str(uuid.uuid4())
# Django UUIDField stores UUIDs as 32-char hex (no dashes) in SQLite
binary_id = uuid7().hex
now = datetime.now().isoformat()
# Check which columns exist (schema differs between 0.8.x and 0.9.x)
@@ -186,10 +187,10 @@ def create_process(cursor, machine_id, pwd, cmd, status, exit_code, started_at,
Returns:
process_id (str)
"""
import uuid
from datetime import datetime
process_id = str(uuid.uuid4())
# Django UUIDField stores UUIDs as 32-char hex (no dashes) in SQLite
process_id = uuid7().hex
now = datetime.now().isoformat()
# Convert cmd array to JSON

View File

@@ -0,0 +1,18 @@
# Generated by Django 6.0 on 2026-01-02 08:43
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0027_copy_archiveresult_to_process'),
]
operations = [
migrations.AlterField(
model_name='snapshot',
name='fs_version',
field=models.CharField(default='0.9.0', help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().', max_length=10),
),
]

View File

@@ -0,0 +1,181 @@
# Generated by hand on 2026-01-02
# Migrate ArchiveResult from integer PK to UUID PK (matching Snapshot)
from django.db import migrations, models, connection
from uuid import UUID
from archivebox.uuid_compat import uuid7
def migrate_archiveresult_id_to_uuid(apps, schema_editor):
"""
Migrate ArchiveResult from integer PK to UUID PK.
Strategy:
1. Add old_id field to store current integer IDs
2. Generate UUIDs for any records missing them
3. Swap id and uuid fields (uuid becomes PK, old integer id becomes old_id)
"""
cursor = connection.cursor()
# Check if table exists and has data
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='core_archiveresult'")
if not cursor.fetchone():
print('ArchiveResult table does not exist, skipping migration')
return
cursor.execute("SELECT COUNT(*) FROM core_archiveresult")
row_count = cursor.fetchone()[0]
if row_count == 0:
print('No ArchiveResult records to migrate')
return
print(f'Migrating {row_count} ArchiveResult records from integer PK to UUID PK...')
# Step 0: Check if machine_process table exists, if not NULL out process_id values
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='machine_process'")
machine_process_exists = cursor.fetchone() is not None
if not machine_process_exists:
print('machine_process table does not exist yet, setting process_id to NULL')
cursor.execute("UPDATE core_archiveresult SET process_id = NULL WHERE process_id IS NOT NULL")
# Step 1: Create new table with UUID as primary key
cursor.execute("""
CREATE TABLE core_archiveresult_new (
id TEXT PRIMARY KEY NOT NULL,
old_id INTEGER,
uuid TEXT UNIQUE,
created_at DATETIME NOT NULL,
modified_at DATETIME NOT NULL,
snapshot_id TEXT NOT NULL,
plugin VARCHAR(32) NOT NULL,
hook_name VARCHAR(255) NOT NULL DEFAULT '',
status VARCHAR(15) NOT NULL DEFAULT 'queued',
retry_at DATETIME,
start_ts DATETIME,
end_ts DATETIME,
output_str TEXT NOT NULL DEFAULT '',
output_json TEXT,
output_files TEXT NOT NULL DEFAULT '{}',
output_size BIGINT NOT NULL DEFAULT 0,
output_mimetypes VARCHAR(512) NOT NULL DEFAULT '',
config TEXT NOT NULL DEFAULT '{}',
notes TEXT NOT NULL DEFAULT '',
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
num_uses_failed INTEGER NOT NULL DEFAULT 0,
process_id TEXT,
FOREIGN KEY (snapshot_id) REFERENCES core_snapshot(id) ON DELETE CASCADE,
FOREIGN KEY (process_id) REFERENCES machine_process(id) ON DELETE SET NULL
);
""")
# Step 2: Generate UUIDs for records that don't have them
cursor.execute("SELECT id, uuid FROM core_archiveresult")
records = cursor.fetchall()
id_to_uuid = {}
for old_id, existing_uuid in records:
if existing_uuid:
# Normalize existing UUID to 32-char hex format (Django SQLite UUIDField format)
# (existing UUIDs might be stored with or without dashes in old schema)
id_to_uuid[old_id] = UUID(existing_uuid).hex
else:
# Generate new UUIDv7 (time-ordered) as 32-char hex
id_to_uuid[old_id] = uuid7().hex
# Step 3: Copy data with UUIDs as new primary key
cursor.execute("SELECT * FROM core_archiveresult")
old_records = cursor.fetchall()
# Get column names
cursor.execute("PRAGMA table_info(core_archiveresult)")
columns = cursor.fetchall()
col_names = [col[1] for col in columns]
for i, record in enumerate(old_records):
old_id = record[col_names.index('id')]
new_uuid = id_to_uuid[old_id]
# Build insert with new structure
values = {col_names[i]: record[i] for i in range(len(col_names))}
# Check which fields exist in new table
fields_to_copy = [
'created_at', 'modified_at', 'snapshot_id', 'plugin', 'hook_name',
'status', 'retry_at', 'start_ts', 'end_ts',
'output_str', 'output_json', 'output_files', 'output_size', 'output_mimetypes',
'config', 'notes', 'num_uses_succeeded', 'num_uses_failed', 'process_id'
]
# Build INSERT statement
existing_fields = [f for f in fields_to_copy if f in values]
placeholders = ', '.join(['?'] * (len(existing_fields) + 3)) # +3 for id, old_id, uuid
field_list = 'id, old_id, uuid, ' + ', '.join(existing_fields)
insert_values = [new_uuid, old_id, new_uuid] + [values.get(f) for f in existing_fields]
cursor.execute(
f"INSERT INTO core_archiveresult_new ({field_list}) VALUES ({placeholders})",
insert_values
)
# Step 4: Replace old table with new table
cursor.execute("DROP TABLE core_archiveresult")
cursor.execute("ALTER TABLE core_archiveresult_new RENAME TO core_archiveresult")
# Step 5: Create indexes
cursor.execute("CREATE INDEX core_archiveresult_snapshot_id_idx ON core_archiveresult(snapshot_id)")
cursor.execute("CREATE INDEX core_archiveresult_plugin_idx ON core_archiveresult(plugin)")
cursor.execute("CREATE INDEX core_archiveresult_status_idx ON core_archiveresult(status)")
cursor.execute("CREATE INDEX core_archiveresult_retry_at_idx ON core_archiveresult(retry_at)")
cursor.execute("CREATE INDEX core_archiveresult_created_at_idx ON core_archiveresult(created_at)")
cursor.execute("CREATE INDEX core_archiveresult_hook_name_idx ON core_archiveresult(hook_name)")
cursor.execute("CREATE INDEX core_archiveresult_process_id_idx ON core_archiveresult(process_id)")
cursor.execute("CREATE INDEX core_archiveresult_old_id_idx ON core_archiveresult(old_id)")
print(f'✓ Migrated {row_count} ArchiveResult records to UUID primary key')
class Migration(migrations.Migration):
dependencies = [
('core', '0028_alter_snapshot_fs_version'),
]
operations = [
migrations.SeparateDatabaseAndState(
database_operations=[
migrations.RunPython(
migrate_archiveresult_id_to_uuid,
reverse_code=migrations.RunPython.noop,
),
],
state_operations=[
# Remove old uuid field
migrations.RemoveField(
model_name='archiveresult',
name='uuid',
),
# Change id from AutoField to UUIDField
migrations.AlterField(
model_name='archiveresult',
name='id',
field=models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True),
),
# Add old_id field to preserve legacy integer IDs
migrations.AddField(
model_name='archiveresult',
name='old_id',
field=models.IntegerField(null=True, blank=True, db_index=True, help_text='Legacy integer ID from pre-0.9.0 versions'),
),
],
),
]

View File

@@ -0,0 +1,19 @@
# Generated by Django 6.0 on 2026-01-02 10:02
import uuid
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0029_migrate_archiveresult_to_uuid_pk'),
]
operations = [
migrations.AlterField(
model_name='archiveresult',
name='id',
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
]

View File

@@ -362,6 +362,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
# Migrate filesystem if needed (happens automatically on save)
if self.pk and self.fs_migration_needed:
print(f"[DEBUG save()] Triggering filesystem migration for {str(self.id)[:8]}: {self.fs_version}{self._fs_current_version()}")
# Walk through migration chain automatically
current = self.fs_version
target = self._fs_current_version()
@@ -372,6 +373,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
# Only run if method exists (most are no-ops)
if hasattr(self, method):
print(f"[DEBUG save()] Running {method}()")
getattr(self, method)()
current = next_ver
@@ -449,10 +451,18 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
old_dir = self.get_storage_path_for_version('0.8.0')
new_dir = self.get_storage_path_for_version('0.9.0')
if not old_dir.exists() or old_dir == new_dir or new_dir.exists():
# Even if no directory migration needed, still convert index format
self.convert_index_json_to_jsonl()
return
print(f"[DEBUG _fs_migrate] {self.timestamp}: old_exists={old_dir.exists()}, same={old_dir == new_dir}, new_exists={new_dir.exists()}")
if not old_dir.exists() or old_dir == new_dir:
# No migration needed
print(f"[DEBUG _fs_migrate] Returning None (early return)")
return None
if new_dir.exists():
# New directory already exists (files already copied), but we still need cleanup
# Return cleanup info so old directory can be cleaned up
print(f"[DEBUG _fs_migrate] Returning cleanup info (new_dir exists)")
return (old_dir, new_dir)
new_dir.mkdir(parents=True, exist_ok=True)
@@ -495,47 +505,32 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
def _cleanup_old_migration_dir(self, old_dir: Path, new_dir: Path):
"""
Delete old directory and create symlink after successful migration.
Called via transaction.on_commit() after DB commit succeeds.
"""
import shutil
import logging
print(f"[DEBUG] _cleanup_old_migration_dir called: old_dir={old_dir}, new_dir={new_dir}")
# Delete old directory
if old_dir.exists() and not old_dir.is_symlink():
print(f"[DEBUG] Attempting to delete old directory: {old_dir}")
try:
shutil.rmtree(old_dir)
print(f"[DEBUG] Successfully deleted old directory: {old_dir}")
except Exception as e:
# Log but don't raise - migration succeeded, this is just cleanup
print(f"[DEBUG] Failed to delete old directory {old_dir}: {e}")
logging.getLogger('archivebox.migration').warning(
f"Could not remove old migration directory {old_dir}: {e}"
)
return # Don't create symlink if cleanup failed
else:
print(f"[DEBUG] Old directory doesn't exist or is already a symlink: {old_dir}")
# Create backwards-compat symlink (after old dir is deleted)
symlink_path = old_dir # Same path as old_dir
if symlink_path.is_symlink():
print(f"[DEBUG] Unlinking existing symlink: {symlink_path}")
symlink_path.unlink()
if not symlink_path.exists():
print(f"[DEBUG] Creating symlink: {symlink_path} -> {new_dir}")
try:
symlink_path.symlink_to(new_dir, target_is_directory=True)
print(f"[DEBUG] Successfully created symlink")
except Exception as e:
print(f"[DEBUG] Failed to create symlink: {e}")
logging.getLogger('archivebox.migration').warning(
f"Could not create symlink from {symlink_path} to {new_dir}: {e}"
)
else:
print(f"[DEBUG] Symlink path already exists: {symlink_path}")
# =========================================================================
# Path Calculation and Migration Helpers
@@ -660,13 +655,28 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
if not timestamp:
return None
# Look up existing
# Look up existing (try exact match first, then fuzzy match for truncated timestamps)
try:
return cls.objects.get(url=url, timestamp=timestamp)
snapshot = cls.objects.get(url=url, timestamp=timestamp)
print(f"[DEBUG load_from_directory] Found existing snapshot for {url} @ {timestamp}: {str(snapshot.id)[:8]}")
return snapshot
except cls.DoesNotExist:
print(f"[DEBUG load_from_directory] NOT FOUND (exact): {url} @ {timestamp}")
# Try fuzzy match - index.json may have truncated timestamp
# e.g., index has "1767000340" but DB has "1767000340.624737"
candidates = cls.objects.filter(url=url, timestamp__startswith=timestamp)
if candidates.count() == 1:
snapshot = candidates.first()
print(f"[DEBUG load_from_directory] Found via fuzzy match: {snapshot.timestamp}")
return snapshot
elif candidates.count() > 1:
print(f"[DEBUG load_from_directory] Multiple fuzzy matches, using first")
return candidates.first()
print(f"[DEBUG load_from_directory] NOT FOUND (fuzzy): {url} @ {timestamp}")
return None
except cls.MultipleObjectsReturned:
# Should not happen with unique constraint
print(f"[DEBUG load_from_directory] Multiple snapshots found for {url} @ {timestamp}")
return cls.objects.filter(url=url, timestamp=timestamp).first()
@classmethod
@@ -1668,83 +1678,20 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
return archiveresults
def advance_step_if_ready(self) -> bool:
"""
Advance current_step if all foreground hooks in current step are finished.
Called by the state machine to check if step can advance.
Background hooks (.bg) don't block step advancement.
Step advancement rules:
- All foreground ARs in current step must be finished (SUCCEEDED/FAILED/SKIPPED)
- Background ARs (hook_name contains '.bg.') are ignored for advancement
- When ready, increments current_step by 1 (up to 9)
Returns:
True if step was advanced, False if not ready or already at step 9.
"""
from archivebox.hooks import extract_step, is_background_hook
if self.current_step >= 9:
return False # Already at final step
# Get all ARs for current step that are foreground
current_step_ars = self.archiveresult_set.filter(
hook_name__isnull=False
).exclude(hook_name='')
# Check each AR in current step
for ar in current_step_ars:
ar_step = extract_step(ar.hook_name)
if ar_step != self.current_step:
continue # Not in current step
if is_background_hook(ar.hook_name):
continue # Background hooks don't block
# Foreground hook in current step - check if finished
if ar.status not in ArchiveResult.FINAL_OR_ACTIVE_STATES:
# Still pending/queued - can't advance
return False
if ar.status == ArchiveResult.StatusChoices.STARTED:
# Still running - can't advance
return False
# All foreground hooks in current step are finished - advance!
self.current_step += 1
self.save(update_fields=['current_step', 'modified_at'])
return True
def is_finished_processing(self) -> bool:
"""
Check if this snapshot has finished processing.
Check if all ArchiveResults are finished.
Used by SnapshotMachine.is_finished() to determine if snapshot is complete.
Returns:
True if all archiveresults are finished (or no work to do), False otherwise.
Note: This is only called for observability/progress tracking.
SnapshotWorker owns the execution and doesn't poll this.
"""
# if no archiveresults exist yet, it's not finished
if not self.archiveresult_set.exists():
return False
# Check if any ARs are still pending/started
pending = self.archiveresult_set.exclude(
status__in=ArchiveResult.FINAL_OR_ACTIVE_STATES
).exists()
# Try to advance step if ready (handles step-based hook execution)
# This will increment current_step when all foreground hooks in current step are done
while self.advance_step_if_ready():
pass # Keep advancing until we can't anymore
# if archiveresults exist but are still pending, it's not finished
if self.pending_archiveresults().exists():
return False
# Don't wait for background hooks - they'll be cleaned up on entering sealed state
# Background hooks in STARTED state are excluded by pending_archiveresults()
# (STARTED is in FINAL_OR_ACTIVE_STATES) so once all results are FINAL or ACTIVE,
# we can transition to sealed and cleanup() will kill the background hooks
# otherwise archiveresults exist and are all finished, so it's finished
return True
return not pending
def get_progress_stats(self) -> dict:
"""
@@ -2242,7 +2189,6 @@ class SnapshotMachine(BaseStateMachine, strict_states=True):
tick = (
queued.to.itself(unless='can_start') |
queued.to(started, cond='can_start') |
started.to.itself(unless='is_finished') |
started.to(sealed, cond='is_finished')
)
@@ -2253,6 +2199,10 @@ class SnapshotMachine(BaseStateMachine, strict_states=True):
can_start = bool(self.snapshot.url)
return can_start
def is_finished(self) -> bool:
"""Check if all ArchiveResults for this snapshot are finished."""
return self.snapshot.is_finished_processing()
@queued.enter
def enter_queued(self):
self.snapshot.update_and_requeue(
@@ -2262,29 +2212,10 @@ class SnapshotMachine(BaseStateMachine, strict_states=True):
@started.enter
def enter_started(self):
import sys
print(f'[cyan] 🔄 SnapshotMachine.enter_started() - creating archiveresults for {self.snapshot.url}[/cyan]', file=sys.stderr)
# Run the snapshot - creates pending archiveresults for all enabled plugins
self.snapshot.run()
# Check if any archiveresults were created
ar_count = self.snapshot.archiveresult_set.count()
print(f'[cyan] 🔄 ArchiveResult count: {ar_count}[/cyan]', file=sys.stderr)
if ar_count == 0:
# No archiveresults created, seal immediately
print(f'[cyan] 🔄 No archiveresults created, sealing snapshot immediately[/cyan]', file=sys.stderr)
self.seal()
else:
# Set status = started with retry_at far future (so workers don't claim us - we're waiting for ARs)
# Last AR will manually call self.seal() when done
self.snapshot.update_and_requeue(
retry_at=timezone.now() + timedelta(days=365),
status=Snapshot.StatusChoices.STARTED,
)
print(f'[cyan] 🔄 {ar_count} archiveresults created, waiting for them to finish[/cyan]', file=sys.stderr)
"""Just mark as started - SnapshotWorker will create ARs and run hooks."""
self.snapshot.status = Snapshot.StatusChoices.STARTED
self.snapshot.retry_at = None # No more polling
self.snapshot.save(update_fields=['status', 'retry_at', 'modified_at'])
@sealed.enter
def enter_sealed(self):
@@ -2329,12 +2260,11 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
plugins = [get_plugin_name(e) for e in get_plugins()]
return tuple((e, e) for e in plugins)
# Keep AutoField for backward compatibility with 0.7.x databases
# UUID field is added separately by migration for new records
id = models.AutoField(primary_key=True, editable=False)
# Note: unique constraint is added by migration 0027 - don't set unique=True here
# or SQLite table recreation in earlier migrations will fail
uuid = models.UUIDField(default=uuid7, null=True, blank=True, db_index=True)
# UUID primary key (migrated from integer in 0029)
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
# old_id preserves the legacy integer ID for backward compatibility
old_id = models.IntegerField(null=True, blank=True, db_index=True, help_text='Legacy integer ID from pre-0.9.0 versions')
# Note: uuid field was removed in migration 0029 when id became UUID
created_at = models.DateTimeField(default=timezone.now, db_index=True)
modified_at = models.DateTimeField(auto_now=True)
@@ -2684,13 +2614,11 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
plugin_dir = Path(self.snapshot.output_dir) / self.plugin
start_ts = timezone.now()
is_bg_hook = False
process = None
for hook in hooks:
# Check if this is a background hook
is_bg_hook = is_background_hook(hook.name)
result = run_hook(
# Run hook using Process.launch() - returns Process model
process = run_hook(
hook,
output_dir=plugin_dir,
config=config,
@@ -2700,27 +2628,25 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
depth=self.snapshot.depth,
)
# Background hooks return None
if result is None:
is_bg_hook = True
# Update status based on hook execution
if is_bg_hook:
# BACKGROUND HOOK - still running, return immediately
# Status stays STARTED, will be finalized by Snapshot.cleanup()
self.status = self.StatusChoices.STARTED
# Link ArchiveResult to Process
self.process = process
self.start_ts = start_ts
if self.process_id:
self.process.pwd = str(plugin_dir)
self.process.save()
self.save(update_fields=['process_id', 'start_ts', 'modified_at'])
if not process:
# No hooks ran
self.status = self.StatusChoices.FAILED
self.output_str = 'No hooks executed'
self.save()
return
# Update status based on hook execution
if process.status == process.StatusChoices.RUNNING:
# BACKGROUND HOOK - still running, return immediately
# Status is already STARTED from enter_started(), will be finalized by Snapshot.cleanup()
return
# FOREGROUND HOOK - completed, update from filesystem
self.start_ts = start_ts
if self.process_id:
self.process.pwd = str(plugin_dir)
self.process.save()
self.update_from_output()
# Clean up empty output directory if no files were created
@@ -3037,26 +2963,30 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):
skipped = State(value=ArchiveResult.StatusChoices.SKIPPED, final=True)
# Tick Event - transitions based on conditions
# Flow: queued → started → (succeeded|failed|skipped)
# queued → skipped (if exceeded max attempts)
# started → backoff → started (retry)
tick = (
queued.to(skipped, cond='is_exceeded_max_attempts') | # Check skip first
queued.to.itself(unless='can_start') |
queued.to(started, cond='can_start') |
started.to.itself(unless='is_finished') |
started.to(succeeded, cond='is_succeeded') |
started.to(failed, cond='is_failed') |
started.to(skipped, cond='is_skipped') |
started.to(backoff, cond='is_backoff') |
backoff.to(skipped, cond='is_exceeded_max_attempts') | # Check skip from backoff too
backoff.to.itself(unless='can_start') |
backoff.to(started, cond='can_start') |
backoff.to(succeeded, cond='is_succeeded') |
backoff.to(failed, cond='is_failed') |
backoff.to(skipped, cond='is_skipped')
backoff.to(started, cond='can_start')
# Removed redundant transitions: backoff.to(succeeded/failed/skipped)
# Reason: backoff should always retry→started, then started→final states
)
def can_start(self) -> bool:
if not self.archiveresult.snapshot.url:
return False
"""Pure function - check if AR can start (has valid URL)."""
return bool(self.archiveresult.snapshot.url)
# Check if snapshot has exceeded MAX_URL_ATTEMPTS failed results
def is_exceeded_max_attempts(self) -> bool:
"""Check if snapshot has exceeded MAX_URL_ATTEMPTS failed results."""
from archivebox.config.configset import get_config
config = get_config(
@@ -3070,15 +3000,7 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):
status=ArchiveResult.StatusChoices.FAILED
).count()
if failed_count >= max_attempts:
# Mark this result as skipped since we've hit the limit
self.archiveresult.status = ArchiveResult.StatusChoices.SKIPPED
self.archiveresult.output_str = f'Skipped: snapshot exceeded MAX_URL_ATTEMPTS ({max_attempts} failures)'
self.archiveresult.retry_at = None
self.archiveresult.save()
return False
return True
return failed_count >= max_attempts
def is_succeeded(self) -> bool:
"""Check if extractor plugin succeeded (status was set by run())."""
@@ -3101,12 +3023,35 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):
)
def is_finished(self) -> bool:
"""Check if extraction has completed (success, failure, or skipped)."""
return self.archiveresult.status in (
"""
Check if extraction has completed (success, failure, or skipped).
For background hooks in STARTED state, checks if their Process has finished and reaps them.
"""
# If already in final state, return True
if self.archiveresult.status in (
ArchiveResult.StatusChoices.SUCCEEDED,
ArchiveResult.StatusChoices.FAILED,
ArchiveResult.StatusChoices.SKIPPED,
)
):
return True
# If in STARTED state with a Process, check if Process has finished running
if self.archiveresult.status == ArchiveResult.StatusChoices.STARTED:
if self.archiveresult.process_id:
process = self.archiveresult.process
# If process is NOT running anymore, reap the background hook
if not process.is_running():
self.archiveresult.update_from_output()
# Check if now in final state after reaping
return self.archiveresult.status in (
ArchiveResult.StatusChoices.SUCCEEDED,
ArchiveResult.StatusChoices.FAILED,
ArchiveResult.StatusChoices.SKIPPED,
)
return False
@queued.enter
def enter_queued(self):
@@ -3148,7 +3093,12 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):
)
def _check_and_seal_parent_snapshot(self):
"""Check if this is the last ArchiveResult to finish - if so, seal the parent Snapshot."""
"""
Check if this is the last ArchiveResult to finish - if so, seal the parent Snapshot.
Note: In the new architecture, SnapshotWorker handles step advancement and sealing.
This method is kept for backwards compatibility with manual CLI commands.
"""
import sys
snapshot = self.archiveresult.snapshot
@@ -3189,6 +3139,8 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):
def enter_failed(self):
import sys
print(f'[red] ❌ ArchiveResult.enter_failed() called for {self.archiveresult.plugin}[/red]', file=sys.stderr)
self.archiveresult.update_and_requeue(
retry_at=None,
status=ArchiveResult.StatusChoices.FAILED,
@@ -3207,6 +3159,16 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):
def enter_skipped(self):
import sys
# Set output_str if not already set (e.g., when skipped due to max attempts)
if not self.archiveresult.output_str and self.is_exceeded_max_attempts():
from archivebox.config.configset import get_config
config = get_config(
crawl=self.archiveresult.snapshot.crawl,
snapshot=self.archiveresult.snapshot,
)
max_attempts = config.get('MAX_URL_ATTEMPTS', 50)
self.archiveresult.output_str = f'Skipped: snapshot exceeded MAX_URL_ATTEMPTS ({max_attempts} failures)'
self.archiveresult.update_and_requeue(
retry_at=None,
status=ArchiveResult.StatusChoices.SKIPPED,