mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
move tests into subfolder, add missing install hooks
This commit is contained in:
@@ -116,7 +116,7 @@ def upgrade_core_tables(apps, schema_editor):
|
||||
retry_at DATETIME,
|
||||
|
||||
depth INTEGER NOT NULL DEFAULT 0,
|
||||
fs_version VARCHAR(10) NOT NULL DEFAULT '0.9.0',
|
||||
fs_version VARCHAR(10) NOT NULL DEFAULT '0.8.0',
|
||||
config TEXT NOT NULL DEFAULT '{}',
|
||||
notes TEXT NOT NULL DEFAULT '',
|
||||
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
|
||||
@@ -326,6 +326,16 @@ class Migration(migrations.Migration):
|
||||
name='modified_at',
|
||||
field=models.DateTimeField(auto_now=True),
|
||||
),
|
||||
# Declare fs_version (already created in database with DEFAULT '0.8.0')
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='fs_version',
|
||||
field=models.CharField(
|
||||
max_length=10,
|
||||
default='0.8.0',
|
||||
help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().'
|
||||
),
|
||||
),
|
||||
|
||||
# SnapshotTag table already exists from v0.7.2, just declare it in state
|
||||
migrations.CreateModel(
|
||||
|
||||
@@ -150,11 +150,7 @@ class Migration(migrations.Migration):
|
||||
name='downloaded_at',
|
||||
field=models.DateTimeField(blank=True, db_index=True, default=None, editable=False, null=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='fs_version',
|
||||
field=models.CharField(default='0.9.0', help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().', max_length=10),
|
||||
),
|
||||
# NOTE: fs_version already added by migration 0023 with default='0.8.0'
|
||||
# NOTE: modified_at already added by migration 0023
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
|
||||
@@ -8,7 +8,7 @@ class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0025_alter_archiveresult_options_alter_snapshot_options_and_more'),
|
||||
('machine', '0003_add_process_type_and_parent'),
|
||||
('machine', '0007_add_process_type_and_parent'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
|
||||
388
archivebox/core/migrations/0027_copy_archiveresult_to_process.py
Normal file
388
archivebox/core/migrations/0027_copy_archiveresult_to_process.py
Normal file
@@ -0,0 +1,388 @@
|
||||
# Generated by hand on 2026-01-01
|
||||
# Copies ArchiveResult cmd/pwd/cmd_version data to Process records before removing old fields
|
||||
|
||||
from django.db import migrations, connection
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def parse_cmd_field(cmd_raw):
|
||||
"""
|
||||
Parse cmd field which could be:
|
||||
1. JSON array string: '["wget", "-p", "url"]'
|
||||
2. Space-separated string: 'wget -p url'
|
||||
3. NULL/empty
|
||||
|
||||
Returns list of strings.
|
||||
"""
|
||||
if not cmd_raw:
|
||||
return []
|
||||
|
||||
cmd_raw = cmd_raw.strip()
|
||||
|
||||
if not cmd_raw:
|
||||
return []
|
||||
|
||||
# Try to parse as JSON first
|
||||
if cmd_raw.startswith('['):
|
||||
try:
|
||||
parsed = json.loads(cmd_raw)
|
||||
if isinstance(parsed, list):
|
||||
return [str(x) for x in parsed]
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Fallback: split by spaces (simple approach, doesn't handle quoted strings)
|
||||
# This is acceptable since old cmd fields were mostly simple commands
|
||||
return cmd_raw.split()
|
||||
|
||||
|
||||
def get_or_create_current_machine(cursor):
|
||||
"""Get or create Machine.current() using raw SQL."""
|
||||
import uuid
|
||||
import socket
|
||||
from datetime import datetime
|
||||
|
||||
# Simple machine detection - get hostname as guid
|
||||
hostname = socket.gethostname()
|
||||
guid = f'host_{hostname}' # Simple but stable identifier
|
||||
|
||||
# Check if machine exists
|
||||
cursor.execute("SELECT id FROM machine_machine WHERE guid = ?", [guid])
|
||||
row = cursor.fetchone()
|
||||
|
||||
if row:
|
||||
return row[0]
|
||||
|
||||
# Create new machine
|
||||
machine_id = str(uuid.uuid4())
|
||||
now = datetime.now().isoformat()
|
||||
|
||||
# Check which columns exist (schema differs between 0.8.x and 0.9.x)
|
||||
cursor.execute("PRAGMA table_info(machine_machine)")
|
||||
machine_cols = {row[1] for row in cursor.fetchall()}
|
||||
|
||||
# Build INSERT statement based on available columns
|
||||
if 'config' in machine_cols:
|
||||
# 0.9.x schema with config column
|
||||
cursor.execute("""
|
||||
INSERT INTO machine_machine (
|
||||
id, created_at, modified_at, guid, hostname,
|
||||
hw_in_docker, hw_in_vm, hw_manufacturer, hw_product, hw_uuid,
|
||||
os_arch, os_family, os_platform, os_release, os_kernel,
|
||||
stats, config, num_uses_failed, num_uses_succeeded
|
||||
) VALUES (?, ?, ?, ?, ?, 0, 0, '', '', '',
|
||||
'', '', '', '', '', '{}', '{}', 0, 0)
|
||||
""", [machine_id, now, now, guid, hostname])
|
||||
else:
|
||||
# 0.8.x schema without config column
|
||||
cursor.execute("""
|
||||
INSERT INTO machine_machine (
|
||||
id, created_at, modified_at, guid, hostname,
|
||||
hw_in_docker, hw_in_vm, hw_manufacturer, hw_product, hw_uuid,
|
||||
os_arch, os_family, os_platform, os_release, os_kernel,
|
||||
stats, num_uses_failed, num_uses_succeeded
|
||||
) VALUES (?, ?, ?, ?, ?, 0, 0, '', '', '',
|
||||
'', '', '', '', '', '{}', 0, 0)
|
||||
""", [machine_id, now, now, guid, hostname])
|
||||
|
||||
return machine_id
|
||||
|
||||
|
||||
def get_or_create_binary(cursor, machine_id, name, abspath, version):
|
||||
"""
|
||||
Get or create Binary record.
|
||||
|
||||
Args:
|
||||
cursor: DB cursor
|
||||
machine_id: Machine FK
|
||||
name: Binary name (basename of command)
|
||||
abspath: Absolute path to binary (or just name if path unknown)
|
||||
version: Version string
|
||||
|
||||
Returns:
|
||||
binary_id (str)
|
||||
"""
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
|
||||
# If abspath is just a name without slashes, it's not a full path
|
||||
# Store it in both fields for simplicity
|
||||
if '/' not in abspath:
|
||||
# Not a full path - store as-is
|
||||
pass
|
||||
|
||||
# Check if binary exists with same machine, name, abspath, version
|
||||
cursor.execute("""
|
||||
SELECT id FROM machine_binary
|
||||
WHERE machine_id = ? AND name = ? AND abspath = ? AND version = ?
|
||||
""", [machine_id, name, abspath, version])
|
||||
|
||||
row = cursor.fetchone()
|
||||
if row:
|
||||
return row[0]
|
||||
|
||||
# Create new binary
|
||||
binary_id = str(uuid.uuid4())
|
||||
now = datetime.now().isoformat()
|
||||
|
||||
# Check which columns exist (schema differs between 0.8.x and 0.9.x)
|
||||
cursor.execute("PRAGMA table_info(machine_binary)")
|
||||
binary_cols = {row[1] for row in cursor.fetchall()}
|
||||
|
||||
# Use only columns that exist in current schema
|
||||
# 0.8.x schema: id, created_at, modified_at, machine_id, name, binprovider, abspath, version, sha256, num_uses_failed, num_uses_succeeded
|
||||
# 0.9.x schema adds: binproviders, overrides, status, retry_at, output_dir
|
||||
if 'binproviders' in binary_cols:
|
||||
# 0.9.x schema
|
||||
cursor.execute("""
|
||||
INSERT INTO machine_binary (
|
||||
id, created_at, modified_at, machine_id,
|
||||
name, binproviders, overrides, binprovider, abspath, version, sha256,
|
||||
status, retry_at, output_dir,
|
||||
num_uses_failed, num_uses_succeeded
|
||||
) VALUES (?, ?, ?, ?, ?, 'env', '{}', 'env', ?, ?, '',
|
||||
'succeeded', NULL, '', 0, 0)
|
||||
""", [binary_id, now, now, machine_id, name, abspath, version])
|
||||
else:
|
||||
# 0.8.x schema (simpler)
|
||||
cursor.execute("""
|
||||
INSERT INTO machine_binary (
|
||||
id, created_at, modified_at, machine_id,
|
||||
name, binprovider, abspath, version, sha256,
|
||||
num_uses_failed, num_uses_succeeded
|
||||
) VALUES (?, ?, ?, ?, ?, 'env', ?, ?, '', 0, 0)
|
||||
""", [binary_id, now, now, machine_id, name, abspath, version])
|
||||
|
||||
return binary_id
|
||||
|
||||
|
||||
def map_status(old_status):
|
||||
"""
|
||||
Map old ArchiveResult status to Process status and exit_code.
|
||||
|
||||
Args:
|
||||
old_status: One of: queued, started, backoff, succeeded, failed, skipped
|
||||
|
||||
Returns:
|
||||
(process_status, exit_code) tuple
|
||||
"""
|
||||
status_map = {
|
||||
'queued': ('queued', None),
|
||||
'started': ('running', None),
|
||||
'backoff': ('queued', None),
|
||||
'succeeded': ('exited', 0),
|
||||
'failed': ('exited', 1),
|
||||
'skipped': ('exited', None), # Skipped = exited without error
|
||||
}
|
||||
|
||||
return status_map.get(old_status, ('queued', None))
|
||||
|
||||
|
||||
def create_process(cursor, machine_id, pwd, cmd, status, exit_code, started_at, ended_at, binary_id):
|
||||
"""
|
||||
Create a Process record.
|
||||
|
||||
Returns:
|
||||
process_id (str)
|
||||
"""
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
|
||||
process_id = str(uuid.uuid4())
|
||||
now = datetime.now().isoformat()
|
||||
|
||||
# Convert cmd array to JSON
|
||||
cmd_json = json.dumps(cmd)
|
||||
|
||||
# Set retry_at to now for queued processes, NULL otherwise
|
||||
retry_at = now if status == 'queued' else None
|
||||
|
||||
cursor.execute("""
|
||||
INSERT INTO machine_process (
|
||||
id, created_at, modified_at, machine_id, parent_id, process_type,
|
||||
pwd, cmd, env, timeout,
|
||||
pid, exit_code, stdout, stderr,
|
||||
started_at, ended_at,
|
||||
binary_id, iface_id, url,
|
||||
status, retry_at
|
||||
) VALUES (?, ?, ?, ?, NULL, 'cli',
|
||||
?, ?, '{}', 120,
|
||||
NULL, ?, '', '',
|
||||
?, ?,
|
||||
?, NULL, NULL,
|
||||
?, ?)
|
||||
""", [
|
||||
process_id, now, now, machine_id,
|
||||
pwd, cmd_json,
|
||||
exit_code,
|
||||
started_at, ended_at,
|
||||
binary_id,
|
||||
status, retry_at
|
||||
])
|
||||
|
||||
return process_id
|
||||
|
||||
|
||||
def copy_archiveresult_data_to_process(apps, schema_editor):
|
||||
"""
|
||||
Copy old ArchiveResult execution data (cmd, pwd, cmd_version) to Process records.
|
||||
|
||||
For each ArchiveResult without a process_id:
|
||||
1. Parse cmd field (handle both JSON array and space-separated string)
|
||||
2. Extract binary name/path from cmd[0]
|
||||
3. Get or create Binary record with machine, name, abspath, version
|
||||
4. Create Process record with mapped fields
|
||||
5. Link ArchiveResult.process_id to new Process
|
||||
|
||||
Status mapping:
|
||||
- queued → queued (exit_code=None)
|
||||
- started → running (exit_code=None)
|
||||
- backoff → queued (exit_code=None)
|
||||
- succeeded → exited (exit_code=0)
|
||||
- failed → exited (exit_code=1)
|
||||
- skipped → exited (exit_code=None)
|
||||
"""
|
||||
cursor = connection.cursor()
|
||||
|
||||
# Check if old fields still exist (skip if fresh install or already migrated)
|
||||
cursor.execute("PRAGMA table_info(core_archiveresult)")
|
||||
cols = {row[1] for row in cursor.fetchall()}
|
||||
|
||||
print(f'DEBUG 0027: Columns found: {sorted(cols)}')
|
||||
print(f'DEBUG 0027: Has cmd={("cmd" in cols)}, pwd={("pwd" in cols)}, cmd_version={("cmd_version" in cols)}, process_id={("process_id" in cols)}')
|
||||
|
||||
if 'cmd' not in cols or 'pwd' not in cols or 'cmd_version' not in cols:
|
||||
print('✓ Fresh install or fields already removed - skipping data copy')
|
||||
return
|
||||
|
||||
# Check if process_id field exists (should exist from 0026)
|
||||
if 'process_id' not in cols:
|
||||
print('✗ ERROR: process_id field not found. Migration 0026 must run first.')
|
||||
return
|
||||
|
||||
# Get or create Machine.current()
|
||||
machine_id = get_or_create_current_machine(cursor)
|
||||
|
||||
# Get ArchiveResults without process_id that have cmd data
|
||||
# Use plugin (extractor was renamed to plugin in migration 0025)
|
||||
cursor.execute("""
|
||||
SELECT id, snapshot_id, plugin, cmd, pwd, cmd_version,
|
||||
status, start_ts, end_ts, created_at
|
||||
FROM core_archiveresult
|
||||
WHERE process_id IS NULL
|
||||
AND (cmd IS NOT NULL OR pwd IS NOT NULL)
|
||||
""")
|
||||
|
||||
results = cursor.fetchall()
|
||||
|
||||
if not results:
|
||||
print('✓ No ArchiveResults need Process migration')
|
||||
return
|
||||
|
||||
print(f'Migrating {len(results)} ArchiveResults to Process records...')
|
||||
|
||||
migrated_count = 0
|
||||
skipped_count = 0
|
||||
error_count = 0
|
||||
|
||||
for i, row in enumerate(results):
|
||||
ar_id, snapshot_id, plugin, cmd_raw, pwd, cmd_version, status, start_ts, end_ts, created_at = row
|
||||
|
||||
if i == 0:
|
||||
print(f'DEBUG 0027: First row: ar_id={ar_id}, plugin={plugin}, cmd={cmd_raw[:50] if cmd_raw else None}, status={status}')
|
||||
|
||||
try:
|
||||
# Parse cmd field
|
||||
cmd_array = parse_cmd_field(cmd_raw)
|
||||
|
||||
if i == 0:
|
||||
print(f'DEBUG 0027: Parsed cmd: {cmd_array}')
|
||||
|
||||
# Extract binary info from cmd[0] if available
|
||||
binary_id = None
|
||||
if cmd_array and cmd_array[0]:
|
||||
binary_name = Path(cmd_array[0]).name or plugin # Fallback to plugin name
|
||||
binary_abspath = cmd_array[0]
|
||||
binary_version = cmd_version or ''
|
||||
|
||||
# Get or create Binary record
|
||||
binary_id = get_or_create_binary(
|
||||
cursor, machine_id, binary_name, binary_abspath, binary_version
|
||||
)
|
||||
|
||||
if i == 0:
|
||||
print(f'DEBUG 0027: Created Binary: id={binary_id}, name={binary_name}')
|
||||
|
||||
# Map status
|
||||
process_status, exit_code = map_status(status)
|
||||
|
||||
# Set timestamps
|
||||
started_at = start_ts or created_at
|
||||
ended_at = end_ts if process_status == 'exited' else None
|
||||
|
||||
# Create Process record
|
||||
process_id = create_process(
|
||||
cursor=cursor,
|
||||
machine_id=machine_id,
|
||||
pwd=pwd or '',
|
||||
cmd=cmd_array,
|
||||
status=process_status,
|
||||
exit_code=exit_code,
|
||||
started_at=started_at,
|
||||
ended_at=ended_at,
|
||||
binary_id=binary_id,
|
||||
)
|
||||
|
||||
if i == 0:
|
||||
print(f'DEBUG 0027: Created Process: id={process_id}')
|
||||
|
||||
# Link ArchiveResult to Process
|
||||
cursor.execute(
|
||||
"UPDATE core_archiveresult SET process_id = ? WHERE id = ?",
|
||||
[process_id, ar_id]
|
||||
)
|
||||
|
||||
migrated_count += 1
|
||||
|
||||
if i == 0:
|
||||
print(f'DEBUG 0027: Linked ArchiveResult to Process')
|
||||
|
||||
except Exception as e:
|
||||
print(f'✗ Error migrating ArchiveResult {ar_id}: {e}')
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
error_count += 1
|
||||
continue
|
||||
|
||||
print(f'✓ Migration complete: {migrated_count} migrated, {skipped_count} skipped, {error_count} errors')
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0026_add_process_to_archiveresult'),
|
||||
('machine', '0007_add_process_type_and_parent'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
# First, copy data from old fields to Process
|
||||
migrations.RunPython(
|
||||
copy_archiveresult_data_to_process,
|
||||
reverse_code=migrations.RunPython.noop,
|
||||
),
|
||||
|
||||
# Now safe to remove old fields (moved from 0025)
|
||||
migrations.RemoveField(
|
||||
model_name='archiveresult',
|
||||
name='cmd',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='archiveresult',
|
||||
name='cmd_version',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='archiveresult',
|
||||
name='pwd',
|
||||
),
|
||||
]
|
||||
@@ -362,24 +362,22 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
|
||||
# Migrate filesystem if needed (happens automatically on save)
|
||||
if self.pk and self.fs_migration_needed:
|
||||
from django.db import transaction
|
||||
with transaction.atomic():
|
||||
# Walk through migration chain automatically
|
||||
current = self.fs_version
|
||||
target = self._fs_current_version()
|
||||
# Walk through migration chain automatically
|
||||
current = self.fs_version
|
||||
target = self._fs_current_version()
|
||||
|
||||
while current != target:
|
||||
next_ver = self._fs_next_version(current)
|
||||
method = f'_fs_migrate_from_{current.replace(".", "_")}_to_{next_ver.replace(".", "_")}'
|
||||
while current != target:
|
||||
next_ver = self._fs_next_version(current)
|
||||
method = f'_fs_migrate_from_{current.replace(".", "_")}_to_{next_ver.replace(".", "_")}'
|
||||
|
||||
# Only run if method exists (most are no-ops)
|
||||
if hasattr(self, method):
|
||||
getattr(self, method)()
|
||||
# Only run if method exists (most are no-ops)
|
||||
if hasattr(self, method):
|
||||
getattr(self, method)()
|
||||
|
||||
current = next_ver
|
||||
current = next_ver
|
||||
|
||||
# Update version (still in transaction)
|
||||
self.fs_version = target
|
||||
# Update version
|
||||
self.fs_version = target
|
||||
|
||||
super().save(*args, **kwargs)
|
||||
if self.url not in self.crawl.urls:
|
||||
@@ -486,33 +484,58 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
# Convert index.json to index.jsonl in the new directory
|
||||
self.convert_index_json_to_jsonl()
|
||||
|
||||
# Create backwards-compat symlink (INSIDE transaction)
|
||||
symlink_path = CONSTANTS.ARCHIVE_DIR / self.timestamp
|
||||
if symlink_path.is_symlink():
|
||||
symlink_path.unlink()
|
||||
# Schedule cleanup AFTER transaction commits successfully
|
||||
# This ensures DB changes are committed before we delete old files
|
||||
from django.db import transaction
|
||||
transaction.on_commit(lambda: self._cleanup_old_migration_dir(old_dir, new_dir))
|
||||
|
||||
if not symlink_path.exists() or symlink_path == old_dir:
|
||||
symlink_path.symlink_to(new_dir, target_is_directory=True)
|
||||
# Return cleanup info for manual cleanup if needed (when called directly)
|
||||
return (old_dir, new_dir)
|
||||
|
||||
# Schedule old directory deletion AFTER transaction commits
|
||||
transaction.on_commit(lambda: self._cleanup_old_migration_dir(old_dir))
|
||||
|
||||
def _cleanup_old_migration_dir(self, old_dir: Path):
|
||||
def _cleanup_old_migration_dir(self, old_dir: Path, new_dir: Path):
|
||||
"""
|
||||
Delete old directory after successful migration.
|
||||
Delete old directory and create symlink after successful migration.
|
||||
Called via transaction.on_commit() after DB commit succeeds.
|
||||
"""
|
||||
import shutil
|
||||
import logging
|
||||
|
||||
print(f"[DEBUG] _cleanup_old_migration_dir called: old_dir={old_dir}, new_dir={new_dir}")
|
||||
|
||||
# Delete old directory
|
||||
if old_dir.exists() and not old_dir.is_symlink():
|
||||
print(f"[DEBUG] Attempting to delete old directory: {old_dir}")
|
||||
try:
|
||||
shutil.rmtree(old_dir)
|
||||
print(f"[DEBUG] Successfully deleted old directory: {old_dir}")
|
||||
except Exception as e:
|
||||
# Log but don't raise - migration succeeded, this is just cleanup
|
||||
print(f"[DEBUG] Failed to delete old directory {old_dir}: {e}")
|
||||
logging.getLogger('archivebox.migration').warning(
|
||||
f"Could not remove old migration directory {old_dir}: {e}"
|
||||
)
|
||||
return # Don't create symlink if cleanup failed
|
||||
else:
|
||||
print(f"[DEBUG] Old directory doesn't exist or is already a symlink: {old_dir}")
|
||||
|
||||
# Create backwards-compat symlink (after old dir is deleted)
|
||||
symlink_path = old_dir # Same path as old_dir
|
||||
if symlink_path.is_symlink():
|
||||
print(f"[DEBUG] Unlinking existing symlink: {symlink_path}")
|
||||
symlink_path.unlink()
|
||||
|
||||
if not symlink_path.exists():
|
||||
print(f"[DEBUG] Creating symlink: {symlink_path} -> {new_dir}")
|
||||
try:
|
||||
symlink_path.symlink_to(new_dir, target_is_directory=True)
|
||||
print(f"[DEBUG] Successfully created symlink")
|
||||
except Exception as e:
|
||||
print(f"[DEBUG] Failed to create symlink: {e}")
|
||||
logging.getLogger('archivebox.migration').warning(
|
||||
f"Could not create symlink from {symlink_path} to {new_dir}: {e}"
|
||||
)
|
||||
else:
|
||||
print(f"[DEBUG] Symlink path already exists: {symlink_path}")
|
||||
|
||||
# =========================================================================
|
||||
# Path Calculation and Migration Helpers
|
||||
@@ -1616,8 +1639,11 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
This enables step-based execution where all hooks in a step can run in parallel.
|
||||
"""
|
||||
from archivebox.hooks import discover_hooks
|
||||
from archivebox.config.configset import get_config
|
||||
|
||||
hooks = discover_hooks('Snapshot')
|
||||
# Get merged config with crawl-specific PLUGINS filter
|
||||
config = get_config(crawl=self.crawl, snapshot=self)
|
||||
hooks = discover_hooks('Snapshot', config=config)
|
||||
archiveresults = []
|
||||
|
||||
for hook_path in hooks:
|
||||
@@ -2212,22 +2238,19 @@ class SnapshotMachine(BaseStateMachine, strict_states=True):
|
||||
started = State(value=Snapshot.StatusChoices.STARTED)
|
||||
sealed = State(value=Snapshot.StatusChoices.SEALED, final=True)
|
||||
|
||||
# Tick Event
|
||||
# Tick Event (polled by workers)
|
||||
tick = (
|
||||
queued.to.itself(unless='can_start') |
|
||||
queued.to(started, cond='can_start') |
|
||||
started.to.itself(unless='is_finished', on='on_started_to_started') |
|
||||
started.to(sealed, cond='is_finished')
|
||||
queued.to(started, cond='can_start')
|
||||
)
|
||||
|
||||
# Manual event (triggered by last ArchiveResult finishing)
|
||||
seal = started.to(sealed)
|
||||
|
||||
def can_start(self) -> bool:
|
||||
can_start = bool(self.snapshot.url)
|
||||
return can_start
|
||||
|
||||
def is_finished(self) -> bool:
|
||||
"""Check if snapshot processing is complete - delegates to model method."""
|
||||
return self.snapshot.is_finished_processing()
|
||||
|
||||
@queued.enter
|
||||
def enter_queued(self):
|
||||
self.snapshot.update_and_requeue(
|
||||
@@ -2237,29 +2260,34 @@ class SnapshotMachine(BaseStateMachine, strict_states=True):
|
||||
|
||||
@started.enter
|
||||
def enter_started(self):
|
||||
# lock the snapshot while we create the pending archiveresults
|
||||
self.snapshot.update_and_requeue(
|
||||
retry_at=timezone.now() + timedelta(seconds=30), # if failed, wait 30s before retrying
|
||||
)
|
||||
import sys
|
||||
|
||||
print(f'[cyan] 🔄 SnapshotMachine.enter_started() - creating archiveresults for {self.snapshot.url}[/cyan]', file=sys.stderr)
|
||||
|
||||
# Run the snapshot - creates pending archiveresults for all enabled plugins
|
||||
self.snapshot.run()
|
||||
|
||||
# unlock the snapshot after we're done + set status = started
|
||||
self.snapshot.update_and_requeue(
|
||||
retry_at=timezone.now() + timedelta(seconds=5), # check again in 5s
|
||||
status=Snapshot.StatusChoices.STARTED,
|
||||
)
|
||||
# Check if any archiveresults were created
|
||||
ar_count = self.snapshot.archiveresult_set.count()
|
||||
print(f'[cyan] 🔄 ArchiveResult count: {ar_count}[/cyan]', file=sys.stderr)
|
||||
|
||||
def on_started_to_started(self):
|
||||
"""Called when Snapshot stays in started state (archiveresults not finished yet)."""
|
||||
# Bump retry_at so we check again in a few seconds
|
||||
self.snapshot.update_and_requeue(
|
||||
retry_at=timezone.now() + timedelta(seconds=5),
|
||||
)
|
||||
if ar_count == 0:
|
||||
# No archiveresults created, seal immediately
|
||||
print(f'[cyan] 🔄 No archiveresults created, sealing snapshot immediately[/cyan]', file=sys.stderr)
|
||||
self.seal()
|
||||
else:
|
||||
# Set status = started with retry_at far future (so workers don't claim us - we're waiting for ARs)
|
||||
# Last AR will manually call self.seal() when done
|
||||
self.snapshot.update_and_requeue(
|
||||
retry_at=timezone.now() + timedelta(days=365),
|
||||
status=Snapshot.StatusChoices.STARTED,
|
||||
)
|
||||
print(f'[cyan] 🔄 {ar_count} archiveresults created, waiting for them to finish[/cyan]', file=sys.stderr)
|
||||
|
||||
@sealed.enter
|
||||
def enter_sealed(self):
|
||||
import sys
|
||||
|
||||
# Clean up background hooks
|
||||
self.snapshot.cleanup()
|
||||
|
||||
@@ -2268,6 +2296,21 @@ class SnapshotMachine(BaseStateMachine, strict_states=True):
|
||||
status=Snapshot.StatusChoices.SEALED,
|
||||
)
|
||||
|
||||
print(f'[cyan] ✅ SnapshotMachine.enter_sealed() - sealed {self.snapshot.url}[/cyan]', file=sys.stderr)
|
||||
|
||||
# Check if this is the last snapshot for the parent crawl - if so, seal the crawl
|
||||
if self.snapshot.crawl:
|
||||
crawl = self.snapshot.crawl
|
||||
remaining_active = Snapshot.objects.filter(
|
||||
crawl=crawl,
|
||||
status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]
|
||||
).count()
|
||||
|
||||
if remaining_active == 0:
|
||||
print(f'[cyan]🔒 All snapshots sealed for crawl {crawl.id}, sealing crawl[/cyan]', file=sys.stderr)
|
||||
# Seal the parent crawl
|
||||
crawl.sm.seal()
|
||||
|
||||
|
||||
class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithStateMachine):
|
||||
class StatusChoices(models.TextChoices):
|
||||
@@ -3102,8 +3145,30 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):
|
||||
end_ts=None,
|
||||
)
|
||||
|
||||
def _check_and_seal_parent_snapshot(self):
|
||||
"""Check if this is the last ArchiveResult to finish - if so, seal the parent Snapshot."""
|
||||
import sys
|
||||
|
||||
snapshot = self.archiveresult.snapshot
|
||||
|
||||
# Check if all archiveresults are finished (in final states)
|
||||
remaining_active = snapshot.archiveresult_set.exclude(
|
||||
status__in=[
|
||||
ArchiveResult.StatusChoices.SUCCEEDED,
|
||||
ArchiveResult.StatusChoices.FAILED,
|
||||
ArchiveResult.StatusChoices.SKIPPED,
|
||||
]
|
||||
).count()
|
||||
|
||||
if remaining_active == 0:
|
||||
print(f'[cyan] 🔒 All archiveresults finished for snapshot {snapshot.url}, sealing snapshot[/cyan]', file=sys.stderr)
|
||||
# Seal the parent snapshot
|
||||
snapshot.sm.seal()
|
||||
|
||||
@succeeded.enter
|
||||
def enter_succeeded(self):
|
||||
import sys
|
||||
|
||||
self.archiveresult.update_and_requeue(
|
||||
retry_at=None,
|
||||
status=ArchiveResult.StatusChoices.SUCCEEDED,
|
||||
@@ -3113,8 +3178,15 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):
|
||||
# Update health stats for ArchiveResult, Snapshot, and Crawl cascade
|
||||
self.archiveresult.cascade_health_update(success=True)
|
||||
|
||||
print(f'[cyan] ✅ ArchiveResult succeeded: {self.archiveresult.plugin} for {self.archiveresult.snapshot.url}[/cyan]', file=sys.stderr)
|
||||
|
||||
# Check if this is the last AR to finish - seal parent snapshot if so
|
||||
self._check_and_seal_parent_snapshot()
|
||||
|
||||
@failed.enter
|
||||
def enter_failed(self):
|
||||
import sys
|
||||
|
||||
self.archiveresult.update_and_requeue(
|
||||
retry_at=None,
|
||||
status=ArchiveResult.StatusChoices.FAILED,
|
||||
@@ -3124,16 +3196,25 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):
|
||||
# Update health stats for ArchiveResult, Snapshot, and Crawl cascade
|
||||
self.archiveresult.cascade_health_update(success=False)
|
||||
|
||||
print(f'[red] ❌ ArchiveResult failed: {self.archiveresult.plugin} for {self.archiveresult.snapshot.url}[/red]', file=sys.stderr)
|
||||
|
||||
# Check if this is the last AR to finish - seal parent snapshot if so
|
||||
self._check_and_seal_parent_snapshot()
|
||||
|
||||
@skipped.enter
|
||||
def enter_skipped(self):
|
||||
import sys
|
||||
|
||||
self.archiveresult.update_and_requeue(
|
||||
retry_at=None,
|
||||
status=ArchiveResult.StatusChoices.SKIPPED,
|
||||
end_ts=timezone.now(),
|
||||
)
|
||||
|
||||
def after_transition(self, event: str, source: State, target: State):
|
||||
self.archiveresult.snapshot.update_and_requeue() # bump snapshot retry time so it picks up all the new changes
|
||||
print(f'[dim] ⏭️ ArchiveResult skipped: {self.archiveresult.plugin} for {self.archiveresult.snapshot.url}[/dim]', file=sys.stderr)
|
||||
|
||||
# Check if this is the last AR to finish - seal parent snapshot if so
|
||||
self._check_and_seal_parent_snapshot()
|
||||
|
||||
|
||||
# =============================================================================
|
||||
|
||||
Reference in New Issue
Block a user