diff --git a/README.md b/README.md
index 45406ee6..00656468 100644
--- a/README.md
+++ b/README.md
@@ -763,7 +763,7 @@ The configuration is documented here: **[Configuration Wiki](https://github.com/
TIMEOUT=240 # default: 60 add more seconds on slower networks
CHECK_SSL_VALIDITY=False # default: True False = allow saving URLs w/ bad SSL
-SAVE_ARCHIVE_DOT_ORG=False # default: True False = disable Archive.org saving
+SAVE_ARCHIVEDOTORG=False # default: True False = disable Archive.org saving
MAX_MEDIA_SIZE=1500m # default: 750m raise/lower youtubedl output size
PUBLIC_INDEX=True # default: True whether anon users can view index
@@ -959,7 +959,7 @@ archivebox add 'https://docs.google.com/document/d/12345somePrivateDocument'
archivebox add 'https://vimeo.com/somePrivateVideo'
# without first disabling saving to Archive.org:
-archivebox config --set SAVE_ARCHIVE_DOT_ORG=False # disable saving all URLs in Archive.org
+archivebox config --set SAVE_ARCHIVEDOTORG=False # disable saving all URLs in Archive.org
# restrict the main index, Snapshot content, and Add Page to authenticated users as-needed:
archivebox config --set PUBLIC_INDEX=False
diff --git a/archivebox/__init__.py b/archivebox/__init__.py
index 2cf819d4..8c0be7a0 100755
--- a/archivebox/__init__.py
+++ b/archivebox/__init__.py
@@ -26,10 +26,10 @@ ASCII_LOGO = """
PACKAGE_DIR = Path(__file__).resolve().parent
-# Add PACKAGE_DIR to sys.path - required for Django migrations to import models
-# Migrations reference models like 'machine.Binary' which need to be importable
-if str(PACKAGE_DIR) not in sys.path:
- sys.path.append(str(PACKAGE_DIR))
+# # Add PACKAGE_DIR to sys.path - required for Django migrations to import models
+# # Migrations reference models like 'machine.Binary' which need to be importable
+# if str(PACKAGE_DIR) not in sys.path:
+# sys.path.append(str(PACKAGE_DIR))
os.environ['DJANGO_SETTINGS_MODULE'] = 'archivebox.core.settings'
os.environ['TZ'] = 'UTC'
diff --git a/archivebox/api/apps.py b/archivebox/api/apps.py
index 86ee88ad..a1a0655a 100644
--- a/archivebox/api/apps.py
+++ b/archivebox/api/apps.py
@@ -5,6 +5,7 @@ from django.apps import AppConfig
class APIConfig(AppConfig):
name = 'archivebox.api'
+ label = 'api'
def register_admin(admin_site):
diff --git a/archivebox/api/v1_workers.py b/archivebox/api/v1_workers.py
index d95c6ff6..f4ff580e 100644
--- a/archivebox/api/v1_workers.py
+++ b/archivebox/api/v1_workers.py
@@ -94,7 +94,7 @@ class OrchestratorSchema(Schema):
@router.get("/orchestrator", response=OrchestratorSchema, url_name="get_orchestrator")
def get_orchestrator(request):
"""Get the orchestrator status and all worker queues."""
- from workers.orchestrator import Orchestrator
+ from archivebox.workers.orchestrator import Orchestrator
from workers.worker import CrawlWorker, SnapshotWorker, ArchiveResultWorker
orchestrator = Orchestrator()
diff --git a/archivebox/base_models/models.py b/archivebox/base_models/models.py
index 66499231..55f033b0 100755
--- a/archivebox/base_models/models.py
+++ b/archivebox/base_models/models.py
@@ -73,7 +73,7 @@ class ModelWithUUID(models.Model):
return f'/api/v1/docs#/{self._meta.app_label.title()}%20Models/api_v1_{self._meta.app_label}_get_{self._meta.db_table}'
def as_json(self, keys: Iterable[str] = ()) -> dict:
- default_keys = ('id', 'created_at', 'modified_at', 'created_by_id')
+ default_keys = ('id', 'created_at', 'modified_at')
return {key: getattr(self, key) for key in (keys or default_keys) if hasattr(self, key)}
@@ -119,7 +119,7 @@ class ModelWithHealthStats(models.Model):
class ModelWithConfig(models.Model):
"""Mixin for models with a JSON config field."""
- config = models.JSONField(default=dict, null=False, blank=False, editable=True)
+ config = models.JSONField(default=dict, null=True, blank=True, editable=True)
class Meta:
abstract = True
diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py
index 3a991d39..234d1316 100644
--- a/archivebox/cli/archivebox_add.py
+++ b/archivebox/cli/archivebox_add.py
@@ -56,7 +56,7 @@ def add(urls: str | list[str],
from archivebox.core.models import Snapshot
from archivebox.crawls.models import Crawl
from archivebox.base_models.models import get_or_create_system_user_pk
- from workers.orchestrator import Orchestrator
+ from archivebox.workers.orchestrator import Orchestrator
created_by_id = created_by_id or get_or_create_system_user_pk()
diff --git a/archivebox/cli/archivebox_crawl.py b/archivebox/cli/archivebox_crawl.py
index f73553db..3bedaade 100644
--- a/archivebox/cli/archivebox_crawl.py
+++ b/archivebox/cli/archivebox_crawl.py
@@ -78,7 +78,7 @@ def discover_outlinks(
from archivebox.core.models import Snapshot, ArchiveResult
from archivebox.crawls.models import Crawl
from archivebox.config import CONSTANTS
- from workers.orchestrator import Orchestrator
+ from archivebox.workers.orchestrator import Orchestrator
created_by_id = get_or_create_system_user_pk()
is_tty = sys.stdout.isatty()
diff --git a/archivebox/cli/archivebox_extract.py b/archivebox/cli/archivebox_extract.py
index 4005f365..29abd63d 100644
--- a/archivebox/cli/archivebox_extract.py
+++ b/archivebox/cli/archivebox_extract.py
@@ -96,7 +96,7 @@ def run_plugins(
TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
)
from archivebox.core.models import Snapshot, ArchiveResult
- from workers.orchestrator import Orchestrator
+ from archivebox.workers.orchestrator import Orchestrator
is_tty = sys.stdout.isatty()
diff --git a/archivebox/cli/archivebox_init.py b/archivebox/cli/archivebox_init.py
index e4dc58a4..ed67c77d 100755
--- a/archivebox/cli/archivebox_init.py
+++ b/archivebox/cli/archivebox_init.py
@@ -13,11 +13,9 @@ from archivebox.misc.util import docstring, enforce_types
@enforce_types
-def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=False) -> None:
+def init(force: bool=False, quick: bool=False, install: bool=False) -> None:
"""Initialize a new ArchiveBox collection in the current directory"""
- install = install or setup
-
from archivebox.config import CONSTANTS, VERSION, DATA_DIR
from archivebox.config.common import SERVER_CONFIG
from archivebox.config.collection import write_config_file
@@ -128,7 +126,8 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
print(f' [yellow]√ Added {len(orphaned_data_dir_links)} orphaned links from existing archive directories.[/yellow]')
if pending_links:
- Snapshot.objects.create_from_dicts(list(pending_links.values()))
+ for link_dict in pending_links.values():
+ Snapshot.from_jsonl(link_dict)
# Hint for orphaned snapshot directories
print()
@@ -187,7 +186,6 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
@click.option('--force', '-f', is_flag=True, help='Ignore unrecognized files in current directory and initialize anyway')
@click.option('--quick', '-q', is_flag=True, help='Run any updates or migrations without rechecking all snapshot dirs')
@click.option('--install', '-s', is_flag=True, help='Automatically install dependencies and extras used for archiving')
-@click.option('--setup', '-s', is_flag=True, help='DEPRECATED: equivalent to --install')
@docstring(init.__doc__)
def main(**kwargs) -> None:
init(**kwargs)
diff --git a/archivebox/cli/archivebox_install.py b/archivebox/cli/archivebox_install.py
index e9a7f7a5..f35adf5e 100755
--- a/archivebox/cli/archivebox_install.py
+++ b/archivebox/cli/archivebox_install.py
@@ -85,7 +85,7 @@ def install(dry_run: bool=False) -> None:
print()
# Run the crawl synchronously (this triggers on_Crawl hooks)
- from workers.orchestrator import Orchestrator
+ from archivebox.workers.orchestrator import Orchestrator
orchestrator = Orchestrator(exit_on_idle=True)
orchestrator.runloop()
diff --git a/archivebox/cli/archivebox_orchestrator.py b/archivebox/cli/archivebox_orchestrator.py
index 9f277e7d..4b272727 100644
--- a/archivebox/cli/archivebox_orchestrator.py
+++ b/archivebox/cli/archivebox_orchestrator.py
@@ -37,7 +37,7 @@ def orchestrator(daemon: bool = False, watch: bool = False) -> int:
0: All work completed successfully
1: Error occurred
"""
- from workers.orchestrator import Orchestrator
+ from archivebox.workers.orchestrator import Orchestrator
if Orchestrator.is_running():
print('[yellow]Orchestrator is already running[/yellow]')
diff --git a/archivebox/cli/archivebox_server.py b/archivebox/cli/archivebox_server.py
index fb0b1148..49490142 100644
--- a/archivebox/cli/archivebox_server.py
+++ b/archivebox/cli/archivebox_server.py
@@ -74,7 +74,7 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
tail_multiple_worker_logs,
is_port_in_use,
)
- from workers.orchestrator import Orchestrator
+ from archivebox.workers.orchestrator import Orchestrator
import sys
# Check if port is already in use
diff --git a/archivebox/cli/archivebox_snapshot.py b/archivebox/cli/archivebox_snapshot.py
index 6fba01a3..4d2f7b5f 100644
--- a/archivebox/cli/archivebox_snapshot.py
+++ b/archivebox/cli/archivebox_snapshot.py
@@ -163,7 +163,7 @@ def create_snapshots(
# If --plugins is passed, run the orchestrator for those plugins
if plugins:
- from workers.orchestrator import Orchestrator
+ from archivebox.workers.orchestrator import Orchestrator
rprint(f'[blue]Running plugins: {plugins or "all"}...[/blue]', file=sys.stderr)
orchestrator = Orchestrator(exit_on_idle=True)
orchestrator.runloop()
diff --git a/archivebox/cli/archivebox_update.py b/archivebox/cli/archivebox_update.py
index 49ba8f13..b0e29be9 100644
--- a/archivebox/cli/archivebox_update.py
+++ b/archivebox/cli/archivebox_update.py
@@ -160,7 +160,7 @@ def process_all_db_snapshots(batch_size: int = 100) -> dict:
total = Snapshot.objects.count()
print(f'[*] Processing {total} snapshots from database...')
- for snapshot in Snapshot.objects.iterator():
+ for snapshot in Snapshot.objects.iterator(chunk_size=batch_size):
# Reconcile index.json with DB
snapshot.reconcile_with_index_json()
@@ -209,7 +209,7 @@ def process_filtered_snapshots(
total = snapshots.count()
print(f'[*] Found {total} matching snapshots')
- for snapshot in snapshots.iterator():
+ for snapshot in snapshots.iterator(chunk_size=batch_size):
# Reconcile index.json with DB
snapshot.reconcile_with_index_json()
diff --git a/archivebox/cli/tests.py b/archivebox/cli/tests.py
index 5a2b74b9..27dec785 100644
--- a/archivebox/cli/tests.py
+++ b/archivebox/cli/tests.py
@@ -17,7 +17,7 @@ TEST_CONFIG = {
'DATA_DIR': 'data.tests',
- 'SAVE_ARCHIVE_DOT_ORG': 'False',
+ 'SAVE_ARCHIVEDOTORG': 'False',
'SAVE_TITLE': 'False',
'USE_CURL': 'False',
diff --git a/archivebox/cli/tests_piping.py b/archivebox/cli/tests_piping.py
index 88a7435d..23967550 100644
--- a/archivebox/cli/tests_piping.py
+++ b/archivebox/cli/tests_piping.py
@@ -32,7 +32,7 @@ from unittest.mock import patch, MagicMock
TEST_CONFIG = {
'USE_COLOR': 'False',
'SHOW_PROGRESS': 'False',
- 'SAVE_ARCHIVE_DOT_ORG': 'False',
+ 'SAVE_ARCHIVEDOTORG': 'False',
'SAVE_TITLE': 'True', # Fast extractor
'SAVE_FAVICON': 'False',
'SAVE_WGET': 'False',
diff --git a/archivebox/config/configset.py b/archivebox/config/configset.py
index 40d8db4c..9f6ee979 100644
--- a/archivebox/config/configset.py
+++ b/archivebox/config/configset.py
@@ -216,6 +216,29 @@ def get_config(
if snapshot and hasattr(snapshot, "config") and snapshot.config:
config.update(snapshot.config)
+ # Normalize all aliases to canonical names (after all sources merged)
+ # This handles aliases that came from user/crawl/snapshot configs, not just env
+ try:
+ from archivebox.hooks import discover_plugin_configs
+ plugin_configs = discover_plugin_configs()
+ aliases_to_normalize = {} # {alias_key: canonical_key}
+
+ # Build alias mapping from all plugin schemas
+ for plugin_name, schema in plugin_configs.items():
+ for canonical_key, prop_schema in schema.get('properties', {}).items():
+ for alias in prop_schema.get('x-aliases', []):
+ aliases_to_normalize[alias] = canonical_key
+
+ # Normalize: copy alias values to canonical keys (aliases take precedence)
+ for alias_key, canonical_key in aliases_to_normalize.items():
+ if alias_key in config:
+ # Alias exists - copy to canonical key (overwriting any default)
+ config[canonical_key] = config[alias_key]
+ # Remove alias from config to keep it clean
+ del config[alias_key]
+ except ImportError:
+ pass
+
return config
diff --git a/archivebox/core/apps.py b/archivebox/core/apps.py
index 5b173784..0362afe3 100644
--- a/archivebox/core/apps.py
+++ b/archivebox/core/apps.py
@@ -5,8 +5,12 @@ from django.apps import AppConfig
class CoreConfig(AppConfig):
name = 'archivebox.core'
+ label = 'core'
def ready(self):
"""Register the archivebox.core.admin_site as the main django admin site"""
from archivebox.core.admin_site import register_admin_site
register_admin_site()
+
+ # Import models to register state machines with the registry
+ from archivebox.core import models # noqa: F401
diff --git a/archivebox/core/migrations/0024_b_clear_config_fields.py b/archivebox/core/migrations/0024_b_clear_config_fields.py
new file mode 100644
index 00000000..112688dd
--- /dev/null
+++ b/archivebox/core/migrations/0024_b_clear_config_fields.py
@@ -0,0 +1,57 @@
+# Data migration to clear config fields that may contain invalid JSON
+# This runs before 0025 to prevent CHECK constraint failures
+
+from django.db import migrations
+
+
+def clear_config_fields(apps, schema_editor):
+ """Clear all config fields in related tables to avoid JSON validation errors."""
+ db_alias = schema_editor.connection.alias
+
+ # Disable foreign key checks temporarily to allow updates
+ with schema_editor.connection.cursor() as cursor:
+ cursor.execute("PRAGMA foreign_keys=OFF")
+
+ tables_to_clear = [
+ ('crawls_seed', 'config'),
+ ('crawls_crawl', 'config'),
+ ('crawls_crawlschedule', 'config') if 'crawlschedule' in dir() else None,
+ ('machine_machine', 'stats'),
+ ('machine_machine', 'config'),
+ ]
+
+ for table_info in tables_to_clear:
+ if table_info is None:
+ continue
+ table_name, field_name = table_info
+
+ try:
+ with schema_editor.connection.cursor() as cursor:
+ # Check if table exists first
+ cursor.execute(f"SELECT name FROM sqlite_master WHERE type='table' AND name='{table_name}'")
+ if not cursor.fetchone():
+ print(f" Skipping {table_name}.{field_name}: table does not exist")
+ continue
+
+ # Set all to empty JSON object
+ cursor.execute(f"UPDATE {table_name} SET {field_name} = '{{}}' WHERE {field_name} IS NOT NULL")
+ print(f" Cleared {field_name} in {table_name}: {cursor.rowcount} rows")
+ except Exception as e:
+ print(f" Skipping {table_name}.{field_name}: {e}")
+
+ # Re-enable foreign key checks
+ with schema_editor.connection.cursor() as cursor:
+ cursor.execute("PRAGMA foreign_keys=ON")
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('core', '0023_new_schema'),
+ ('crawls', '0001_initial'),
+ ('machine', '0001_squashed'),
+ ]
+
+ operations = [
+ migrations.RunPython(clear_config_fields, reverse_code=migrations.RunPython.noop),
+ ]
diff --git a/archivebox/core/migrations/0024_c_disable_fk_checks.py b/archivebox/core/migrations/0024_c_disable_fk_checks.py
new file mode 100644
index 00000000..8bee7270
--- /dev/null
+++ b/archivebox/core/migrations/0024_c_disable_fk_checks.py
@@ -0,0 +1,28 @@
+# Disable foreign key checks before 0025 to prevent CHECK constraint validation errors
+
+from django.db import migrations
+
+
+def disable_fk_checks(apps, schema_editor):
+ """Temporarily disable foreign key checks."""
+ with schema_editor.connection.cursor() as cursor:
+ cursor.execute("PRAGMA foreign_keys=OFF")
+ print(" Disabled foreign key checks")
+
+
+def enable_fk_checks(apps, schema_editor):
+ """Re-enable foreign key checks."""
+ with schema_editor.connection.cursor() as cursor:
+ cursor.execute("PRAGMA foreign_keys=ON")
+ print(" Enabled foreign key checks")
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('core', '0024_b_clear_config_fields'),
+ ]
+
+ operations = [
+ migrations.RunPython(disable_fk_checks, reverse_code=enable_fk_checks),
+ ]
diff --git a/archivebox/core/migrations/0024_d_fix_crawls_config.py b/archivebox/core/migrations/0024_d_fix_crawls_config.py
new file mode 100644
index 00000000..e1df3322
--- /dev/null
+++ b/archivebox/core/migrations/0024_d_fix_crawls_config.py
@@ -0,0 +1,93 @@
+# Fix crawls_crawl config field to avoid CHECK constraint errors during table rebuilds
+
+from django.db import migrations
+
+
+def fix_crawls_config(apps, schema_editor):
+ """
+ Rebuild crawls_crawl table to fix CHECK constraints and make seed_id nullable.
+ Only runs for UPGRADES from 0.8.x (when crawls.0001_initial didn't exist yet).
+ For fresh installs, crawls.0001_initial creates the correct schema.
+ """
+ with schema_editor.connection.cursor() as cursor:
+ # Check if this is an upgrade from old 0.8.x or a fresh install
+ # In fresh installs, crawls.0001_initial was applied, creating seed FK
+ # In upgrades, the table was created by old migrations before 0001_initial existed
+ cursor.execute("""
+ SELECT COUNT(*) FROM django_migrations
+ WHERE app='crawls' AND name='0001_initial'
+ """)
+ has_crawls_0001 = cursor.fetchone()[0] > 0
+
+ if has_crawls_0001:
+ # Fresh install - crawls.0001_initial already created the correct schema
+ # Just clear config to avoid CHECK constraint issues
+ print(" Fresh install detected - clearing config field only")
+ try:
+ cursor.execute('UPDATE "crawls_crawl" SET "config" = NULL')
+ except Exception as e:
+ print(f" Skipping config clear: {e}")
+ return
+
+ # Upgrade from 0.8.x - rebuild table to make seed_id nullable and remove CHECK constraint
+ print(" Upgrading from 0.8.x - rebuilding crawls_crawl table")
+ cursor.execute("PRAGMA foreign_keys=OFF")
+
+ # Backup
+ cursor.execute("CREATE TABLE crawls_crawl_backup AS SELECT * FROM crawls_crawl")
+
+ # Recreate without config CHECK constraint, with nullable seed_id
+ cursor.execute("DROP TABLE crawls_crawl")
+ cursor.execute("""
+ CREATE TABLE "crawls_crawl" (
+ "num_uses_failed" integer unsigned NOT NULL CHECK ("num_uses_failed" >= 0),
+ "num_uses_succeeded" integer unsigned NOT NULL CHECK ("num_uses_succeeded" >= 0),
+ "id" char(32) NOT NULL PRIMARY KEY,
+ "created_at" datetime NOT NULL,
+ "modified_at" datetime NOT NULL,
+ "urls" text NOT NULL,
+ "config" text,
+ "max_depth" smallint unsigned NOT NULL CHECK ("max_depth" >= 0),
+ "tags_str" varchar(1024) NOT NULL,
+ "persona_id" char(32) NULL,
+ "label" varchar(64) NOT NULL,
+ "notes" text NOT NULL,
+ "output_dir" varchar(512) NOT NULL,
+ "status" varchar(15) NOT NULL,
+ "retry_at" datetime NULL,
+ "created_by_id" integer NOT NULL REFERENCES "auth_user" ("id") DEFERRABLE INITIALLY DEFERRED,
+ "seed_id" char(32) NULL DEFAULT NULL,
+ "schedule_id" char(32) NULL REFERENCES "crawls_crawlschedule" ("id") DEFERRABLE INITIALLY DEFERRED
+ )
+ """)
+
+ # Restore data
+ cursor.execute("""
+ INSERT INTO "crawls_crawl" (
+ "num_uses_failed", "num_uses_succeeded", "id", "created_at", "modified_at",
+ "urls", "config", "max_depth", "tags_str", "persona_id", "label", "notes",
+ "output_dir", "status", "retry_at", "created_by_id", "seed_id", "schedule_id"
+ )
+ SELECT
+ "num_uses_failed", "num_uses_succeeded", "id", "created_at", "modified_at",
+ "urls", "config", "max_depth", "tags_str", "persona_id", "label", "notes",
+ "output_dir", "status", "retry_at", "created_by_id", "seed_id", "schedule_id"
+ FROM crawls_crawl_backup
+ """)
+
+ cursor.execute("DROP TABLE crawls_crawl_backup")
+
+ # NULL out config to avoid any invalid JSON
+ cursor.execute('UPDATE "crawls_crawl" SET "config" = NULL')
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('core', '0024_c_disable_fk_checks'),
+ ('crawls', '0001_initial'),
+ ]
+
+ operations = [
+ migrations.RunPython(fix_crawls_config, reverse_code=migrations.RunPython.noop),
+ ]
diff --git a/archivebox/core/migrations/0024_snapshot_crawl.py b/archivebox/core/migrations/0024_snapshot_crawl.py
index 84c285bc..c8b47bf2 100644
--- a/archivebox/core/migrations/0024_snapshot_crawl.py
+++ b/archivebox/core/migrations/0024_snapshot_crawl.py
@@ -8,9 +8,7 @@ import django.db.models.deletion
class Migration(migrations.Migration):
dependencies = [
- ('core', '0023_new_schema'),
- ('crawls', '0001_initial'),
- ('machine', '0001_squashed'),
+ ('core', '0024_d_fix_crawls_config'),
]
operations = [
diff --git a/archivebox/core/migrations/0026_remove_archiveresult_output_dir_and_more.py b/archivebox/core/migrations/0026_remove_archiveresult_output_dir_and_more.py
index 13707940..5ec70d47 100755
--- a/archivebox/core/migrations/0026_remove_archiveresult_output_dir_and_more.py
+++ b/archivebox/core/migrations/0026_remove_archiveresult_output_dir_and_more.py
@@ -10,6 +10,13 @@ from django.db import migrations, models
def populate_archiveresult_uuids(apps, schema_editor):
"""Generate unique UUIDs for ArchiveResults that don't have one."""
+ # Check if uuid column exists before trying to populate it
+ with schema_editor.connection.cursor() as cursor:
+ cursor.execute("PRAGMA table_info(core_archiveresult)")
+ columns = [row[1] for row in cursor.fetchall()]
+ if 'uuid' not in columns:
+ return # uuid column doesn't exist, skip this data migration
+
ArchiveResult = apps.get_model('core', 'ArchiveResult')
for result in ArchiveResult.objects.filter(uuid__isnull=True):
result.uuid = uuid_compat.uuid7()
@@ -21,6 +28,22 @@ def reverse_populate_uuids(apps, schema_editor):
pass
+def remove_output_dir_if_exists(apps, schema_editor):
+ """Remove output_dir columns if they exist."""
+ with schema_editor.connection.cursor() as cursor:
+ # Check and remove from core_archiveresult
+ cursor.execute("PRAGMA table_info(core_archiveresult)")
+ columns = [row[1] for row in cursor.fetchall()]
+ if 'output_dir' in columns:
+ cursor.execute("ALTER TABLE core_archiveresult DROP COLUMN output_dir")
+
+ # Check and remove from core_snapshot
+ cursor.execute("PRAGMA table_info(core_snapshot)")
+ columns = [row[1] for row in cursor.fetchall()]
+ if 'output_dir' in columns:
+ cursor.execute("ALTER TABLE core_snapshot DROP COLUMN output_dir")
+
+
class Migration(migrations.Migration):
dependencies = [
@@ -33,82 +56,90 @@ class Migration(migrations.Migration):
migrations.RunPython(populate_archiveresult_uuids, reverse_populate_uuids),
# Remove output_dir fields (not needed, computed from snapshot)
- migrations.RemoveField(
- model_name='archiveresult',
- name='output_dir',
- ),
- migrations.RemoveField(
- model_name='snapshot',
- name='output_dir',
+ migrations.RunPython(remove_output_dir_if_exists, reverse_code=migrations.RunPython.noop),
+
+ # Update Django's migration state to match 0.9.x schema
+ # Database already has correct types from 0.8.x, just update state
+ migrations.SeparateDatabaseAndState(
+ state_operations=[
+ # Archiveresult field alterations
+ migrations.AlterField(
+ model_name='archiveresult',
+ name='created_at',
+ field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
+ ),
+ migrations.AlterField(
+ model_name='archiveresult',
+ name='created_by',
+ field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, related_name='archiveresult_set', to=settings.AUTH_USER_MODEL),
+ ),
+ migrations.AlterField(
+ model_name='archiveresult',
+ name='extractor',
+ field=models.CharField(db_index=True, max_length=32),
+ ),
+ # Convert id from AutoField to UUIDField (database already has UUID CHAR(32))
+ migrations.AlterField(
+ model_name='archiveresult',
+ name='id',
+ field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
+ ),
+ migrations.AlterField(
+ model_name='archiveresult',
+ name='status',
+ field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('backoff', 'Waiting to retry'), ('succeeded', 'Succeeded'), ('failed', 'Failed'), ('skipped', 'Skipped')], db_index=True, default='queued', max_length=15),
+ ),
+
+ # Snapshot field alterations
+ migrations.AlterField(
+ model_name='snapshot',
+ name='bookmarked_at',
+ field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
+ ),
+ migrations.AlterField(
+ model_name='snapshot',
+ name='created_at',
+ field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
+ ),
+ migrations.AlterField(
+ model_name='snapshot',
+ name='created_by',
+ field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to=settings.AUTH_USER_MODEL),
+ ),
+ migrations.AlterField(
+ model_name='snapshot',
+ name='downloaded_at',
+ field=models.DateTimeField(blank=True, db_index=True, default=None, editable=False, null=True),
+ ),
+ migrations.AlterField(
+ model_name='snapshot',
+ name='id',
+ field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
+ ),
+ ],
+ database_operations=[
+ # No actual database changes needed - schema is already correct from 0.8.x
+ ],
),
- # Archiveresult field alterations
- migrations.AlterField(
- model_name='archiveresult',
- name='created_at',
- field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
- ),
- migrations.AlterField(
- model_name='archiveresult',
- name='created_by',
- field=models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, related_name='archiveresult_set', to=settings.AUTH_USER_MODEL),
- ),
- migrations.AlterField(
- model_name='archiveresult',
- name='extractor',
- field=models.CharField(db_index=True, max_length=32),
- ),
- migrations.AlterField(
- model_name='archiveresult',
- name='id',
- field=models.AutoField(editable=False, primary_key=True, serialize=False),
- ),
- migrations.AlterField(
- model_name='archiveresult',
- name='status',
- field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('backoff', 'Waiting to retry'), ('succeeded', 'Succeeded'), ('failed', 'Failed'), ('skipped', 'Skipped')], db_index=True, default='queued', max_length=15),
- ),
-
- # Snapshot field alterations
- migrations.AlterField(
- model_name='snapshot',
- name='bookmarked_at',
- field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
- ),
- migrations.AlterField(
- model_name='snapshot',
- name='created_at',
- field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
- ),
- migrations.AlterField(
- model_name='snapshot',
- name='created_by',
- field=models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to=settings.AUTH_USER_MODEL),
- ),
- migrations.AlterField(
- model_name='snapshot',
- name='downloaded_at',
- field=models.DateTimeField(blank=True, db_index=True, default=None, editable=False, null=True),
- ),
- migrations.AlterField(
- model_name='snapshot',
- name='id',
- field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
- ),
-
- # SnapshotTag and Tag alterations
- migrations.AlterField(
- model_name='snapshottag',
- name='id',
- field=models.AutoField(primary_key=True, serialize=False),
- ),
- migrations.AlterField(
- model_name='tag',
- name='created_by',
- field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, related_name='tag_set', to=settings.AUTH_USER_MODEL),
- ),
- migrations.AlterUniqueTogether(
- name='snapshottag',
- unique_together={('snapshot', 'tag')},
+ # SnapshotTag and Tag alterations - state only, DB already correct
+ migrations.SeparateDatabaseAndState(
+ state_operations=[
+ migrations.AlterField(
+ model_name='snapshottag',
+ name='id',
+ field=models.AutoField(primary_key=True, serialize=False),
+ ),
+ migrations.AlterField(
+ model_name='tag',
+ name='created_by',
+ field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, related_name='tag_set', to=settings.AUTH_USER_MODEL),
+ ),
+ migrations.AlterUniqueTogether(
+ name='snapshottag',
+ unique_together={('snapshot', 'tag')},
+ ),
+ ],
+ database_operations=[],
),
]
diff --git a/archivebox/core/migrations/0029_archiveresult_hook_fields.py b/archivebox/core/migrations/0029_archiveresult_hook_fields.py
index 41096eee..a8ddfb27 100644
--- a/archivebox/core/migrations/0029_archiveresult_hook_fields.py
+++ b/archivebox/core/migrations/0029_archiveresult_hook_fields.py
@@ -13,68 +13,79 @@ class Migration(migrations.Migration):
]
operations = [
- # Add new output fields (keep old 'output' temporarily for migration)
- migrations.AddField(
- model_name='archiveresult',
- name='output_str',
- field=models.TextField(
- blank=True,
- default='',
- help_text='Human-readable output summary (e.g., "Downloaded 5 files")'
- ),
- ),
-
- migrations.AddField(
- model_name='archiveresult',
- name='output_json',
- field=models.JSONField(
- null=True,
- blank=True,
- default=None,
- help_text='Structured metadata (headers, redirects, etc.) - should NOT duplicate ArchiveResult fields'
- ),
- ),
-
- migrations.AddField(
- model_name='archiveresult',
- name='output_files',
- field=models.JSONField(
- default=dict,
- help_text='Dict of {relative_path: {metadata}} - values are empty dicts for now, extensible for future metadata'
- ),
- ),
-
- migrations.AddField(
- model_name='archiveresult',
- name='output_size',
- field=models.BigIntegerField(
- default=0,
- help_text='Total recursive size in bytes of all output files'
- ),
- ),
-
- migrations.AddField(
- model_name='archiveresult',
- name='output_mimetypes',
- field=models.CharField(
- max_length=512,
- blank=True,
- default='',
- help_text='CSV of mimetypes sorted by size descending'
- ),
- ),
-
- # Add binary FK (optional)
- migrations.AddField(
- model_name='archiveresult',
- name='binary',
- field=models.ForeignKey(
- 'machine.Binary',
- on_delete=models.SET_NULL,
- null=True,
- blank=True,
- related_name='archiveresults',
- help_text='Primary binary used by this hook (optional)'
- ),
+ # Add new output fields using SeparateDatabaseAndState to avoid table rebuilds
+ migrations.SeparateDatabaseAndState(
+ state_operations=[
+ migrations.AddField(
+ model_name='archiveresult',
+ name='output_str',
+ field=models.TextField(
+ blank=True,
+ default='',
+ help_text='Human-readable output summary (e.g., "Downloaded 5 files")'
+ ),
+ ),
+ migrations.AddField(
+ model_name='archiveresult',
+ name='output_json',
+ field=models.JSONField(
+ null=True,
+ blank=True,
+ default=None,
+ help_text='Structured metadata (headers, redirects, etc.) - should NOT duplicate ArchiveResult fields'
+ ),
+ ),
+ migrations.AddField(
+ model_name='archiveresult',
+ name='output_files',
+ field=models.JSONField(
+ default=dict,
+ help_text='Dict of {relative_path: {metadata}} - values are empty dicts for now, extensible for future metadata'
+ ),
+ ),
+ migrations.AddField(
+ model_name='archiveresult',
+ name='output_size',
+ field=models.BigIntegerField(
+ default=0,
+ help_text='Total recursive size in bytes of all output files'
+ ),
+ ),
+ migrations.AddField(
+ model_name='archiveresult',
+ name='output_mimetypes',
+ field=models.CharField(
+ max_length=512,
+ blank=True,
+ default='',
+ help_text='CSV of mimetypes sorted by size descending'
+ ),
+ ),
+ migrations.AddField(
+ model_name='archiveresult',
+ name='binary',
+ field=models.ForeignKey(
+ 'machine.Binary',
+ on_delete=models.SET_NULL,
+ null=True,
+ blank=True,
+ related_name='archiveresults',
+ help_text='Primary binary used by this hook (optional)'
+ ),
+ ),
+ ],
+ database_operations=[
+ migrations.RunSQL(
+ sql="""
+ ALTER TABLE core_archiveresult ADD COLUMN output_str TEXT DEFAULT '';
+ ALTER TABLE core_archiveresult ADD COLUMN output_json TEXT;
+ ALTER TABLE core_archiveresult ADD COLUMN output_files TEXT DEFAULT '{}';
+ ALTER TABLE core_archiveresult ADD COLUMN output_size BIGINT DEFAULT 0;
+ ALTER TABLE core_archiveresult ADD COLUMN output_mimetypes VARCHAR(512) DEFAULT '';
+ ALTER TABLE core_archiveresult ADD COLUMN binary_id CHAR(32) REFERENCES machine_binary(id);
+ """,
+ reverse_sql=migrations.RunSQL.noop,
+ ),
+ ],
),
]
diff --git a/archivebox/core/migrations/0030_migrate_output_field.py b/archivebox/core/migrations/0030_migrate_output_field.py
index 5dafb7e8..6c0501ae 100644
--- a/archivebox/core/migrations/0030_migrate_output_field.py
+++ b/archivebox/core/migrations/0030_migrate_output_field.py
@@ -12,27 +12,46 @@ def migrate_output_field(apps, schema_editor):
Logic:
- If output contains JSON {...}, move to output_json
- Otherwise, move to output_str
+
+ Use raw SQL to avoid CHECK constraint issues during migration.
"""
- ArchiveResult = apps.get_model('core', 'ArchiveResult')
+ # Use raw SQL to migrate data without triggering CHECK constraints
+ with schema_editor.connection.cursor() as cursor:
+ # Get all archive results
+ cursor.execute("""
+ SELECT id, output FROM core_archiveresult
+ """)
- for ar in ArchiveResult.objects.all().iterator():
- old_output = ar.output or ''
+ for row in cursor.fetchall():
+ ar_id, old_output = row
+ old_output = old_output or ''
- # Case 1: JSON output
- if old_output.strip().startswith('{'):
- try:
- parsed = json.loads(old_output)
- ar.output_json = parsed
- ar.output_str = ''
- except json.JSONDecodeError:
- # Not valid JSON, treat as string
- ar.output_str = old_output
-
- # Case 2: File path or plain string
- else:
- ar.output_str = old_output
-
- ar.save(update_fields=['output_str', 'output_json'])
+ # Case 1: JSON output
+ if old_output.strip().startswith('{'):
+ try:
+ # Validate it's actual JSON
+ parsed = json.loads(old_output)
+ # Update with JSON - cast to JSON to satisfy CHECK constraint
+ json_str = json.dumps(parsed)
+ cursor.execute("""
+ UPDATE core_archiveresult
+ SET output_str = '', output_json = json(?)
+ WHERE id = ?
+ """, (json_str, ar_id))
+ except json.JSONDecodeError:
+ # Not valid JSON, treat as string
+ cursor.execute("""
+ UPDATE core_archiveresult
+ SET output_str = ?, output_json = NULL
+ WHERE id = ?
+ """, (old_output, ar_id))
+ # Case 2: File path or plain string
+ else:
+ cursor.execute("""
+ UPDATE core_archiveresult
+ SET output_str = ?, output_json = NULL
+ WHERE id = ?
+ """, (old_output, ar_id))
def reverse_migrate(apps, schema_editor):
diff --git a/archivebox/core/migrations/0032_alter_archiveresult_binary_and_more.py b/archivebox/core/migrations/0032_alter_archiveresult_binary_and_more.py
index cd8eb821..bbe45cba 100644
--- a/archivebox/core/migrations/0032_alter_archiveresult_binary_and_more.py
+++ b/archivebox/core/migrations/0032_alter_archiveresult_binary_and_more.py
@@ -16,43 +16,62 @@ class Migration(migrations.Migration):
]
operations = [
- migrations.AlterField(
- model_name='archiveresult',
- name='binary',
- field=models.ForeignKey(blank=True, help_text='Primary binary used by this hook', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='archiveresults', to='machine.binary'),
+ # Update Django's state only - database already has correct schema from 0029
+ migrations.SeparateDatabaseAndState(
+ state_operations=[
+ migrations.AlterField(
+ model_name='archiveresult',
+ name='binary',
+ field=models.ForeignKey(blank=True, help_text='Primary binary used by this hook', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='archiveresults', to='machine.binary'),
+ ),
+ migrations.AlterField(
+ model_name='archiveresult',
+ name='output_files',
+ field=models.JSONField(default=dict, help_text='Dict of {relative_path: {metadata}}'),
+ ),
+ migrations.AlterField(
+ model_name='archiveresult',
+ name='output_json',
+ field=models.JSONField(blank=True, default=None, help_text='Structured metadata (headers, redirects, etc.)', null=True),
+ ),
+ migrations.AlterField(
+ model_name='archiveresult',
+ name='output_mimetypes',
+ field=models.CharField(blank=True, default='', help_text='CSV of mimetypes sorted by size', max_length=512),
+ ),
+ migrations.AlterField(
+ model_name='archiveresult',
+ name='output_size',
+ field=models.BigIntegerField(default=0, help_text='Total bytes of all output files'),
+ ),
+ migrations.AlterField(
+ model_name='archiveresult',
+ name='output_str',
+ field=models.TextField(blank=True, default='', help_text='Human-readable output summary'),
+ ),
+ migrations.AlterField(
+ model_name='archiveresult',
+ name='uuid',
+ field=models.UUIDField(blank=True, db_index=True, default=uuid_compat.uuid7, null=True),
+ ),
+ ],
+ database_operations=[
+ # No database changes needed - columns already exist with correct types
+ ],
),
- migrations.AlterField(
- model_name='archiveresult',
- name='output_files',
- field=models.JSONField(default=dict, help_text='Dict of {relative_path: {metadata}}'),
- ),
- migrations.AlterField(
- model_name='archiveresult',
- name='output_json',
- field=models.JSONField(blank=True, default=None, help_text='Structured metadata (headers, redirects, etc.)', null=True),
- ),
- migrations.AlterField(
- model_name='archiveresult',
- name='output_mimetypes',
- field=models.CharField(blank=True, default='', help_text='CSV of mimetypes sorted by size', max_length=512),
- ),
- migrations.AlterField(
- model_name='archiveresult',
- name='output_size',
- field=models.BigIntegerField(default=0, help_text='Total bytes of all output files'),
- ),
- migrations.AlterField(
- model_name='archiveresult',
- name='output_str',
- field=models.TextField(blank=True, default='', help_text='Human-readable output summary'),
- ),
- migrations.AlterField(
- model_name='archiveresult',
- name='uuid',
- field=models.UUIDField(blank=True, db_index=True, default=uuid_compat.uuid7, null=True),
- ),
- migrations.AddConstraint(
- model_name='snapshot',
- constraint=models.UniqueConstraint(fields=('timestamp',), name='unique_timestamp'),
+ # Add unique constraint without table rebuild
+ migrations.SeparateDatabaseAndState(
+ state_operations=[
+ migrations.AddConstraint(
+ model_name='snapshot',
+ constraint=models.UniqueConstraint(fields=('timestamp',), name='unique_timestamp'),
+ ),
+ ],
+ database_operations=[
+ migrations.RunSQL(
+ sql="CREATE UNIQUE INDEX IF NOT EXISTS unique_timestamp ON core_snapshot (timestamp);",
+ reverse_sql="DROP INDEX IF EXISTS unique_timestamp;",
+ ),
+ ],
),
]
diff --git a/archivebox/core/migrations/0033_rename_extractor_add_hook_name.py b/archivebox/core/migrations/0033_rename_extractor_add_hook_name.py
index 4e0a20bf..bedb58db 100644
--- a/archivebox/core/migrations/0033_rename_extractor_add_hook_name.py
+++ b/archivebox/core/migrations/0033_rename_extractor_add_hook_name.py
@@ -10,20 +10,35 @@ class Migration(migrations.Migration):
]
operations = [
- migrations.RenameField(
- model_name='archiveresult',
- old_name='extractor',
- new_name='plugin',
- ),
- migrations.AddField(
- model_name='archiveresult',
- name='hook_name',
- field=models.CharField(
- blank=True,
- default='',
- max_length=255,
- db_index=True,
- help_text='Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)'
- ),
+ # Use SeparateDatabaseAndState to avoid table rebuilds that would re-add CHECK constraints
+ migrations.SeparateDatabaseAndState(
+ state_operations=[
+ migrations.RenameField(
+ model_name='archiveresult',
+ old_name='extractor',
+ new_name='plugin',
+ ),
+ migrations.AddField(
+ model_name='archiveresult',
+ name='hook_name',
+ field=models.CharField(
+ blank=True,
+ default='',
+ max_length=255,
+ db_index=True,
+ help_text='Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)'
+ ),
+ ),
+ ],
+ database_operations=[
+ migrations.RunSQL(
+ sql="""
+ ALTER TABLE core_archiveresult RENAME COLUMN extractor TO plugin;
+ ALTER TABLE core_archiveresult ADD COLUMN hook_name VARCHAR(255) DEFAULT '' NOT NULL;
+ CREATE INDEX IF NOT EXISTS core_archiveresult_hook_name_idx ON core_archiveresult (hook_name);
+ """,
+ reverse_sql=migrations.RunSQL.noop,
+ ),
+ ],
),
]
diff --git a/archivebox/core/migrations/0034_snapshot_current_step.py b/archivebox/core/migrations/0034_snapshot_current_step.py
index f570230c..4b89fa21 100644
--- a/archivebox/core/migrations/0034_snapshot_current_step.py
+++ b/archivebox/core/migrations/0034_snapshot_current_step.py
@@ -11,13 +11,27 @@ class Migration(migrations.Migration):
]
operations = [
- migrations.AddField(
- model_name='snapshot',
- name='current_step',
- field=models.PositiveSmallIntegerField(
- default=0,
- db_index=True,
- help_text='Current hook step being executed (0-9). Used for sequential hook execution.'
- ),
+ # Use SeparateDatabaseAndState to avoid table rebuild that would fail on config NOT NULL constraint
+ migrations.SeparateDatabaseAndState(
+ state_operations=[
+ migrations.AddField(
+ model_name='snapshot',
+ name='current_step',
+ field=models.PositiveSmallIntegerField(
+ default=0,
+ db_index=True,
+ help_text='Current hook step being executed (0-9). Used for sequential hook execution.'
+ ),
+ ),
+ ],
+ database_operations=[
+ migrations.RunSQL(
+ sql="""
+ ALTER TABLE core_snapshot ADD COLUMN current_step SMALLINT UNSIGNED DEFAULT 0 NOT NULL;
+ CREATE INDEX IF NOT EXISTS core_snapshot_current_step_idx ON core_snapshot (current_step);
+ """,
+ reverse_sql=migrations.RunSQL.noop,
+ ),
+ ],
),
]
diff --git a/archivebox/core/migrations/0035_snapshot_crawl_non_nullable_remove_created_by.py b/archivebox/core/migrations/0035_snapshot_crawl_non_nullable_remove_created_by.py
index 50a3f33f..84ea3c23 100644
--- a/archivebox/core/migrations/0035_snapshot_crawl_non_nullable_remove_created_by.py
+++ b/archivebox/core/migrations/0035_snapshot_crawl_non_nullable_remove_created_by.py
@@ -54,7 +54,7 @@ class Migration(migrations.Migration):
dependencies = [
('core', '0034_snapshot_current_step'),
- ('crawls', '0004_alter_crawl_output_dir'),
+ ('crawls', '0005_drop_seed_id_column'),
]
operations = [
@@ -64,16 +64,24 @@ class Migration(migrations.Migration):
reverse_code=migrations.RunPython.noop,
),
- # Step 2: Make crawl non-nullable
- migrations.AlterField(
- model_name='snapshot',
- name='crawl',
- field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to='crawls.crawl'),
- ),
-
- # Step 3: Remove created_by field
- migrations.RemoveField(
- model_name='snapshot',
- name='created_by',
+ # Step 2 & 3: Update Django's state only - leave created_by_id column in database (unused but harmless)
+ migrations.SeparateDatabaseAndState(
+ state_operations=[
+ # Make crawl non-nullable
+ migrations.AlterField(
+ model_name='snapshot',
+ name='crawl',
+ field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to='crawls.crawl'),
+ ),
+ # Remove created_by field from Django's state
+ migrations.RemoveField(
+ model_name='snapshot',
+ name='created_by',
+ ),
+ ],
+ database_operations=[
+ # No database changes - crawl_id already exists and NOT NULL constraint will be enforced by model
+ # created_by_id column remains in database but is unused
+ ],
),
]
diff --git a/archivebox/core/migrations/0036_remove_archiveresult_created_by.py b/archivebox/core/migrations/0036_remove_archiveresult_created_by.py
index 6a6d1f1f..5b6983c0 100644
--- a/archivebox/core/migrations/0036_remove_archiveresult_created_by.py
+++ b/archivebox/core/migrations/0036_remove_archiveresult_created_by.py
@@ -10,10 +10,18 @@ class Migration(migrations.Migration):
]
operations = [
- # Remove created_by field from ArchiveResult
+ # Remove created_by field from ArchiveResult (state only)
# No data migration needed - created_by can be accessed via snapshot.crawl.created_by
- migrations.RemoveField(
- model_name='archiveresult',
- name='created_by',
+ # Leave created_by_id column in database (unused but harmless, avoids table rebuild)
+ migrations.SeparateDatabaseAndState(
+ state_operations=[
+ migrations.RemoveField(
+ model_name='archiveresult',
+ name='created_by',
+ ),
+ ],
+ database_operations=[
+ # No database changes - leave created_by_id column in place to avoid table rebuild
+ ],
),
]
diff --git a/archivebox/core/migrations/0037_remove_archiveresult_output_dir_and_more.py b/archivebox/core/migrations/0037_remove_archiveresult_output_dir_and_more.py
new file mode 100644
index 00000000..592eed6a
--- /dev/null
+++ b/archivebox/core/migrations/0037_remove_archiveresult_output_dir_and_more.py
@@ -0,0 +1,44 @@
+# Generated by Django 6.0 on 2025-12-29 06:45
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('core', '0036_remove_archiveresult_created_by'),
+ ]
+
+ operations = [
+ # Update Django's state only - database columns remain for backwards compat
+ migrations.SeparateDatabaseAndState(
+ state_operations=[
+ migrations.RemoveField(
+ model_name='archiveresult',
+ name='output_dir',
+ ),
+ migrations.RemoveField(
+ model_name='snapshot',
+ name='output_dir',
+ ),
+ migrations.AlterField(
+ model_name='archiveresult',
+ name='config',
+ field=models.JSONField(blank=True, default=dict, null=True),
+ ),
+ migrations.AlterField(
+ model_name='snapshot',
+ name='config',
+ field=models.JSONField(blank=True, default=dict, null=True),
+ ),
+ migrations.AlterField(
+ model_name='snapshot',
+ name='tags',
+ field=models.ManyToManyField(blank=True, related_name='snapshot_set', through='core.SnapshotTag', through_fields=('snapshot', 'tag'), to='core.tag'),
+ ),
+ ],
+ database_operations=[
+ # No database changes - columns remain in place to avoid table rebuilds
+ ],
+ ),
+ ]
diff --git a/archivebox/core/migrations/0038_fix_missing_columns.py b/archivebox/core/migrations/0038_fix_missing_columns.py
new file mode 100644
index 00000000..3c1e6551
--- /dev/null
+++ b/archivebox/core/migrations/0038_fix_missing_columns.py
@@ -0,0 +1,84 @@
+# Add missing columns to ArchiveResult and remove created_by_id from Snapshot
+
+from django.db import migrations, models, connection
+import django.utils.timezone
+
+
+def add_columns_if_not_exist(apps, schema_editor):
+ """Add columns to ArchiveResult only if they don't already exist."""
+ with connection.cursor() as cursor:
+ # Get existing columns
+ cursor.execute("PRAGMA table_info(core_archiveresult)")
+ existing_columns = {row[1] for row in cursor.fetchall()}
+
+ # Add num_uses_failed if it doesn't exist
+ if 'num_uses_failed' not in existing_columns:
+ cursor.execute("ALTER TABLE core_archiveresult ADD COLUMN num_uses_failed integer unsigned NOT NULL DEFAULT 0 CHECK (num_uses_failed >= 0)")
+
+ # Add num_uses_succeeded if it doesn't exist
+ if 'num_uses_succeeded' not in existing_columns:
+ cursor.execute("ALTER TABLE core_archiveresult ADD COLUMN num_uses_succeeded integer unsigned NOT NULL DEFAULT 0 CHECK (num_uses_succeeded >= 0)")
+
+ # Add config if it doesn't exist
+ if 'config' not in existing_columns:
+ cursor.execute("ALTER TABLE core_archiveresult ADD COLUMN config text NULL")
+
+ # Add retry_at if it doesn't exist
+ if 'retry_at' not in existing_columns:
+ cursor.execute("ALTER TABLE core_archiveresult ADD COLUMN retry_at datetime NULL")
+ cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_retry_at_idx ON core_archiveresult(retry_at)")
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('core', '0037_remove_archiveresult_output_dir_and_more'),
+ ]
+
+ operations = [
+ # Add missing columns to ArchiveResult
+ migrations.SeparateDatabaseAndState(
+ state_operations=[
+ migrations.AddField(
+ model_name='archiveresult',
+ name='num_uses_failed',
+ field=models.PositiveIntegerField(default=0),
+ ),
+ migrations.AddField(
+ model_name='archiveresult',
+ name='num_uses_succeeded',
+ field=models.PositiveIntegerField(default=0),
+ ),
+ migrations.AddField(
+ model_name='archiveresult',
+ name='config',
+ field=models.JSONField(blank=True, default=dict, null=True),
+ ),
+ migrations.AddField(
+ model_name='archiveresult',
+ name='retry_at',
+ field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True),
+ ),
+ ],
+ database_operations=[
+ migrations.RunPython(add_columns_if_not_exist, reverse_code=migrations.RunPython.noop),
+ ],
+ ),
+
+ # Drop created_by_id from Snapshot (database only, already removed from model in 0035)
+ migrations.SeparateDatabaseAndState(
+ state_operations=[
+ # No state changes - field already removed in 0035
+ ],
+ database_operations=[
+ migrations.RunSQL(
+ sql="""
+ -- Drop index first, then column
+ DROP INDEX IF EXISTS core_snapshot_created_by_id_6dbd6149;
+ ALTER TABLE core_snapshot DROP COLUMN created_by_id;
+ """,
+ reverse_sql=migrations.RunSQL.noop,
+ ),
+ ],
+ ),
+ ]
diff --git a/archivebox/core/migrations/0039_fix_num_uses_values.py b/archivebox/core/migrations/0039_fix_num_uses_values.py
new file mode 100644
index 00000000..4c04ed3e
--- /dev/null
+++ b/archivebox/core/migrations/0039_fix_num_uses_values.py
@@ -0,0 +1,30 @@
+# Fix num_uses_failed and num_uses_succeeded string values to integers
+
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('core', '0038_fix_missing_columns'),
+ ]
+
+ operations = [
+ # Fix string values that got inserted as literals instead of integers
+ migrations.RunSQL(
+ sql="""
+ UPDATE core_snapshot
+ SET num_uses_failed = 0
+ WHERE typeof(num_uses_failed) = 'text' OR num_uses_failed = 'num_uses_failed';
+
+ UPDATE core_snapshot
+ SET num_uses_succeeded = 0
+ WHERE typeof(num_uses_succeeded) = 'text' OR num_uses_succeeded = 'num_uses_succeeded';
+
+ UPDATE core_snapshot
+ SET depth = 0
+ WHERE typeof(depth) = 'text' OR depth = 'depth';
+ """,
+ reverse_sql=migrations.RunSQL.noop,
+ ),
+ ]
diff --git a/archivebox/core/models.py b/archivebox/core/models.py
index cf4216c6..4c0e026b 100755
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -911,7 +911,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
)
merged = 0
- for dup in duplicates.iterator():
+ for dup in duplicates.iterator(chunk_size=500):
snapshots = list(
cls.objects
.filter(url=dup['url'], timestamp=dup['timestamp'])
diff --git a/archivebox/core/models.py.bak b/archivebox/core/models.py.bak
deleted file mode 100755
index a99d9360..00000000
--- a/archivebox/core/models.py.bak
+++ /dev/null
@@ -1,2638 +0,0 @@
-__package__ = 'archivebox.core'
-
-from typing import Optional, Dict, Iterable, Any, List, TYPE_CHECKING
-from archivebox.uuid_compat import uuid7
-from datetime import datetime, timedelta
-from django_stubs_ext.db.models import TypedModelMeta
-
-import os
-import json
-from pathlib import Path
-
-from statemachine import State, registry
-
-from django.db import models
-from django.db.models import QuerySet, Value, Case, When, IntegerField
-from django.utils.functional import cached_property
-from django.utils.text import slugify
-from django.utils import timezone
-from django.core.cache import cache
-from django.urls import reverse, reverse_lazy
-from django.contrib import admin
-from django.conf import settings
-
-from archivebox.config import CONSTANTS
-from archivebox.misc.system import get_dir_size, atomic_write
-from archivebox.misc.util import parse_date, base_url, domain as url_domain, to_json, ts_to_date_str, urlencode, htmlencode, urldecode
-from archivebox.misc.hashing import get_dir_info
-from archivebox.hooks import (
- EXTRACTOR_INDEXING_PRECEDENCE,
- get_plugins, get_plugin_name, get_plugin_icon,
- DEFAULT_PLUGIN_ICONS,
-)
-from archivebox.base_models.models import (
- ModelWithUUID, ModelWithSerializers, ModelWithOutputDir,
- ModelWithConfig, ModelWithNotes, ModelWithHealthStats,
- get_or_create_system_user_pk,
-)
-from workers.models import ModelWithStateMachine, BaseStateMachine
-from workers.tasks import bg_archive_snapshot
-from archivebox.crawls.models import Crawl
-from archivebox.machine.models import NetworkInterface, Binary
-
-
-
-class Tag(ModelWithSerializers):
- # Keep AutoField for compatibility with main branch migrations
- # Don't use UUIDField here - requires complex FK transformation
- id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
- created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False, related_name='tag_set')
- created_at = models.DateTimeField(default=timezone.now, db_index=True, null=True)
- modified_at = models.DateTimeField(auto_now=True)
- name = models.CharField(unique=True, blank=False, max_length=100)
- slug = models.SlugField(unique=True, blank=False, max_length=100, editable=False)
-
- snapshot_set: models.Manager['Snapshot']
-
- class Meta(TypedModelMeta):
- verbose_name = "Tag"
- verbose_name_plural = "Tags"
-
- def __str__(self):
- return self.name
-
- def save(self, *args, **kwargs):
- is_new = self._state.adding
- if is_new:
- self.slug = slugify(self.name)
- existing = set(Tag.objects.filter(slug__startswith=self.slug).values_list("slug", flat=True))
- i = None
- while True:
- slug = f"{slugify(self.name)}_{i}" if i else slugify(self.name)
- if slug not in existing:
- self.slug = slug
- break
- i = (i or 0) + 1
- super().save(*args, **kwargs)
-
- if is_new:
- from archivebox.misc.logging_util import log_worker_event
- log_worker_event(
- worker_type='DB',
- event='Created Tag',
- indent_level=0,
- metadata={
- 'id': self.id,
- 'name': self.name,
- 'slug': self.slug,
- },
- )
-
- @property
- def api_url(self) -> str:
- return reverse_lazy('api-1:get_tag', args=[self.id])
-
- @staticmethod
- def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None):
- """
- Create/update Tag from JSONL record.
-
- Args:
- record: JSONL record with 'name' field
- overrides: Optional dict with 'snapshot' to auto-attach tag
-
- Returns:
- Tag instance or None
- """
- from archivebox.misc.jsonl import get_or_create_tag
-
- try:
- tag = get_or_create_tag(record)
-
- # Auto-attach to snapshot if in overrides
- if overrides and 'snapshot' in overrides and tag:
- overrides['snapshot'].tags.add(tag)
-
- return tag
- except ValueError:
- return None
-
-
-class SnapshotTag(models.Model):
- id = models.AutoField(primary_key=True)
- snapshot = models.ForeignKey('Snapshot', db_column='snapshot_id', on_delete=models.CASCADE, to_field='id')
- tag = models.ForeignKey(Tag, db_column='tag_id', on_delete=models.CASCADE, to_field='id')
-
- class Meta:
- db_table = 'core_snapshot_tags'
- unique_together = [('snapshot', 'tag')]
-
-
-class SnapshotQuerySet(models.QuerySet):
- """Custom QuerySet for Snapshot model with export methods that persist through .filter() etc."""
-
- # =========================================================================
- # Filtering Methods
- # =========================================================================
-
- FILTER_TYPES = {
- 'exact': lambda pattern: models.Q(url=pattern),
- 'substring': lambda pattern: models.Q(url__icontains=pattern),
- 'regex': lambda pattern: models.Q(url__iregex=pattern),
- 'domain': lambda pattern: models.Q(url__istartswith=f"http://{pattern}") | models.Q(url__istartswith=f"https://{pattern}") | models.Q(url__istartswith=f"ftp://{pattern}"),
- 'tag': lambda pattern: models.Q(tags__name=pattern),
- 'timestamp': lambda pattern: models.Q(timestamp=pattern),
- }
-
- def filter_by_patterns(self, patterns: List[str], filter_type: str = 'exact') -> 'SnapshotQuerySet':
- """Filter snapshots by URL patterns using specified filter type"""
- from archivebox.misc.logging import stderr
-
- q_filter = models.Q()
- for pattern in patterns:
- try:
- q_filter = q_filter | self.FILTER_TYPES[filter_type](pattern)
- except KeyError:
- stderr()
- stderr(f'[X] Got invalid pattern for --filter-type={filter_type}:', color='red')
- stderr(f' {pattern}')
- raise SystemExit(2)
- return self.filter(q_filter)
-
- def search(self, patterns: List[str]) -> 'SnapshotQuerySet':
- """Search snapshots using the configured search backend"""
- from archivebox.config.common import SEARCH_BACKEND_CONFIG
- from archivebox.search import query_search_index
- from archivebox.misc.logging import stderr
-
- if not SEARCH_BACKEND_CONFIG.USE_SEARCHING_BACKEND:
- stderr()
- stderr('[X] The search backend is not enabled, set config.USE_SEARCHING_BACKEND = True', color='red')
- raise SystemExit(2)
-
- qsearch = self.none()
- for pattern in patterns:
- try:
- qsearch |= query_search_index(pattern)
- except:
- raise SystemExit(2)
- return self.all() & qsearch
-
- # =========================================================================
- # Export Methods
- # =========================================================================
-
- def to_json(self, with_headers: bool = False) -> str:
- """Generate JSON index from snapshots"""
- import sys
- from datetime import datetime, timezone as tz
- from archivebox.config import VERSION
- from archivebox.config.common import SERVER_CONFIG
-
- MAIN_INDEX_HEADER = {
- 'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
- 'schema': 'archivebox.index.json',
- 'copyright_info': SERVER_CONFIG.FOOTER_INFO,
- 'meta': {
- 'project': 'ArchiveBox',
- 'version': VERSION,
- 'git_sha': VERSION,
- 'website': 'https://ArchiveBox.io',
- 'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki',
- 'source': 'https://github.com/ArchiveBox/ArchiveBox',
- 'issues': 'https://github.com/ArchiveBox/ArchiveBox/issues',
- 'dependencies': {},
- },
- } if with_headers else {}
-
- snapshot_dicts = [s.to_dict(extended=True) for s in self.iterator(chunk_size=500)]
-
- if with_headers:
- output = {
- **MAIN_INDEX_HEADER,
- 'num_links': len(snapshot_dicts),
- 'updated': datetime.now(tz.utc),
- 'last_run_cmd': sys.argv,
- 'links': snapshot_dicts,
- }
- else:
- output = snapshot_dicts
- return to_json(output, indent=4, sort_keys=True)
-
- def to_csv(self, cols: Optional[List[str]] = None, header: bool = True, separator: str = ',', ljust: int = 0) -> str:
- """Generate CSV output from snapshots"""
- cols = cols or ['timestamp', 'is_archived', 'url']
- header_str = separator.join(col.ljust(ljust) for col in cols) if header else ''
- row_strs = (s.to_csv(cols=cols, ljust=ljust, separator=separator) for s in self.iterator(chunk_size=500))
- return '\n'.join((header_str, *row_strs))
-
- def to_html(self, with_headers: bool = True) -> str:
- """Generate main index HTML from snapshots"""
- from datetime import datetime, timezone as tz
- from django.template.loader import render_to_string
- from archivebox.config import VERSION
- from archivebox.config.common import SERVER_CONFIG
- from archivebox.config.version import get_COMMIT_HASH
-
- template = 'static_index.html' if with_headers else 'minimal_index.html'
- snapshot_list = list(self.iterator(chunk_size=500))
-
- return render_to_string(template, {
- 'version': VERSION,
- 'git_sha': get_COMMIT_HASH() or VERSION,
- 'num_links': str(len(snapshot_list)),
- 'date_updated': datetime.now(tz.utc).strftime('%Y-%m-%d'),
- 'time_updated': datetime.now(tz.utc).strftime('%Y-%m-%d %H:%M'),
- 'links': snapshot_list,
- 'FOOTER_INFO': SERVER_CONFIG.FOOTER_INFO,
- })
-
-
-class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)):
- """Manager for Snapshot model - uses SnapshotQuerySet for chainable methods"""
-
- def filter(self, *args, **kwargs):
- domain = kwargs.pop('domain', None)
- qs = super().filter(*args, **kwargs)
- if domain:
- qs = qs.filter(url__icontains=f'://{domain}')
- return qs
-
- def get_queryset(self):
- return super().get_queryset().prefetch_related('tags', 'archiveresult_set')
-
- # =========================================================================
- # Import Methods
- # =========================================================================
-
- def remove(self, atomic: bool = False) -> tuple:
- """Remove snapshots from the database"""
- from django.db import transaction
- if atomic:
- with transaction.atomic():
- return self.delete()
- return self.delete()
-
-
-class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
- id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
- created_at = models.DateTimeField(default=timezone.now, db_index=True)
- modified_at = models.DateTimeField(auto_now=True)
-
- url = models.URLField(unique=False, db_index=True) # URLs can appear in multiple crawls
- timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False)
- bookmarked_at = models.DateTimeField(default=timezone.now, db_index=True)
- crawl: Crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, null=False, related_name='snapshot_set', db_index=True) # type: ignore[assignment]
- parent_snapshot = models.ForeignKey('self', on_delete=models.SET_NULL, null=True, blank=True, related_name='child_snapshots', db_index=True, help_text='Parent snapshot that discovered this URL (for recursive crawling)')
-
- title = models.CharField(max_length=512, null=True, blank=True, db_index=True)
- downloaded_at = models.DateTimeField(default=None, null=True, editable=False, db_index=True, blank=True)
- depth = models.PositiveSmallIntegerField(default=0, db_index=True) # 0 for root snapshot, 1+ for discovered URLs
- fs_version = models.CharField(max_length=10, default='0.9.0', help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().')
- current_step = models.PositiveSmallIntegerField(default=0, db_index=True, help_text='Current hook step being executed (0-9). Used for sequential hook execution.')
-
- retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
- status = ModelWithStateMachine.StatusField(choices=ModelWithStateMachine.StatusChoices, default=ModelWithStateMachine.StatusChoices.QUEUED)
- config = models.JSONField(default=dict, null=False, blank=False, editable=True)
- notes = models.TextField(blank=True, null=False, default='')
- output_dir = models.FilePathField(path=CONSTANTS.ARCHIVE_DIR, recursive=True, match='.*', default=None, null=True, blank=True, editable=True)
-
- tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name='snapshot_set', through_fields=('snapshot', 'tag'))
-
- state_machine_name = 'core.models.SnapshotMachine'
- state_field_name = 'status'
- retry_at_field_name = 'retry_at'
- StatusChoices = ModelWithStateMachine.StatusChoices
- active_state = StatusChoices.STARTED
-
- objects = SnapshotManager()
- archiveresult_set: models.Manager['ArchiveResult']
-
- class Meta(TypedModelMeta):
- verbose_name = "Snapshot"
- verbose_name_plural = "Snapshots"
- constraints = [
- # Allow same URL in different crawls, but not duplicates within same crawl
- models.UniqueConstraint(fields=['url', 'crawl'], name='unique_url_per_crawl'),
- # Global timestamp uniqueness for 1:1 symlink mapping
- models.UniqueConstraint(fields=['timestamp'], name='unique_timestamp'),
- ]
-
- def __str__(self):
- return f'[{self.id}] {self.url[:64]}'
-
- def save(self, *args, **kwargs):
- is_new = self._state.adding
- if not self.bookmarked_at:
- self.bookmarked_at = self.created_at or timezone.now()
- if not self.timestamp:
- self.timestamp = str(self.bookmarked_at.timestamp())
-
- # Migrate filesystem if needed (happens automatically on save)
- if self.pk and self.fs_migration_needed:
- from django.db import transaction
- with transaction.atomic():
- # Walk through migration chain automatically
- current = self.fs_version
- target = self._fs_current_version()
-
- while current != target:
- next_ver = self._fs_next_version(current)
- method = f'_fs_migrate_from_{current.replace(".", "_")}_to_{next_ver.replace(".", "_")}'
-
- # Only run if method exists (most are no-ops)
- if hasattr(self, method):
- getattr(self, method)()
-
- current = next_ver
-
- # Update version (still in transaction)
- self.fs_version = target
-
- super().save(*args, **kwargs)
- if self.crawl and self.url not in self.crawl.urls:
- self.crawl.urls += f'\n{self.url}'
- self.crawl.save()
-
- if is_new:
- from archivebox.misc.logging_util import log_worker_event
- log_worker_event(
- worker_type='DB',
- event='Created Snapshot',
- indent_level=2,
- url=self.url,
- metadata={
- 'id': str(self.id),
- 'crawl_id': str(self.crawl_id) if self.crawl_id else None,
- 'depth': self.depth,
- 'status': self.status,
- },
- )
-
- # =========================================================================
- # Filesystem Migration Methods
- # =========================================================================
-
- @staticmethod
- def _fs_current_version() -> str:
- """Get current ArchiveBox filesystem version (normalized to x.x.0 format)"""
- from archivebox.config import VERSION
- # Normalize version to x.x.0 format (e.g., "0.9.0rc1" -> "0.9.0")
- parts = VERSION.split('.')
- if len(parts) >= 2:
- major, minor = parts[0], parts[1]
- # Strip any non-numeric suffix from minor version
- minor = ''.join(c for c in minor if c.isdigit())
- return f'{major}.{minor}.0'
- return '0.9.0' # Fallback if version parsing fails
-
- @property
- def fs_migration_needed(self) -> bool:
- """Check if snapshot needs filesystem migration"""
- return self.fs_version != self._fs_current_version()
-
- def _fs_next_version(self, version: str) -> str:
- """Get next version in migration chain"""
- chain = ['0.7.0', '0.8.0', '0.9.0']
- try:
- idx = chain.index(version)
- return chain[idx + 1] if idx + 1 < len(chain) else self._fs_current_version()
- except ValueError:
- # Unknown version - skip to current
- return self._fs_current_version()
-
- def _fs_migrate_from_0_7_0_to_0_8_0(self):
- """Migration from 0.7.0 to 0.8.0 layout (no-op)"""
- # 0.7 and 0.8 both used archive/
- # Nothing to do!
- pass
-
- def _fs_migrate_from_0_8_0_to_0_9_0(self):
- """
- Migrate from flat to nested structure.
-
- 0.8.x: archive/{timestamp}/
- 0.9.x: users/{user}/snapshots/YYYYMMDD/{domain}/{uuid}/
-
- Transaction handling:
- 1. Copy files INSIDE transaction
- 2. Create symlink INSIDE transaction
- 3. Update fs_version INSIDE transaction (done by save())
- 4. Exit transaction (DB commit)
- 5. Delete old files OUTSIDE transaction (after commit)
- """
- import shutil
- from django.db import transaction
-
- old_dir = self.get_storage_path_for_version('0.8.0')
- new_dir = self.get_storage_path_for_version('0.9.0')
-
- if not old_dir.exists() or old_dir == new_dir or new_dir.exists():
- return
-
- new_dir.mkdir(parents=True, exist_ok=True)
-
- # Copy all files (idempotent)
- for old_file in old_dir.rglob('*'):
- if not old_file.is_file():
- continue
-
- rel_path = old_file.relative_to(old_dir)
- new_file = new_dir / rel_path
-
- # Skip if already copied
- if new_file.exists() and new_file.stat().st_size == old_file.stat().st_size:
- continue
-
- new_file.parent.mkdir(parents=True, exist_ok=True)
- shutil.copy2(old_file, new_file)
-
- # Verify all copied
- old_files = {f.relative_to(old_dir): f.stat().st_size
- for f in old_dir.rglob('*') if f.is_file()}
- new_files = {f.relative_to(new_dir): f.stat().st_size
- for f in new_dir.rglob('*') if f.is_file()}
-
- if old_files.keys() != new_files.keys():
- missing = old_files.keys() - new_files.keys()
- raise Exception(f"Migration incomplete: missing {missing}")
-
- # Create backwards-compat symlink (INSIDE transaction)
- symlink_path = CONSTANTS.ARCHIVE_DIR / self.timestamp
- if symlink_path.is_symlink():
- symlink_path.unlink()
-
- if not symlink_path.exists() or symlink_path == old_dir:
- symlink_path.symlink_to(new_dir, target_is_directory=True)
-
- # Schedule old directory deletion AFTER transaction commits
- transaction.on_commit(lambda: self._cleanup_old_migration_dir(old_dir))
-
- def _cleanup_old_migration_dir(self, old_dir: Path):
- """
- Delete old directory after successful migration.
- Called via transaction.on_commit() after DB commit succeeds.
- """
- import shutil
- import logging
-
- if old_dir.exists() and not old_dir.is_symlink():
- try:
- shutil.rmtree(old_dir)
- except Exception as e:
- # Log but don't raise - migration succeeded, this is just cleanup
- logging.getLogger('archivebox.migration').warning(
- f"Could not remove old migration directory {old_dir}: {e}"
- )
-
- # =========================================================================
- # Path Calculation and Migration Helpers
- # =========================================================================
-
- @staticmethod
- def extract_domain_from_url(url: str) -> str:
- """
- Extract domain from URL for 0.9.x path structure.
- Uses full hostname with sanitized special chars.
-
- Examples:
- https://example.com:8080 → example.com_8080
- https://sub.example.com → sub.example.com
- file:///path → localhost
- data:text/html → data
- """
- from urllib.parse import urlparse
-
- try:
- parsed = urlparse(url)
-
- if parsed.scheme in ('http', 'https'):
- if parsed.port:
- return f"{parsed.hostname}_{parsed.port}".replace(':', '_')
- return parsed.hostname or 'unknown'
- elif parsed.scheme == 'file':
- return 'localhost'
- elif parsed.scheme:
- return parsed.scheme
- else:
- return 'unknown'
- except Exception:
- return 'unknown'
-
- def get_storage_path_for_version(self, version: str) -> Path:
- """
- Calculate storage path for specific filesystem version.
- Centralizes path logic so it's reusable.
-
- 0.7.x/0.8.x: archive/{timestamp}
- 0.9.x: users/{username}/snapshots/YYYYMMDD/{domain}/{uuid}/
- """
- from datetime import datetime
-
- if version in ('0.7.0', '0.8.0'):
- return CONSTANTS.ARCHIVE_DIR / self.timestamp
-
- elif version in ('0.9.0', '1.0.0'):
- username = self.crawl.created_by.username
-
- # Use created_at for date grouping (fallback to timestamp)
- if self.created_at:
- date_str = self.created_at.strftime('%Y%m%d')
- else:
- date_str = datetime.fromtimestamp(float(self.timestamp)).strftime('%Y%m%d')
-
- domain = self.extract_domain_from_url(self.url)
-
- return (
- CONSTANTS.DATA_DIR / 'users' / username / 'snapshots' /
- date_str / domain / str(self.id)
- )
- else:
- # Unknown version - use current
- return self.get_storage_path_for_version(self._fs_current_version())
-
- # =========================================================================
- # Loading and Creation from Filesystem (Used by archivebox update ONLY)
- # =========================================================================
-
- @classmethod
- def load_from_directory(cls, snapshot_dir: Path) -> Optional['Snapshot']:
- """
- Load existing Snapshot from DB by reading index.json.
-
- Reads index.json, extracts url+timestamp, queries DB.
- Returns existing Snapshot or None if not found/invalid.
- Does NOT create new snapshots.
-
- ONLY used by: archivebox update (for orphan detection)
- """
- import json
-
- index_path = snapshot_dir / 'index.json'
- if not index_path.exists():
- return None
-
- try:
- with open(index_path) as f:
- data = json.load(f)
- except:
- return None
-
- url = data.get('url')
- if not url:
- return None
-
- # Get timestamp - prefer index.json, fallback to folder name
- timestamp = cls._select_best_timestamp(
- index_timestamp=data.get('timestamp'),
- folder_name=snapshot_dir.name
- )
-
- if not timestamp:
- return None
-
- # Look up existing
- try:
- return cls.objects.get(url=url, timestamp=timestamp)
- except cls.DoesNotExist:
- return None
- except cls.MultipleObjectsReturned:
- # Should not happen with unique constraint
- return cls.objects.filter(url=url, timestamp=timestamp).first()
-
- @classmethod
- def create_from_directory(cls, snapshot_dir: Path) -> Optional['Snapshot']:
- """
- Create new Snapshot from orphaned directory.
-
- Validates timestamp, ensures uniqueness.
- Returns new UNSAVED Snapshot or None if invalid.
-
- ONLY used by: archivebox update (for orphan import)
- """
- import json
-
- index_path = snapshot_dir / 'index.json'
- if not index_path.exists():
- return None
-
- try:
- with open(index_path) as f:
- data = json.load(f)
- except:
- return None
-
- url = data.get('url')
- if not url:
- return None
-
- # Get and validate timestamp
- timestamp = cls._select_best_timestamp(
- index_timestamp=data.get('timestamp'),
- folder_name=snapshot_dir.name
- )
-
- if not timestamp:
- return None
-
- # Ensure uniqueness (reuses existing logic from create_or_update_from_dict)
- timestamp = cls._ensure_unique_timestamp(url, timestamp)
-
- # Detect version
- fs_version = cls._detect_fs_version_from_index(data)
-
- return cls(
- url=url,
- timestamp=timestamp,
- title=data.get('title', ''),
- fs_version=fs_version,
- created_by_id=get_or_create_system_user_pk(),
- )
-
- @staticmethod
- def _select_best_timestamp(index_timestamp: str, folder_name: str) -> Optional[str]:
- """
- Select best timestamp from index.json vs folder name.
-
- Validates range (1995-2035).
- Prefers index.json if valid.
- """
- def is_valid_timestamp(ts):
- try:
- ts_int = int(float(ts))
- # 1995-01-01 to 2035-12-31
- return 788918400 <= ts_int <= 2082758400
- except:
- return False
-
- index_valid = is_valid_timestamp(index_timestamp) if index_timestamp else False
- folder_valid = is_valid_timestamp(folder_name)
-
- if index_valid:
- return str(int(float(index_timestamp)))
- elif folder_valid:
- return str(int(float(folder_name)))
- else:
- return None
-
- @classmethod
- def _ensure_unique_timestamp(cls, url: str, timestamp: str) -> str:
- """
- Ensure timestamp is globally unique.
- If collision with different URL, increment by 1 until unique.
-
- NOTE: Logic already exists in create_or_update_from_dict (line 266-267)
- This is just an extracted, reusable version.
- """
- while cls.objects.filter(timestamp=timestamp).exclude(url=url).exists():
- timestamp = str(int(float(timestamp)) + 1)
- return timestamp
-
- @staticmethod
- def _detect_fs_version_from_index(data: dict) -> str:
- """
- Detect fs_version from index.json structure.
-
- - Has fs_version field: use it
- - Has history dict: 0.7.0
- - Has archive_results list: 0.8.0
- - Default: 0.7.0
- """
- if 'fs_version' in data:
- return data['fs_version']
- if 'history' in data and 'archive_results' not in data:
- return '0.7.0'
- if 'archive_results' in data:
- return '0.8.0'
- return '0.7.0'
-
- # =========================================================================
- # Index.json Reconciliation
- # =========================================================================
-
- def reconcile_with_index_json(self):
- """
- Merge index.json with DB. DB is source of truth.
-
- - Title: longest non-URL
- - Tags: union
- - ArchiveResults: keep both (by plugin+start_ts)
-
- Writes back in 0.9.x format.
-
- Used by: archivebox update (to sync index.json with DB)
- """
- import json
-
- index_path = Path(self.output_dir) / 'index.json'
-
- index_data = {}
- if index_path.exists():
- try:
- with open(index_path) as f:
- index_data = json.load(f)
- except:
- pass
-
- # Merge title
- self._merge_title_from_index(index_data)
-
- # Merge tags
- self._merge_tags_from_index(index_data)
-
- # Merge ArchiveResults
- self._merge_archive_results_from_index(index_data)
-
- # Write back
- self.write_index_json()
-
- def _merge_title_from_index(self, index_data: dict):
- """Merge title - prefer longest non-URL title."""
- index_title = index_data.get('title', '').strip()
- db_title = self.title or ''
-
- candidates = [t for t in [index_title, db_title] if t and t != self.url]
- if candidates:
- best_title = max(candidates, key=len)
- if self.title != best_title:
- self.title = best_title
-
- def _merge_tags_from_index(self, index_data: dict):
- """Merge tags - union of both sources."""
- from django.db import transaction
-
- index_tags = set(index_data.get('tags', '').split(',')) if index_data.get('tags') else set()
- index_tags = {t.strip() for t in index_tags if t.strip()}
-
- db_tags = set(self.tags.values_list('name', flat=True))
-
- new_tags = index_tags - db_tags
- if new_tags:
- with transaction.atomic():
- for tag_name in new_tags:
- tag, _ = Tag.objects.get_or_create(name=tag_name)
- self.tags.add(tag)
-
- def _merge_archive_results_from_index(self, index_data: dict):
- """Merge ArchiveResults - keep both (by plugin+start_ts)."""
- existing = {
- (ar.plugin, ar.start_ts): ar
- for ar in ArchiveResult.objects.filter(snapshot=self)
- }
-
- # Handle 0.8.x format (archive_results list)
- for result_data in index_data.get('archive_results', []):
- self._create_archive_result_if_missing(result_data, existing)
-
- # Handle 0.7.x format (history dict)
- if 'history' in index_data and isinstance(index_data['history'], dict):
- for plugin, result_list in index_data['history'].items():
- if isinstance(result_list, list):
- for result_data in result_list:
- # Support both old 'extractor' and new 'plugin' keys for backwards compat
- result_data['plugin'] = result_data.get('plugin') or result_data.get('extractor') or plugin
- self._create_archive_result_if_missing(result_data, existing)
-
- def _create_archive_result_if_missing(self, result_data: dict, existing: dict):
- """Create ArchiveResult if not already in DB."""
- from dateutil import parser
-
- # Support both old 'extractor' and new 'plugin' keys for backwards compat
- plugin = result_data.get('plugin') or result_data.get('extractor', '')
- if not plugin:
- return
-
- start_ts = None
- if result_data.get('start_ts'):
- try:
- start_ts = parser.parse(result_data['start_ts'])
- except:
- pass
-
- if (plugin, start_ts) in existing:
- return
-
- try:
- end_ts = None
- if result_data.get('end_ts'):
- try:
- end_ts = parser.parse(result_data['end_ts'])
- except:
- pass
-
- ArchiveResult.objects.create(
- snapshot=self,
- plugin=plugin,
- hook_name=result_data.get('hook_name', ''),
- status=result_data.get('status', 'failed'),
- output_str=result_data.get('output', ''),
- cmd=result_data.get('cmd', []),
- pwd=result_data.get('pwd', str(self.output_dir)),
- start_ts=start_ts,
- end_ts=end_ts,
- created_by=self.crawl.created_by,
- )
- except:
- pass
-
- def write_index_json(self):
- """Write index.json in 0.9.x format."""
- import json
-
- index_path = Path(self.output_dir) / 'index.json'
-
- data = {
- 'url': self.url,
- 'timestamp': self.timestamp,
- 'title': self.title or '',
- 'tags': ','.join(sorted(self.tags.values_list('name', flat=True))),
- 'fs_version': self.fs_version,
- 'bookmarked_at': self.bookmarked_at.isoformat() if self.bookmarked_at else None,
- 'created_at': self.created_at.isoformat() if self.created_at else None,
- 'archive_results': [
- {
- 'plugin': ar.plugin,
- 'status': ar.status,
- 'start_ts': ar.start_ts.isoformat() if ar.start_ts else None,
- 'end_ts': ar.end_ts.isoformat() if ar.end_ts else None,
- 'output': ar.output_str or '',
- 'cmd': ar.cmd if isinstance(ar.cmd, list) else [],
- 'pwd': ar.pwd,
- }
- for ar in ArchiveResult.objects.filter(snapshot=self).order_by('start_ts')
- ],
- }
-
- index_path.parent.mkdir(parents=True, exist_ok=True)
- with open(index_path, 'w') as f:
- json.dump(data, f, indent=2, sort_keys=True)
-
- # =========================================================================
- # Snapshot Utilities
- # =========================================================================
-
- @staticmethod
- def move_directory_to_invalid(snapshot_dir: Path):
- """
- Move invalid directory to data/invalid/YYYYMMDD/.
-
- Used by: archivebox update (when encountering invalid directories)
- """
- from datetime import datetime
- import shutil
-
- invalid_dir = CONSTANTS.DATA_DIR / 'invalid' / datetime.now().strftime('%Y%m%d')
- invalid_dir.mkdir(parents=True, exist_ok=True)
-
- dest = invalid_dir / snapshot_dir.name
- counter = 1
- while dest.exists():
- dest = invalid_dir / f"{snapshot_dir.name}_{counter}"
- counter += 1
-
- try:
- shutil.move(str(snapshot_dir), str(dest))
- except:
- pass
-
- @classmethod
- def find_and_merge_duplicates(cls) -> int:
- """
- Find and merge snapshots with same url:timestamp.
- Returns count of duplicate sets merged.
-
- Used by: archivebox update (Phase 3: deduplication)
- """
- from django.db.models import Count
-
- duplicates = (
- cls.objects
- .values('url', 'timestamp')
- .annotate(count=Count('id'))
- .filter(count__gt=1)
- )
-
- merged = 0
- for dup in duplicates.iterator():
- snapshots = list(
- cls.objects
- .filter(url=dup['url'], timestamp=dup['timestamp'])
- .order_by('created_at') # Keep oldest
- )
-
- if len(snapshots) > 1:
- try:
- cls._merge_snapshots(snapshots)
- merged += 1
- except:
- pass
-
- return merged
-
- @classmethod
- def _merge_snapshots(cls, snapshots: list['Snapshot']):
- """
- Merge exact duplicates.
- Keep oldest, union files + ArchiveResults.
- """
- import shutil
-
- keeper = snapshots[0]
- duplicates = snapshots[1:]
-
- keeper_dir = Path(keeper.output_dir)
-
- for dup in duplicates:
- dup_dir = Path(dup.output_dir)
-
- # Merge files
- if dup_dir.exists() and dup_dir != keeper_dir:
- for dup_file in dup_dir.rglob('*'):
- if not dup_file.is_file():
- continue
-
- rel = dup_file.relative_to(dup_dir)
- keeper_file = keeper_dir / rel
-
- if not keeper_file.exists():
- keeper_file.parent.mkdir(parents=True, exist_ok=True)
- shutil.copy2(dup_file, keeper_file)
-
- try:
- shutil.rmtree(dup_dir)
- except:
- pass
-
- # Merge tags
- for tag in dup.tags.all():
- keeper.tags.add(tag)
-
- # Move ArchiveResults
- ArchiveResult.objects.filter(snapshot=dup).update(snapshot=keeper)
-
- # Delete
- dup.delete()
-
- # =========================================================================
- # Output Directory Properties
- # =========================================================================
-
- @property
- def output_dir_parent(self) -> str:
- return 'archive'
-
- @property
- def output_dir_name(self) -> str:
- return str(self.timestamp)
-
- def archive(self, overwrite=False, methods=None):
- return bg_archive_snapshot(self, overwrite=overwrite, methods=methods)
-
- @admin.display(description='Tags')
- def tags_str(self, nocache=True) -> str | None:
- calc_tags_str = lambda: ','.join(sorted(tag.name for tag in self.tags.all()))
- if hasattr(self, '_prefetched_objects_cache') and 'tags' in self._prefetched_objects_cache:
- return calc_tags_str()
- cache_key = f'{self.pk}-tags'
- return cache.get_or_set(cache_key, calc_tags_str) if not nocache else calc_tags_str()
-
- def icons(self) -> str:
- """Generate HTML icons showing which extractor plugins have succeeded for this snapshot"""
- from django.utils.html import format_html, mark_safe
-
- cache_key = f'result_icons:{self.pk}:{(self.downloaded_at or self.modified_at or self.created_at or self.bookmarked_at).timestamp()}'
-
- def calc_icons():
- if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
- archive_results = {r.plugin: r for r in self.archiveresult_set.all() if r.status == "succeeded" and (r.output_files or r.output_str)}
- else:
- # Filter for results that have either output_files or output_str
- from django.db.models import Q
- archive_results = {r.plugin: r for r in self.archiveresult_set.filter(
- Q(status="succeeded") & (Q(output_files__isnull=False) | ~Q(output_str=''))
- )}
-
- path = self.archive_path
- canon = self.canonical_outputs()
- output = ""
- output_template = '{} '
-
- # Get all plugins from hooks system (sorted by numeric prefix)
- all_plugins = [get_plugin_name(e) for e in get_plugins()]
-
- for plugin in all_plugins:
- result = archive_results.get(plugin)
- existing = result and result.status == 'succeeded' and (result.output_files or result.output_str)
- icon = get_plugin_icon(plugin)
- output += format_html(
- output_template,
- path,
- canon.get(plugin, plugin + '/'),
- str(bool(existing)),
- plugin,
- icon
- )
-
- return format_html('{}', mark_safe(output))
-
- cache_result = cache.get(cache_key)
- if cache_result:
- return cache_result
-
- fresh_result = calc_icons()
- cache.set(cache_key, fresh_result, timeout=60 * 60 * 24)
- return fresh_result
-
- @property
- def api_url(self) -> str:
- return reverse_lazy('api-1:get_snapshot', args=[self.id])
-
- def get_absolute_url(self):
- return f'/{self.archive_path}'
-
- @cached_property
- def domain(self) -> str:
- return url_domain(self.url)
-
- @cached_property
- def output_dir(self):
- """The filesystem path to the snapshot's output directory."""
- import os
-
- current_path = self.get_storage_path_for_version(self.fs_version)
-
- if current_path.exists():
- return str(current_path)
-
- # Check for backwards-compat symlink
- old_path = CONSTANTS.ARCHIVE_DIR / self.timestamp
- if old_path.is_symlink():
- return str(Path(os.readlink(old_path)).resolve())
- elif old_path.exists():
- return str(old_path)
-
- return str(current_path)
-
- @cached_property
- def archive_path(self):
- return f'{CONSTANTS.ARCHIVE_DIR_NAME}/{self.timestamp}'
-
- @cached_property
- def archive_size(self):
- try:
- return get_dir_size(self.output_dir)[0]
- except Exception:
- return 0
-
- def save_tags(self, tags: Iterable[str] = ()) -> None:
- tags_id = [Tag.objects.get_or_create(name=tag)[0].pk for tag in tags if tag.strip()]
- self.tags.clear()
- self.tags.add(*tags_id)
-
- def pending_archiveresults(self) -> QuerySet['ArchiveResult']:
- return self.archiveresult_set.exclude(status__in=ArchiveResult.FINAL_OR_ACTIVE_STATES)
-
- def run(self) -> list['ArchiveResult']:
- """
- Execute snapshot by creating pending ArchiveResults for all enabled hooks.
-
- Called by: SnapshotMachine.enter_started()
-
- Hook Lifecycle:
- 1. discover_hooks('Snapshot') → finds all plugin hooks
- 2. For each hook:
- - Create ArchiveResult with status=QUEUED
- - Store hook_name (e.g., 'on_Snapshot__50_wget.py')
- 3. ArchiveResults execute independently via ArchiveResultMachine
- 4. Hook execution happens in ArchiveResult.run(), NOT here
-
- Returns:
- list[ArchiveResult]: Newly created pending results
- """
- return self.create_pending_archiveresults()
-
- def cleanup(self):
- """
- Clean up background ArchiveResult hooks.
-
- Called by the state machine when entering the 'sealed' state.
- Kills any background hooks and finalizes their ArchiveResults.
- """
- from archivebox.hooks import kill_process
-
- # Kill any background ArchiveResult hooks
- if not self.OUTPUT_DIR.exists():
- return
-
- # Find all .pid files in this snapshot's output directory
- for pid_file in self.OUTPUT_DIR.glob('**/*.pid'):
- kill_process(pid_file, validate=True)
-
- # Update all STARTED ArchiveResults from filesystem
- results = self.archiveresult_set.filter(status=ArchiveResult.StatusChoices.STARTED)
- for ar in results:
- ar.update_from_output()
-
- def has_running_background_hooks(self) -> bool:
- """
- Check if any ArchiveResult background hooks are still running.
-
- Used by state machine to determine if snapshot is finished.
- """
- from archivebox.hooks import process_is_alive
-
- if not self.OUTPUT_DIR.exists():
- return False
-
- for plugin_dir in self.OUTPUT_DIR.iterdir():
- if not plugin_dir.is_dir():
- continue
- pid_file = plugin_dir / 'hook.pid'
- if process_is_alive(pid_file):
- return True
-
- return False
-
- @staticmethod
- def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None, queue_for_extraction: bool = True):
- """
- Create/update Snapshot from JSONL record or dict.
-
- Unified method that handles:
- - ID-based patching: {"id": "...", "title": "new title"}
- - URL-based create/update: {"url": "...", "title": "...", "tags": "..."}
- - Auto-creates Crawl if not provided
- - Optionally queues for extraction
-
- Args:
- record: Dict with 'url' (for create) or 'id' (for patch), plus other fields
- overrides: Dict with 'crawl', 'snapshot' (parent), 'created_by_id'
- queue_for_extraction: If True, sets status=QUEUED and retry_at (default: True)
-
- Returns:
- Snapshot instance or None
- """
- import re
- from django.utils import timezone
- from archivebox.misc.util import parse_date
- from archivebox.base_models.models import get_or_create_system_user_pk
- from archivebox.config.common import GENERAL_CONFIG
-
- overrides = overrides or {}
-
- # If 'id' is provided, lookup and patch that specific snapshot
- snapshot_id = record.get('id')
- if snapshot_id:
- try:
- snapshot = Snapshot.objects.get(id=snapshot_id)
-
- # Generically update all fields present in record
- update_fields = []
- for field_name, value in record.items():
- # Skip internal fields
- if field_name in ('id', 'type'):
- continue
-
- # Skip if field doesn't exist on model
- if not hasattr(snapshot, field_name):
- continue
-
- # Special parsing for date fields
- if field_name in ('bookmarked_at', 'retry_at', 'created_at', 'modified_at'):
- if value and isinstance(value, str):
- value = parse_date(value)
-
- # Update field if value is provided and different
- if value is not None and getattr(snapshot, field_name) != value:
- setattr(snapshot, field_name, value)
- update_fields.append(field_name)
-
- if update_fields:
- snapshot.save(update_fields=update_fields + ['modified_at'])
-
- return snapshot
- except Snapshot.DoesNotExist:
- # ID not found, fall through to create-by-URL logic
- pass
-
- url = record.get('url')
- if not url:
- return None
-
- # Determine or create crawl (every snapshot must have a crawl)
- crawl = overrides.get('crawl')
- parent_snapshot = overrides.get('snapshot') # Parent snapshot
- created_by_id = overrides.get('created_by_id') or (parent_snapshot.crawl.created_by_id if parent_snapshot else None) or get_or_create_system_user_pk()
-
- # If no crawl provided, inherit from parent or auto-create one
- if not crawl:
- if parent_snapshot:
- # Inherit crawl from parent snapshot
- crawl = parent_snapshot.crawl
- else:
- # Auto-create a single-URL crawl
- from archivebox.crawls.models import Crawl
- from archivebox.config import CONSTANTS
-
- timestamp_str = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
- sources_file = CONSTANTS.SOURCES_DIR / f'{timestamp_str}__auto_crawl.txt'
- sources_file.parent.mkdir(parents=True, exist_ok=True)
- sources_file.write_text(url)
-
- crawl = Crawl.objects.create(
- urls=url,
- max_depth=0,
- label=f'auto-created for {url[:50]}',
- created_by_id=created_by_id,
- )
-
- # Parse tags
- tags_str = record.get('tags', '')
- tag_list = []
- if tags_str:
- tag_list = list(dict.fromkeys(
- tag.strip() for tag in re.split(GENERAL_CONFIG.TAG_SEPARATOR_PATTERN, tags_str)
- if tag.strip()
- ))
-
- # Get most recent snapshot with this URL (URLs can exist in multiple crawls)
- snapshot = Snapshot.objects.filter(url=url).order_by('-created_at').first()
-
- title = record.get('title')
- timestamp = record.get('timestamp')
-
- if snapshot:
- # Update existing snapshot
- if title and (not snapshot.title or len(title) > len(snapshot.title or '')):
- snapshot.title = title
- snapshot.save(update_fields=['title', 'modified_at'])
- else:
- # Create new snapshot
- if timestamp:
- while Snapshot.objects.filter(timestamp=timestamp).exists():
- timestamp = str(float(timestamp) + 1.0)
-
- snapshot = Snapshot.objects.create(
- url=url,
- timestamp=timestamp,
- title=title,
- crawl=crawl,
- )
-
- # Update tags
- if tag_list:
- existing_tags = set(snapshot.tags.values_list('name', flat=True))
- new_tags = set(tag_list) | existing_tags
- snapshot.save_tags(new_tags)
-
- # Queue for extraction and update additional fields
- update_fields = []
-
- if queue_for_extraction:
- snapshot.status = Snapshot.StatusChoices.QUEUED
- snapshot.retry_at = timezone.now()
- update_fields.extend(['status', 'retry_at'])
-
- # Update additional fields if provided
- for field_name in ('depth', 'parent_snapshot_id', 'crawl_id', 'bookmarked_at'):
- value = record.get(field_name)
- if value is not None and getattr(snapshot, field_name) != value:
- setattr(snapshot, field_name, value)
- update_fields.append(field_name)
-
- if update_fields:
- snapshot.save(update_fields=update_fields + ['modified_at'])
-
- return snapshot
-
- def create_pending_archiveresults(self) -> list['ArchiveResult']:
- """
- Create ArchiveResult records for all enabled hooks.
-
- Uses the hooks system to discover available hooks from:
- - archivebox/plugins/*/on_Snapshot__*.{py,sh,js}
- - data/plugins/*/on_Snapshot__*.{py,sh,js}
-
- Creates one ArchiveResult per hook (not per plugin), with hook_name set.
- This enables step-based execution where all hooks in a step can run in parallel.
- """
- from archivebox.hooks import discover_hooks
-
- hooks = discover_hooks('Snapshot')
- archiveresults = []
-
- for hook_path in hooks:
- hook_name = hook_path.name # e.g., 'on_Snapshot__50_wget.py'
- plugin = hook_path.parent.name # e.g., 'wget'
-
- # Check if AR already exists for this specific hook
- if ArchiveResult.objects.filter(snapshot=self, hook_name=hook_name).exists():
- continue
-
- archiveresult, created = ArchiveResult.objects.get_or_create(
- snapshot=self,
- hook_name=hook_name,
- defaults={
- 'plugin': plugin,
- 'status': ArchiveResult.INITIAL_STATE,
- 'retry_at': timezone.now(),
- 'created_by_id': self.crawl.created_by_id,
- },
- )
- if archiveresult.status == ArchiveResult.INITIAL_STATE:
- archiveresults.append(archiveresult)
-
- return archiveresults
-
- def advance_step_if_ready(self) -> bool:
- """
- Advance current_step if all foreground hooks in current step are finished.
-
- Called by the state machine to check if step can advance.
- Background hooks (.bg) don't block step advancement.
-
- Step advancement rules:
- - All foreground ARs in current step must be finished (SUCCEEDED/FAILED/SKIPPED)
- - Background ARs (hook_name contains '.bg.') are ignored for advancement
- - When ready, increments current_step by 1 (up to 9)
-
- Returns:
- True if step was advanced, False if not ready or already at step 9.
- """
- from archivebox.hooks import extract_step, is_background_hook
-
- if self.current_step >= 9:
- return False # Already at final step
-
- # Get all ARs for current step that are foreground
- current_step_ars = self.archiveresult_set.filter(
- hook_name__isnull=False
- ).exclude(hook_name='')
-
- # Check each AR in current step
- for ar in current_step_ars:
- ar_step = extract_step(ar.hook_name)
- if ar_step != self.current_step:
- continue # Not in current step
-
- if is_background_hook(ar.hook_name):
- continue # Background hooks don't block
-
- # Foreground hook in current step - check if finished
- if ar.status not in ArchiveResult.FINAL_OR_ACTIVE_STATES:
- # Still pending/queued - can't advance
- return False
-
- if ar.status == ArchiveResult.StatusChoices.STARTED:
- # Still running - can't advance
- return False
-
- # All foreground hooks in current step are finished - advance!
- self.current_step += 1
- self.save(update_fields=['current_step', 'modified_at'])
- return True
-
- def is_finished_processing(self) -> bool:
- """
- Check if this snapshot has finished processing.
-
- Used by SnapshotMachine.is_finished() to determine if snapshot is complete.
-
- Returns:
- True if all archiveresults are finished (or no work to do), False otherwise.
- """
- # if no archiveresults exist yet, it's not finished
- if not self.archiveresult_set.exists():
- return False
-
- # Try to advance step if ready (handles step-based hook execution)
- # This will increment current_step when all foreground hooks in current step are done
- while self.advance_step_if_ready():
- pass # Keep advancing until we can't anymore
-
- # if archiveresults exist but are still pending, it's not finished
- if self.pending_archiveresults().exists():
- return False
-
- # Don't wait for background hooks - they'll be cleaned up on entering sealed state
- # Background hooks in STARTED state are excluded by pending_archiveresults()
- # (STARTED is in FINAL_OR_ACTIVE_STATES) so once all results are FINAL or ACTIVE,
- # we can transition to sealed and cleanup() will kill the background hooks
-
- # otherwise archiveresults exist and are all finished, so it's finished
- return True
-
- def retry_failed_archiveresults(self, retry_at: Optional['timezone.datetime'] = None) -> int:
- """
- Reset failed/skipped ArchiveResults to queued for retry.
-
- This enables seamless retry of the entire extraction pipeline:
- - Resets FAILED and SKIPPED results to QUEUED
- - Sets retry_at so workers pick them up
- - Plugins run in order (numeric prefix)
- - Each plugin checks its dependencies at runtime
-
- Dependency handling (e.g., chrome_session → screenshot):
- - Plugins check if required outputs exist before running
- - If dependency output missing → plugin returns 'skipped'
- - On retry, if dependency now succeeds → dependent can run
-
- Returns count of ArchiveResults reset.
- """
- retry_at = retry_at or timezone.now()
-
- count = self.archiveresult_set.filter(
- status__in=[
- ArchiveResult.StatusChoices.FAILED,
- ArchiveResult.StatusChoices.SKIPPED,
- ]
- ).update(
- status=ArchiveResult.StatusChoices.QUEUED,
- retry_at=retry_at,
- output=None,
- start_ts=None,
- end_ts=None,
- )
-
- # Also reset the snapshot and current_step so it gets re-checked from the beginning
- if count > 0:
- self.status = self.StatusChoices.STARTED
- self.retry_at = retry_at
- self.current_step = 0 # Reset to step 0 for retry
- self.save(update_fields=['status', 'retry_at', 'current_step', 'modified_at'])
-
- return count
-
- # =========================================================================
- # URL Helper Properties (migrated from Link schema)
- # =========================================================================
-
- @cached_property
- def url_hash(self) -> str:
- from hashlib import sha256
- return sha256(self.url.encode()).hexdigest()[:8]
-
- @cached_property
- def scheme(self) -> str:
- return self.url.split('://')[0]
-
- @cached_property
- def path(self) -> str:
- parts = self.url.split('://', 1)
- return '/' + parts[1].split('/', 1)[1] if len(parts) > 1 and '/' in parts[1] else '/'
-
- @cached_property
- def basename(self) -> str:
- return self.path.split('/')[-1]
-
- @cached_property
- def extension(self) -> str:
- basename = self.basename
- return basename.split('.')[-1] if '.' in basename else ''
-
- @cached_property
- def base_url(self) -> str:
- return f'{self.scheme}://{self.domain}'
-
- @cached_property
- def is_static(self) -> bool:
- static_extensions = {'.pdf', '.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.mp4', '.mp3', '.wav', '.webm'}
- return any(self.url.lower().endswith(ext) for ext in static_extensions)
-
- @cached_property
- def is_archived(self) -> bool:
- output_paths = (
- self.domain,
- 'output.html',
- 'output.pdf',
- 'screenshot.png',
- 'singlefile.html',
- 'readability/content.html',
- 'mercury/content.html',
- 'htmltotext.txt',
- 'media',
- 'git',
- )
- return any((Path(self.output_dir) / path).exists() for path in output_paths)
-
- # =========================================================================
- # Date/Time Properties (migrated from Link schema)
- # =========================================================================
-
- @cached_property
- def bookmarked_date(self) -> Optional[str]:
- max_ts = (timezone.now() + timedelta(days=30)).timestamp()
- if self.timestamp and self.timestamp.replace('.', '').isdigit():
- if 0 < float(self.timestamp) < max_ts:
- return self._ts_to_date_str(datetime.fromtimestamp(float(self.timestamp)))
- return str(self.timestamp)
- return None
-
- @cached_property
- def downloaded_datestr(self) -> Optional[str]:
- return self._ts_to_date_str(self.downloaded_at) if self.downloaded_at else None
-
- @cached_property
- def archive_dates(self) -> List[datetime]:
- return [
- result.start_ts
- for result in self.archiveresult_set.all()
- if result.start_ts
- ]
-
- @cached_property
- def oldest_archive_date(self) -> Optional[datetime]:
- dates = self.archive_dates
- return min(dates) if dates else None
-
- @cached_property
- def newest_archive_date(self) -> Optional[datetime]:
- dates = self.archive_dates
- return max(dates) if dates else None
-
- @cached_property
- def num_outputs(self) -> int:
- return self.archiveresult_set.filter(status='succeeded').count()
-
- @cached_property
- def num_failures(self) -> int:
- return self.archiveresult_set.filter(status='failed').count()
-
- # =========================================================================
- # Output Path Methods (migrated from Link schema)
- # =========================================================================
-
- def canonical_outputs(self) -> Dict[str, Optional[str]]:
- """
- Intelligently discover the best output file for each plugin.
- Uses actual ArchiveResult data and filesystem scanning with smart heuristics.
- """
- FAVICON_PROVIDER = 'https://www.google.com/s2/favicons?domain={}'
-
- # Mimetypes that can be embedded/previewed in an iframe
- IFRAME_EMBEDDABLE_EXTENSIONS = {
- 'html', 'htm', 'pdf', 'txt', 'md', 'json', 'jsonl',
- 'png', 'jpg', 'jpeg', 'gif', 'webp', 'svg', 'ico',
- 'mp4', 'webm', 'mp3', 'opus', 'ogg', 'wav',
- }
-
- MIN_DISPLAY_SIZE = 15_000 # 15KB - filter out tiny files
- MAX_SCAN_FILES = 50 # Don't scan massive directories
-
- def find_best_output_in_dir(dir_path: Path, plugin_name: str) -> Optional[str]:
- """Find the best representative file in a plugin's output directory"""
- if not dir_path.exists() or not dir_path.is_dir():
- return None
-
- candidates = []
- file_count = 0
-
- # Special handling for media plugin - look for thumbnails
- is_media_dir = plugin_name == 'media'
-
- # Scan for suitable files
- for file_path in dir_path.rglob('*'):
- file_count += 1
- if file_count > MAX_SCAN_FILES:
- break
-
- if file_path.is_dir() or file_path.name.startswith('.'):
- continue
-
- ext = file_path.suffix.lstrip('.').lower()
- if ext not in IFRAME_EMBEDDABLE_EXTENSIONS:
- continue
-
- try:
- size = file_path.stat().st_size
- except OSError:
- continue
-
- # For media dir, allow smaller image files (thumbnails are often < 15KB)
- min_size = 5_000 if (is_media_dir and ext in ('png', 'jpg', 'jpeg', 'webp', 'gif')) else MIN_DISPLAY_SIZE
- if size < min_size:
- continue
-
- # Prefer main files: index.html, output.*, content.*, etc.
- priority = 0
- name_lower = file_path.name.lower()
-
- if is_media_dir:
- # Special prioritization for media directories
- if any(keyword in name_lower for keyword in ('thumb', 'thumbnail', 'cover', 'poster')):
- priority = 200 # Highest priority for thumbnails
- elif ext in ('png', 'jpg', 'jpeg', 'webp', 'gif'):
- priority = 150 # High priority for any image
- elif ext in ('mp4', 'webm', 'mp3', 'opus', 'ogg'):
- priority = 100 # Lower priority for actual media files
- else:
- priority = 50
- elif 'index' in name_lower:
- priority = 100
- elif name_lower.startswith(('output', 'content', plugin_name)):
- priority = 50
- elif ext in ('html', 'htm', 'pdf'):
- priority = 30
- elif ext in ('png', 'jpg', 'jpeg', 'webp'):
- priority = 20
- else:
- priority = 10
-
- candidates.append((priority, size, file_path))
-
- if not candidates:
- return None
-
- # Sort by priority (desc), then size (desc)
- candidates.sort(key=lambda x: (x[0], x[1]), reverse=True)
- best_file = candidates[0][2]
- return str(best_file.relative_to(Path(self.output_dir)))
-
- canonical = {
- 'index_path': 'index.html',
- 'google_favicon_path': FAVICON_PROVIDER.format(self.domain),
- 'archive_org_path': f'https://web.archive.org/web/{self.base_url}',
- }
-
- # Scan each ArchiveResult's output directory for the best file
- snap_dir = Path(self.output_dir)
- for result in self.archiveresult_set.filter(status='succeeded'):
- if not result.output_files and not result.output_str:
- continue
-
- # Try to find the best output file for this plugin
- plugin_dir = snap_dir / result.plugin
- best_output = None
-
- # Check output_files first (new field)
- if result.output_files:
- first_file = next(iter(result.output_files.keys()), None)
- if first_file and (plugin_dir / first_file).exists():
- best_output = f'{result.plugin}/{first_file}'
-
- # Fallback to output_str if it looks like a path
- if not best_output and result.output_str and (snap_dir / result.output_str).exists():
- best_output = result.output_str
-
- if not best_output and plugin_dir.exists():
- # Intelligently find the best file in the plugin's directory
- best_output = find_best_output_in_dir(plugin_dir, result.plugin)
-
- if best_output:
- canonical[f'{result.plugin}_path'] = best_output
-
- # Also scan top-level for legacy outputs (backwards compatibility)
- for file_path in snap_dir.glob('*'):
- if file_path.is_dir() or file_path.name in ('index.html', 'index.json'):
- continue
-
- ext = file_path.suffix.lstrip('.').lower()
- if ext not in IFRAME_EMBEDDABLE_EXTENSIONS:
- continue
-
- try:
- size = file_path.stat().st_size
- if size >= MIN_DISPLAY_SIZE:
- # Add as generic output with stem as key
- key = f'{file_path.stem}_path'
- if key not in canonical:
- canonical[key] = file_path.name
- except OSError:
- continue
-
- if self.is_static:
- static_path = f'warc/{self.timestamp}'
- canonical.update({
- 'title': self.basename,
- 'wget_path': static_path,
- })
-
- return canonical
-
- def latest_outputs(self, status: Optional[str] = None) -> Dict[str, Any]:
- """Get the latest output that each plugin produced"""
- from archivebox.hooks import get_plugins
- from django.db.models import Q
-
- latest: Dict[str, Any] = {}
- for plugin in get_plugins():
- results = self.archiveresult_set.filter(plugin=plugin)
- if status is not None:
- results = results.filter(status=status)
- # Filter for results with output_files or output_str
- results = results.filter(Q(output_files__isnull=False) | ~Q(output_str='')).order_by('-start_ts')
- result = results.first()
- # Return embed_path() for backwards compatibility
- latest[plugin] = result.embed_path() if result else None
- return latest
-
- # =========================================================================
- # Serialization Methods
- # =========================================================================
-
- def to_dict(self, extended: bool = False) -> Dict[str, Any]:
- """Convert Snapshot to a dictionary (replacement for Link._asdict())"""
- from archivebox.misc.util import ts_to_date_str
-
- result = {
- 'TYPE': 'core.models.Snapshot',
- 'id': str(self.id),
- 'url': self.url,
- 'timestamp': self.timestamp,
- 'title': self.title,
- 'tags': self.tags_str(),
- 'downloaded_at': self.downloaded_at.isoformat() if self.downloaded_at else None,
- 'bookmarked_at': self.bookmarked_at.isoformat() if self.bookmarked_at else None,
- 'created_at': self.created_at.isoformat() if self.created_at else None,
- # Computed properties
- 'domain': self.domain,
- 'scheme': self.scheme,
- 'base_url': self.base_url,
- 'path': self.path,
- 'basename': self.basename,
- 'extension': self.extension,
- 'is_static': self.is_static,
- 'is_archived': self.is_archived,
- 'archive_path': self.archive_path,
- 'output_dir': self.output_dir,
- 'link_dir': self.output_dir, # backwards compatibility alias
- 'archive_size': self.archive_size,
- 'bookmarked_date': self.bookmarked_date,
- 'downloaded_datestr': self.downloaded_datestr,
- 'num_outputs': self.num_outputs,
- 'num_failures': self.num_failures,
- }
- if extended:
- result['canonical'] = self.canonical_outputs()
- return result
-
- def to_json(self, indent: int = 4) -> str:
- """Convert to JSON string"""
- return to_json(self.to_dict(extended=True), indent=indent)
-
- def to_csv(self, cols: Optional[List[str]] = None, separator: str = ',', ljust: int = 0) -> str:
- """Convert to CSV string"""
- data = self.to_dict()
- cols = cols or ['timestamp', 'is_archived', 'url']
- return separator.join(to_json(data.get(col, ''), indent=None).ljust(ljust) for col in cols)
-
- def write_json_details(self, out_dir: Optional[str] = None) -> None:
- """Write JSON index file for this snapshot to its output directory"""
- out_dir = out_dir or self.output_dir
- path = Path(out_dir) / CONSTANTS.JSON_INDEX_FILENAME
- atomic_write(str(path), self.to_dict(extended=True))
-
- def write_html_details(self, out_dir: Optional[str] = None) -> None:
- """Write HTML detail page for this snapshot to its output directory"""
- from django.template.loader import render_to_string
- from archivebox.config.common import SERVER_CONFIG
- from archivebox.config.configset import get_config
- from archivebox.misc.logging_util import printable_filesize
-
- out_dir = out_dir or self.output_dir
- config = get_config()
- SAVE_ARCHIVE_DOT_ORG = config.get('SAVE_ARCHIVE_DOT_ORG', True)
- TITLE_LOADING_MSG = 'Not yet archived...'
-
- canonical = self.canonical_outputs()
- context = {
- **self.to_dict(extended=True),
- **{f'{k}_path': v for k, v in canonical.items()},
- 'canonical': {f'{k}_path': v for k, v in canonical.items()},
- 'title': htmlencode(self.title or (self.base_url if self.is_archived else TITLE_LOADING_MSG)),
- 'url_str': htmlencode(urldecode(self.base_url)),
- 'archive_url': urlencode(f'warc/{self.timestamp}' or (self.domain if self.is_archived else '')) or 'about:blank',
- 'extension': self.extension or 'html',
- 'tags': self.tags_str() or 'untagged',
- 'size': printable_filesize(self.archive_size) if self.archive_size else 'pending',
- 'status': 'archived' if self.is_archived else 'not yet archived',
- 'status_color': 'success' if self.is_archived else 'danger',
- 'oldest_archive_date': ts_to_date_str(self.oldest_archive_date),
- 'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG,
- 'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS,
- }
- rendered_html = render_to_string('snapshot.html', context)
- atomic_write(str(Path(out_dir) / CONSTANTS.HTML_INDEX_FILENAME), rendered_html)
-
- # =========================================================================
- # Helper Methods
- # =========================================================================
-
- @staticmethod
- def _ts_to_date_str(dt: Optional[datetime]) -> Optional[str]:
- return dt.strftime('%Y-%m-%d %H:%M:%S') if dt else None
-
-
-# =============================================================================
-# Snapshot State Machine
-# =============================================================================
-
-class SnapshotMachine(BaseStateMachine, strict_states=True):
- """
- State machine for managing Snapshot lifecycle.
-
- Hook Lifecycle:
- ┌─────────────────────────────────────────────────────────────┐
- │ QUEUED State │
- │ • Waiting for snapshot to be ready │
- └─────────────────────────────────────────────────────────────┘
- ↓ tick() when can_start()
- ┌─────────────────────────────────────────────────────────────┐
- │ STARTED State → enter_started() │
- │ 1. snapshot.run() │
- │ • discover_hooks('Snapshot') → finds all plugin hooks │
- │ • create_pending_archiveresults() → creates ONE │
- │ ArchiveResult per hook (NO execution yet) │
- │ 2. ArchiveResults process independently with their own │
- │ state machines (see ArchiveResultMachine) │
- │ 3. Advance through steps 0-9 as foreground hooks complete │
- └─────────────────────────────────────────────────────────────┘
- ↓ tick() when is_finished()
- ┌─────────────────────────────────────────────────────────────┐
- │ SEALED State → enter_sealed() │
- │ • cleanup() → kills any background hooks still running │
- │ • Set retry_at=None (no more processing) │
- └─────────────────────────────────────────────────────────────┘
-
- https://github.com/ArchiveBox/ArchiveBox/wiki/ArchiveBox-Architecture-Diagrams
- """
-
- model_attr_name = 'snapshot'
-
- # States
- queued = State(value=Snapshot.StatusChoices.QUEUED, initial=True)
- started = State(value=Snapshot.StatusChoices.STARTED)
- sealed = State(value=Snapshot.StatusChoices.SEALED, final=True)
-
- # Tick Event
- tick = (
- queued.to.itself(unless='can_start') |
- queued.to(started, cond='can_start') |
- started.to.itself(unless='is_finished') |
- started.to(sealed, cond='is_finished')
- )
-
- def can_start(self) -> bool:
- can_start = bool(self.snapshot.url)
- # Suppressed: queue waiting logs
- return can_start
-
- def is_finished(self) -> bool:
- """Check if snapshot processing is complete - delegates to model method."""
- return self.snapshot.is_finished_processing()
-
- @queued.enter
- def enter_queued(self):
- # Suppressed: state transition logs
- self.snapshot.update_and_requeue(
- retry_at=timezone.now(),
- status=Snapshot.StatusChoices.QUEUED,
- )
-
- @started.enter
- def enter_started(self):
- # Suppressed: state transition logs
- # lock the snapshot while we create the pending archiveresults
- self.snapshot.update_and_requeue(
- retry_at=timezone.now() + timedelta(seconds=30), # if failed, wait 30s before retrying
- )
-
- # Run the snapshot - creates pending archiveresults for all enabled plugins
- self.snapshot.run()
-
- # unlock the snapshot after we're done + set status = started
- self.snapshot.update_and_requeue(
- retry_at=timezone.now() + timedelta(seconds=5), # check again in 5s
- status=Snapshot.StatusChoices.STARTED,
- )
-
- @sealed.enter
- def enter_sealed(self):
- # Clean up background hooks
- self.snapshot.cleanup()
-
- # Suppressed: state transition logs
- self.snapshot.update_and_requeue(
- retry_at=None,
- status=Snapshot.StatusChoices.SEALED,
- )
-
-
-class ArchiveResultManager(models.Manager):
- def indexable(self, sorted: bool = True):
- INDEXABLE_METHODS = [r[0] for r in EXTRACTOR_INDEXING_PRECEDENCE]
- qs = self.get_queryset().filter(plugin__in=INDEXABLE_METHODS, status='succeeded')
- if sorted:
- precedence = [When(plugin=method, then=Value(p)) for method, p in EXTRACTOR_INDEXING_PRECEDENCE]
- qs = qs.annotate(indexing_precedence=Case(*precedence, default=Value(1000), output_field=IntegerField())).order_by('indexing_precedence')
- return qs
-
-
-class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
- class StatusChoices(models.TextChoices):
- QUEUED = 'queued', 'Queued'
- STARTED = 'started', 'Started'
- BACKOFF = 'backoff', 'Waiting to retry'
- SUCCEEDED = 'succeeded', 'Succeeded'
- FAILED = 'failed', 'Failed'
- SKIPPED = 'skipped', 'Skipped'
-
- @classmethod
- def get_plugin_choices(cls):
- """Get plugin choices from discovered hooks (for forms/admin)."""
- plugins = [get_plugin_name(e) for e in get_plugins()]
- return tuple((e, e) for e in plugins)
-
- # Keep AutoField for backward compatibility with 0.7.x databases
- # UUID field is added separately by migration for new records
- id = models.AutoField(primary_key=True, editable=False)
- # Note: unique constraint is added by migration 0027 - don't set unique=True here
- # or SQLite table recreation in earlier migrations will fail
- uuid = models.UUIDField(default=uuid7, null=True, blank=True, db_index=True)
- created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False, related_name='archiveresult_set', db_index=True)
- created_at = models.DateTimeField(default=timezone.now, db_index=True)
- modified_at = models.DateTimeField(auto_now=True)
-
- snapshot: Snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE) # type: ignore
- # No choices= constraint - plugin names come from plugin system and can be any string
- plugin = models.CharField(max_length=32, blank=False, null=False, db_index=True)
- hook_name = models.CharField(max_length=255, blank=True, default='', db_index=True, help_text='Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)')
- pwd = models.CharField(max_length=256, default=None, null=True, blank=True)
- cmd = models.JSONField(default=None, null=True, blank=True)
- cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
-
- # New output fields (replacing old 'output' field)
- output_str = models.TextField(blank=True, default='', help_text='Human-readable output summary')
- output_json = models.JSONField(null=True, blank=True, default=None, help_text='Structured metadata (headers, redirects, etc.)')
- output_files = models.JSONField(default=dict, help_text='Dict of {relative_path: {metadata}}')
- output_size = models.BigIntegerField(default=0, help_text='Total bytes of all output files')
- output_mimetypes = models.CharField(max_length=512, blank=True, default='', help_text='CSV of mimetypes sorted by size')
-
- # Binary FK (optional - set when hook reports cmd)
- binary = models.ForeignKey(
- 'machine.Binary',
- on_delete=models.SET_NULL,
- null=True, blank=True,
- related_name='archiveresults',
- help_text='Primary binary used by this hook'
- )
-
- start_ts = models.DateTimeField(default=None, null=True, blank=True)
- end_ts = models.DateTimeField(default=None, null=True, blank=True)
-
- status = ModelWithStateMachine.StatusField(choices=StatusChoices.choices, default=StatusChoices.QUEUED)
- retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
- notes = models.TextField(blank=True, null=False, default='')
- output_dir = models.CharField(max_length=256, default=None, null=True, blank=True)
- iface = models.ForeignKey(NetworkInterface, on_delete=models.SET_NULL, null=True, blank=True)
-
- state_machine_name = 'core.models.ArchiveResultMachine'
- retry_at_field_name = 'retry_at'
- state_field_name = 'status'
- active_state = StatusChoices.STARTED
-
- objects = ArchiveResultManager()
-
- class Meta(TypedModelMeta):
- verbose_name = 'Archive Result'
- verbose_name_plural = 'Archive Results Log'
-
- def __str__(self):
- return f'[{self.id}] {self.snapshot.url[:64]} -> {self.plugin}'
-
- def save(self, *args, **kwargs):
- is_new = self._state.adding
- # Skip ModelWithOutputDir.save() to avoid creating index.json in plugin directories
- # Call the Django Model.save() directly instead
- models.Model.save(self, *args, **kwargs)
-
- if is_new:
- from archivebox.misc.logging_util import log_worker_event
- log_worker_event(
- worker_type='DB',
- event='Created ArchiveResult',
- indent_level=3,
- plugin=self.plugin,
- metadata={
- 'id': str(self.id),
- 'snapshot_id': str(self.snapshot_id),
- 'snapshot_url': str(self.snapshot.url)[:64],
- 'status': self.status,
- },
- )
-
- @cached_property
- def snapshot_dir(self):
- return Path(self.snapshot.output_dir)
-
- @cached_property
- def url(self):
- return self.snapshot.url
-
- @property
- def api_url(self) -> str:
- return reverse_lazy('api-1:get_archiveresult', args=[self.id])
-
- def get_absolute_url(self):
- return f'/{self.snapshot.archive_path}/{self.plugin}'
-
- @property
- def plugin_module(self) -> Any | None:
- # Hook scripts are now used instead of Python plugin modules
- # The plugin name maps to hooks in archivebox/plugins/{plugin}/
- return None
-
- def output_exists(self) -> bool:
- return os.path.exists(Path(self.snapshot_dir) / self.plugin)
-
- def embed_path(self) -> Optional[str]:
- """
- Get the relative path to the embeddable output file for this result.
-
- Returns the first file from output_files if set, otherwise tries to
- find a reasonable default based on the plugin type.
- """
- # Check output_files dict for primary output
- if self.output_files:
- # Return first file from output_files (dict preserves insertion order)
- first_file = next(iter(self.output_files.keys()), None)
- if first_file:
- return f'{self.plugin}/{first_file}'
-
- # Fallback: check output_str if it looks like a file path
- if self.output_str and ('/' in self.output_str or '.' in self.output_str):
- return self.output_str
-
- # Try to find output file based on plugin's canonical output path
- canonical = self.snapshot.canonical_outputs()
- plugin_key = f'{self.plugin}_path'
- if plugin_key in canonical:
- return canonical[plugin_key]
-
- # Fallback to plugin directory
- return f'{self.plugin}/'
-
- def create_output_dir(self):
- output_dir = Path(self.snapshot_dir) / self.plugin
- output_dir.mkdir(parents=True, exist_ok=True)
- return output_dir
-
- @property
- def output_dir_name(self) -> str:
- return self.plugin
-
- @property
- def output_dir_parent(self) -> str:
- return str(self.snapshot.OUTPUT_DIR.relative_to(CONSTANTS.DATA_DIR))
-
- def save_search_index(self):
- pass
-
- def cascade_health_update(self, success: bool):
- """Update health stats for self, parent Snapshot, and grandparent Crawl (if present)."""
- self.increment_health_stats(success)
- self.snapshot.increment_health_stats(success)
- if self.snapshot.crawl_id:
- self.snapshot.crawl.increment_health_stats(success)
-
- def run(self):
- """
- Execute this ArchiveResult's hook and update status.
-
- If self.hook_name is set, runs only that specific hook.
- If self.hook_name is empty, discovers and runs all hooks for self.plugin (backwards compat).
-
- Updates status/output fields, queues discovered URLs, and triggers indexing.
- """
- from django.utils import timezone
- from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook, is_background_hook
- from archivebox.config.configset import get_config
-
- # Get merged config with proper context
- config = get_config(
- crawl=self.snapshot.crawl if self.snapshot.crawl else None,
- snapshot=self.snapshot,
- )
-
- # Determine which hook(s) to run
- hooks = []
-
- if self.hook_name:
- # SPECIFIC HOOK MODE: Find the specific hook by name
- for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
- if not base_dir.exists():
- continue
- plugin_dir = base_dir / self.plugin
- if plugin_dir.exists():
- hook_path = plugin_dir / self.hook_name
- if hook_path.exists():
- hooks.append(hook_path)
- break
- else:
- # LEGACY MODE: Discover all hooks for this plugin (backwards compatibility)
- for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
- if not base_dir.exists():
- continue
- plugin_dir = base_dir / self.plugin
- if plugin_dir.exists():
- matches = list(plugin_dir.glob('on_Snapshot__*.*'))
- if matches:
- hooks.extend(sorted(matches))
-
- if not hooks:
- self.status = self.StatusChoices.FAILED
- if self.hook_name:
- self.output_str = f'Hook not found: {self.plugin}/{self.hook_name}'
- else:
- self.output_str = f'No hooks found for plugin: {self.plugin}'
- self.retry_at = None
- self.save()
- return
-
- # Output directory is plugin_dir for the hook output
- plugin_dir = Path(self.snapshot.output_dir) / self.plugin
-
- start_ts = timezone.now()
- is_bg_hook = False
-
- for hook in hooks:
- # Check if this is a background hook
- is_bg_hook = is_background_hook(hook.name)
-
- result = run_hook(
- hook,
- output_dir=plugin_dir,
- config=config,
- url=self.snapshot.url,
- snapshot_id=str(self.snapshot.id),
- crawl_id=str(self.snapshot.crawl.id) if self.snapshot.crawl else None,
- depth=self.snapshot.depth,
- )
-
- # Background hooks return None
- if result is None:
- is_bg_hook = True
-
- # Update status based on hook execution
- if is_bg_hook:
- # BACKGROUND HOOK - still running, return immediately
- # Status stays STARTED, will be finalized by Snapshot.cleanup()
- self.status = self.StatusChoices.STARTED
- self.start_ts = start_ts
- self.pwd = str(plugin_dir)
- self.save()
- return
-
- # FOREGROUND HOOK - completed, update from filesystem
- self.start_ts = start_ts
- self.pwd = str(plugin_dir)
- self.update_from_output()
-
- # Clean up empty output directory if no files were created
- if plugin_dir.exists() and not self.output_files:
- try:
- if not any(plugin_dir.iterdir()):
- plugin_dir.rmdir()
- except (OSError, RuntimeError):
- pass
-
- def update_from_output(self):
- """
- Update this ArchiveResult from filesystem logs and output files.
-
- Used for:
- - Foreground hooks that completed (called from ArchiveResult.run())
- - Background hooks that completed (called from Snapshot.cleanup())
-
- Updates:
- - status, output_str, output_json from ArchiveResult JSONL record
- - output_files, output_size, output_mimetypes by walking filesystem
- - end_ts, retry_at, cmd, cmd_version, binary FK
- - Processes side-effect records (Snapshot, Tag, etc.) via process_hook_records()
- """
- import json
- import mimetypes
- from collections import defaultdict
- from pathlib import Path
- from django.utils import timezone
- from archivebox.hooks import process_hook_records
-
- plugin_dir = Path(self.pwd) if self.pwd else None
- if not plugin_dir or not plugin_dir.exists():
- self.status = self.StatusChoices.FAILED
- self.output_str = 'Output directory not found'
- self.end_ts = timezone.now()
- self.retry_at = None
- self.save()
- return
-
- # Read and parse JSONL output from stdout.log
- stdout_file = plugin_dir / 'stdout.log'
- stdout = stdout_file.read_text() if stdout_file.exists() else ''
-
- records = []
- for line in stdout.splitlines():
- if line.strip() and line.strip().startswith('{'):
- try:
- records.append(json.loads(line))
- except json.JSONDecodeError:
- continue
-
- # Find ArchiveResult record and update status/output from it
- ar_records = [r for r in records if r.get('type') == 'ArchiveResult']
- if ar_records:
- hook_data = ar_records[0]
-
- # Update status
- status_map = {
- 'succeeded': self.StatusChoices.SUCCEEDED,
- 'failed': self.StatusChoices.FAILED,
- 'skipped': self.StatusChoices.SKIPPED,
- }
- self.status = status_map.get(hook_data.get('status', 'failed'), self.StatusChoices.FAILED)
-
- # Update output fields
- self.output_str = hook_data.get('output_str') or hook_data.get('output') or ''
- self.output_json = hook_data.get('output_json')
-
- # Update cmd fields
- if hook_data.get('cmd'):
- self.cmd = hook_data['cmd']
- self._set_binary_from_cmd(hook_data['cmd'])
- if hook_data.get('cmd_version'):
- self.cmd_version = hook_data['cmd_version'][:128]
- else:
- # No ArchiveResult record = failed
- self.status = self.StatusChoices.FAILED
- self.output_str = 'Hook did not output ArchiveResult record'
-
- # Walk filesystem and populate output_files, output_size, output_mimetypes
- exclude_names = {'stdout.log', 'stderr.log', 'hook.pid', 'listener.pid'}
- mime_sizes = defaultdict(int)
- total_size = 0
- output_files = {}
-
- for file_path in plugin_dir.rglob('*'):
- if not file_path.is_file():
- continue
- if file_path.name in exclude_names:
- continue
-
- try:
- stat = file_path.stat()
- mime_type, _ = mimetypes.guess_type(str(file_path))
- mime_type = mime_type or 'application/octet-stream'
-
- relative_path = str(file_path.relative_to(plugin_dir))
- output_files[relative_path] = {}
- mime_sizes[mime_type] += stat.st_size
- total_size += stat.st_size
- except (OSError, IOError):
- continue
-
- self.output_files = output_files
- self.output_size = total_size
- sorted_mimes = sorted(mime_sizes.items(), key=lambda x: x[1], reverse=True)
- self.output_mimetypes = ','.join(mime for mime, _ in sorted_mimes)
-
- # Update timestamps
- self.end_ts = timezone.now()
- self.retry_at = None
-
- self.save()
-
- # Process side-effect records (filter Snapshots for depth/URL)
- filtered_records = []
- for record in records:
- record_type = record.get('type')
-
- # Skip ArchiveResult records (already processed above)
- if record_type == 'ArchiveResult':
- continue
-
- # Filter Snapshot records for depth/URL constraints
- if record_type == 'Snapshot':
- if not self.snapshot.crawl:
- continue
-
- url = record.get('url')
- if not url:
- continue
-
- depth = record.get('depth', self.snapshot.depth + 1)
- if depth > self.snapshot.crawl.max_depth:
- continue
-
- if not self._url_passes_filters(url):
- continue
-
- filtered_records.append(record)
-
- # Process filtered records with unified dispatcher
- overrides = {
- 'snapshot': self.snapshot,
- 'crawl': self.snapshot.crawl,
- 'created_by_id': self.snapshot.crawl.created_by_id,
- }
- process_hook_records(filtered_records, overrides=overrides)
-
- # Cleanup PID files and empty logs
- pid_file = plugin_dir / 'hook.pid'
- pid_file.unlink(missing_ok=True)
- stderr_file = plugin_dir / 'stderr.log'
- if stdout_file.exists() and stdout_file.stat().st_size == 0:
- stdout_file.unlink()
- if stderr_file.exists() and stderr_file.stat().st_size == 0:
- stderr_file.unlink()
-
- def _set_binary_from_cmd(self, cmd: list) -> None:
- """
- Find Binary for command and set binary FK.
-
- Tries matching by absolute path first, then by binary name.
- Only matches binaries on the current machine.
- """
- if not cmd:
- return
-
- from archivebox.machine.models import Machine
-
- bin_path_or_name = cmd[0] if isinstance(cmd, list) else cmd
- machine = Machine.current()
-
- # Try matching by absolute path first
- binary = Binary.objects.filter(
- abspath=bin_path_or_name,
- machine=machine
- ).first()
-
- if binary:
- self.binary = binary
- return
-
- # Fallback: match by binary name
- bin_name = Path(bin_path_or_name).name
- binary = Binary.objects.filter(
- name=bin_name,
- machine=machine
- ).first()
-
- if binary:
- self.binary = binary
-
- def _url_passes_filters(self, url: str) -> bool:
- """Check if URL passes URL_ALLOWLIST and URL_DENYLIST config filters.
-
- Uses proper config hierarchy: defaults -> file -> env -> machine -> user -> crawl -> snapshot
- """
- import re
- from archivebox.config.configset import get_config
-
- # Get merged config with proper hierarchy
- config = get_config(
- user=self.snapshot.crawl.created_by if self.snapshot else None,
- crawl=self.snapshot.crawl if self.snapshot else None,
- snapshot=self.snapshot,
- )
-
- # Get allowlist/denylist (can be string or list)
- allowlist_raw = config.get('URL_ALLOWLIST', '')
- denylist_raw = config.get('URL_DENYLIST', '')
-
- # Normalize to list of patterns
- def to_pattern_list(value):
- if isinstance(value, list):
- return value
- if isinstance(value, str):
- return [p.strip() for p in value.split(',') if p.strip()]
- return []
-
- allowlist = to_pattern_list(allowlist_raw)
- denylist = to_pattern_list(denylist_raw)
-
- # Denylist takes precedence
- if denylist:
- for pattern in denylist:
- try:
- if re.search(pattern, url):
- return False
- except re.error:
- continue # Skip invalid regex patterns
-
- # If allowlist exists, URL must match at least one pattern
- if allowlist:
- for pattern in allowlist:
- try:
- if re.search(pattern, url):
- return True
- except re.error:
- continue # Skip invalid regex patterns
- return False # No allowlist patterns matched
-
- return True # No filters or passed filters
-
- @property
- def output_dir(self) -> Path:
- """Get the output directory for this plugin's results."""
- return Path(self.snapshot.output_dir) / self.plugin
-
- def is_background_hook(self) -> bool:
- """Check if this ArchiveResult is for a background hook."""
- plugin_dir = Path(self.pwd) if self.pwd else None
- if not plugin_dir:
- return False
- pid_file = plugin_dir / 'hook.pid'
- return pid_file.exists()
-
-
-# =============================================================================
-# ArchiveResult State Machine
-# =============================================================================
-
-class ArchiveResultMachine(BaseStateMachine, strict_states=True):
- """
- State machine for managing ArchiveResult (single plugin execution) lifecycle.
-
- Hook Lifecycle:
- ┌─────────────────────────────────────────────────────────────┐
- │ QUEUED State │
- │ • Waiting for its turn to run │
- └─────────────────────────────────────────────────────────────┘
- ↓ tick() when can_start()
- ┌─────────────────────────────────────────────────────────────┐
- │ STARTED State → enter_started() │
- │ 1. archiveresult.run() │
- │ • Find specific hook by hook_name │
- │ • run_hook(script, output_dir, ...) → subprocess │
- │ │
- │ 2a. FOREGROUND hook (returns HookResult): │
- │ • update_from_output() immediately │
- │ - Read stdout.log │
- │ - Parse JSONL records │
- │ - Extract 'ArchiveResult' record → update status │
- │ - Walk output_dir → populate output_files │
- │ - Call process_hook_records() for side effects │
- │ │
- │ 2b. BACKGROUND hook (returns None): │
- │ • Status stays STARTED │
- │ • Continues running in background │
- │ • Killed by Snapshot.cleanup() when sealed │
- └─────────────────────────────────────────────────────────────┘
- ↓ tick() checks status
- ┌─────────────────────────────────────────────────────────────┐
- │ SUCCEEDED / FAILED / SKIPPED / BACKOFF │
- │ • Set by hook's JSONL output during update_from_output() │
- │ • Health stats incremented (num_uses_succeeded/failed) │
- │ • Parent Snapshot health stats also updated │
- └─────────────────────────────────────────────────────────────┘
-
- https://github.com/ArchiveBox/ArchiveBox/wiki/ArchiveBox-Architecture-Diagrams
- """
-
- model_attr_name = 'archiveresult'
-
- # States
- queued = State(value=ArchiveResult.StatusChoices.QUEUED, initial=True)
- started = State(value=ArchiveResult.StatusChoices.STARTED)
- backoff = State(value=ArchiveResult.StatusChoices.BACKOFF)
- succeeded = State(value=ArchiveResult.StatusChoices.SUCCEEDED, final=True)
- failed = State(value=ArchiveResult.StatusChoices.FAILED, final=True)
- skipped = State(value=ArchiveResult.StatusChoices.SKIPPED, final=True)
-
- # Tick Event - transitions based on conditions
- tick = (
- queued.to.itself(unless='can_start') |
- queued.to(started, cond='can_start') |
- started.to.itself(unless='is_finished') |
- started.to(succeeded, cond='is_succeeded') |
- started.to(failed, cond='is_failed') |
- started.to(skipped, cond='is_skipped') |
- started.to(backoff, cond='is_backoff') |
- backoff.to.itself(unless='can_start') |
- backoff.to(started, cond='can_start') |
- backoff.to(succeeded, cond='is_succeeded') |
- backoff.to(failed, cond='is_failed') |
- backoff.to(skipped, cond='is_skipped')
- )
-
- def can_start(self) -> bool:
- can_start = bool(self.archiveresult.snapshot.url)
- # Suppressed: queue waiting logs
- return can_start
-
- def is_succeeded(self) -> bool:
- """Check if extractor plugin succeeded (status was set by run())."""
- return self.archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED
-
- def is_failed(self) -> bool:
- """Check if extractor plugin failed (status was set by run())."""
- return self.archiveresult.status == ArchiveResult.StatusChoices.FAILED
-
- def is_skipped(self) -> bool:
- """Check if extractor plugin was skipped (status was set by run())."""
- return self.archiveresult.status == ArchiveResult.StatusChoices.SKIPPED
-
- def is_backoff(self) -> bool:
- """Check if we should backoff and retry later."""
- # Backoff if status is still started (plugin didn't complete) and output_str is empty
- return (
- self.archiveresult.status == ArchiveResult.StatusChoices.STARTED and
- not self.archiveresult.output_str
- )
-
- def is_finished(self) -> bool:
- """Check if extraction has completed (success, failure, or skipped)."""
- return self.archiveresult.status in (
- ArchiveResult.StatusChoices.SUCCEEDED,
- ArchiveResult.StatusChoices.FAILED,
- ArchiveResult.StatusChoices.SKIPPED,
- )
-
- @queued.enter
- def enter_queued(self):
- # Suppressed: state transition logs
- self.archiveresult.update_and_requeue(
- retry_at=timezone.now(),
- status=ArchiveResult.StatusChoices.QUEUED,
- start_ts=None,
- ) # bump the snapshot's retry_at so they pickup any new changes
-
- @started.enter
- def enter_started(self):
- from archivebox.machine.models import NetworkInterface
-
- # Suppressed: state transition logs
- # Lock the object and mark start time
- self.archiveresult.update_and_requeue(
- retry_at=timezone.now() + timedelta(seconds=120), # 2 min timeout for plugin
- status=ArchiveResult.StatusChoices.STARTED,
- start_ts=timezone.now(),
- iface=NetworkInterface.current(),
- )
-
- # Run the plugin - this updates status, output, timestamps, etc.
- self.archiveresult.run()
-
- # Save the updated result
- self.archiveresult.save()
-
- # Suppressed: plugin result logs (already logged by worker)
-
- @backoff.enter
- def enter_backoff(self):
- # Suppressed: state transition logs
- self.archiveresult.update_and_requeue(
- retry_at=timezone.now() + timedelta(seconds=60),
- status=ArchiveResult.StatusChoices.BACKOFF,
- end_ts=None,
- # retries=F('retries') + 1, # F() equivalent to getattr(self.archiveresult, 'retries', 0) + 1,
- )
-
- @succeeded.enter
- def enter_succeeded(self):
- # Suppressed: state transition logs
- self.archiveresult.update_and_requeue(
- retry_at=None,
- status=ArchiveResult.StatusChoices.SUCCEEDED,
- end_ts=timezone.now(),
- # **self.archiveresult.get_output_dict(), # {output, output_json, stderr, stdout, returncode, errors, cmd_version, pwd, cmd, machine}
- )
- self.archiveresult.save()
-
- # Update health stats for ArchiveResult, Snapshot, and Crawl cascade
- self.archiveresult.cascade_health_update(success=True)
-
- @failed.enter
- def enter_failed(self):
- # Suppressed: state transition logs
- self.archiveresult.update_and_requeue(
- retry_at=None,
- status=ArchiveResult.StatusChoices.FAILED,
- end_ts=timezone.now(),
- )
-
- # Update health stats for ArchiveResult, Snapshot, and Crawl cascade
- self.archiveresult.cascade_health_update(success=False)
-
- @skipped.enter
- def enter_skipped(self):
- # Suppressed: state transition logs
- self.archiveresult.update_and_requeue(
- retry_at=None,
- status=ArchiveResult.StatusChoices.SKIPPED,
- end_ts=timezone.now(),
- )
-
- def after_transition(self, event: str, source: State, target: State):
- # print(f"after '{event}' from '{source.id}' to '{target.id}'")
- self.archiveresult.snapshot.update_and_requeue() # bump snapshot retry time so it picks up all the new changes
-
-
-# =============================================================================
-# State Machine Registration
-# =============================================================================
-
-# Manually register state machines with python-statemachine registry
-# (normally auto-discovered from statemachines.py, but we define them here for clarity)
-registry.register(SnapshotMachine)
-registry.register(ArchiveResultMachine)
\ No newline at end of file
diff --git a/archivebox/core/templatetags/core_tags.py b/archivebox/core/templatetags/core_tags.py
index 685665a4..7e201f94 100644
--- a/archivebox/core/templatetags/core_tags.py
+++ b/archivebox/core/templatetags/core_tags.py
@@ -91,7 +91,11 @@ def plugin_thumbnail(context, result) -> str:
'output_path': output_path,
'plugin': plugin,
})
- return mark_safe(tpl.render(ctx))
+ rendered = tpl.render(ctx)
+ # Only return non-empty content (strip whitespace to check)
+ if rendered.strip():
+ return mark_safe(rendered)
+ return ''
except Exception:
return ''
@@ -119,7 +123,11 @@ def plugin_embed(context, result) -> str:
'output_path': output_path,
'plugin': plugin,
})
- return mark_safe(tpl.render(ctx))
+ rendered = tpl.render(ctx)
+ # Only return non-empty content (strip whitespace to check)
+ if rendered.strip():
+ return mark_safe(rendered)
+ return ''
except Exception:
return ''
@@ -147,7 +155,11 @@ def plugin_fullscreen(context, result) -> str:
'output_path': output_path,
'plugin': plugin,
})
- return mark_safe(tpl.render(ctx))
+ rendered = tpl.render(ctx)
+ # Only return non-empty content (strip whitespace to check)
+ if rendered.strip():
+ return mark_safe(rendered)
+ return ''
except Exception:
return ''
diff --git a/archivebox/core/views.py b/archivebox/core/views.py
index 84a6bd2b..fd5dfbd8 100644
--- a/archivebox/core/views.py
+++ b/archivebox/core/views.py
@@ -539,7 +539,7 @@ from django.http import JsonResponse
def live_progress_view(request):
"""Simple JSON endpoint for live progress status - used by admin progress monitor."""
try:
- from workers.orchestrator import Orchestrator
+ from archivebox.workers.orchestrator import Orchestrator
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot, ArchiveResult
from django.db.models import Case, When, Value, IntegerField
diff --git a/archivebox/crawls/apps.py b/archivebox/crawls/apps.py
index f7819eda..1bb34b3a 100644
--- a/archivebox/crawls/apps.py
+++ b/archivebox/crawls/apps.py
@@ -4,3 +4,8 @@ from django.apps import AppConfig
class CrawlsConfig(AppConfig):
default_auto_field = "django.db.models.BigAutoField"
name = "archivebox.crawls"
+ label = "crawls"
+
+ def ready(self):
+ """Import models to register state machines with the registry"""
+ from archivebox.crawls.models import CrawlMachine # noqa: F401
diff --git a/archivebox/crawls/migrations/0002_drop_seed_model.py b/archivebox/crawls/migrations/0002_drop_seed_model.py
index c82dceb7..bf55c90a 100755
--- a/archivebox/crawls/migrations/0002_drop_seed_model.py
+++ b/archivebox/crawls/migrations/0002_drop_seed_model.py
@@ -17,39 +17,62 @@ class Migration(migrations.Migration):
]
operations = [
- # Remove the seed foreign key from Crawl
- migrations.RemoveField(
- model_name='crawl',
- name='seed',
+ # Remove the seed foreign key from Crawl (no-op if already removed by core/0024_d)
+ migrations.RunPython(
+ code=lambda apps, schema_editor: None,
+ reverse_code=migrations.RunPython.noop,
),
- # Delete the Seed model entirely
- migrations.DeleteModel(
- name='Seed',
+ # Delete the Seed model entirely (already done)
+ migrations.RunPython(
+ code=lambda apps, schema_editor: None,
+ reverse_code=migrations.RunPython.noop,
),
- # Update fields to new schema
- migrations.AlterField(
- model_name='crawl',
- name='created_by',
- field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
- ),
- migrations.AlterField(
- model_name='crawl',
- name='id',
- field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
- ),
- migrations.AlterField(
- model_name='crawl',
- name='urls',
- field=models.TextField(help_text='Newline-separated list of URLs to crawl'),
- ),
- migrations.AlterField(
- model_name='crawlschedule',
- name='created_by',
- field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
- ),
- migrations.AlterField(
- model_name='crawlschedule',
- name='id',
- field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
+ # Drop seed_id column if it exists, then update Django's migration state
+ migrations.SeparateDatabaseAndState(
+ state_operations=[
+ # Update fields to new schema
+ migrations.AlterField(
+ model_name='crawl',
+ name='created_by',
+ field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
+ ),
+ migrations.AlterField(
+ model_name='crawl',
+ name='id',
+ field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
+ ),
+ migrations.AlterField(
+ model_name='crawl',
+ name='urls',
+ field=models.TextField(help_text='Newline-separated list of URLs to crawl'),
+ ),
+ migrations.AlterField(
+ model_name='crawlschedule',
+ name='created_by',
+ field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
+ ),
+ migrations.AlterField(
+ model_name='crawlschedule',
+ name='id',
+ field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
+ ),
+ ],
+ database_operations=[
+ # Drop seed table and NULL out seed_id FK values
+ migrations.RunSQL(
+ sql="""
+ PRAGMA foreign_keys=OFF;
+
+ -- NULL out seed_id values in crawls_crawl
+ UPDATE crawls_crawl SET seed_id = NULL;
+
+ -- Drop seed table if it exists
+ DROP TABLE IF EXISTS crawls_seed;
+
+ PRAGMA foreign_keys=ON;
+ """,
+ reverse_sql=migrations.RunSQL.noop,
+ ),
+ ],
),
]
diff --git a/archivebox/crawls/migrations/0003_alter_crawl_output_dir.py b/archivebox/crawls/migrations/0003_alter_crawl_output_dir.py
index f4c26aa5..4d5b335d 100644
--- a/archivebox/crawls/migrations/0003_alter_crawl_output_dir.py
+++ b/archivebox/crawls/migrations/0003_alter_crawl_output_dir.py
@@ -8,12 +8,21 @@ class Migration(migrations.Migration):
dependencies = [
('crawls', '0002_drop_seed_model'),
+ ('core', '0024_d_fix_crawls_config'), # Depends on config fix
]
operations = [
- migrations.AlterField(
- model_name='crawl',
- name='output_dir',
- field=models.FilePathField(blank=True, default='', path=pathlib.PurePosixPath('/private/tmp/test_archivebox_migrations/archive')),
+ # Update Django's state only to avoid table rebuild that would re-apply old constraints
+ migrations.SeparateDatabaseAndState(
+ state_operations=[
+ migrations.AlterField(
+ model_name='crawl',
+ name='output_dir',
+ field=models.FilePathField(blank=True, default='', path=pathlib.PurePosixPath('/private/tmp/test_archivebox_migrations/archive')),
+ ),
+ ],
+ database_operations=[
+ # No database changes - output_dir type change is cosmetic for Django admin
+ ],
),
]
diff --git a/archivebox/crawls/migrations/0004_alter_crawl_output_dir.py b/archivebox/crawls/migrations/0004_alter_crawl_output_dir.py
index 809cf722..919bd021 100644
--- a/archivebox/crawls/migrations/0004_alter_crawl_output_dir.py
+++ b/archivebox/crawls/migrations/0004_alter_crawl_output_dir.py
@@ -11,9 +11,17 @@ class Migration(migrations.Migration):
]
operations = [
- migrations.AlterField(
- model_name='crawl',
- name='output_dir',
- field=models.FilePathField(blank=True, default='', path=pathlib.PurePosixPath('/private/tmp/archivebox-makemigrations/archive')),
+ # Update Django's state only to avoid table rebuild that would re-apply old constraints
+ migrations.SeparateDatabaseAndState(
+ state_operations=[
+ migrations.AlterField(
+ model_name='crawl',
+ name='output_dir',
+ field=models.FilePathField(blank=True, default='', path=pathlib.PurePosixPath('/private/tmp/archivebox-makemigrations/archive')),
+ ),
+ ],
+ database_operations=[
+ # No database changes - output_dir type change is cosmetic for Django admin
+ ],
),
]
diff --git a/archivebox/crawls/migrations/0005_drop_seed_id_column.py b/archivebox/crawls/migrations/0005_drop_seed_id_column.py
new file mode 100644
index 00000000..60bdecf1
--- /dev/null
+++ b/archivebox/crawls/migrations/0005_drop_seed_id_column.py
@@ -0,0 +1,28 @@
+# Drop seed_id column from Django's state (leave in database to avoid FK issues)
+
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('crawls', '0004_alter_crawl_output_dir'),
+ ]
+
+ operations = [
+ # Update Django's state only - leave seed_id column in database (unused but harmless)
+ # This avoids FK mismatch errors with crawls_crawlschedule
+ migrations.SeparateDatabaseAndState(
+ state_operations=[
+ # Remove seed field from Django's migration state
+ migrations.RemoveField(
+ model_name='crawl',
+ name='seed',
+ ),
+ ],
+ database_operations=[
+ # No database changes - seed_id column remains to avoid FK rebuild issues
+ # crawls_seed table can be manually dropped by DBA if needed
+ ],
+ ),
+ ]
diff --git a/archivebox/crawls/migrations/0006_alter_crawl_config_alter_crawl_output_dir_and_more.py b/archivebox/crawls/migrations/0006_alter_crawl_config_alter_crawl_output_dir_and_more.py
new file mode 100644
index 00000000..02805c72
--- /dev/null
+++ b/archivebox/crawls/migrations/0006_alter_crawl_config_alter_crawl_output_dir_and_more.py
@@ -0,0 +1,35 @@
+# Generated by Django 6.0 on 2025-12-29 06:45
+
+import pathlib
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('crawls', '0005_drop_seed_id_column'),
+ ]
+
+ operations = [
+ # Update Django's state only - database already correct
+ migrations.SeparateDatabaseAndState(
+ state_operations=[
+ migrations.AlterField(
+ model_name='crawl',
+ name='config',
+ field=models.JSONField(blank=True, default=dict, null=True),
+ ),
+ migrations.AlterField(
+ model_name='crawl',
+ name='output_dir',
+ field=models.FilePathField(blank=True, default='', path=pathlib.PurePosixPath('/Users/squash/Local/Code/archiveboxes/archivebox-nue/data/archive')),
+ ),
+ migrations.DeleteModel(
+ name='Seed',
+ ),
+ ],
+ database_operations=[
+ # No database changes - Seed table already dropped in 0005
+ ],
+ ),
+ ]
diff --git a/archivebox/crawls/models.py b/archivebox/crawls/models.py
index 420db4a2..a0c9cdda 100755
--- a/archivebox/crawls/models.py
+++ b/archivebox/crawls/models.py
@@ -65,7 +65,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
modified_at = models.DateTimeField(auto_now=True)
urls = models.TextField(blank=False, null=False, help_text='Newline-separated list of URLs to crawl')
- config = models.JSONField(default=dict)
+ config = models.JSONField(default=dict, null=True, blank=True)
max_depth = models.PositiveSmallIntegerField(default=0, validators=[MinValueValidator(0), MaxValueValidator(4)])
tags_str = models.CharField(max_length=1024, blank=True, null=False, default='')
persona_id = models.UUIDField(null=True, blank=True)
@@ -77,7 +77,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
status = ModelWithStateMachine.StatusField(choices=ModelWithStateMachine.StatusChoices, default=ModelWithStateMachine.StatusChoices.QUEUED)
retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
- state_machine_name = 'crawls.models.CrawlMachine'
+ state_machine_name = 'archivebox.crawls.models.CrawlMachine'
retry_at_field_name = 'retry_at'
state_field_name = 'status'
StatusChoices = ModelWithStateMachine.StatusChoices
@@ -190,7 +190,6 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
'status': Snapshot.INITIAL_STATE,
'retry_at': timezone.now(),
'timestamp': str(timezone.now().timestamp()),
- 'created_by_id': self.created_by_id,
'depth': 0,
},
)
@@ -290,7 +289,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
'timestamp': timestamp or str(timezone.now().timestamp()),
'status': Snapshot.INITIAL_STATE,
'retry_at': timezone.now(),
- 'created_by_id': self.created_by_id,
+ # Note: created_by removed in 0.9.0 - Snapshot inherits from Crawl
}
)
diff --git a/archivebox/machine/apps.py b/archivebox/machine/apps.py
index f9b297a9..bbc02f78 100644
--- a/archivebox/machine/apps.py
+++ b/archivebox/machine/apps.py
@@ -7,8 +7,13 @@ class MachineConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField'
name = 'archivebox.machine'
+ label = 'machine' # Explicit label for migrations
verbose_name = 'Machine Info'
+ def ready(self):
+ """Import models to register state machines with the registry"""
+ from archivebox.machine import models # noqa: F401
+
def register_admin(admin_site):
from archivebox.machine.admin import register_admin
diff --git a/archivebox/machine/migrations/0001_squashed.py b/archivebox/machine/migrations/0001_squashed.py
index cd2c7db9..3ef5b8be 100644
--- a/archivebox/machine/migrations/0001_squashed.py
+++ b/archivebox/machine/migrations/0001_squashed.py
@@ -85,6 +85,12 @@ class Migration(migrations.Migration):
('version', models.CharField(blank=True, default=None, max_length=32)),
('sha256', models.CharField(blank=True, default=None, max_length=64)),
('machine', models.ForeignKey(blank=True, default=None, on_delete=django.db.models.deletion.CASCADE, to='machine.machine')),
+ # Fields added in migration 0005 (included here for fresh installs)
+ ('binproviders', models.CharField(blank=True, default='env', max_length=127)),
+ ('output_dir', models.CharField(blank=True, default='', max_length=255)),
+ ('overrides', models.JSONField(blank=True, default=dict)),
+ ('retry_at', models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True)),
+ ('status', models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('succeeded', 'Succeeded'), ('failed', 'Failed')], db_index=True, default='queued', max_length=16)),
# dependency FK removed - Dependency model deleted
],
options={
diff --git a/archivebox/machine/migrations/0005_binary_binproviders_binary_output_dir_and_more.py b/archivebox/machine/migrations/0005_binary_binproviders_binary_output_dir_and_more.py
new file mode 100644
index 00000000..6d4b8ac7
--- /dev/null
+++ b/archivebox/machine/migrations/0005_binary_binproviders_binary_output_dir_and_more.py
@@ -0,0 +1,104 @@
+# Generated by Django 6.0 on 2025-12-29 06:45
+
+import django.db.models.deletion
+import django.utils.timezone
+from archivebox.uuid_compat import uuid7
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('machine', '0004_drop_dependency_table'),
+ ]
+
+ operations = [
+ # Update Django's state only - database already has correct schema
+ migrations.SeparateDatabaseAndState(
+ state_operations=[
+ migrations.AddField(
+ model_name='binary',
+ name='binproviders',
+ field=models.CharField(blank=True, default='env', help_text='Comma-separated list of allowed providers: apt,brew,pip,npm,env', max_length=127),
+ ),
+ migrations.AddField(
+ model_name='binary',
+ name='output_dir',
+ field=models.CharField(blank=True, default='', help_text='Directory where installation hook logs are stored', max_length=255),
+ ),
+ migrations.AddField(
+ model_name='binary',
+ name='overrides',
+ field=models.JSONField(blank=True, default=dict, help_text="Provider-specific overrides: {'apt': {'packages': ['pkg']}, ...}"),
+ ),
+ migrations.AddField(
+ model_name='binary',
+ name='retry_at',
+ field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, help_text='When to retry this binary installation', null=True),
+ ),
+ migrations.AddField(
+ model_name='binary',
+ name='status',
+ field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('succeeded', 'Succeeded'), ('failed', 'Failed')], db_index=True, default='queued', max_length=16),
+ ),
+ migrations.AlterField(
+ model_name='binary',
+ name='abspath',
+ field=models.CharField(blank=True, default='', max_length=255),
+ ),
+ migrations.AlterField(
+ model_name='binary',
+ name='binprovider',
+ field=models.CharField(blank=True, default='', help_text='Provider that successfully installed this binary', max_length=31),
+ ),
+ migrations.AlterField(
+ model_name='binary',
+ name='id',
+ field=models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True),
+ ),
+ migrations.AlterField(
+ model_name='binary',
+ name='machine',
+ field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='machine.machine'),
+ ),
+ migrations.AlterField(
+ model_name='binary',
+ name='name',
+ field=models.CharField(blank=True, db_index=True, default='', max_length=63),
+ ),
+ migrations.AlterField(
+ model_name='binary',
+ name='sha256',
+ field=models.CharField(blank=True, default='', max_length=64),
+ ),
+ migrations.AlterField(
+ model_name='binary',
+ name='version',
+ field=models.CharField(blank=True, default='', max_length=32),
+ ),
+ migrations.AlterField(
+ model_name='machine',
+ name='config',
+ field=models.JSONField(blank=True, default=dict, help_text='Machine-specific config overrides (e.g., resolved binary paths like WGET_BINARY)', null=True),
+ ),
+ migrations.AlterField(
+ model_name='machine',
+ name='id',
+ field=models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True),
+ ),
+ migrations.AlterField(
+ model_name='machine',
+ name='stats',
+ field=models.JSONField(blank=True, default=dict, null=True),
+ ),
+ migrations.AlterField(
+ model_name='networkinterface',
+ name='id',
+ field=models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True),
+ ),
+ ],
+ database_operations=[
+ # No database changes - schema already correct from previous migrations
+ ],
+ ),
+ ]
diff --git a/archivebox/machine/models.py b/archivebox/machine/models.py
index aeffd71c..cb4130f2 100755
--- a/archivebox/machine/models.py
+++ b/archivebox/machine/models.py
@@ -44,8 +44,8 @@ class Machine(ModelWithHealthStats):
os_platform = models.CharField(max_length=63, default=None, null=False)
os_release = models.CharField(max_length=63, default=None, null=False)
os_kernel = models.CharField(max_length=255, default=None, null=False)
- stats = models.JSONField(default=dict, null=False)
- config = models.JSONField(default=dict, null=False, blank=True,
+ stats = models.JSONField(default=dict, null=True, blank=True)
+ config = models.JSONField(default=dict, null=True, blank=True,
help_text="Machine-specific config overrides (e.g., resolved binary paths like WGET_BINARY)")
num_uses_failed = models.PositiveIntegerField(default=0)
num_uses_succeeded = models.PositiveIntegerField(default=0)
@@ -213,7 +213,7 @@ class Binary(ModelWithHealthStats):
num_uses_failed = models.PositiveIntegerField(default=0)
num_uses_succeeded = models.PositiveIntegerField(default=0)
- state_machine_name: str = 'machine.models.BinaryMachine'
+ state_machine_name: str = 'archivebox.machine.models.BinaryMachine'
objects: BinaryManager = BinaryManager()
diff --git a/archivebox/personas/apps.py b/archivebox/personas/apps.py
index d7440140..9a1cfb90 100644
--- a/archivebox/personas/apps.py
+++ b/archivebox/personas/apps.py
@@ -4,3 +4,4 @@ from django.apps import AppConfig
class SessionsConfig(AppConfig):
default_auto_field = "django.db.models.BigAutoField"
name = "archivebox.personas"
+ label = "personas"
diff --git a/archivebox/personas/models.py b/archivebox/personas/models.py
index 49b357d4..99f8ef87 100644
--- a/archivebox/personas/models.py
+++ b/archivebox/personas/models.py
@@ -21,7 +21,7 @@
# # COOKIES_TXT_FILE: '/path/to/cookies.txt',
# # CHROME_USER_DATA_DIR: '/path/to/chrome/user/data/dir',
# # CHECK_SSL_VALIDITY: False,
-# # SAVE_ARCHIVE_DOT_ORG: True,
+# # SAVE_ARCHIVEDOTORG: True,
# # CHROME_BINARY: 'chromium'
# # ...
# # }
diff --git a/archivebox/plugins/search_backend_ripgrep/tests/test_ripgrep_detection.py b/archivebox/plugins/search_backend_ripgrep/tests/test_ripgrep_detection.py
index 69f7c331..7b639efd 100644
--- a/archivebox/plugins/search_backend_ripgrep/tests/test_ripgrep_detection.py
+++ b/archivebox/plugins/search_backend_ripgrep/tests/test_ripgrep_detection.py
@@ -63,7 +63,7 @@ def test_ripgrep_hook_detects_binary_from_path():
def test_ripgrep_hook_skips_when_backend_not_ripgrep():
"""Test that ripgrep hook exits silently when search backend is not ripgrep."""
- hook_path = Path(__file__).parent.parent / 'on_Crawl__00_validate_ripgrep.py'
+ hook_path = Path(__file__).parent.parent / 'on_Crawl__00_install_ripgrep.py'
env = os.environ.copy()
env['SEARCH_BACKEND_ENGINE'] = 'sqlite' # Different backend
@@ -82,7 +82,7 @@ def test_ripgrep_hook_skips_when_backend_not_ripgrep():
def test_ripgrep_hook_handles_absolute_path():
"""Test that ripgrep hook works when RIPGREP_BINARY is an absolute path."""
- hook_path = Path(__file__).parent.parent / 'on_Crawl__00_validate_ripgrep.py'
+ hook_path = Path(__file__).parent.parent / 'on_Crawl__00_install_ripgrep.py'
rg_path = shutil.which('rg')
if not rg_path:
@@ -222,7 +222,7 @@ def test_ripgrep_only_detected_when_backend_enabled():
if not shutil.which('rg'):
pytest.skip("ripgrep not installed")
- hook_path = Path(__file__).parent.parent / 'on_Crawl__00_validate_ripgrep.py'
+ hook_path = Path(__file__).parent.parent / 'on_Crawl__00_install_ripgrep.py'
# Test 1: With ripgrep backend - should output Binary record
env1 = os.environ.copy()
diff --git a/archivebox/templates/core/snapshot.html b/archivebox/templates/core/snapshot.html
index a08a87f9..1b6b2bbd 100644
--- a/archivebox/templates/core/snapshot.html
+++ b/archivebox/templates/core/snapshot.html
@@ -360,9 +360,11 @@
-
+