diff --git a/README.md b/README.md index 45406ee6..00656468 100644 --- a/README.md +++ b/README.md @@ -763,7 +763,7 @@ The configuration is documented here: **[Configuration Wiki](https://github.com/
TIMEOUT=240 # default: 60 add more seconds on slower networks CHECK_SSL_VALIDITY=False # default: True False = allow saving URLs w/ bad SSL -SAVE_ARCHIVE_DOT_ORG=False # default: True False = disable Archive.org saving +SAVE_ARCHIVEDOTORG=False # default: True False = disable Archive.org saving MAX_MEDIA_SIZE=1500m # default: 750m raise/lower youtubedl output size
PUBLIC_INDEX=True # default: True whether anon users can view index @@ -959,7 +959,7 @@ archivebox add 'https://docs.google.com/document/d/12345somePrivateDocument' archivebox add 'https://vimeo.com/somePrivateVideo' # without first disabling saving to Archive.org: -archivebox config --set SAVE_ARCHIVE_DOT_ORG=False # disable saving all URLs in Archive.org +archivebox config --set SAVE_ARCHIVEDOTORG=False # disable saving all URLs in Archive.org # restrict the main index, Snapshot content, and Add Page to authenticated users as-needed: archivebox config --set PUBLIC_INDEX=False diff --git a/archivebox/__init__.py b/archivebox/__init__.py index 2cf819d4..8c0be7a0 100755 --- a/archivebox/__init__.py +++ b/archivebox/__init__.py @@ -26,10 +26,10 @@ ASCII_LOGO = """ PACKAGE_DIR = Path(__file__).resolve().parent -# Add PACKAGE_DIR to sys.path - required for Django migrations to import models -# Migrations reference models like 'machine.Binary' which need to be importable -if str(PACKAGE_DIR) not in sys.path: - sys.path.append(str(PACKAGE_DIR)) +# # Add PACKAGE_DIR to sys.path - required for Django migrations to import models +# # Migrations reference models like 'machine.Binary' which need to be importable +# if str(PACKAGE_DIR) not in sys.path: +# sys.path.append(str(PACKAGE_DIR)) os.environ['DJANGO_SETTINGS_MODULE'] = 'archivebox.core.settings' os.environ['TZ'] = 'UTC' diff --git a/archivebox/api/apps.py b/archivebox/api/apps.py index 86ee88ad..a1a0655a 100644 --- a/archivebox/api/apps.py +++ b/archivebox/api/apps.py @@ -5,6 +5,7 @@ from django.apps import AppConfig class APIConfig(AppConfig): name = 'archivebox.api' + label = 'api' def register_admin(admin_site): diff --git a/archivebox/api/v1_workers.py b/archivebox/api/v1_workers.py index d95c6ff6..f4ff580e 100644 --- a/archivebox/api/v1_workers.py +++ b/archivebox/api/v1_workers.py @@ -94,7 +94,7 @@ class OrchestratorSchema(Schema): @router.get("/orchestrator", response=OrchestratorSchema, url_name="get_orchestrator") def get_orchestrator(request): """Get the orchestrator status and all worker queues.""" - from workers.orchestrator import Orchestrator + from archivebox.workers.orchestrator import Orchestrator from workers.worker import CrawlWorker, SnapshotWorker, ArchiveResultWorker orchestrator = Orchestrator() diff --git a/archivebox/base_models/models.py b/archivebox/base_models/models.py index 66499231..55f033b0 100755 --- a/archivebox/base_models/models.py +++ b/archivebox/base_models/models.py @@ -73,7 +73,7 @@ class ModelWithUUID(models.Model): return f'/api/v1/docs#/{self._meta.app_label.title()}%20Models/api_v1_{self._meta.app_label}_get_{self._meta.db_table}' def as_json(self, keys: Iterable[str] = ()) -> dict: - default_keys = ('id', 'created_at', 'modified_at', 'created_by_id') + default_keys = ('id', 'created_at', 'modified_at') return {key: getattr(self, key) for key in (keys or default_keys) if hasattr(self, key)} @@ -119,7 +119,7 @@ class ModelWithHealthStats(models.Model): class ModelWithConfig(models.Model): """Mixin for models with a JSON config field.""" - config = models.JSONField(default=dict, null=False, blank=False, editable=True) + config = models.JSONField(default=dict, null=True, blank=True, editable=True) class Meta: abstract = True diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index 3a991d39..234d1316 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -56,7 +56,7 @@ def add(urls: str | list[str], from archivebox.core.models import Snapshot from archivebox.crawls.models import Crawl from archivebox.base_models.models import get_or_create_system_user_pk - from workers.orchestrator import Orchestrator + from archivebox.workers.orchestrator import Orchestrator created_by_id = created_by_id or get_or_create_system_user_pk() diff --git a/archivebox/cli/archivebox_crawl.py b/archivebox/cli/archivebox_crawl.py index f73553db..3bedaade 100644 --- a/archivebox/cli/archivebox_crawl.py +++ b/archivebox/cli/archivebox_crawl.py @@ -78,7 +78,7 @@ def discover_outlinks( from archivebox.core.models import Snapshot, ArchiveResult from archivebox.crawls.models import Crawl from archivebox.config import CONSTANTS - from workers.orchestrator import Orchestrator + from archivebox.workers.orchestrator import Orchestrator created_by_id = get_or_create_system_user_pk() is_tty = sys.stdout.isatty() diff --git a/archivebox/cli/archivebox_extract.py b/archivebox/cli/archivebox_extract.py index 4005f365..29abd63d 100644 --- a/archivebox/cli/archivebox_extract.py +++ b/archivebox/cli/archivebox_extract.py @@ -96,7 +96,7 @@ def run_plugins( TYPE_SNAPSHOT, TYPE_ARCHIVERESULT ) from archivebox.core.models import Snapshot, ArchiveResult - from workers.orchestrator import Orchestrator + from archivebox.workers.orchestrator import Orchestrator is_tty = sys.stdout.isatty() diff --git a/archivebox/cli/archivebox_init.py b/archivebox/cli/archivebox_init.py index e4dc58a4..ed67c77d 100755 --- a/archivebox/cli/archivebox_init.py +++ b/archivebox/cli/archivebox_init.py @@ -13,11 +13,9 @@ from archivebox.misc.util import docstring, enforce_types @enforce_types -def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=False) -> None: +def init(force: bool=False, quick: bool=False, install: bool=False) -> None: """Initialize a new ArchiveBox collection in the current directory""" - install = install or setup - from archivebox.config import CONSTANTS, VERSION, DATA_DIR from archivebox.config.common import SERVER_CONFIG from archivebox.config.collection import write_config_file @@ -128,7 +126,8 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool= print(f' [yellow]√ Added {len(orphaned_data_dir_links)} orphaned links from existing archive directories.[/yellow]') if pending_links: - Snapshot.objects.create_from_dicts(list(pending_links.values())) + for link_dict in pending_links.values(): + Snapshot.from_jsonl(link_dict) # Hint for orphaned snapshot directories print() @@ -187,7 +186,6 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool= @click.option('--force', '-f', is_flag=True, help='Ignore unrecognized files in current directory and initialize anyway') @click.option('--quick', '-q', is_flag=True, help='Run any updates or migrations without rechecking all snapshot dirs') @click.option('--install', '-s', is_flag=True, help='Automatically install dependencies and extras used for archiving') -@click.option('--setup', '-s', is_flag=True, help='DEPRECATED: equivalent to --install') @docstring(init.__doc__) def main(**kwargs) -> None: init(**kwargs) diff --git a/archivebox/cli/archivebox_install.py b/archivebox/cli/archivebox_install.py index e9a7f7a5..f35adf5e 100755 --- a/archivebox/cli/archivebox_install.py +++ b/archivebox/cli/archivebox_install.py @@ -85,7 +85,7 @@ def install(dry_run: bool=False) -> None: print() # Run the crawl synchronously (this triggers on_Crawl hooks) - from workers.orchestrator import Orchestrator + from archivebox.workers.orchestrator import Orchestrator orchestrator = Orchestrator(exit_on_idle=True) orchestrator.runloop() diff --git a/archivebox/cli/archivebox_orchestrator.py b/archivebox/cli/archivebox_orchestrator.py index 9f277e7d..4b272727 100644 --- a/archivebox/cli/archivebox_orchestrator.py +++ b/archivebox/cli/archivebox_orchestrator.py @@ -37,7 +37,7 @@ def orchestrator(daemon: bool = False, watch: bool = False) -> int: 0: All work completed successfully 1: Error occurred """ - from workers.orchestrator import Orchestrator + from archivebox.workers.orchestrator import Orchestrator if Orchestrator.is_running(): print('[yellow]Orchestrator is already running[/yellow]') diff --git a/archivebox/cli/archivebox_server.py b/archivebox/cli/archivebox_server.py index fb0b1148..49490142 100644 --- a/archivebox/cli/archivebox_server.py +++ b/archivebox/cli/archivebox_server.py @@ -74,7 +74,7 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,), tail_multiple_worker_logs, is_port_in_use, ) - from workers.orchestrator import Orchestrator + from archivebox.workers.orchestrator import Orchestrator import sys # Check if port is already in use diff --git a/archivebox/cli/archivebox_snapshot.py b/archivebox/cli/archivebox_snapshot.py index 6fba01a3..4d2f7b5f 100644 --- a/archivebox/cli/archivebox_snapshot.py +++ b/archivebox/cli/archivebox_snapshot.py @@ -163,7 +163,7 @@ def create_snapshots( # If --plugins is passed, run the orchestrator for those plugins if plugins: - from workers.orchestrator import Orchestrator + from archivebox.workers.orchestrator import Orchestrator rprint(f'[blue]Running plugins: {plugins or "all"}...[/blue]', file=sys.stderr) orchestrator = Orchestrator(exit_on_idle=True) orchestrator.runloop() diff --git a/archivebox/cli/archivebox_update.py b/archivebox/cli/archivebox_update.py index 49ba8f13..b0e29be9 100644 --- a/archivebox/cli/archivebox_update.py +++ b/archivebox/cli/archivebox_update.py @@ -160,7 +160,7 @@ def process_all_db_snapshots(batch_size: int = 100) -> dict: total = Snapshot.objects.count() print(f'[*] Processing {total} snapshots from database...') - for snapshot in Snapshot.objects.iterator(): + for snapshot in Snapshot.objects.iterator(chunk_size=batch_size): # Reconcile index.json with DB snapshot.reconcile_with_index_json() @@ -209,7 +209,7 @@ def process_filtered_snapshots( total = snapshots.count() print(f'[*] Found {total} matching snapshots') - for snapshot in snapshots.iterator(): + for snapshot in snapshots.iterator(chunk_size=batch_size): # Reconcile index.json with DB snapshot.reconcile_with_index_json() diff --git a/archivebox/cli/tests.py b/archivebox/cli/tests.py index 5a2b74b9..27dec785 100644 --- a/archivebox/cli/tests.py +++ b/archivebox/cli/tests.py @@ -17,7 +17,7 @@ TEST_CONFIG = { 'DATA_DIR': 'data.tests', - 'SAVE_ARCHIVE_DOT_ORG': 'False', + 'SAVE_ARCHIVEDOTORG': 'False', 'SAVE_TITLE': 'False', 'USE_CURL': 'False', diff --git a/archivebox/cli/tests_piping.py b/archivebox/cli/tests_piping.py index 88a7435d..23967550 100644 --- a/archivebox/cli/tests_piping.py +++ b/archivebox/cli/tests_piping.py @@ -32,7 +32,7 @@ from unittest.mock import patch, MagicMock TEST_CONFIG = { 'USE_COLOR': 'False', 'SHOW_PROGRESS': 'False', - 'SAVE_ARCHIVE_DOT_ORG': 'False', + 'SAVE_ARCHIVEDOTORG': 'False', 'SAVE_TITLE': 'True', # Fast extractor 'SAVE_FAVICON': 'False', 'SAVE_WGET': 'False', diff --git a/archivebox/config/configset.py b/archivebox/config/configset.py index 40d8db4c..9f6ee979 100644 --- a/archivebox/config/configset.py +++ b/archivebox/config/configset.py @@ -216,6 +216,29 @@ def get_config( if snapshot and hasattr(snapshot, "config") and snapshot.config: config.update(snapshot.config) + # Normalize all aliases to canonical names (after all sources merged) + # This handles aliases that came from user/crawl/snapshot configs, not just env + try: + from archivebox.hooks import discover_plugin_configs + plugin_configs = discover_plugin_configs() + aliases_to_normalize = {} # {alias_key: canonical_key} + + # Build alias mapping from all plugin schemas + for plugin_name, schema in plugin_configs.items(): + for canonical_key, prop_schema in schema.get('properties', {}).items(): + for alias in prop_schema.get('x-aliases', []): + aliases_to_normalize[alias] = canonical_key + + # Normalize: copy alias values to canonical keys (aliases take precedence) + for alias_key, canonical_key in aliases_to_normalize.items(): + if alias_key in config: + # Alias exists - copy to canonical key (overwriting any default) + config[canonical_key] = config[alias_key] + # Remove alias from config to keep it clean + del config[alias_key] + except ImportError: + pass + return config diff --git a/archivebox/core/apps.py b/archivebox/core/apps.py index 5b173784..0362afe3 100644 --- a/archivebox/core/apps.py +++ b/archivebox/core/apps.py @@ -5,8 +5,12 @@ from django.apps import AppConfig class CoreConfig(AppConfig): name = 'archivebox.core' + label = 'core' def ready(self): """Register the archivebox.core.admin_site as the main django admin site""" from archivebox.core.admin_site import register_admin_site register_admin_site() + + # Import models to register state machines with the registry + from archivebox.core import models # noqa: F401 diff --git a/archivebox/core/migrations/0024_b_clear_config_fields.py b/archivebox/core/migrations/0024_b_clear_config_fields.py new file mode 100644 index 00000000..112688dd --- /dev/null +++ b/archivebox/core/migrations/0024_b_clear_config_fields.py @@ -0,0 +1,57 @@ +# Data migration to clear config fields that may contain invalid JSON +# This runs before 0025 to prevent CHECK constraint failures + +from django.db import migrations + + +def clear_config_fields(apps, schema_editor): + """Clear all config fields in related tables to avoid JSON validation errors.""" + db_alias = schema_editor.connection.alias + + # Disable foreign key checks temporarily to allow updates + with schema_editor.connection.cursor() as cursor: + cursor.execute("PRAGMA foreign_keys=OFF") + + tables_to_clear = [ + ('crawls_seed', 'config'), + ('crawls_crawl', 'config'), + ('crawls_crawlschedule', 'config') if 'crawlschedule' in dir() else None, + ('machine_machine', 'stats'), + ('machine_machine', 'config'), + ] + + for table_info in tables_to_clear: + if table_info is None: + continue + table_name, field_name = table_info + + try: + with schema_editor.connection.cursor() as cursor: + # Check if table exists first + cursor.execute(f"SELECT name FROM sqlite_master WHERE type='table' AND name='{table_name}'") + if not cursor.fetchone(): + print(f" Skipping {table_name}.{field_name}: table does not exist") + continue + + # Set all to empty JSON object + cursor.execute(f"UPDATE {table_name} SET {field_name} = '{{}}' WHERE {field_name} IS NOT NULL") + print(f" Cleared {field_name} in {table_name}: {cursor.rowcount} rows") + except Exception as e: + print(f" Skipping {table_name}.{field_name}: {e}") + + # Re-enable foreign key checks + with schema_editor.connection.cursor() as cursor: + cursor.execute("PRAGMA foreign_keys=ON") + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0023_new_schema'), + ('crawls', '0001_initial'), + ('machine', '0001_squashed'), + ] + + operations = [ + migrations.RunPython(clear_config_fields, reverse_code=migrations.RunPython.noop), + ] diff --git a/archivebox/core/migrations/0024_c_disable_fk_checks.py b/archivebox/core/migrations/0024_c_disable_fk_checks.py new file mode 100644 index 00000000..8bee7270 --- /dev/null +++ b/archivebox/core/migrations/0024_c_disable_fk_checks.py @@ -0,0 +1,28 @@ +# Disable foreign key checks before 0025 to prevent CHECK constraint validation errors + +from django.db import migrations + + +def disable_fk_checks(apps, schema_editor): + """Temporarily disable foreign key checks.""" + with schema_editor.connection.cursor() as cursor: + cursor.execute("PRAGMA foreign_keys=OFF") + print(" Disabled foreign key checks") + + +def enable_fk_checks(apps, schema_editor): + """Re-enable foreign key checks.""" + with schema_editor.connection.cursor() as cursor: + cursor.execute("PRAGMA foreign_keys=ON") + print(" Enabled foreign key checks") + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0024_b_clear_config_fields'), + ] + + operations = [ + migrations.RunPython(disable_fk_checks, reverse_code=enable_fk_checks), + ] diff --git a/archivebox/core/migrations/0024_d_fix_crawls_config.py b/archivebox/core/migrations/0024_d_fix_crawls_config.py new file mode 100644 index 00000000..e1df3322 --- /dev/null +++ b/archivebox/core/migrations/0024_d_fix_crawls_config.py @@ -0,0 +1,93 @@ +# Fix crawls_crawl config field to avoid CHECK constraint errors during table rebuilds + +from django.db import migrations + + +def fix_crawls_config(apps, schema_editor): + """ + Rebuild crawls_crawl table to fix CHECK constraints and make seed_id nullable. + Only runs for UPGRADES from 0.8.x (when crawls.0001_initial didn't exist yet). + For fresh installs, crawls.0001_initial creates the correct schema. + """ + with schema_editor.connection.cursor() as cursor: + # Check if this is an upgrade from old 0.8.x or a fresh install + # In fresh installs, crawls.0001_initial was applied, creating seed FK + # In upgrades, the table was created by old migrations before 0001_initial existed + cursor.execute(""" + SELECT COUNT(*) FROM django_migrations + WHERE app='crawls' AND name='0001_initial' + """) + has_crawls_0001 = cursor.fetchone()[0] > 0 + + if has_crawls_0001: + # Fresh install - crawls.0001_initial already created the correct schema + # Just clear config to avoid CHECK constraint issues + print(" Fresh install detected - clearing config field only") + try: + cursor.execute('UPDATE "crawls_crawl" SET "config" = NULL') + except Exception as e: + print(f" Skipping config clear: {e}") + return + + # Upgrade from 0.8.x - rebuild table to make seed_id nullable and remove CHECK constraint + print(" Upgrading from 0.8.x - rebuilding crawls_crawl table") + cursor.execute("PRAGMA foreign_keys=OFF") + + # Backup + cursor.execute("CREATE TABLE crawls_crawl_backup AS SELECT * FROM crawls_crawl") + + # Recreate without config CHECK constraint, with nullable seed_id + cursor.execute("DROP TABLE crawls_crawl") + cursor.execute(""" + CREATE TABLE "crawls_crawl" ( + "num_uses_failed" integer unsigned NOT NULL CHECK ("num_uses_failed" >= 0), + "num_uses_succeeded" integer unsigned NOT NULL CHECK ("num_uses_succeeded" >= 0), + "id" char(32) NOT NULL PRIMARY KEY, + "created_at" datetime NOT NULL, + "modified_at" datetime NOT NULL, + "urls" text NOT NULL, + "config" text, + "max_depth" smallint unsigned NOT NULL CHECK ("max_depth" >= 0), + "tags_str" varchar(1024) NOT NULL, + "persona_id" char(32) NULL, + "label" varchar(64) NOT NULL, + "notes" text NOT NULL, + "output_dir" varchar(512) NOT NULL, + "status" varchar(15) NOT NULL, + "retry_at" datetime NULL, + "created_by_id" integer NOT NULL REFERENCES "auth_user" ("id") DEFERRABLE INITIALLY DEFERRED, + "seed_id" char(32) NULL DEFAULT NULL, + "schedule_id" char(32) NULL REFERENCES "crawls_crawlschedule" ("id") DEFERRABLE INITIALLY DEFERRED + ) + """) + + # Restore data + cursor.execute(""" + INSERT INTO "crawls_crawl" ( + "num_uses_failed", "num_uses_succeeded", "id", "created_at", "modified_at", + "urls", "config", "max_depth", "tags_str", "persona_id", "label", "notes", + "output_dir", "status", "retry_at", "created_by_id", "seed_id", "schedule_id" + ) + SELECT + "num_uses_failed", "num_uses_succeeded", "id", "created_at", "modified_at", + "urls", "config", "max_depth", "tags_str", "persona_id", "label", "notes", + "output_dir", "status", "retry_at", "created_by_id", "seed_id", "schedule_id" + FROM crawls_crawl_backup + """) + + cursor.execute("DROP TABLE crawls_crawl_backup") + + # NULL out config to avoid any invalid JSON + cursor.execute('UPDATE "crawls_crawl" SET "config" = NULL') + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0024_c_disable_fk_checks'), + ('crawls', '0001_initial'), + ] + + operations = [ + migrations.RunPython(fix_crawls_config, reverse_code=migrations.RunPython.noop), + ] diff --git a/archivebox/core/migrations/0024_snapshot_crawl.py b/archivebox/core/migrations/0024_snapshot_crawl.py index 84c285bc..c8b47bf2 100644 --- a/archivebox/core/migrations/0024_snapshot_crawl.py +++ b/archivebox/core/migrations/0024_snapshot_crawl.py @@ -8,9 +8,7 @@ import django.db.models.deletion class Migration(migrations.Migration): dependencies = [ - ('core', '0023_new_schema'), - ('crawls', '0001_initial'), - ('machine', '0001_squashed'), + ('core', '0024_d_fix_crawls_config'), ] operations = [ diff --git a/archivebox/core/migrations/0026_remove_archiveresult_output_dir_and_more.py b/archivebox/core/migrations/0026_remove_archiveresult_output_dir_and_more.py index 13707940..5ec70d47 100755 --- a/archivebox/core/migrations/0026_remove_archiveresult_output_dir_and_more.py +++ b/archivebox/core/migrations/0026_remove_archiveresult_output_dir_and_more.py @@ -10,6 +10,13 @@ from django.db import migrations, models def populate_archiveresult_uuids(apps, schema_editor): """Generate unique UUIDs for ArchiveResults that don't have one.""" + # Check if uuid column exists before trying to populate it + with schema_editor.connection.cursor() as cursor: + cursor.execute("PRAGMA table_info(core_archiveresult)") + columns = [row[1] for row in cursor.fetchall()] + if 'uuid' not in columns: + return # uuid column doesn't exist, skip this data migration + ArchiveResult = apps.get_model('core', 'ArchiveResult') for result in ArchiveResult.objects.filter(uuid__isnull=True): result.uuid = uuid_compat.uuid7() @@ -21,6 +28,22 @@ def reverse_populate_uuids(apps, schema_editor): pass +def remove_output_dir_if_exists(apps, schema_editor): + """Remove output_dir columns if they exist.""" + with schema_editor.connection.cursor() as cursor: + # Check and remove from core_archiveresult + cursor.execute("PRAGMA table_info(core_archiveresult)") + columns = [row[1] for row in cursor.fetchall()] + if 'output_dir' in columns: + cursor.execute("ALTER TABLE core_archiveresult DROP COLUMN output_dir") + + # Check and remove from core_snapshot + cursor.execute("PRAGMA table_info(core_snapshot)") + columns = [row[1] for row in cursor.fetchall()] + if 'output_dir' in columns: + cursor.execute("ALTER TABLE core_snapshot DROP COLUMN output_dir") + + class Migration(migrations.Migration): dependencies = [ @@ -33,82 +56,90 @@ class Migration(migrations.Migration): migrations.RunPython(populate_archiveresult_uuids, reverse_populate_uuids), # Remove output_dir fields (not needed, computed from snapshot) - migrations.RemoveField( - model_name='archiveresult', - name='output_dir', - ), - migrations.RemoveField( - model_name='snapshot', - name='output_dir', + migrations.RunPython(remove_output_dir_if_exists, reverse_code=migrations.RunPython.noop), + + # Update Django's migration state to match 0.9.x schema + # Database already has correct types from 0.8.x, just update state + migrations.SeparateDatabaseAndState( + state_operations=[ + # Archiveresult field alterations + migrations.AlterField( + model_name='archiveresult', + name='created_at', + field=models.DateTimeField(db_index=True, default=django.utils.timezone.now), + ), + migrations.AlterField( + model_name='archiveresult', + name='created_by', + field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, related_name='archiveresult_set', to=settings.AUTH_USER_MODEL), + ), + migrations.AlterField( + model_name='archiveresult', + name='extractor', + field=models.CharField(db_index=True, max_length=32), + ), + # Convert id from AutoField to UUIDField (database already has UUID CHAR(32)) + migrations.AlterField( + model_name='archiveresult', + name='id', + field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True), + ), + migrations.AlterField( + model_name='archiveresult', + name='status', + field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('backoff', 'Waiting to retry'), ('succeeded', 'Succeeded'), ('failed', 'Failed'), ('skipped', 'Skipped')], db_index=True, default='queued', max_length=15), + ), + + # Snapshot field alterations + migrations.AlterField( + model_name='snapshot', + name='bookmarked_at', + field=models.DateTimeField(db_index=True, default=django.utils.timezone.now), + ), + migrations.AlterField( + model_name='snapshot', + name='created_at', + field=models.DateTimeField(db_index=True, default=django.utils.timezone.now), + ), + migrations.AlterField( + model_name='snapshot', + name='created_by', + field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to=settings.AUTH_USER_MODEL), + ), + migrations.AlterField( + model_name='snapshot', + name='downloaded_at', + field=models.DateTimeField(blank=True, db_index=True, default=None, editable=False, null=True), + ), + migrations.AlterField( + model_name='snapshot', + name='id', + field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True), + ), + ], + database_operations=[ + # No actual database changes needed - schema is already correct from 0.8.x + ], ), - # Archiveresult field alterations - migrations.AlterField( - model_name='archiveresult', - name='created_at', - field=models.DateTimeField(db_index=True, default=django.utils.timezone.now), - ), - migrations.AlterField( - model_name='archiveresult', - name='created_by', - field=models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, related_name='archiveresult_set', to=settings.AUTH_USER_MODEL), - ), - migrations.AlterField( - model_name='archiveresult', - name='extractor', - field=models.CharField(db_index=True, max_length=32), - ), - migrations.AlterField( - model_name='archiveresult', - name='id', - field=models.AutoField(editable=False, primary_key=True, serialize=False), - ), - migrations.AlterField( - model_name='archiveresult', - name='status', - field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('backoff', 'Waiting to retry'), ('succeeded', 'Succeeded'), ('failed', 'Failed'), ('skipped', 'Skipped')], db_index=True, default='queued', max_length=15), - ), - - # Snapshot field alterations - migrations.AlterField( - model_name='snapshot', - name='bookmarked_at', - field=models.DateTimeField(db_index=True, default=django.utils.timezone.now), - ), - migrations.AlterField( - model_name='snapshot', - name='created_at', - field=models.DateTimeField(db_index=True, default=django.utils.timezone.now), - ), - migrations.AlterField( - model_name='snapshot', - name='created_by', - field=models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to=settings.AUTH_USER_MODEL), - ), - migrations.AlterField( - model_name='snapshot', - name='downloaded_at', - field=models.DateTimeField(blank=True, db_index=True, default=None, editable=False, null=True), - ), - migrations.AlterField( - model_name='snapshot', - name='id', - field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True), - ), - - # SnapshotTag and Tag alterations - migrations.AlterField( - model_name='snapshottag', - name='id', - field=models.AutoField(primary_key=True, serialize=False), - ), - migrations.AlterField( - model_name='tag', - name='created_by', - field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, related_name='tag_set', to=settings.AUTH_USER_MODEL), - ), - migrations.AlterUniqueTogether( - name='snapshottag', - unique_together={('snapshot', 'tag')}, + # SnapshotTag and Tag alterations - state only, DB already correct + migrations.SeparateDatabaseAndState( + state_operations=[ + migrations.AlterField( + model_name='snapshottag', + name='id', + field=models.AutoField(primary_key=True, serialize=False), + ), + migrations.AlterField( + model_name='tag', + name='created_by', + field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, related_name='tag_set', to=settings.AUTH_USER_MODEL), + ), + migrations.AlterUniqueTogether( + name='snapshottag', + unique_together={('snapshot', 'tag')}, + ), + ], + database_operations=[], ), ] diff --git a/archivebox/core/migrations/0029_archiveresult_hook_fields.py b/archivebox/core/migrations/0029_archiveresult_hook_fields.py index 41096eee..a8ddfb27 100644 --- a/archivebox/core/migrations/0029_archiveresult_hook_fields.py +++ b/archivebox/core/migrations/0029_archiveresult_hook_fields.py @@ -13,68 +13,79 @@ class Migration(migrations.Migration): ] operations = [ - # Add new output fields (keep old 'output' temporarily for migration) - migrations.AddField( - model_name='archiveresult', - name='output_str', - field=models.TextField( - blank=True, - default='', - help_text='Human-readable output summary (e.g., "Downloaded 5 files")' - ), - ), - - migrations.AddField( - model_name='archiveresult', - name='output_json', - field=models.JSONField( - null=True, - blank=True, - default=None, - help_text='Structured metadata (headers, redirects, etc.) - should NOT duplicate ArchiveResult fields' - ), - ), - - migrations.AddField( - model_name='archiveresult', - name='output_files', - field=models.JSONField( - default=dict, - help_text='Dict of {relative_path: {metadata}} - values are empty dicts for now, extensible for future metadata' - ), - ), - - migrations.AddField( - model_name='archiveresult', - name='output_size', - field=models.BigIntegerField( - default=0, - help_text='Total recursive size in bytes of all output files' - ), - ), - - migrations.AddField( - model_name='archiveresult', - name='output_mimetypes', - field=models.CharField( - max_length=512, - blank=True, - default='', - help_text='CSV of mimetypes sorted by size descending' - ), - ), - - # Add binary FK (optional) - migrations.AddField( - model_name='archiveresult', - name='binary', - field=models.ForeignKey( - 'machine.Binary', - on_delete=models.SET_NULL, - null=True, - blank=True, - related_name='archiveresults', - help_text='Primary binary used by this hook (optional)' - ), + # Add new output fields using SeparateDatabaseAndState to avoid table rebuilds + migrations.SeparateDatabaseAndState( + state_operations=[ + migrations.AddField( + model_name='archiveresult', + name='output_str', + field=models.TextField( + blank=True, + default='', + help_text='Human-readable output summary (e.g., "Downloaded 5 files")' + ), + ), + migrations.AddField( + model_name='archiveresult', + name='output_json', + field=models.JSONField( + null=True, + blank=True, + default=None, + help_text='Structured metadata (headers, redirects, etc.) - should NOT duplicate ArchiveResult fields' + ), + ), + migrations.AddField( + model_name='archiveresult', + name='output_files', + field=models.JSONField( + default=dict, + help_text='Dict of {relative_path: {metadata}} - values are empty dicts for now, extensible for future metadata' + ), + ), + migrations.AddField( + model_name='archiveresult', + name='output_size', + field=models.BigIntegerField( + default=0, + help_text='Total recursive size in bytes of all output files' + ), + ), + migrations.AddField( + model_name='archiveresult', + name='output_mimetypes', + field=models.CharField( + max_length=512, + blank=True, + default='', + help_text='CSV of mimetypes sorted by size descending' + ), + ), + migrations.AddField( + model_name='archiveresult', + name='binary', + field=models.ForeignKey( + 'machine.Binary', + on_delete=models.SET_NULL, + null=True, + blank=True, + related_name='archiveresults', + help_text='Primary binary used by this hook (optional)' + ), + ), + ], + database_operations=[ + migrations.RunSQL( + sql=""" + ALTER TABLE core_archiveresult ADD COLUMN output_str TEXT DEFAULT ''; + ALTER TABLE core_archiveresult ADD COLUMN output_json TEXT; + ALTER TABLE core_archiveresult ADD COLUMN output_files TEXT DEFAULT '{}'; + ALTER TABLE core_archiveresult ADD COLUMN output_size BIGINT DEFAULT 0; + ALTER TABLE core_archiveresult ADD COLUMN output_mimetypes VARCHAR(512) DEFAULT ''; + ALTER TABLE core_archiveresult ADD COLUMN binary_id CHAR(32) REFERENCES machine_binary(id); + """, + reverse_sql=migrations.RunSQL.noop, + ), + ], ), ] diff --git a/archivebox/core/migrations/0030_migrate_output_field.py b/archivebox/core/migrations/0030_migrate_output_field.py index 5dafb7e8..6c0501ae 100644 --- a/archivebox/core/migrations/0030_migrate_output_field.py +++ b/archivebox/core/migrations/0030_migrate_output_field.py @@ -12,27 +12,46 @@ def migrate_output_field(apps, schema_editor): Logic: - If output contains JSON {...}, move to output_json - Otherwise, move to output_str + + Use raw SQL to avoid CHECK constraint issues during migration. """ - ArchiveResult = apps.get_model('core', 'ArchiveResult') + # Use raw SQL to migrate data without triggering CHECK constraints + with schema_editor.connection.cursor() as cursor: + # Get all archive results + cursor.execute(""" + SELECT id, output FROM core_archiveresult + """) - for ar in ArchiveResult.objects.all().iterator(): - old_output = ar.output or '' + for row in cursor.fetchall(): + ar_id, old_output = row + old_output = old_output or '' - # Case 1: JSON output - if old_output.strip().startswith('{'): - try: - parsed = json.loads(old_output) - ar.output_json = parsed - ar.output_str = '' - except json.JSONDecodeError: - # Not valid JSON, treat as string - ar.output_str = old_output - - # Case 2: File path or plain string - else: - ar.output_str = old_output - - ar.save(update_fields=['output_str', 'output_json']) + # Case 1: JSON output + if old_output.strip().startswith('{'): + try: + # Validate it's actual JSON + parsed = json.loads(old_output) + # Update with JSON - cast to JSON to satisfy CHECK constraint + json_str = json.dumps(parsed) + cursor.execute(""" + UPDATE core_archiveresult + SET output_str = '', output_json = json(?) + WHERE id = ? + """, (json_str, ar_id)) + except json.JSONDecodeError: + # Not valid JSON, treat as string + cursor.execute(""" + UPDATE core_archiveresult + SET output_str = ?, output_json = NULL + WHERE id = ? + """, (old_output, ar_id)) + # Case 2: File path or plain string + else: + cursor.execute(""" + UPDATE core_archiveresult + SET output_str = ?, output_json = NULL + WHERE id = ? + """, (old_output, ar_id)) def reverse_migrate(apps, schema_editor): diff --git a/archivebox/core/migrations/0032_alter_archiveresult_binary_and_more.py b/archivebox/core/migrations/0032_alter_archiveresult_binary_and_more.py index cd8eb821..bbe45cba 100644 --- a/archivebox/core/migrations/0032_alter_archiveresult_binary_and_more.py +++ b/archivebox/core/migrations/0032_alter_archiveresult_binary_and_more.py @@ -16,43 +16,62 @@ class Migration(migrations.Migration): ] operations = [ - migrations.AlterField( - model_name='archiveresult', - name='binary', - field=models.ForeignKey(blank=True, help_text='Primary binary used by this hook', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='archiveresults', to='machine.binary'), + # Update Django's state only - database already has correct schema from 0029 + migrations.SeparateDatabaseAndState( + state_operations=[ + migrations.AlterField( + model_name='archiveresult', + name='binary', + field=models.ForeignKey(blank=True, help_text='Primary binary used by this hook', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='archiveresults', to='machine.binary'), + ), + migrations.AlterField( + model_name='archiveresult', + name='output_files', + field=models.JSONField(default=dict, help_text='Dict of {relative_path: {metadata}}'), + ), + migrations.AlterField( + model_name='archiveresult', + name='output_json', + field=models.JSONField(blank=True, default=None, help_text='Structured metadata (headers, redirects, etc.)', null=True), + ), + migrations.AlterField( + model_name='archiveresult', + name='output_mimetypes', + field=models.CharField(blank=True, default='', help_text='CSV of mimetypes sorted by size', max_length=512), + ), + migrations.AlterField( + model_name='archiveresult', + name='output_size', + field=models.BigIntegerField(default=0, help_text='Total bytes of all output files'), + ), + migrations.AlterField( + model_name='archiveresult', + name='output_str', + field=models.TextField(blank=True, default='', help_text='Human-readable output summary'), + ), + migrations.AlterField( + model_name='archiveresult', + name='uuid', + field=models.UUIDField(blank=True, db_index=True, default=uuid_compat.uuid7, null=True), + ), + ], + database_operations=[ + # No database changes needed - columns already exist with correct types + ], ), - migrations.AlterField( - model_name='archiveresult', - name='output_files', - field=models.JSONField(default=dict, help_text='Dict of {relative_path: {metadata}}'), - ), - migrations.AlterField( - model_name='archiveresult', - name='output_json', - field=models.JSONField(blank=True, default=None, help_text='Structured metadata (headers, redirects, etc.)', null=True), - ), - migrations.AlterField( - model_name='archiveresult', - name='output_mimetypes', - field=models.CharField(blank=True, default='', help_text='CSV of mimetypes sorted by size', max_length=512), - ), - migrations.AlterField( - model_name='archiveresult', - name='output_size', - field=models.BigIntegerField(default=0, help_text='Total bytes of all output files'), - ), - migrations.AlterField( - model_name='archiveresult', - name='output_str', - field=models.TextField(blank=True, default='', help_text='Human-readable output summary'), - ), - migrations.AlterField( - model_name='archiveresult', - name='uuid', - field=models.UUIDField(blank=True, db_index=True, default=uuid_compat.uuid7, null=True), - ), - migrations.AddConstraint( - model_name='snapshot', - constraint=models.UniqueConstraint(fields=('timestamp',), name='unique_timestamp'), + # Add unique constraint without table rebuild + migrations.SeparateDatabaseAndState( + state_operations=[ + migrations.AddConstraint( + model_name='snapshot', + constraint=models.UniqueConstraint(fields=('timestamp',), name='unique_timestamp'), + ), + ], + database_operations=[ + migrations.RunSQL( + sql="CREATE UNIQUE INDEX IF NOT EXISTS unique_timestamp ON core_snapshot (timestamp);", + reverse_sql="DROP INDEX IF EXISTS unique_timestamp;", + ), + ], ), ] diff --git a/archivebox/core/migrations/0033_rename_extractor_add_hook_name.py b/archivebox/core/migrations/0033_rename_extractor_add_hook_name.py index 4e0a20bf..bedb58db 100644 --- a/archivebox/core/migrations/0033_rename_extractor_add_hook_name.py +++ b/archivebox/core/migrations/0033_rename_extractor_add_hook_name.py @@ -10,20 +10,35 @@ class Migration(migrations.Migration): ] operations = [ - migrations.RenameField( - model_name='archiveresult', - old_name='extractor', - new_name='plugin', - ), - migrations.AddField( - model_name='archiveresult', - name='hook_name', - field=models.CharField( - blank=True, - default='', - max_length=255, - db_index=True, - help_text='Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)' - ), + # Use SeparateDatabaseAndState to avoid table rebuilds that would re-add CHECK constraints + migrations.SeparateDatabaseAndState( + state_operations=[ + migrations.RenameField( + model_name='archiveresult', + old_name='extractor', + new_name='plugin', + ), + migrations.AddField( + model_name='archiveresult', + name='hook_name', + field=models.CharField( + blank=True, + default='', + max_length=255, + db_index=True, + help_text='Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)' + ), + ), + ], + database_operations=[ + migrations.RunSQL( + sql=""" + ALTER TABLE core_archiveresult RENAME COLUMN extractor TO plugin; + ALTER TABLE core_archiveresult ADD COLUMN hook_name VARCHAR(255) DEFAULT '' NOT NULL; + CREATE INDEX IF NOT EXISTS core_archiveresult_hook_name_idx ON core_archiveresult (hook_name); + """, + reverse_sql=migrations.RunSQL.noop, + ), + ], ), ] diff --git a/archivebox/core/migrations/0034_snapshot_current_step.py b/archivebox/core/migrations/0034_snapshot_current_step.py index f570230c..4b89fa21 100644 --- a/archivebox/core/migrations/0034_snapshot_current_step.py +++ b/archivebox/core/migrations/0034_snapshot_current_step.py @@ -11,13 +11,27 @@ class Migration(migrations.Migration): ] operations = [ - migrations.AddField( - model_name='snapshot', - name='current_step', - field=models.PositiveSmallIntegerField( - default=0, - db_index=True, - help_text='Current hook step being executed (0-9). Used for sequential hook execution.' - ), + # Use SeparateDatabaseAndState to avoid table rebuild that would fail on config NOT NULL constraint + migrations.SeparateDatabaseAndState( + state_operations=[ + migrations.AddField( + model_name='snapshot', + name='current_step', + field=models.PositiveSmallIntegerField( + default=0, + db_index=True, + help_text='Current hook step being executed (0-9). Used for sequential hook execution.' + ), + ), + ], + database_operations=[ + migrations.RunSQL( + sql=""" + ALTER TABLE core_snapshot ADD COLUMN current_step SMALLINT UNSIGNED DEFAULT 0 NOT NULL; + CREATE INDEX IF NOT EXISTS core_snapshot_current_step_idx ON core_snapshot (current_step); + """, + reverse_sql=migrations.RunSQL.noop, + ), + ], ), ] diff --git a/archivebox/core/migrations/0035_snapshot_crawl_non_nullable_remove_created_by.py b/archivebox/core/migrations/0035_snapshot_crawl_non_nullable_remove_created_by.py index 50a3f33f..84ea3c23 100644 --- a/archivebox/core/migrations/0035_snapshot_crawl_non_nullable_remove_created_by.py +++ b/archivebox/core/migrations/0035_snapshot_crawl_non_nullable_remove_created_by.py @@ -54,7 +54,7 @@ class Migration(migrations.Migration): dependencies = [ ('core', '0034_snapshot_current_step'), - ('crawls', '0004_alter_crawl_output_dir'), + ('crawls', '0005_drop_seed_id_column'), ] operations = [ @@ -64,16 +64,24 @@ class Migration(migrations.Migration): reverse_code=migrations.RunPython.noop, ), - # Step 2: Make crawl non-nullable - migrations.AlterField( - model_name='snapshot', - name='crawl', - field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to='crawls.crawl'), - ), - - # Step 3: Remove created_by field - migrations.RemoveField( - model_name='snapshot', - name='created_by', + # Step 2 & 3: Update Django's state only - leave created_by_id column in database (unused but harmless) + migrations.SeparateDatabaseAndState( + state_operations=[ + # Make crawl non-nullable + migrations.AlterField( + model_name='snapshot', + name='crawl', + field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to='crawls.crawl'), + ), + # Remove created_by field from Django's state + migrations.RemoveField( + model_name='snapshot', + name='created_by', + ), + ], + database_operations=[ + # No database changes - crawl_id already exists and NOT NULL constraint will be enforced by model + # created_by_id column remains in database but is unused + ], ), ] diff --git a/archivebox/core/migrations/0036_remove_archiveresult_created_by.py b/archivebox/core/migrations/0036_remove_archiveresult_created_by.py index 6a6d1f1f..5b6983c0 100644 --- a/archivebox/core/migrations/0036_remove_archiveresult_created_by.py +++ b/archivebox/core/migrations/0036_remove_archiveresult_created_by.py @@ -10,10 +10,18 @@ class Migration(migrations.Migration): ] operations = [ - # Remove created_by field from ArchiveResult + # Remove created_by field from ArchiveResult (state only) # No data migration needed - created_by can be accessed via snapshot.crawl.created_by - migrations.RemoveField( - model_name='archiveresult', - name='created_by', + # Leave created_by_id column in database (unused but harmless, avoids table rebuild) + migrations.SeparateDatabaseAndState( + state_operations=[ + migrations.RemoveField( + model_name='archiveresult', + name='created_by', + ), + ], + database_operations=[ + # No database changes - leave created_by_id column in place to avoid table rebuild + ], ), ] diff --git a/archivebox/core/migrations/0037_remove_archiveresult_output_dir_and_more.py b/archivebox/core/migrations/0037_remove_archiveresult_output_dir_and_more.py new file mode 100644 index 00000000..592eed6a --- /dev/null +++ b/archivebox/core/migrations/0037_remove_archiveresult_output_dir_and_more.py @@ -0,0 +1,44 @@ +# Generated by Django 6.0 on 2025-12-29 06:45 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0036_remove_archiveresult_created_by'), + ] + + operations = [ + # Update Django's state only - database columns remain for backwards compat + migrations.SeparateDatabaseAndState( + state_operations=[ + migrations.RemoveField( + model_name='archiveresult', + name='output_dir', + ), + migrations.RemoveField( + model_name='snapshot', + name='output_dir', + ), + migrations.AlterField( + model_name='archiveresult', + name='config', + field=models.JSONField(blank=True, default=dict, null=True), + ), + migrations.AlterField( + model_name='snapshot', + name='config', + field=models.JSONField(blank=True, default=dict, null=True), + ), + migrations.AlterField( + model_name='snapshot', + name='tags', + field=models.ManyToManyField(blank=True, related_name='snapshot_set', through='core.SnapshotTag', through_fields=('snapshot', 'tag'), to='core.tag'), + ), + ], + database_operations=[ + # No database changes - columns remain in place to avoid table rebuilds + ], + ), + ] diff --git a/archivebox/core/migrations/0038_fix_missing_columns.py b/archivebox/core/migrations/0038_fix_missing_columns.py new file mode 100644 index 00000000..3c1e6551 --- /dev/null +++ b/archivebox/core/migrations/0038_fix_missing_columns.py @@ -0,0 +1,84 @@ +# Add missing columns to ArchiveResult and remove created_by_id from Snapshot + +from django.db import migrations, models, connection +import django.utils.timezone + + +def add_columns_if_not_exist(apps, schema_editor): + """Add columns to ArchiveResult only if they don't already exist.""" + with connection.cursor() as cursor: + # Get existing columns + cursor.execute("PRAGMA table_info(core_archiveresult)") + existing_columns = {row[1] for row in cursor.fetchall()} + + # Add num_uses_failed if it doesn't exist + if 'num_uses_failed' not in existing_columns: + cursor.execute("ALTER TABLE core_archiveresult ADD COLUMN num_uses_failed integer unsigned NOT NULL DEFAULT 0 CHECK (num_uses_failed >= 0)") + + # Add num_uses_succeeded if it doesn't exist + if 'num_uses_succeeded' not in existing_columns: + cursor.execute("ALTER TABLE core_archiveresult ADD COLUMN num_uses_succeeded integer unsigned NOT NULL DEFAULT 0 CHECK (num_uses_succeeded >= 0)") + + # Add config if it doesn't exist + if 'config' not in existing_columns: + cursor.execute("ALTER TABLE core_archiveresult ADD COLUMN config text NULL") + + # Add retry_at if it doesn't exist + if 'retry_at' not in existing_columns: + cursor.execute("ALTER TABLE core_archiveresult ADD COLUMN retry_at datetime NULL") + cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_retry_at_idx ON core_archiveresult(retry_at)") + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0037_remove_archiveresult_output_dir_and_more'), + ] + + operations = [ + # Add missing columns to ArchiveResult + migrations.SeparateDatabaseAndState( + state_operations=[ + migrations.AddField( + model_name='archiveresult', + name='num_uses_failed', + field=models.PositiveIntegerField(default=0), + ), + migrations.AddField( + model_name='archiveresult', + name='num_uses_succeeded', + field=models.PositiveIntegerField(default=0), + ), + migrations.AddField( + model_name='archiveresult', + name='config', + field=models.JSONField(blank=True, default=dict, null=True), + ), + migrations.AddField( + model_name='archiveresult', + name='retry_at', + field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True), + ), + ], + database_operations=[ + migrations.RunPython(add_columns_if_not_exist, reverse_code=migrations.RunPython.noop), + ], + ), + + # Drop created_by_id from Snapshot (database only, already removed from model in 0035) + migrations.SeparateDatabaseAndState( + state_operations=[ + # No state changes - field already removed in 0035 + ], + database_operations=[ + migrations.RunSQL( + sql=""" + -- Drop index first, then column + DROP INDEX IF EXISTS core_snapshot_created_by_id_6dbd6149; + ALTER TABLE core_snapshot DROP COLUMN created_by_id; + """, + reverse_sql=migrations.RunSQL.noop, + ), + ], + ), + ] diff --git a/archivebox/core/migrations/0039_fix_num_uses_values.py b/archivebox/core/migrations/0039_fix_num_uses_values.py new file mode 100644 index 00000000..4c04ed3e --- /dev/null +++ b/archivebox/core/migrations/0039_fix_num_uses_values.py @@ -0,0 +1,30 @@ +# Fix num_uses_failed and num_uses_succeeded string values to integers + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0038_fix_missing_columns'), + ] + + operations = [ + # Fix string values that got inserted as literals instead of integers + migrations.RunSQL( + sql=""" + UPDATE core_snapshot + SET num_uses_failed = 0 + WHERE typeof(num_uses_failed) = 'text' OR num_uses_failed = 'num_uses_failed'; + + UPDATE core_snapshot + SET num_uses_succeeded = 0 + WHERE typeof(num_uses_succeeded) = 'text' OR num_uses_succeeded = 'num_uses_succeeded'; + + UPDATE core_snapshot + SET depth = 0 + WHERE typeof(depth) = 'text' OR depth = 'depth'; + """, + reverse_sql=migrations.RunSQL.noop, + ), + ] diff --git a/archivebox/core/models.py b/archivebox/core/models.py index cf4216c6..4c0e026b 100755 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -911,7 +911,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea ) merged = 0 - for dup in duplicates.iterator(): + for dup in duplicates.iterator(chunk_size=500): snapshots = list( cls.objects .filter(url=dup['url'], timestamp=dup['timestamp']) diff --git a/archivebox/core/models.py.bak b/archivebox/core/models.py.bak deleted file mode 100755 index a99d9360..00000000 --- a/archivebox/core/models.py.bak +++ /dev/null @@ -1,2638 +0,0 @@ -__package__ = 'archivebox.core' - -from typing import Optional, Dict, Iterable, Any, List, TYPE_CHECKING -from archivebox.uuid_compat import uuid7 -from datetime import datetime, timedelta -from django_stubs_ext.db.models import TypedModelMeta - -import os -import json -from pathlib import Path - -from statemachine import State, registry - -from django.db import models -from django.db.models import QuerySet, Value, Case, When, IntegerField -from django.utils.functional import cached_property -from django.utils.text import slugify -from django.utils import timezone -from django.core.cache import cache -from django.urls import reverse, reverse_lazy -from django.contrib import admin -from django.conf import settings - -from archivebox.config import CONSTANTS -from archivebox.misc.system import get_dir_size, atomic_write -from archivebox.misc.util import parse_date, base_url, domain as url_domain, to_json, ts_to_date_str, urlencode, htmlencode, urldecode -from archivebox.misc.hashing import get_dir_info -from archivebox.hooks import ( - EXTRACTOR_INDEXING_PRECEDENCE, - get_plugins, get_plugin_name, get_plugin_icon, - DEFAULT_PLUGIN_ICONS, -) -from archivebox.base_models.models import ( - ModelWithUUID, ModelWithSerializers, ModelWithOutputDir, - ModelWithConfig, ModelWithNotes, ModelWithHealthStats, - get_or_create_system_user_pk, -) -from workers.models import ModelWithStateMachine, BaseStateMachine -from workers.tasks import bg_archive_snapshot -from archivebox.crawls.models import Crawl -from archivebox.machine.models import NetworkInterface, Binary - - - -class Tag(ModelWithSerializers): - # Keep AutoField for compatibility with main branch migrations - # Don't use UUIDField here - requires complex FK transformation - id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID') - created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False, related_name='tag_set') - created_at = models.DateTimeField(default=timezone.now, db_index=True, null=True) - modified_at = models.DateTimeField(auto_now=True) - name = models.CharField(unique=True, blank=False, max_length=100) - slug = models.SlugField(unique=True, blank=False, max_length=100, editable=False) - - snapshot_set: models.Manager['Snapshot'] - - class Meta(TypedModelMeta): - verbose_name = "Tag" - verbose_name_plural = "Tags" - - def __str__(self): - return self.name - - def save(self, *args, **kwargs): - is_new = self._state.adding - if is_new: - self.slug = slugify(self.name) - existing = set(Tag.objects.filter(slug__startswith=self.slug).values_list("slug", flat=True)) - i = None - while True: - slug = f"{slugify(self.name)}_{i}" if i else slugify(self.name) - if slug not in existing: - self.slug = slug - break - i = (i or 0) + 1 - super().save(*args, **kwargs) - - if is_new: - from archivebox.misc.logging_util import log_worker_event - log_worker_event( - worker_type='DB', - event='Created Tag', - indent_level=0, - metadata={ - 'id': self.id, - 'name': self.name, - 'slug': self.slug, - }, - ) - - @property - def api_url(self) -> str: - return reverse_lazy('api-1:get_tag', args=[self.id]) - - @staticmethod - def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None): - """ - Create/update Tag from JSONL record. - - Args: - record: JSONL record with 'name' field - overrides: Optional dict with 'snapshot' to auto-attach tag - - Returns: - Tag instance or None - """ - from archivebox.misc.jsonl import get_or_create_tag - - try: - tag = get_or_create_tag(record) - - # Auto-attach to snapshot if in overrides - if overrides and 'snapshot' in overrides and tag: - overrides['snapshot'].tags.add(tag) - - return tag - except ValueError: - return None - - -class SnapshotTag(models.Model): - id = models.AutoField(primary_key=True) - snapshot = models.ForeignKey('Snapshot', db_column='snapshot_id', on_delete=models.CASCADE, to_field='id') - tag = models.ForeignKey(Tag, db_column='tag_id', on_delete=models.CASCADE, to_field='id') - - class Meta: - db_table = 'core_snapshot_tags' - unique_together = [('snapshot', 'tag')] - - -class SnapshotQuerySet(models.QuerySet): - """Custom QuerySet for Snapshot model with export methods that persist through .filter() etc.""" - - # ========================================================================= - # Filtering Methods - # ========================================================================= - - FILTER_TYPES = { - 'exact': lambda pattern: models.Q(url=pattern), - 'substring': lambda pattern: models.Q(url__icontains=pattern), - 'regex': lambda pattern: models.Q(url__iregex=pattern), - 'domain': lambda pattern: models.Q(url__istartswith=f"http://{pattern}") | models.Q(url__istartswith=f"https://{pattern}") | models.Q(url__istartswith=f"ftp://{pattern}"), - 'tag': lambda pattern: models.Q(tags__name=pattern), - 'timestamp': lambda pattern: models.Q(timestamp=pattern), - } - - def filter_by_patterns(self, patterns: List[str], filter_type: str = 'exact') -> 'SnapshotQuerySet': - """Filter snapshots by URL patterns using specified filter type""" - from archivebox.misc.logging import stderr - - q_filter = models.Q() - for pattern in patterns: - try: - q_filter = q_filter | self.FILTER_TYPES[filter_type](pattern) - except KeyError: - stderr() - stderr(f'[X] Got invalid pattern for --filter-type={filter_type}:', color='red') - stderr(f' {pattern}') - raise SystemExit(2) - return self.filter(q_filter) - - def search(self, patterns: List[str]) -> 'SnapshotQuerySet': - """Search snapshots using the configured search backend""" - from archivebox.config.common import SEARCH_BACKEND_CONFIG - from archivebox.search import query_search_index - from archivebox.misc.logging import stderr - - if not SEARCH_BACKEND_CONFIG.USE_SEARCHING_BACKEND: - stderr() - stderr('[X] The search backend is not enabled, set config.USE_SEARCHING_BACKEND = True', color='red') - raise SystemExit(2) - - qsearch = self.none() - for pattern in patterns: - try: - qsearch |= query_search_index(pattern) - except: - raise SystemExit(2) - return self.all() & qsearch - - # ========================================================================= - # Export Methods - # ========================================================================= - - def to_json(self, with_headers: bool = False) -> str: - """Generate JSON index from snapshots""" - import sys - from datetime import datetime, timezone as tz - from archivebox.config import VERSION - from archivebox.config.common import SERVER_CONFIG - - MAIN_INDEX_HEADER = { - 'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.', - 'schema': 'archivebox.index.json', - 'copyright_info': SERVER_CONFIG.FOOTER_INFO, - 'meta': { - 'project': 'ArchiveBox', - 'version': VERSION, - 'git_sha': VERSION, - 'website': 'https://ArchiveBox.io', - 'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki', - 'source': 'https://github.com/ArchiveBox/ArchiveBox', - 'issues': 'https://github.com/ArchiveBox/ArchiveBox/issues', - 'dependencies': {}, - }, - } if with_headers else {} - - snapshot_dicts = [s.to_dict(extended=True) for s in self.iterator(chunk_size=500)] - - if with_headers: - output = { - **MAIN_INDEX_HEADER, - 'num_links': len(snapshot_dicts), - 'updated': datetime.now(tz.utc), - 'last_run_cmd': sys.argv, - 'links': snapshot_dicts, - } - else: - output = snapshot_dicts - return to_json(output, indent=4, sort_keys=True) - - def to_csv(self, cols: Optional[List[str]] = None, header: bool = True, separator: str = ',', ljust: int = 0) -> str: - """Generate CSV output from snapshots""" - cols = cols or ['timestamp', 'is_archived', 'url'] - header_str = separator.join(col.ljust(ljust) for col in cols) if header else '' - row_strs = (s.to_csv(cols=cols, ljust=ljust, separator=separator) for s in self.iterator(chunk_size=500)) - return '\n'.join((header_str, *row_strs)) - - def to_html(self, with_headers: bool = True) -> str: - """Generate main index HTML from snapshots""" - from datetime import datetime, timezone as tz - from django.template.loader import render_to_string - from archivebox.config import VERSION - from archivebox.config.common import SERVER_CONFIG - from archivebox.config.version import get_COMMIT_HASH - - template = 'static_index.html' if with_headers else 'minimal_index.html' - snapshot_list = list(self.iterator(chunk_size=500)) - - return render_to_string(template, { - 'version': VERSION, - 'git_sha': get_COMMIT_HASH() or VERSION, - 'num_links': str(len(snapshot_list)), - 'date_updated': datetime.now(tz.utc).strftime('%Y-%m-%d'), - 'time_updated': datetime.now(tz.utc).strftime('%Y-%m-%d %H:%M'), - 'links': snapshot_list, - 'FOOTER_INFO': SERVER_CONFIG.FOOTER_INFO, - }) - - -class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)): - """Manager for Snapshot model - uses SnapshotQuerySet for chainable methods""" - - def filter(self, *args, **kwargs): - domain = kwargs.pop('domain', None) - qs = super().filter(*args, **kwargs) - if domain: - qs = qs.filter(url__icontains=f'://{domain}') - return qs - - def get_queryset(self): - return super().get_queryset().prefetch_related('tags', 'archiveresult_set') - - # ========================================================================= - # Import Methods - # ========================================================================= - - def remove(self, atomic: bool = False) -> tuple: - """Remove snapshots from the database""" - from django.db import transaction - if atomic: - with transaction.atomic(): - return self.delete() - return self.delete() - - -class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine): - id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True) - created_at = models.DateTimeField(default=timezone.now, db_index=True) - modified_at = models.DateTimeField(auto_now=True) - - url = models.URLField(unique=False, db_index=True) # URLs can appear in multiple crawls - timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False) - bookmarked_at = models.DateTimeField(default=timezone.now, db_index=True) - crawl: Crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, null=False, related_name='snapshot_set', db_index=True) # type: ignore[assignment] - parent_snapshot = models.ForeignKey('self', on_delete=models.SET_NULL, null=True, blank=True, related_name='child_snapshots', db_index=True, help_text='Parent snapshot that discovered this URL (for recursive crawling)') - - title = models.CharField(max_length=512, null=True, blank=True, db_index=True) - downloaded_at = models.DateTimeField(default=None, null=True, editable=False, db_index=True, blank=True) - depth = models.PositiveSmallIntegerField(default=0, db_index=True) # 0 for root snapshot, 1+ for discovered URLs - fs_version = models.CharField(max_length=10, default='0.9.0', help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().') - current_step = models.PositiveSmallIntegerField(default=0, db_index=True, help_text='Current hook step being executed (0-9). Used for sequential hook execution.') - - retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now) - status = ModelWithStateMachine.StatusField(choices=ModelWithStateMachine.StatusChoices, default=ModelWithStateMachine.StatusChoices.QUEUED) - config = models.JSONField(default=dict, null=False, blank=False, editable=True) - notes = models.TextField(blank=True, null=False, default='') - output_dir = models.FilePathField(path=CONSTANTS.ARCHIVE_DIR, recursive=True, match='.*', default=None, null=True, blank=True, editable=True) - - tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name='snapshot_set', through_fields=('snapshot', 'tag')) - - state_machine_name = 'core.models.SnapshotMachine' - state_field_name = 'status' - retry_at_field_name = 'retry_at' - StatusChoices = ModelWithStateMachine.StatusChoices - active_state = StatusChoices.STARTED - - objects = SnapshotManager() - archiveresult_set: models.Manager['ArchiveResult'] - - class Meta(TypedModelMeta): - verbose_name = "Snapshot" - verbose_name_plural = "Snapshots" - constraints = [ - # Allow same URL in different crawls, but not duplicates within same crawl - models.UniqueConstraint(fields=['url', 'crawl'], name='unique_url_per_crawl'), - # Global timestamp uniqueness for 1:1 symlink mapping - models.UniqueConstraint(fields=['timestamp'], name='unique_timestamp'), - ] - - def __str__(self): - return f'[{self.id}] {self.url[:64]}' - - def save(self, *args, **kwargs): - is_new = self._state.adding - if not self.bookmarked_at: - self.bookmarked_at = self.created_at or timezone.now() - if not self.timestamp: - self.timestamp = str(self.bookmarked_at.timestamp()) - - # Migrate filesystem if needed (happens automatically on save) - if self.pk and self.fs_migration_needed: - from django.db import transaction - with transaction.atomic(): - # Walk through migration chain automatically - current = self.fs_version - target = self._fs_current_version() - - while current != target: - next_ver = self._fs_next_version(current) - method = f'_fs_migrate_from_{current.replace(".", "_")}_to_{next_ver.replace(".", "_")}' - - # Only run if method exists (most are no-ops) - if hasattr(self, method): - getattr(self, method)() - - current = next_ver - - # Update version (still in transaction) - self.fs_version = target - - super().save(*args, **kwargs) - if self.crawl and self.url not in self.crawl.urls: - self.crawl.urls += f'\n{self.url}' - self.crawl.save() - - if is_new: - from archivebox.misc.logging_util import log_worker_event - log_worker_event( - worker_type='DB', - event='Created Snapshot', - indent_level=2, - url=self.url, - metadata={ - 'id': str(self.id), - 'crawl_id': str(self.crawl_id) if self.crawl_id else None, - 'depth': self.depth, - 'status': self.status, - }, - ) - - # ========================================================================= - # Filesystem Migration Methods - # ========================================================================= - - @staticmethod - def _fs_current_version() -> str: - """Get current ArchiveBox filesystem version (normalized to x.x.0 format)""" - from archivebox.config import VERSION - # Normalize version to x.x.0 format (e.g., "0.9.0rc1" -> "0.9.0") - parts = VERSION.split('.') - if len(parts) >= 2: - major, minor = parts[0], parts[1] - # Strip any non-numeric suffix from minor version - minor = ''.join(c for c in minor if c.isdigit()) - return f'{major}.{minor}.0' - return '0.9.0' # Fallback if version parsing fails - - @property - def fs_migration_needed(self) -> bool: - """Check if snapshot needs filesystem migration""" - return self.fs_version != self._fs_current_version() - - def _fs_next_version(self, version: str) -> str: - """Get next version in migration chain""" - chain = ['0.7.0', '0.8.0', '0.9.0'] - try: - idx = chain.index(version) - return chain[idx + 1] if idx + 1 < len(chain) else self._fs_current_version() - except ValueError: - # Unknown version - skip to current - return self._fs_current_version() - - def _fs_migrate_from_0_7_0_to_0_8_0(self): - """Migration from 0.7.0 to 0.8.0 layout (no-op)""" - # 0.7 and 0.8 both used archive/ - # Nothing to do! - pass - - def _fs_migrate_from_0_8_0_to_0_9_0(self): - """ - Migrate from flat to nested structure. - - 0.8.x: archive/{timestamp}/ - 0.9.x: users/{user}/snapshots/YYYYMMDD/{domain}/{uuid}/ - - Transaction handling: - 1. Copy files INSIDE transaction - 2. Create symlink INSIDE transaction - 3. Update fs_version INSIDE transaction (done by save()) - 4. Exit transaction (DB commit) - 5. Delete old files OUTSIDE transaction (after commit) - """ - import shutil - from django.db import transaction - - old_dir = self.get_storage_path_for_version('0.8.0') - new_dir = self.get_storage_path_for_version('0.9.0') - - if not old_dir.exists() or old_dir == new_dir or new_dir.exists(): - return - - new_dir.mkdir(parents=True, exist_ok=True) - - # Copy all files (idempotent) - for old_file in old_dir.rglob('*'): - if not old_file.is_file(): - continue - - rel_path = old_file.relative_to(old_dir) - new_file = new_dir / rel_path - - # Skip if already copied - if new_file.exists() and new_file.stat().st_size == old_file.stat().st_size: - continue - - new_file.parent.mkdir(parents=True, exist_ok=True) - shutil.copy2(old_file, new_file) - - # Verify all copied - old_files = {f.relative_to(old_dir): f.stat().st_size - for f in old_dir.rglob('*') if f.is_file()} - new_files = {f.relative_to(new_dir): f.stat().st_size - for f in new_dir.rglob('*') if f.is_file()} - - if old_files.keys() != new_files.keys(): - missing = old_files.keys() - new_files.keys() - raise Exception(f"Migration incomplete: missing {missing}") - - # Create backwards-compat symlink (INSIDE transaction) - symlink_path = CONSTANTS.ARCHIVE_DIR / self.timestamp - if symlink_path.is_symlink(): - symlink_path.unlink() - - if not symlink_path.exists() or symlink_path == old_dir: - symlink_path.symlink_to(new_dir, target_is_directory=True) - - # Schedule old directory deletion AFTER transaction commits - transaction.on_commit(lambda: self._cleanup_old_migration_dir(old_dir)) - - def _cleanup_old_migration_dir(self, old_dir: Path): - """ - Delete old directory after successful migration. - Called via transaction.on_commit() after DB commit succeeds. - """ - import shutil - import logging - - if old_dir.exists() and not old_dir.is_symlink(): - try: - shutil.rmtree(old_dir) - except Exception as e: - # Log but don't raise - migration succeeded, this is just cleanup - logging.getLogger('archivebox.migration').warning( - f"Could not remove old migration directory {old_dir}: {e}" - ) - - # ========================================================================= - # Path Calculation and Migration Helpers - # ========================================================================= - - @staticmethod - def extract_domain_from_url(url: str) -> str: - """ - Extract domain from URL for 0.9.x path structure. - Uses full hostname with sanitized special chars. - - Examples: - https://example.com:8080 → example.com_8080 - https://sub.example.com → sub.example.com - file:///path → localhost - data:text/html → data - """ - from urllib.parse import urlparse - - try: - parsed = urlparse(url) - - if parsed.scheme in ('http', 'https'): - if parsed.port: - return f"{parsed.hostname}_{parsed.port}".replace(':', '_') - return parsed.hostname or 'unknown' - elif parsed.scheme == 'file': - return 'localhost' - elif parsed.scheme: - return parsed.scheme - else: - return 'unknown' - except Exception: - return 'unknown' - - def get_storage_path_for_version(self, version: str) -> Path: - """ - Calculate storage path for specific filesystem version. - Centralizes path logic so it's reusable. - - 0.7.x/0.8.x: archive/{timestamp} - 0.9.x: users/{username}/snapshots/YYYYMMDD/{domain}/{uuid}/ - """ - from datetime import datetime - - if version in ('0.7.0', '0.8.0'): - return CONSTANTS.ARCHIVE_DIR / self.timestamp - - elif version in ('0.9.0', '1.0.0'): - username = self.crawl.created_by.username - - # Use created_at for date grouping (fallback to timestamp) - if self.created_at: - date_str = self.created_at.strftime('%Y%m%d') - else: - date_str = datetime.fromtimestamp(float(self.timestamp)).strftime('%Y%m%d') - - domain = self.extract_domain_from_url(self.url) - - return ( - CONSTANTS.DATA_DIR / 'users' / username / 'snapshots' / - date_str / domain / str(self.id) - ) - else: - # Unknown version - use current - return self.get_storage_path_for_version(self._fs_current_version()) - - # ========================================================================= - # Loading and Creation from Filesystem (Used by archivebox update ONLY) - # ========================================================================= - - @classmethod - def load_from_directory(cls, snapshot_dir: Path) -> Optional['Snapshot']: - """ - Load existing Snapshot from DB by reading index.json. - - Reads index.json, extracts url+timestamp, queries DB. - Returns existing Snapshot or None if not found/invalid. - Does NOT create new snapshots. - - ONLY used by: archivebox update (for orphan detection) - """ - import json - - index_path = snapshot_dir / 'index.json' - if not index_path.exists(): - return None - - try: - with open(index_path) as f: - data = json.load(f) - except: - return None - - url = data.get('url') - if not url: - return None - - # Get timestamp - prefer index.json, fallback to folder name - timestamp = cls._select_best_timestamp( - index_timestamp=data.get('timestamp'), - folder_name=snapshot_dir.name - ) - - if not timestamp: - return None - - # Look up existing - try: - return cls.objects.get(url=url, timestamp=timestamp) - except cls.DoesNotExist: - return None - except cls.MultipleObjectsReturned: - # Should not happen with unique constraint - return cls.objects.filter(url=url, timestamp=timestamp).first() - - @classmethod - def create_from_directory(cls, snapshot_dir: Path) -> Optional['Snapshot']: - """ - Create new Snapshot from orphaned directory. - - Validates timestamp, ensures uniqueness. - Returns new UNSAVED Snapshot or None if invalid. - - ONLY used by: archivebox update (for orphan import) - """ - import json - - index_path = snapshot_dir / 'index.json' - if not index_path.exists(): - return None - - try: - with open(index_path) as f: - data = json.load(f) - except: - return None - - url = data.get('url') - if not url: - return None - - # Get and validate timestamp - timestamp = cls._select_best_timestamp( - index_timestamp=data.get('timestamp'), - folder_name=snapshot_dir.name - ) - - if not timestamp: - return None - - # Ensure uniqueness (reuses existing logic from create_or_update_from_dict) - timestamp = cls._ensure_unique_timestamp(url, timestamp) - - # Detect version - fs_version = cls._detect_fs_version_from_index(data) - - return cls( - url=url, - timestamp=timestamp, - title=data.get('title', ''), - fs_version=fs_version, - created_by_id=get_or_create_system_user_pk(), - ) - - @staticmethod - def _select_best_timestamp(index_timestamp: str, folder_name: str) -> Optional[str]: - """ - Select best timestamp from index.json vs folder name. - - Validates range (1995-2035). - Prefers index.json if valid. - """ - def is_valid_timestamp(ts): - try: - ts_int = int(float(ts)) - # 1995-01-01 to 2035-12-31 - return 788918400 <= ts_int <= 2082758400 - except: - return False - - index_valid = is_valid_timestamp(index_timestamp) if index_timestamp else False - folder_valid = is_valid_timestamp(folder_name) - - if index_valid: - return str(int(float(index_timestamp))) - elif folder_valid: - return str(int(float(folder_name))) - else: - return None - - @classmethod - def _ensure_unique_timestamp(cls, url: str, timestamp: str) -> str: - """ - Ensure timestamp is globally unique. - If collision with different URL, increment by 1 until unique. - - NOTE: Logic already exists in create_or_update_from_dict (line 266-267) - This is just an extracted, reusable version. - """ - while cls.objects.filter(timestamp=timestamp).exclude(url=url).exists(): - timestamp = str(int(float(timestamp)) + 1) - return timestamp - - @staticmethod - def _detect_fs_version_from_index(data: dict) -> str: - """ - Detect fs_version from index.json structure. - - - Has fs_version field: use it - - Has history dict: 0.7.0 - - Has archive_results list: 0.8.0 - - Default: 0.7.0 - """ - if 'fs_version' in data: - return data['fs_version'] - if 'history' in data and 'archive_results' not in data: - return '0.7.0' - if 'archive_results' in data: - return '0.8.0' - return '0.7.0' - - # ========================================================================= - # Index.json Reconciliation - # ========================================================================= - - def reconcile_with_index_json(self): - """ - Merge index.json with DB. DB is source of truth. - - - Title: longest non-URL - - Tags: union - - ArchiveResults: keep both (by plugin+start_ts) - - Writes back in 0.9.x format. - - Used by: archivebox update (to sync index.json with DB) - """ - import json - - index_path = Path(self.output_dir) / 'index.json' - - index_data = {} - if index_path.exists(): - try: - with open(index_path) as f: - index_data = json.load(f) - except: - pass - - # Merge title - self._merge_title_from_index(index_data) - - # Merge tags - self._merge_tags_from_index(index_data) - - # Merge ArchiveResults - self._merge_archive_results_from_index(index_data) - - # Write back - self.write_index_json() - - def _merge_title_from_index(self, index_data: dict): - """Merge title - prefer longest non-URL title.""" - index_title = index_data.get('title', '').strip() - db_title = self.title or '' - - candidates = [t for t in [index_title, db_title] if t and t != self.url] - if candidates: - best_title = max(candidates, key=len) - if self.title != best_title: - self.title = best_title - - def _merge_tags_from_index(self, index_data: dict): - """Merge tags - union of both sources.""" - from django.db import transaction - - index_tags = set(index_data.get('tags', '').split(',')) if index_data.get('tags') else set() - index_tags = {t.strip() for t in index_tags if t.strip()} - - db_tags = set(self.tags.values_list('name', flat=True)) - - new_tags = index_tags - db_tags - if new_tags: - with transaction.atomic(): - for tag_name in new_tags: - tag, _ = Tag.objects.get_or_create(name=tag_name) - self.tags.add(tag) - - def _merge_archive_results_from_index(self, index_data: dict): - """Merge ArchiveResults - keep both (by plugin+start_ts).""" - existing = { - (ar.plugin, ar.start_ts): ar - for ar in ArchiveResult.objects.filter(snapshot=self) - } - - # Handle 0.8.x format (archive_results list) - for result_data in index_data.get('archive_results', []): - self._create_archive_result_if_missing(result_data, existing) - - # Handle 0.7.x format (history dict) - if 'history' in index_data and isinstance(index_data['history'], dict): - for plugin, result_list in index_data['history'].items(): - if isinstance(result_list, list): - for result_data in result_list: - # Support both old 'extractor' and new 'plugin' keys for backwards compat - result_data['plugin'] = result_data.get('plugin') or result_data.get('extractor') or plugin - self._create_archive_result_if_missing(result_data, existing) - - def _create_archive_result_if_missing(self, result_data: dict, existing: dict): - """Create ArchiveResult if not already in DB.""" - from dateutil import parser - - # Support both old 'extractor' and new 'plugin' keys for backwards compat - plugin = result_data.get('plugin') or result_data.get('extractor', '') - if not plugin: - return - - start_ts = None - if result_data.get('start_ts'): - try: - start_ts = parser.parse(result_data['start_ts']) - except: - pass - - if (plugin, start_ts) in existing: - return - - try: - end_ts = None - if result_data.get('end_ts'): - try: - end_ts = parser.parse(result_data['end_ts']) - except: - pass - - ArchiveResult.objects.create( - snapshot=self, - plugin=plugin, - hook_name=result_data.get('hook_name', ''), - status=result_data.get('status', 'failed'), - output_str=result_data.get('output', ''), - cmd=result_data.get('cmd', []), - pwd=result_data.get('pwd', str(self.output_dir)), - start_ts=start_ts, - end_ts=end_ts, - created_by=self.crawl.created_by, - ) - except: - pass - - def write_index_json(self): - """Write index.json in 0.9.x format.""" - import json - - index_path = Path(self.output_dir) / 'index.json' - - data = { - 'url': self.url, - 'timestamp': self.timestamp, - 'title': self.title or '', - 'tags': ','.join(sorted(self.tags.values_list('name', flat=True))), - 'fs_version': self.fs_version, - 'bookmarked_at': self.bookmarked_at.isoformat() if self.bookmarked_at else None, - 'created_at': self.created_at.isoformat() if self.created_at else None, - 'archive_results': [ - { - 'plugin': ar.plugin, - 'status': ar.status, - 'start_ts': ar.start_ts.isoformat() if ar.start_ts else None, - 'end_ts': ar.end_ts.isoformat() if ar.end_ts else None, - 'output': ar.output_str or '', - 'cmd': ar.cmd if isinstance(ar.cmd, list) else [], - 'pwd': ar.pwd, - } - for ar in ArchiveResult.objects.filter(snapshot=self).order_by('start_ts') - ], - } - - index_path.parent.mkdir(parents=True, exist_ok=True) - with open(index_path, 'w') as f: - json.dump(data, f, indent=2, sort_keys=True) - - # ========================================================================= - # Snapshot Utilities - # ========================================================================= - - @staticmethod - def move_directory_to_invalid(snapshot_dir: Path): - """ - Move invalid directory to data/invalid/YYYYMMDD/. - - Used by: archivebox update (when encountering invalid directories) - """ - from datetime import datetime - import shutil - - invalid_dir = CONSTANTS.DATA_DIR / 'invalid' / datetime.now().strftime('%Y%m%d') - invalid_dir.mkdir(parents=True, exist_ok=True) - - dest = invalid_dir / snapshot_dir.name - counter = 1 - while dest.exists(): - dest = invalid_dir / f"{snapshot_dir.name}_{counter}" - counter += 1 - - try: - shutil.move(str(snapshot_dir), str(dest)) - except: - pass - - @classmethod - def find_and_merge_duplicates(cls) -> int: - """ - Find and merge snapshots with same url:timestamp. - Returns count of duplicate sets merged. - - Used by: archivebox update (Phase 3: deduplication) - """ - from django.db.models import Count - - duplicates = ( - cls.objects - .values('url', 'timestamp') - .annotate(count=Count('id')) - .filter(count__gt=1) - ) - - merged = 0 - for dup in duplicates.iterator(): - snapshots = list( - cls.objects - .filter(url=dup['url'], timestamp=dup['timestamp']) - .order_by('created_at') # Keep oldest - ) - - if len(snapshots) > 1: - try: - cls._merge_snapshots(snapshots) - merged += 1 - except: - pass - - return merged - - @classmethod - def _merge_snapshots(cls, snapshots: list['Snapshot']): - """ - Merge exact duplicates. - Keep oldest, union files + ArchiveResults. - """ - import shutil - - keeper = snapshots[0] - duplicates = snapshots[1:] - - keeper_dir = Path(keeper.output_dir) - - for dup in duplicates: - dup_dir = Path(dup.output_dir) - - # Merge files - if dup_dir.exists() and dup_dir != keeper_dir: - for dup_file in dup_dir.rglob('*'): - if not dup_file.is_file(): - continue - - rel = dup_file.relative_to(dup_dir) - keeper_file = keeper_dir / rel - - if not keeper_file.exists(): - keeper_file.parent.mkdir(parents=True, exist_ok=True) - shutil.copy2(dup_file, keeper_file) - - try: - shutil.rmtree(dup_dir) - except: - pass - - # Merge tags - for tag in dup.tags.all(): - keeper.tags.add(tag) - - # Move ArchiveResults - ArchiveResult.objects.filter(snapshot=dup).update(snapshot=keeper) - - # Delete - dup.delete() - - # ========================================================================= - # Output Directory Properties - # ========================================================================= - - @property - def output_dir_parent(self) -> str: - return 'archive' - - @property - def output_dir_name(self) -> str: - return str(self.timestamp) - - def archive(self, overwrite=False, methods=None): - return bg_archive_snapshot(self, overwrite=overwrite, methods=methods) - - @admin.display(description='Tags') - def tags_str(self, nocache=True) -> str | None: - calc_tags_str = lambda: ','.join(sorted(tag.name for tag in self.tags.all())) - if hasattr(self, '_prefetched_objects_cache') and 'tags' in self._prefetched_objects_cache: - return calc_tags_str() - cache_key = f'{self.pk}-tags' - return cache.get_or_set(cache_key, calc_tags_str) if not nocache else calc_tags_str() - - def icons(self) -> str: - """Generate HTML icons showing which extractor plugins have succeeded for this snapshot""" - from django.utils.html import format_html, mark_safe - - cache_key = f'result_icons:{self.pk}:{(self.downloaded_at or self.modified_at or self.created_at or self.bookmarked_at).timestamp()}' - - def calc_icons(): - if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache: - archive_results = {r.plugin: r for r in self.archiveresult_set.all() if r.status == "succeeded" and (r.output_files or r.output_str)} - else: - # Filter for results that have either output_files or output_str - from django.db.models import Q - archive_results = {r.plugin: r for r in self.archiveresult_set.filter( - Q(status="succeeded") & (Q(output_files__isnull=False) | ~Q(output_str='')) - )} - - path = self.archive_path - canon = self.canonical_outputs() - output = "" - output_template = '{}  ' - - # Get all plugins from hooks system (sorted by numeric prefix) - all_plugins = [get_plugin_name(e) for e in get_plugins()] - - for plugin in all_plugins: - result = archive_results.get(plugin) - existing = result and result.status == 'succeeded' and (result.output_files or result.output_str) - icon = get_plugin_icon(plugin) - output += format_html( - output_template, - path, - canon.get(plugin, plugin + '/'), - str(bool(existing)), - plugin, - icon - ) - - return format_html('{}', mark_safe(output)) - - cache_result = cache.get(cache_key) - if cache_result: - return cache_result - - fresh_result = calc_icons() - cache.set(cache_key, fresh_result, timeout=60 * 60 * 24) - return fresh_result - - @property - def api_url(self) -> str: - return reverse_lazy('api-1:get_snapshot', args=[self.id]) - - def get_absolute_url(self): - return f'/{self.archive_path}' - - @cached_property - def domain(self) -> str: - return url_domain(self.url) - - @cached_property - def output_dir(self): - """The filesystem path to the snapshot's output directory.""" - import os - - current_path = self.get_storage_path_for_version(self.fs_version) - - if current_path.exists(): - return str(current_path) - - # Check for backwards-compat symlink - old_path = CONSTANTS.ARCHIVE_DIR / self.timestamp - if old_path.is_symlink(): - return str(Path(os.readlink(old_path)).resolve()) - elif old_path.exists(): - return str(old_path) - - return str(current_path) - - @cached_property - def archive_path(self): - return f'{CONSTANTS.ARCHIVE_DIR_NAME}/{self.timestamp}' - - @cached_property - def archive_size(self): - try: - return get_dir_size(self.output_dir)[0] - except Exception: - return 0 - - def save_tags(self, tags: Iterable[str] = ()) -> None: - tags_id = [Tag.objects.get_or_create(name=tag)[0].pk for tag in tags if tag.strip()] - self.tags.clear() - self.tags.add(*tags_id) - - def pending_archiveresults(self) -> QuerySet['ArchiveResult']: - return self.archiveresult_set.exclude(status__in=ArchiveResult.FINAL_OR_ACTIVE_STATES) - - def run(self) -> list['ArchiveResult']: - """ - Execute snapshot by creating pending ArchiveResults for all enabled hooks. - - Called by: SnapshotMachine.enter_started() - - Hook Lifecycle: - 1. discover_hooks('Snapshot') → finds all plugin hooks - 2. For each hook: - - Create ArchiveResult with status=QUEUED - - Store hook_name (e.g., 'on_Snapshot__50_wget.py') - 3. ArchiveResults execute independently via ArchiveResultMachine - 4. Hook execution happens in ArchiveResult.run(), NOT here - - Returns: - list[ArchiveResult]: Newly created pending results - """ - return self.create_pending_archiveresults() - - def cleanup(self): - """ - Clean up background ArchiveResult hooks. - - Called by the state machine when entering the 'sealed' state. - Kills any background hooks and finalizes their ArchiveResults. - """ - from archivebox.hooks import kill_process - - # Kill any background ArchiveResult hooks - if not self.OUTPUT_DIR.exists(): - return - - # Find all .pid files in this snapshot's output directory - for pid_file in self.OUTPUT_DIR.glob('**/*.pid'): - kill_process(pid_file, validate=True) - - # Update all STARTED ArchiveResults from filesystem - results = self.archiveresult_set.filter(status=ArchiveResult.StatusChoices.STARTED) - for ar in results: - ar.update_from_output() - - def has_running_background_hooks(self) -> bool: - """ - Check if any ArchiveResult background hooks are still running. - - Used by state machine to determine if snapshot is finished. - """ - from archivebox.hooks import process_is_alive - - if not self.OUTPUT_DIR.exists(): - return False - - for plugin_dir in self.OUTPUT_DIR.iterdir(): - if not plugin_dir.is_dir(): - continue - pid_file = plugin_dir / 'hook.pid' - if process_is_alive(pid_file): - return True - - return False - - @staticmethod - def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None, queue_for_extraction: bool = True): - """ - Create/update Snapshot from JSONL record or dict. - - Unified method that handles: - - ID-based patching: {"id": "...", "title": "new title"} - - URL-based create/update: {"url": "...", "title": "...", "tags": "..."} - - Auto-creates Crawl if not provided - - Optionally queues for extraction - - Args: - record: Dict with 'url' (for create) or 'id' (for patch), plus other fields - overrides: Dict with 'crawl', 'snapshot' (parent), 'created_by_id' - queue_for_extraction: If True, sets status=QUEUED and retry_at (default: True) - - Returns: - Snapshot instance or None - """ - import re - from django.utils import timezone - from archivebox.misc.util import parse_date - from archivebox.base_models.models import get_or_create_system_user_pk - from archivebox.config.common import GENERAL_CONFIG - - overrides = overrides or {} - - # If 'id' is provided, lookup and patch that specific snapshot - snapshot_id = record.get('id') - if snapshot_id: - try: - snapshot = Snapshot.objects.get(id=snapshot_id) - - # Generically update all fields present in record - update_fields = [] - for field_name, value in record.items(): - # Skip internal fields - if field_name in ('id', 'type'): - continue - - # Skip if field doesn't exist on model - if not hasattr(snapshot, field_name): - continue - - # Special parsing for date fields - if field_name in ('bookmarked_at', 'retry_at', 'created_at', 'modified_at'): - if value and isinstance(value, str): - value = parse_date(value) - - # Update field if value is provided and different - if value is not None and getattr(snapshot, field_name) != value: - setattr(snapshot, field_name, value) - update_fields.append(field_name) - - if update_fields: - snapshot.save(update_fields=update_fields + ['modified_at']) - - return snapshot - except Snapshot.DoesNotExist: - # ID not found, fall through to create-by-URL logic - pass - - url = record.get('url') - if not url: - return None - - # Determine or create crawl (every snapshot must have a crawl) - crawl = overrides.get('crawl') - parent_snapshot = overrides.get('snapshot') # Parent snapshot - created_by_id = overrides.get('created_by_id') or (parent_snapshot.crawl.created_by_id if parent_snapshot else None) or get_or_create_system_user_pk() - - # If no crawl provided, inherit from parent or auto-create one - if not crawl: - if parent_snapshot: - # Inherit crawl from parent snapshot - crawl = parent_snapshot.crawl - else: - # Auto-create a single-URL crawl - from archivebox.crawls.models import Crawl - from archivebox.config import CONSTANTS - - timestamp_str = timezone.now().strftime("%Y-%m-%d__%H-%M-%S") - sources_file = CONSTANTS.SOURCES_DIR / f'{timestamp_str}__auto_crawl.txt' - sources_file.parent.mkdir(parents=True, exist_ok=True) - sources_file.write_text(url) - - crawl = Crawl.objects.create( - urls=url, - max_depth=0, - label=f'auto-created for {url[:50]}', - created_by_id=created_by_id, - ) - - # Parse tags - tags_str = record.get('tags', '') - tag_list = [] - if tags_str: - tag_list = list(dict.fromkeys( - tag.strip() for tag in re.split(GENERAL_CONFIG.TAG_SEPARATOR_PATTERN, tags_str) - if tag.strip() - )) - - # Get most recent snapshot with this URL (URLs can exist in multiple crawls) - snapshot = Snapshot.objects.filter(url=url).order_by('-created_at').first() - - title = record.get('title') - timestamp = record.get('timestamp') - - if snapshot: - # Update existing snapshot - if title and (not snapshot.title or len(title) > len(snapshot.title or '')): - snapshot.title = title - snapshot.save(update_fields=['title', 'modified_at']) - else: - # Create new snapshot - if timestamp: - while Snapshot.objects.filter(timestamp=timestamp).exists(): - timestamp = str(float(timestamp) + 1.0) - - snapshot = Snapshot.objects.create( - url=url, - timestamp=timestamp, - title=title, - crawl=crawl, - ) - - # Update tags - if tag_list: - existing_tags = set(snapshot.tags.values_list('name', flat=True)) - new_tags = set(tag_list) | existing_tags - snapshot.save_tags(new_tags) - - # Queue for extraction and update additional fields - update_fields = [] - - if queue_for_extraction: - snapshot.status = Snapshot.StatusChoices.QUEUED - snapshot.retry_at = timezone.now() - update_fields.extend(['status', 'retry_at']) - - # Update additional fields if provided - for field_name in ('depth', 'parent_snapshot_id', 'crawl_id', 'bookmarked_at'): - value = record.get(field_name) - if value is not None and getattr(snapshot, field_name) != value: - setattr(snapshot, field_name, value) - update_fields.append(field_name) - - if update_fields: - snapshot.save(update_fields=update_fields + ['modified_at']) - - return snapshot - - def create_pending_archiveresults(self) -> list['ArchiveResult']: - """ - Create ArchiveResult records for all enabled hooks. - - Uses the hooks system to discover available hooks from: - - archivebox/plugins/*/on_Snapshot__*.{py,sh,js} - - data/plugins/*/on_Snapshot__*.{py,sh,js} - - Creates one ArchiveResult per hook (not per plugin), with hook_name set. - This enables step-based execution where all hooks in a step can run in parallel. - """ - from archivebox.hooks import discover_hooks - - hooks = discover_hooks('Snapshot') - archiveresults = [] - - for hook_path in hooks: - hook_name = hook_path.name # e.g., 'on_Snapshot__50_wget.py' - plugin = hook_path.parent.name # e.g., 'wget' - - # Check if AR already exists for this specific hook - if ArchiveResult.objects.filter(snapshot=self, hook_name=hook_name).exists(): - continue - - archiveresult, created = ArchiveResult.objects.get_or_create( - snapshot=self, - hook_name=hook_name, - defaults={ - 'plugin': plugin, - 'status': ArchiveResult.INITIAL_STATE, - 'retry_at': timezone.now(), - 'created_by_id': self.crawl.created_by_id, - }, - ) - if archiveresult.status == ArchiveResult.INITIAL_STATE: - archiveresults.append(archiveresult) - - return archiveresults - - def advance_step_if_ready(self) -> bool: - """ - Advance current_step if all foreground hooks in current step are finished. - - Called by the state machine to check if step can advance. - Background hooks (.bg) don't block step advancement. - - Step advancement rules: - - All foreground ARs in current step must be finished (SUCCEEDED/FAILED/SKIPPED) - - Background ARs (hook_name contains '.bg.') are ignored for advancement - - When ready, increments current_step by 1 (up to 9) - - Returns: - True if step was advanced, False if not ready or already at step 9. - """ - from archivebox.hooks import extract_step, is_background_hook - - if self.current_step >= 9: - return False # Already at final step - - # Get all ARs for current step that are foreground - current_step_ars = self.archiveresult_set.filter( - hook_name__isnull=False - ).exclude(hook_name='') - - # Check each AR in current step - for ar in current_step_ars: - ar_step = extract_step(ar.hook_name) - if ar_step != self.current_step: - continue # Not in current step - - if is_background_hook(ar.hook_name): - continue # Background hooks don't block - - # Foreground hook in current step - check if finished - if ar.status not in ArchiveResult.FINAL_OR_ACTIVE_STATES: - # Still pending/queued - can't advance - return False - - if ar.status == ArchiveResult.StatusChoices.STARTED: - # Still running - can't advance - return False - - # All foreground hooks in current step are finished - advance! - self.current_step += 1 - self.save(update_fields=['current_step', 'modified_at']) - return True - - def is_finished_processing(self) -> bool: - """ - Check if this snapshot has finished processing. - - Used by SnapshotMachine.is_finished() to determine if snapshot is complete. - - Returns: - True if all archiveresults are finished (or no work to do), False otherwise. - """ - # if no archiveresults exist yet, it's not finished - if not self.archiveresult_set.exists(): - return False - - # Try to advance step if ready (handles step-based hook execution) - # This will increment current_step when all foreground hooks in current step are done - while self.advance_step_if_ready(): - pass # Keep advancing until we can't anymore - - # if archiveresults exist but are still pending, it's not finished - if self.pending_archiveresults().exists(): - return False - - # Don't wait for background hooks - they'll be cleaned up on entering sealed state - # Background hooks in STARTED state are excluded by pending_archiveresults() - # (STARTED is in FINAL_OR_ACTIVE_STATES) so once all results are FINAL or ACTIVE, - # we can transition to sealed and cleanup() will kill the background hooks - - # otherwise archiveresults exist and are all finished, so it's finished - return True - - def retry_failed_archiveresults(self, retry_at: Optional['timezone.datetime'] = None) -> int: - """ - Reset failed/skipped ArchiveResults to queued for retry. - - This enables seamless retry of the entire extraction pipeline: - - Resets FAILED and SKIPPED results to QUEUED - - Sets retry_at so workers pick them up - - Plugins run in order (numeric prefix) - - Each plugin checks its dependencies at runtime - - Dependency handling (e.g., chrome_session → screenshot): - - Plugins check if required outputs exist before running - - If dependency output missing → plugin returns 'skipped' - - On retry, if dependency now succeeds → dependent can run - - Returns count of ArchiveResults reset. - """ - retry_at = retry_at or timezone.now() - - count = self.archiveresult_set.filter( - status__in=[ - ArchiveResult.StatusChoices.FAILED, - ArchiveResult.StatusChoices.SKIPPED, - ] - ).update( - status=ArchiveResult.StatusChoices.QUEUED, - retry_at=retry_at, - output=None, - start_ts=None, - end_ts=None, - ) - - # Also reset the snapshot and current_step so it gets re-checked from the beginning - if count > 0: - self.status = self.StatusChoices.STARTED - self.retry_at = retry_at - self.current_step = 0 # Reset to step 0 for retry - self.save(update_fields=['status', 'retry_at', 'current_step', 'modified_at']) - - return count - - # ========================================================================= - # URL Helper Properties (migrated from Link schema) - # ========================================================================= - - @cached_property - def url_hash(self) -> str: - from hashlib import sha256 - return sha256(self.url.encode()).hexdigest()[:8] - - @cached_property - def scheme(self) -> str: - return self.url.split('://')[0] - - @cached_property - def path(self) -> str: - parts = self.url.split('://', 1) - return '/' + parts[1].split('/', 1)[1] if len(parts) > 1 and '/' in parts[1] else '/' - - @cached_property - def basename(self) -> str: - return self.path.split('/')[-1] - - @cached_property - def extension(self) -> str: - basename = self.basename - return basename.split('.')[-1] if '.' in basename else '' - - @cached_property - def base_url(self) -> str: - return f'{self.scheme}://{self.domain}' - - @cached_property - def is_static(self) -> bool: - static_extensions = {'.pdf', '.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.mp4', '.mp3', '.wav', '.webm'} - return any(self.url.lower().endswith(ext) for ext in static_extensions) - - @cached_property - def is_archived(self) -> bool: - output_paths = ( - self.domain, - 'output.html', - 'output.pdf', - 'screenshot.png', - 'singlefile.html', - 'readability/content.html', - 'mercury/content.html', - 'htmltotext.txt', - 'media', - 'git', - ) - return any((Path(self.output_dir) / path).exists() for path in output_paths) - - # ========================================================================= - # Date/Time Properties (migrated from Link schema) - # ========================================================================= - - @cached_property - def bookmarked_date(self) -> Optional[str]: - max_ts = (timezone.now() + timedelta(days=30)).timestamp() - if self.timestamp and self.timestamp.replace('.', '').isdigit(): - if 0 < float(self.timestamp) < max_ts: - return self._ts_to_date_str(datetime.fromtimestamp(float(self.timestamp))) - return str(self.timestamp) - return None - - @cached_property - def downloaded_datestr(self) -> Optional[str]: - return self._ts_to_date_str(self.downloaded_at) if self.downloaded_at else None - - @cached_property - def archive_dates(self) -> List[datetime]: - return [ - result.start_ts - for result in self.archiveresult_set.all() - if result.start_ts - ] - - @cached_property - def oldest_archive_date(self) -> Optional[datetime]: - dates = self.archive_dates - return min(dates) if dates else None - - @cached_property - def newest_archive_date(self) -> Optional[datetime]: - dates = self.archive_dates - return max(dates) if dates else None - - @cached_property - def num_outputs(self) -> int: - return self.archiveresult_set.filter(status='succeeded').count() - - @cached_property - def num_failures(self) -> int: - return self.archiveresult_set.filter(status='failed').count() - - # ========================================================================= - # Output Path Methods (migrated from Link schema) - # ========================================================================= - - def canonical_outputs(self) -> Dict[str, Optional[str]]: - """ - Intelligently discover the best output file for each plugin. - Uses actual ArchiveResult data and filesystem scanning with smart heuristics. - """ - FAVICON_PROVIDER = 'https://www.google.com/s2/favicons?domain={}' - - # Mimetypes that can be embedded/previewed in an iframe - IFRAME_EMBEDDABLE_EXTENSIONS = { - 'html', 'htm', 'pdf', 'txt', 'md', 'json', 'jsonl', - 'png', 'jpg', 'jpeg', 'gif', 'webp', 'svg', 'ico', - 'mp4', 'webm', 'mp3', 'opus', 'ogg', 'wav', - } - - MIN_DISPLAY_SIZE = 15_000 # 15KB - filter out tiny files - MAX_SCAN_FILES = 50 # Don't scan massive directories - - def find_best_output_in_dir(dir_path: Path, plugin_name: str) -> Optional[str]: - """Find the best representative file in a plugin's output directory""" - if not dir_path.exists() or not dir_path.is_dir(): - return None - - candidates = [] - file_count = 0 - - # Special handling for media plugin - look for thumbnails - is_media_dir = plugin_name == 'media' - - # Scan for suitable files - for file_path in dir_path.rglob('*'): - file_count += 1 - if file_count > MAX_SCAN_FILES: - break - - if file_path.is_dir() or file_path.name.startswith('.'): - continue - - ext = file_path.suffix.lstrip('.').lower() - if ext not in IFRAME_EMBEDDABLE_EXTENSIONS: - continue - - try: - size = file_path.stat().st_size - except OSError: - continue - - # For media dir, allow smaller image files (thumbnails are often < 15KB) - min_size = 5_000 if (is_media_dir and ext in ('png', 'jpg', 'jpeg', 'webp', 'gif')) else MIN_DISPLAY_SIZE - if size < min_size: - continue - - # Prefer main files: index.html, output.*, content.*, etc. - priority = 0 - name_lower = file_path.name.lower() - - if is_media_dir: - # Special prioritization for media directories - if any(keyword in name_lower for keyword in ('thumb', 'thumbnail', 'cover', 'poster')): - priority = 200 # Highest priority for thumbnails - elif ext in ('png', 'jpg', 'jpeg', 'webp', 'gif'): - priority = 150 # High priority for any image - elif ext in ('mp4', 'webm', 'mp3', 'opus', 'ogg'): - priority = 100 # Lower priority for actual media files - else: - priority = 50 - elif 'index' in name_lower: - priority = 100 - elif name_lower.startswith(('output', 'content', plugin_name)): - priority = 50 - elif ext in ('html', 'htm', 'pdf'): - priority = 30 - elif ext in ('png', 'jpg', 'jpeg', 'webp'): - priority = 20 - else: - priority = 10 - - candidates.append((priority, size, file_path)) - - if not candidates: - return None - - # Sort by priority (desc), then size (desc) - candidates.sort(key=lambda x: (x[0], x[1]), reverse=True) - best_file = candidates[0][2] - return str(best_file.relative_to(Path(self.output_dir))) - - canonical = { - 'index_path': 'index.html', - 'google_favicon_path': FAVICON_PROVIDER.format(self.domain), - 'archive_org_path': f'https://web.archive.org/web/{self.base_url}', - } - - # Scan each ArchiveResult's output directory for the best file - snap_dir = Path(self.output_dir) - for result in self.archiveresult_set.filter(status='succeeded'): - if not result.output_files and not result.output_str: - continue - - # Try to find the best output file for this plugin - plugin_dir = snap_dir / result.plugin - best_output = None - - # Check output_files first (new field) - if result.output_files: - first_file = next(iter(result.output_files.keys()), None) - if first_file and (plugin_dir / first_file).exists(): - best_output = f'{result.plugin}/{first_file}' - - # Fallback to output_str if it looks like a path - if not best_output and result.output_str and (snap_dir / result.output_str).exists(): - best_output = result.output_str - - if not best_output and plugin_dir.exists(): - # Intelligently find the best file in the plugin's directory - best_output = find_best_output_in_dir(plugin_dir, result.plugin) - - if best_output: - canonical[f'{result.plugin}_path'] = best_output - - # Also scan top-level for legacy outputs (backwards compatibility) - for file_path in snap_dir.glob('*'): - if file_path.is_dir() or file_path.name in ('index.html', 'index.json'): - continue - - ext = file_path.suffix.lstrip('.').lower() - if ext not in IFRAME_EMBEDDABLE_EXTENSIONS: - continue - - try: - size = file_path.stat().st_size - if size >= MIN_DISPLAY_SIZE: - # Add as generic output with stem as key - key = f'{file_path.stem}_path' - if key not in canonical: - canonical[key] = file_path.name - except OSError: - continue - - if self.is_static: - static_path = f'warc/{self.timestamp}' - canonical.update({ - 'title': self.basename, - 'wget_path': static_path, - }) - - return canonical - - def latest_outputs(self, status: Optional[str] = None) -> Dict[str, Any]: - """Get the latest output that each plugin produced""" - from archivebox.hooks import get_plugins - from django.db.models import Q - - latest: Dict[str, Any] = {} - for plugin in get_plugins(): - results = self.archiveresult_set.filter(plugin=plugin) - if status is not None: - results = results.filter(status=status) - # Filter for results with output_files or output_str - results = results.filter(Q(output_files__isnull=False) | ~Q(output_str='')).order_by('-start_ts') - result = results.first() - # Return embed_path() for backwards compatibility - latest[plugin] = result.embed_path() if result else None - return latest - - # ========================================================================= - # Serialization Methods - # ========================================================================= - - def to_dict(self, extended: bool = False) -> Dict[str, Any]: - """Convert Snapshot to a dictionary (replacement for Link._asdict())""" - from archivebox.misc.util import ts_to_date_str - - result = { - 'TYPE': 'core.models.Snapshot', - 'id': str(self.id), - 'url': self.url, - 'timestamp': self.timestamp, - 'title': self.title, - 'tags': self.tags_str(), - 'downloaded_at': self.downloaded_at.isoformat() if self.downloaded_at else None, - 'bookmarked_at': self.bookmarked_at.isoformat() if self.bookmarked_at else None, - 'created_at': self.created_at.isoformat() if self.created_at else None, - # Computed properties - 'domain': self.domain, - 'scheme': self.scheme, - 'base_url': self.base_url, - 'path': self.path, - 'basename': self.basename, - 'extension': self.extension, - 'is_static': self.is_static, - 'is_archived': self.is_archived, - 'archive_path': self.archive_path, - 'output_dir': self.output_dir, - 'link_dir': self.output_dir, # backwards compatibility alias - 'archive_size': self.archive_size, - 'bookmarked_date': self.bookmarked_date, - 'downloaded_datestr': self.downloaded_datestr, - 'num_outputs': self.num_outputs, - 'num_failures': self.num_failures, - } - if extended: - result['canonical'] = self.canonical_outputs() - return result - - def to_json(self, indent: int = 4) -> str: - """Convert to JSON string""" - return to_json(self.to_dict(extended=True), indent=indent) - - def to_csv(self, cols: Optional[List[str]] = None, separator: str = ',', ljust: int = 0) -> str: - """Convert to CSV string""" - data = self.to_dict() - cols = cols or ['timestamp', 'is_archived', 'url'] - return separator.join(to_json(data.get(col, ''), indent=None).ljust(ljust) for col in cols) - - def write_json_details(self, out_dir: Optional[str] = None) -> None: - """Write JSON index file for this snapshot to its output directory""" - out_dir = out_dir or self.output_dir - path = Path(out_dir) / CONSTANTS.JSON_INDEX_FILENAME - atomic_write(str(path), self.to_dict(extended=True)) - - def write_html_details(self, out_dir: Optional[str] = None) -> None: - """Write HTML detail page for this snapshot to its output directory""" - from django.template.loader import render_to_string - from archivebox.config.common import SERVER_CONFIG - from archivebox.config.configset import get_config - from archivebox.misc.logging_util import printable_filesize - - out_dir = out_dir or self.output_dir - config = get_config() - SAVE_ARCHIVE_DOT_ORG = config.get('SAVE_ARCHIVE_DOT_ORG', True) - TITLE_LOADING_MSG = 'Not yet archived...' - - canonical = self.canonical_outputs() - context = { - **self.to_dict(extended=True), - **{f'{k}_path': v for k, v in canonical.items()}, - 'canonical': {f'{k}_path': v for k, v in canonical.items()}, - 'title': htmlencode(self.title or (self.base_url if self.is_archived else TITLE_LOADING_MSG)), - 'url_str': htmlencode(urldecode(self.base_url)), - 'archive_url': urlencode(f'warc/{self.timestamp}' or (self.domain if self.is_archived else '')) or 'about:blank', - 'extension': self.extension or 'html', - 'tags': self.tags_str() or 'untagged', - 'size': printable_filesize(self.archive_size) if self.archive_size else 'pending', - 'status': 'archived' if self.is_archived else 'not yet archived', - 'status_color': 'success' if self.is_archived else 'danger', - 'oldest_archive_date': ts_to_date_str(self.oldest_archive_date), - 'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG, - 'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS, - } - rendered_html = render_to_string('snapshot.html', context) - atomic_write(str(Path(out_dir) / CONSTANTS.HTML_INDEX_FILENAME), rendered_html) - - # ========================================================================= - # Helper Methods - # ========================================================================= - - @staticmethod - def _ts_to_date_str(dt: Optional[datetime]) -> Optional[str]: - return dt.strftime('%Y-%m-%d %H:%M:%S') if dt else None - - -# ============================================================================= -# Snapshot State Machine -# ============================================================================= - -class SnapshotMachine(BaseStateMachine, strict_states=True): - """ - State machine for managing Snapshot lifecycle. - - Hook Lifecycle: - ┌─────────────────────────────────────────────────────────────┐ - │ QUEUED State │ - │ • Waiting for snapshot to be ready │ - └─────────────────────────────────────────────────────────────┘ - ↓ tick() when can_start() - ┌─────────────────────────────────────────────────────────────┐ - │ STARTED State → enter_started() │ - │ 1. snapshot.run() │ - │ • discover_hooks('Snapshot') → finds all plugin hooks │ - │ • create_pending_archiveresults() → creates ONE │ - │ ArchiveResult per hook (NO execution yet) │ - │ 2. ArchiveResults process independently with their own │ - │ state machines (see ArchiveResultMachine) │ - │ 3. Advance through steps 0-9 as foreground hooks complete │ - └─────────────────────────────────────────────────────────────┘ - ↓ tick() when is_finished() - ┌─────────────────────────────────────────────────────────────┐ - │ SEALED State → enter_sealed() │ - │ • cleanup() → kills any background hooks still running │ - │ • Set retry_at=None (no more processing) │ - └─────────────────────────────────────────────────────────────┘ - - https://github.com/ArchiveBox/ArchiveBox/wiki/ArchiveBox-Architecture-Diagrams - """ - - model_attr_name = 'snapshot' - - # States - queued = State(value=Snapshot.StatusChoices.QUEUED, initial=True) - started = State(value=Snapshot.StatusChoices.STARTED) - sealed = State(value=Snapshot.StatusChoices.SEALED, final=True) - - # Tick Event - tick = ( - queued.to.itself(unless='can_start') | - queued.to(started, cond='can_start') | - started.to.itself(unless='is_finished') | - started.to(sealed, cond='is_finished') - ) - - def can_start(self) -> bool: - can_start = bool(self.snapshot.url) - # Suppressed: queue waiting logs - return can_start - - def is_finished(self) -> bool: - """Check if snapshot processing is complete - delegates to model method.""" - return self.snapshot.is_finished_processing() - - @queued.enter - def enter_queued(self): - # Suppressed: state transition logs - self.snapshot.update_and_requeue( - retry_at=timezone.now(), - status=Snapshot.StatusChoices.QUEUED, - ) - - @started.enter - def enter_started(self): - # Suppressed: state transition logs - # lock the snapshot while we create the pending archiveresults - self.snapshot.update_and_requeue( - retry_at=timezone.now() + timedelta(seconds=30), # if failed, wait 30s before retrying - ) - - # Run the snapshot - creates pending archiveresults for all enabled plugins - self.snapshot.run() - - # unlock the snapshot after we're done + set status = started - self.snapshot.update_and_requeue( - retry_at=timezone.now() + timedelta(seconds=5), # check again in 5s - status=Snapshot.StatusChoices.STARTED, - ) - - @sealed.enter - def enter_sealed(self): - # Clean up background hooks - self.snapshot.cleanup() - - # Suppressed: state transition logs - self.snapshot.update_and_requeue( - retry_at=None, - status=Snapshot.StatusChoices.SEALED, - ) - - -class ArchiveResultManager(models.Manager): - def indexable(self, sorted: bool = True): - INDEXABLE_METHODS = [r[0] for r in EXTRACTOR_INDEXING_PRECEDENCE] - qs = self.get_queryset().filter(plugin__in=INDEXABLE_METHODS, status='succeeded') - if sorted: - precedence = [When(plugin=method, then=Value(p)) for method, p in EXTRACTOR_INDEXING_PRECEDENCE] - qs = qs.annotate(indexing_precedence=Case(*precedence, default=Value(1000), output_field=IntegerField())).order_by('indexing_precedence') - return qs - - -class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine): - class StatusChoices(models.TextChoices): - QUEUED = 'queued', 'Queued' - STARTED = 'started', 'Started' - BACKOFF = 'backoff', 'Waiting to retry' - SUCCEEDED = 'succeeded', 'Succeeded' - FAILED = 'failed', 'Failed' - SKIPPED = 'skipped', 'Skipped' - - @classmethod - def get_plugin_choices(cls): - """Get plugin choices from discovered hooks (for forms/admin).""" - plugins = [get_plugin_name(e) for e in get_plugins()] - return tuple((e, e) for e in plugins) - - # Keep AutoField for backward compatibility with 0.7.x databases - # UUID field is added separately by migration for new records - id = models.AutoField(primary_key=True, editable=False) - # Note: unique constraint is added by migration 0027 - don't set unique=True here - # or SQLite table recreation in earlier migrations will fail - uuid = models.UUIDField(default=uuid7, null=True, blank=True, db_index=True) - created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False, related_name='archiveresult_set', db_index=True) - created_at = models.DateTimeField(default=timezone.now, db_index=True) - modified_at = models.DateTimeField(auto_now=True) - - snapshot: Snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE) # type: ignore - # No choices= constraint - plugin names come from plugin system and can be any string - plugin = models.CharField(max_length=32, blank=False, null=False, db_index=True) - hook_name = models.CharField(max_length=255, blank=True, default='', db_index=True, help_text='Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)') - pwd = models.CharField(max_length=256, default=None, null=True, blank=True) - cmd = models.JSONField(default=None, null=True, blank=True) - cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True) - - # New output fields (replacing old 'output' field) - output_str = models.TextField(blank=True, default='', help_text='Human-readable output summary') - output_json = models.JSONField(null=True, blank=True, default=None, help_text='Structured metadata (headers, redirects, etc.)') - output_files = models.JSONField(default=dict, help_text='Dict of {relative_path: {metadata}}') - output_size = models.BigIntegerField(default=0, help_text='Total bytes of all output files') - output_mimetypes = models.CharField(max_length=512, blank=True, default='', help_text='CSV of mimetypes sorted by size') - - # Binary FK (optional - set when hook reports cmd) - binary = models.ForeignKey( - 'machine.Binary', - on_delete=models.SET_NULL, - null=True, blank=True, - related_name='archiveresults', - help_text='Primary binary used by this hook' - ) - - start_ts = models.DateTimeField(default=None, null=True, blank=True) - end_ts = models.DateTimeField(default=None, null=True, blank=True) - - status = ModelWithStateMachine.StatusField(choices=StatusChoices.choices, default=StatusChoices.QUEUED) - retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now) - notes = models.TextField(blank=True, null=False, default='') - output_dir = models.CharField(max_length=256, default=None, null=True, blank=True) - iface = models.ForeignKey(NetworkInterface, on_delete=models.SET_NULL, null=True, blank=True) - - state_machine_name = 'core.models.ArchiveResultMachine' - retry_at_field_name = 'retry_at' - state_field_name = 'status' - active_state = StatusChoices.STARTED - - objects = ArchiveResultManager() - - class Meta(TypedModelMeta): - verbose_name = 'Archive Result' - verbose_name_plural = 'Archive Results Log' - - def __str__(self): - return f'[{self.id}] {self.snapshot.url[:64]} -> {self.plugin}' - - def save(self, *args, **kwargs): - is_new = self._state.adding - # Skip ModelWithOutputDir.save() to avoid creating index.json in plugin directories - # Call the Django Model.save() directly instead - models.Model.save(self, *args, **kwargs) - - if is_new: - from archivebox.misc.logging_util import log_worker_event - log_worker_event( - worker_type='DB', - event='Created ArchiveResult', - indent_level=3, - plugin=self.plugin, - metadata={ - 'id': str(self.id), - 'snapshot_id': str(self.snapshot_id), - 'snapshot_url': str(self.snapshot.url)[:64], - 'status': self.status, - }, - ) - - @cached_property - def snapshot_dir(self): - return Path(self.snapshot.output_dir) - - @cached_property - def url(self): - return self.snapshot.url - - @property - def api_url(self) -> str: - return reverse_lazy('api-1:get_archiveresult', args=[self.id]) - - def get_absolute_url(self): - return f'/{self.snapshot.archive_path}/{self.plugin}' - - @property - def plugin_module(self) -> Any | None: - # Hook scripts are now used instead of Python plugin modules - # The plugin name maps to hooks in archivebox/plugins/{plugin}/ - return None - - def output_exists(self) -> bool: - return os.path.exists(Path(self.snapshot_dir) / self.plugin) - - def embed_path(self) -> Optional[str]: - """ - Get the relative path to the embeddable output file for this result. - - Returns the first file from output_files if set, otherwise tries to - find a reasonable default based on the plugin type. - """ - # Check output_files dict for primary output - if self.output_files: - # Return first file from output_files (dict preserves insertion order) - first_file = next(iter(self.output_files.keys()), None) - if first_file: - return f'{self.plugin}/{first_file}' - - # Fallback: check output_str if it looks like a file path - if self.output_str and ('/' in self.output_str or '.' in self.output_str): - return self.output_str - - # Try to find output file based on plugin's canonical output path - canonical = self.snapshot.canonical_outputs() - plugin_key = f'{self.plugin}_path' - if plugin_key in canonical: - return canonical[plugin_key] - - # Fallback to plugin directory - return f'{self.plugin}/' - - def create_output_dir(self): - output_dir = Path(self.snapshot_dir) / self.plugin - output_dir.mkdir(parents=True, exist_ok=True) - return output_dir - - @property - def output_dir_name(self) -> str: - return self.plugin - - @property - def output_dir_parent(self) -> str: - return str(self.snapshot.OUTPUT_DIR.relative_to(CONSTANTS.DATA_DIR)) - - def save_search_index(self): - pass - - def cascade_health_update(self, success: bool): - """Update health stats for self, parent Snapshot, and grandparent Crawl (if present).""" - self.increment_health_stats(success) - self.snapshot.increment_health_stats(success) - if self.snapshot.crawl_id: - self.snapshot.crawl.increment_health_stats(success) - - def run(self): - """ - Execute this ArchiveResult's hook and update status. - - If self.hook_name is set, runs only that specific hook. - If self.hook_name is empty, discovers and runs all hooks for self.plugin (backwards compat). - - Updates status/output fields, queues discovered URLs, and triggers indexing. - """ - from django.utils import timezone - from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook, is_background_hook - from archivebox.config.configset import get_config - - # Get merged config with proper context - config = get_config( - crawl=self.snapshot.crawl if self.snapshot.crawl else None, - snapshot=self.snapshot, - ) - - # Determine which hook(s) to run - hooks = [] - - if self.hook_name: - # SPECIFIC HOOK MODE: Find the specific hook by name - for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR): - if not base_dir.exists(): - continue - plugin_dir = base_dir / self.plugin - if plugin_dir.exists(): - hook_path = plugin_dir / self.hook_name - if hook_path.exists(): - hooks.append(hook_path) - break - else: - # LEGACY MODE: Discover all hooks for this plugin (backwards compatibility) - for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR): - if not base_dir.exists(): - continue - plugin_dir = base_dir / self.plugin - if plugin_dir.exists(): - matches = list(plugin_dir.glob('on_Snapshot__*.*')) - if matches: - hooks.extend(sorted(matches)) - - if not hooks: - self.status = self.StatusChoices.FAILED - if self.hook_name: - self.output_str = f'Hook not found: {self.plugin}/{self.hook_name}' - else: - self.output_str = f'No hooks found for plugin: {self.plugin}' - self.retry_at = None - self.save() - return - - # Output directory is plugin_dir for the hook output - plugin_dir = Path(self.snapshot.output_dir) / self.plugin - - start_ts = timezone.now() - is_bg_hook = False - - for hook in hooks: - # Check if this is a background hook - is_bg_hook = is_background_hook(hook.name) - - result = run_hook( - hook, - output_dir=plugin_dir, - config=config, - url=self.snapshot.url, - snapshot_id=str(self.snapshot.id), - crawl_id=str(self.snapshot.crawl.id) if self.snapshot.crawl else None, - depth=self.snapshot.depth, - ) - - # Background hooks return None - if result is None: - is_bg_hook = True - - # Update status based on hook execution - if is_bg_hook: - # BACKGROUND HOOK - still running, return immediately - # Status stays STARTED, will be finalized by Snapshot.cleanup() - self.status = self.StatusChoices.STARTED - self.start_ts = start_ts - self.pwd = str(plugin_dir) - self.save() - return - - # FOREGROUND HOOK - completed, update from filesystem - self.start_ts = start_ts - self.pwd = str(plugin_dir) - self.update_from_output() - - # Clean up empty output directory if no files were created - if plugin_dir.exists() and not self.output_files: - try: - if not any(plugin_dir.iterdir()): - plugin_dir.rmdir() - except (OSError, RuntimeError): - pass - - def update_from_output(self): - """ - Update this ArchiveResult from filesystem logs and output files. - - Used for: - - Foreground hooks that completed (called from ArchiveResult.run()) - - Background hooks that completed (called from Snapshot.cleanup()) - - Updates: - - status, output_str, output_json from ArchiveResult JSONL record - - output_files, output_size, output_mimetypes by walking filesystem - - end_ts, retry_at, cmd, cmd_version, binary FK - - Processes side-effect records (Snapshot, Tag, etc.) via process_hook_records() - """ - import json - import mimetypes - from collections import defaultdict - from pathlib import Path - from django.utils import timezone - from archivebox.hooks import process_hook_records - - plugin_dir = Path(self.pwd) if self.pwd else None - if not plugin_dir or not plugin_dir.exists(): - self.status = self.StatusChoices.FAILED - self.output_str = 'Output directory not found' - self.end_ts = timezone.now() - self.retry_at = None - self.save() - return - - # Read and parse JSONL output from stdout.log - stdout_file = plugin_dir / 'stdout.log' - stdout = stdout_file.read_text() if stdout_file.exists() else '' - - records = [] - for line in stdout.splitlines(): - if line.strip() and line.strip().startswith('{'): - try: - records.append(json.loads(line)) - except json.JSONDecodeError: - continue - - # Find ArchiveResult record and update status/output from it - ar_records = [r for r in records if r.get('type') == 'ArchiveResult'] - if ar_records: - hook_data = ar_records[0] - - # Update status - status_map = { - 'succeeded': self.StatusChoices.SUCCEEDED, - 'failed': self.StatusChoices.FAILED, - 'skipped': self.StatusChoices.SKIPPED, - } - self.status = status_map.get(hook_data.get('status', 'failed'), self.StatusChoices.FAILED) - - # Update output fields - self.output_str = hook_data.get('output_str') or hook_data.get('output') or '' - self.output_json = hook_data.get('output_json') - - # Update cmd fields - if hook_data.get('cmd'): - self.cmd = hook_data['cmd'] - self._set_binary_from_cmd(hook_data['cmd']) - if hook_data.get('cmd_version'): - self.cmd_version = hook_data['cmd_version'][:128] - else: - # No ArchiveResult record = failed - self.status = self.StatusChoices.FAILED - self.output_str = 'Hook did not output ArchiveResult record' - - # Walk filesystem and populate output_files, output_size, output_mimetypes - exclude_names = {'stdout.log', 'stderr.log', 'hook.pid', 'listener.pid'} - mime_sizes = defaultdict(int) - total_size = 0 - output_files = {} - - for file_path in plugin_dir.rglob('*'): - if not file_path.is_file(): - continue - if file_path.name in exclude_names: - continue - - try: - stat = file_path.stat() - mime_type, _ = mimetypes.guess_type(str(file_path)) - mime_type = mime_type or 'application/octet-stream' - - relative_path = str(file_path.relative_to(plugin_dir)) - output_files[relative_path] = {} - mime_sizes[mime_type] += stat.st_size - total_size += stat.st_size - except (OSError, IOError): - continue - - self.output_files = output_files - self.output_size = total_size - sorted_mimes = sorted(mime_sizes.items(), key=lambda x: x[1], reverse=True) - self.output_mimetypes = ','.join(mime for mime, _ in sorted_mimes) - - # Update timestamps - self.end_ts = timezone.now() - self.retry_at = None - - self.save() - - # Process side-effect records (filter Snapshots for depth/URL) - filtered_records = [] - for record in records: - record_type = record.get('type') - - # Skip ArchiveResult records (already processed above) - if record_type == 'ArchiveResult': - continue - - # Filter Snapshot records for depth/URL constraints - if record_type == 'Snapshot': - if not self.snapshot.crawl: - continue - - url = record.get('url') - if not url: - continue - - depth = record.get('depth', self.snapshot.depth + 1) - if depth > self.snapshot.crawl.max_depth: - continue - - if not self._url_passes_filters(url): - continue - - filtered_records.append(record) - - # Process filtered records with unified dispatcher - overrides = { - 'snapshot': self.snapshot, - 'crawl': self.snapshot.crawl, - 'created_by_id': self.snapshot.crawl.created_by_id, - } - process_hook_records(filtered_records, overrides=overrides) - - # Cleanup PID files and empty logs - pid_file = plugin_dir / 'hook.pid' - pid_file.unlink(missing_ok=True) - stderr_file = plugin_dir / 'stderr.log' - if stdout_file.exists() and stdout_file.stat().st_size == 0: - stdout_file.unlink() - if stderr_file.exists() and stderr_file.stat().st_size == 0: - stderr_file.unlink() - - def _set_binary_from_cmd(self, cmd: list) -> None: - """ - Find Binary for command and set binary FK. - - Tries matching by absolute path first, then by binary name. - Only matches binaries on the current machine. - """ - if not cmd: - return - - from archivebox.machine.models import Machine - - bin_path_or_name = cmd[0] if isinstance(cmd, list) else cmd - machine = Machine.current() - - # Try matching by absolute path first - binary = Binary.objects.filter( - abspath=bin_path_or_name, - machine=machine - ).first() - - if binary: - self.binary = binary - return - - # Fallback: match by binary name - bin_name = Path(bin_path_or_name).name - binary = Binary.objects.filter( - name=bin_name, - machine=machine - ).first() - - if binary: - self.binary = binary - - def _url_passes_filters(self, url: str) -> bool: - """Check if URL passes URL_ALLOWLIST and URL_DENYLIST config filters. - - Uses proper config hierarchy: defaults -> file -> env -> machine -> user -> crawl -> snapshot - """ - import re - from archivebox.config.configset import get_config - - # Get merged config with proper hierarchy - config = get_config( - user=self.snapshot.crawl.created_by if self.snapshot else None, - crawl=self.snapshot.crawl if self.snapshot else None, - snapshot=self.snapshot, - ) - - # Get allowlist/denylist (can be string or list) - allowlist_raw = config.get('URL_ALLOWLIST', '') - denylist_raw = config.get('URL_DENYLIST', '') - - # Normalize to list of patterns - def to_pattern_list(value): - if isinstance(value, list): - return value - if isinstance(value, str): - return [p.strip() for p in value.split(',') if p.strip()] - return [] - - allowlist = to_pattern_list(allowlist_raw) - denylist = to_pattern_list(denylist_raw) - - # Denylist takes precedence - if denylist: - for pattern in denylist: - try: - if re.search(pattern, url): - return False - except re.error: - continue # Skip invalid regex patterns - - # If allowlist exists, URL must match at least one pattern - if allowlist: - for pattern in allowlist: - try: - if re.search(pattern, url): - return True - except re.error: - continue # Skip invalid regex patterns - return False # No allowlist patterns matched - - return True # No filters or passed filters - - @property - def output_dir(self) -> Path: - """Get the output directory for this plugin's results.""" - return Path(self.snapshot.output_dir) / self.plugin - - def is_background_hook(self) -> bool: - """Check if this ArchiveResult is for a background hook.""" - plugin_dir = Path(self.pwd) if self.pwd else None - if not plugin_dir: - return False - pid_file = plugin_dir / 'hook.pid' - return pid_file.exists() - - -# ============================================================================= -# ArchiveResult State Machine -# ============================================================================= - -class ArchiveResultMachine(BaseStateMachine, strict_states=True): - """ - State machine for managing ArchiveResult (single plugin execution) lifecycle. - - Hook Lifecycle: - ┌─────────────────────────────────────────────────────────────┐ - │ QUEUED State │ - │ • Waiting for its turn to run │ - └─────────────────────────────────────────────────────────────┘ - ↓ tick() when can_start() - ┌─────────────────────────────────────────────────────────────┐ - │ STARTED State → enter_started() │ - │ 1. archiveresult.run() │ - │ • Find specific hook by hook_name │ - │ • run_hook(script, output_dir, ...) → subprocess │ - │ │ - │ 2a. FOREGROUND hook (returns HookResult): │ - │ • update_from_output() immediately │ - │ - Read stdout.log │ - │ - Parse JSONL records │ - │ - Extract 'ArchiveResult' record → update status │ - │ - Walk output_dir → populate output_files │ - │ - Call process_hook_records() for side effects │ - │ │ - │ 2b. BACKGROUND hook (returns None): │ - │ • Status stays STARTED │ - │ • Continues running in background │ - │ • Killed by Snapshot.cleanup() when sealed │ - └─────────────────────────────────────────────────────────────┘ - ↓ tick() checks status - ┌─────────────────────────────────────────────────────────────┐ - │ SUCCEEDED / FAILED / SKIPPED / BACKOFF │ - │ • Set by hook's JSONL output during update_from_output() │ - │ • Health stats incremented (num_uses_succeeded/failed) │ - │ • Parent Snapshot health stats also updated │ - └─────────────────────────────────────────────────────────────┘ - - https://github.com/ArchiveBox/ArchiveBox/wiki/ArchiveBox-Architecture-Diagrams - """ - - model_attr_name = 'archiveresult' - - # States - queued = State(value=ArchiveResult.StatusChoices.QUEUED, initial=True) - started = State(value=ArchiveResult.StatusChoices.STARTED) - backoff = State(value=ArchiveResult.StatusChoices.BACKOFF) - succeeded = State(value=ArchiveResult.StatusChoices.SUCCEEDED, final=True) - failed = State(value=ArchiveResult.StatusChoices.FAILED, final=True) - skipped = State(value=ArchiveResult.StatusChoices.SKIPPED, final=True) - - # Tick Event - transitions based on conditions - tick = ( - queued.to.itself(unless='can_start') | - queued.to(started, cond='can_start') | - started.to.itself(unless='is_finished') | - started.to(succeeded, cond='is_succeeded') | - started.to(failed, cond='is_failed') | - started.to(skipped, cond='is_skipped') | - started.to(backoff, cond='is_backoff') | - backoff.to.itself(unless='can_start') | - backoff.to(started, cond='can_start') | - backoff.to(succeeded, cond='is_succeeded') | - backoff.to(failed, cond='is_failed') | - backoff.to(skipped, cond='is_skipped') - ) - - def can_start(self) -> bool: - can_start = bool(self.archiveresult.snapshot.url) - # Suppressed: queue waiting logs - return can_start - - def is_succeeded(self) -> bool: - """Check if extractor plugin succeeded (status was set by run()).""" - return self.archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED - - def is_failed(self) -> bool: - """Check if extractor plugin failed (status was set by run()).""" - return self.archiveresult.status == ArchiveResult.StatusChoices.FAILED - - def is_skipped(self) -> bool: - """Check if extractor plugin was skipped (status was set by run()).""" - return self.archiveresult.status == ArchiveResult.StatusChoices.SKIPPED - - def is_backoff(self) -> bool: - """Check if we should backoff and retry later.""" - # Backoff if status is still started (plugin didn't complete) and output_str is empty - return ( - self.archiveresult.status == ArchiveResult.StatusChoices.STARTED and - not self.archiveresult.output_str - ) - - def is_finished(self) -> bool: - """Check if extraction has completed (success, failure, or skipped).""" - return self.archiveresult.status in ( - ArchiveResult.StatusChoices.SUCCEEDED, - ArchiveResult.StatusChoices.FAILED, - ArchiveResult.StatusChoices.SKIPPED, - ) - - @queued.enter - def enter_queued(self): - # Suppressed: state transition logs - self.archiveresult.update_and_requeue( - retry_at=timezone.now(), - status=ArchiveResult.StatusChoices.QUEUED, - start_ts=None, - ) # bump the snapshot's retry_at so they pickup any new changes - - @started.enter - def enter_started(self): - from archivebox.machine.models import NetworkInterface - - # Suppressed: state transition logs - # Lock the object and mark start time - self.archiveresult.update_and_requeue( - retry_at=timezone.now() + timedelta(seconds=120), # 2 min timeout for plugin - status=ArchiveResult.StatusChoices.STARTED, - start_ts=timezone.now(), - iface=NetworkInterface.current(), - ) - - # Run the plugin - this updates status, output, timestamps, etc. - self.archiveresult.run() - - # Save the updated result - self.archiveresult.save() - - # Suppressed: plugin result logs (already logged by worker) - - @backoff.enter - def enter_backoff(self): - # Suppressed: state transition logs - self.archiveresult.update_and_requeue( - retry_at=timezone.now() + timedelta(seconds=60), - status=ArchiveResult.StatusChoices.BACKOFF, - end_ts=None, - # retries=F('retries') + 1, # F() equivalent to getattr(self.archiveresult, 'retries', 0) + 1, - ) - - @succeeded.enter - def enter_succeeded(self): - # Suppressed: state transition logs - self.archiveresult.update_and_requeue( - retry_at=None, - status=ArchiveResult.StatusChoices.SUCCEEDED, - end_ts=timezone.now(), - # **self.archiveresult.get_output_dict(), # {output, output_json, stderr, stdout, returncode, errors, cmd_version, pwd, cmd, machine} - ) - self.archiveresult.save() - - # Update health stats for ArchiveResult, Snapshot, and Crawl cascade - self.archiveresult.cascade_health_update(success=True) - - @failed.enter - def enter_failed(self): - # Suppressed: state transition logs - self.archiveresult.update_and_requeue( - retry_at=None, - status=ArchiveResult.StatusChoices.FAILED, - end_ts=timezone.now(), - ) - - # Update health stats for ArchiveResult, Snapshot, and Crawl cascade - self.archiveresult.cascade_health_update(success=False) - - @skipped.enter - def enter_skipped(self): - # Suppressed: state transition logs - self.archiveresult.update_and_requeue( - retry_at=None, - status=ArchiveResult.StatusChoices.SKIPPED, - end_ts=timezone.now(), - ) - - def after_transition(self, event: str, source: State, target: State): - # print(f"after '{event}' from '{source.id}' to '{target.id}'") - self.archiveresult.snapshot.update_and_requeue() # bump snapshot retry time so it picks up all the new changes - - -# ============================================================================= -# State Machine Registration -# ============================================================================= - -# Manually register state machines with python-statemachine registry -# (normally auto-discovered from statemachines.py, but we define them here for clarity) -registry.register(SnapshotMachine) -registry.register(ArchiveResultMachine) \ No newline at end of file diff --git a/archivebox/core/templatetags/core_tags.py b/archivebox/core/templatetags/core_tags.py index 685665a4..7e201f94 100644 --- a/archivebox/core/templatetags/core_tags.py +++ b/archivebox/core/templatetags/core_tags.py @@ -91,7 +91,11 @@ def plugin_thumbnail(context, result) -> str: 'output_path': output_path, 'plugin': plugin, }) - return mark_safe(tpl.render(ctx)) + rendered = tpl.render(ctx) + # Only return non-empty content (strip whitespace to check) + if rendered.strip(): + return mark_safe(rendered) + return '' except Exception: return '' @@ -119,7 +123,11 @@ def plugin_embed(context, result) -> str: 'output_path': output_path, 'plugin': plugin, }) - return mark_safe(tpl.render(ctx)) + rendered = tpl.render(ctx) + # Only return non-empty content (strip whitespace to check) + if rendered.strip(): + return mark_safe(rendered) + return '' except Exception: return '' @@ -147,7 +155,11 @@ def plugin_fullscreen(context, result) -> str: 'output_path': output_path, 'plugin': plugin, }) - return mark_safe(tpl.render(ctx)) + rendered = tpl.render(ctx) + # Only return non-empty content (strip whitespace to check) + if rendered.strip(): + return mark_safe(rendered) + return '' except Exception: return '' diff --git a/archivebox/core/views.py b/archivebox/core/views.py index 84a6bd2b..fd5dfbd8 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -539,7 +539,7 @@ from django.http import JsonResponse def live_progress_view(request): """Simple JSON endpoint for live progress status - used by admin progress monitor.""" try: - from workers.orchestrator import Orchestrator + from archivebox.workers.orchestrator import Orchestrator from archivebox.crawls.models import Crawl from archivebox.core.models import Snapshot, ArchiveResult from django.db.models import Case, When, Value, IntegerField diff --git a/archivebox/crawls/apps.py b/archivebox/crawls/apps.py index f7819eda..1bb34b3a 100644 --- a/archivebox/crawls/apps.py +++ b/archivebox/crawls/apps.py @@ -4,3 +4,8 @@ from django.apps import AppConfig class CrawlsConfig(AppConfig): default_auto_field = "django.db.models.BigAutoField" name = "archivebox.crawls" + label = "crawls" + + def ready(self): + """Import models to register state machines with the registry""" + from archivebox.crawls.models import CrawlMachine # noqa: F401 diff --git a/archivebox/crawls/migrations/0002_drop_seed_model.py b/archivebox/crawls/migrations/0002_drop_seed_model.py index c82dceb7..bf55c90a 100755 --- a/archivebox/crawls/migrations/0002_drop_seed_model.py +++ b/archivebox/crawls/migrations/0002_drop_seed_model.py @@ -17,39 +17,62 @@ class Migration(migrations.Migration): ] operations = [ - # Remove the seed foreign key from Crawl - migrations.RemoveField( - model_name='crawl', - name='seed', + # Remove the seed foreign key from Crawl (no-op if already removed by core/0024_d) + migrations.RunPython( + code=lambda apps, schema_editor: None, + reverse_code=migrations.RunPython.noop, ), - # Delete the Seed model entirely - migrations.DeleteModel( - name='Seed', + # Delete the Seed model entirely (already done) + migrations.RunPython( + code=lambda apps, schema_editor: None, + reverse_code=migrations.RunPython.noop, ), - # Update fields to new schema - migrations.AlterField( - model_name='crawl', - name='created_by', - field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL), - ), - migrations.AlterField( - model_name='crawl', - name='id', - field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True), - ), - migrations.AlterField( - model_name='crawl', - name='urls', - field=models.TextField(help_text='Newline-separated list of URLs to crawl'), - ), - migrations.AlterField( - model_name='crawlschedule', - name='created_by', - field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL), - ), - migrations.AlterField( - model_name='crawlschedule', - name='id', - field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True), + # Drop seed_id column if it exists, then update Django's migration state + migrations.SeparateDatabaseAndState( + state_operations=[ + # Update fields to new schema + migrations.AlterField( + model_name='crawl', + name='created_by', + field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL), + ), + migrations.AlterField( + model_name='crawl', + name='id', + field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True), + ), + migrations.AlterField( + model_name='crawl', + name='urls', + field=models.TextField(help_text='Newline-separated list of URLs to crawl'), + ), + migrations.AlterField( + model_name='crawlschedule', + name='created_by', + field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL), + ), + migrations.AlterField( + model_name='crawlschedule', + name='id', + field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True), + ), + ], + database_operations=[ + # Drop seed table and NULL out seed_id FK values + migrations.RunSQL( + sql=""" + PRAGMA foreign_keys=OFF; + + -- NULL out seed_id values in crawls_crawl + UPDATE crawls_crawl SET seed_id = NULL; + + -- Drop seed table if it exists + DROP TABLE IF EXISTS crawls_seed; + + PRAGMA foreign_keys=ON; + """, + reverse_sql=migrations.RunSQL.noop, + ), + ], ), ] diff --git a/archivebox/crawls/migrations/0003_alter_crawl_output_dir.py b/archivebox/crawls/migrations/0003_alter_crawl_output_dir.py index f4c26aa5..4d5b335d 100644 --- a/archivebox/crawls/migrations/0003_alter_crawl_output_dir.py +++ b/archivebox/crawls/migrations/0003_alter_crawl_output_dir.py @@ -8,12 +8,21 @@ class Migration(migrations.Migration): dependencies = [ ('crawls', '0002_drop_seed_model'), + ('core', '0024_d_fix_crawls_config'), # Depends on config fix ] operations = [ - migrations.AlterField( - model_name='crawl', - name='output_dir', - field=models.FilePathField(blank=True, default='', path=pathlib.PurePosixPath('/private/tmp/test_archivebox_migrations/archive')), + # Update Django's state only to avoid table rebuild that would re-apply old constraints + migrations.SeparateDatabaseAndState( + state_operations=[ + migrations.AlterField( + model_name='crawl', + name='output_dir', + field=models.FilePathField(blank=True, default='', path=pathlib.PurePosixPath('/private/tmp/test_archivebox_migrations/archive')), + ), + ], + database_operations=[ + # No database changes - output_dir type change is cosmetic for Django admin + ], ), ] diff --git a/archivebox/crawls/migrations/0004_alter_crawl_output_dir.py b/archivebox/crawls/migrations/0004_alter_crawl_output_dir.py index 809cf722..919bd021 100644 --- a/archivebox/crawls/migrations/0004_alter_crawl_output_dir.py +++ b/archivebox/crawls/migrations/0004_alter_crawl_output_dir.py @@ -11,9 +11,17 @@ class Migration(migrations.Migration): ] operations = [ - migrations.AlterField( - model_name='crawl', - name='output_dir', - field=models.FilePathField(blank=True, default='', path=pathlib.PurePosixPath('/private/tmp/archivebox-makemigrations/archive')), + # Update Django's state only to avoid table rebuild that would re-apply old constraints + migrations.SeparateDatabaseAndState( + state_operations=[ + migrations.AlterField( + model_name='crawl', + name='output_dir', + field=models.FilePathField(blank=True, default='', path=pathlib.PurePosixPath('/private/tmp/archivebox-makemigrations/archive')), + ), + ], + database_operations=[ + # No database changes - output_dir type change is cosmetic for Django admin + ], ), ] diff --git a/archivebox/crawls/migrations/0005_drop_seed_id_column.py b/archivebox/crawls/migrations/0005_drop_seed_id_column.py new file mode 100644 index 00000000..60bdecf1 --- /dev/null +++ b/archivebox/crawls/migrations/0005_drop_seed_id_column.py @@ -0,0 +1,28 @@ +# Drop seed_id column from Django's state (leave in database to avoid FK issues) + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('crawls', '0004_alter_crawl_output_dir'), + ] + + operations = [ + # Update Django's state only - leave seed_id column in database (unused but harmless) + # This avoids FK mismatch errors with crawls_crawlschedule + migrations.SeparateDatabaseAndState( + state_operations=[ + # Remove seed field from Django's migration state + migrations.RemoveField( + model_name='crawl', + name='seed', + ), + ], + database_operations=[ + # No database changes - seed_id column remains to avoid FK rebuild issues + # crawls_seed table can be manually dropped by DBA if needed + ], + ), + ] diff --git a/archivebox/crawls/migrations/0006_alter_crawl_config_alter_crawl_output_dir_and_more.py b/archivebox/crawls/migrations/0006_alter_crawl_config_alter_crawl_output_dir_and_more.py new file mode 100644 index 00000000..02805c72 --- /dev/null +++ b/archivebox/crawls/migrations/0006_alter_crawl_config_alter_crawl_output_dir_and_more.py @@ -0,0 +1,35 @@ +# Generated by Django 6.0 on 2025-12-29 06:45 + +import pathlib +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('crawls', '0005_drop_seed_id_column'), + ] + + operations = [ + # Update Django's state only - database already correct + migrations.SeparateDatabaseAndState( + state_operations=[ + migrations.AlterField( + model_name='crawl', + name='config', + field=models.JSONField(blank=True, default=dict, null=True), + ), + migrations.AlterField( + model_name='crawl', + name='output_dir', + field=models.FilePathField(blank=True, default='', path=pathlib.PurePosixPath('/Users/squash/Local/Code/archiveboxes/archivebox-nue/data/archive')), + ), + migrations.DeleteModel( + name='Seed', + ), + ], + database_operations=[ + # No database changes - Seed table already dropped in 0005 + ], + ), + ] diff --git a/archivebox/crawls/models.py b/archivebox/crawls/models.py index 420db4a2..a0c9cdda 100755 --- a/archivebox/crawls/models.py +++ b/archivebox/crawls/models.py @@ -65,7 +65,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith modified_at = models.DateTimeField(auto_now=True) urls = models.TextField(blank=False, null=False, help_text='Newline-separated list of URLs to crawl') - config = models.JSONField(default=dict) + config = models.JSONField(default=dict, null=True, blank=True) max_depth = models.PositiveSmallIntegerField(default=0, validators=[MinValueValidator(0), MaxValueValidator(4)]) tags_str = models.CharField(max_length=1024, blank=True, null=False, default='') persona_id = models.UUIDField(null=True, blank=True) @@ -77,7 +77,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith status = ModelWithStateMachine.StatusField(choices=ModelWithStateMachine.StatusChoices, default=ModelWithStateMachine.StatusChoices.QUEUED) retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now) - state_machine_name = 'crawls.models.CrawlMachine' + state_machine_name = 'archivebox.crawls.models.CrawlMachine' retry_at_field_name = 'retry_at' state_field_name = 'status' StatusChoices = ModelWithStateMachine.StatusChoices @@ -190,7 +190,6 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith 'status': Snapshot.INITIAL_STATE, 'retry_at': timezone.now(), 'timestamp': str(timezone.now().timestamp()), - 'created_by_id': self.created_by_id, 'depth': 0, }, ) @@ -290,7 +289,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith 'timestamp': timestamp or str(timezone.now().timestamp()), 'status': Snapshot.INITIAL_STATE, 'retry_at': timezone.now(), - 'created_by_id': self.created_by_id, + # Note: created_by removed in 0.9.0 - Snapshot inherits from Crawl } ) diff --git a/archivebox/machine/apps.py b/archivebox/machine/apps.py index f9b297a9..bbc02f78 100644 --- a/archivebox/machine/apps.py +++ b/archivebox/machine/apps.py @@ -7,8 +7,13 @@ class MachineConfig(AppConfig): default_auto_field = 'django.db.models.BigAutoField' name = 'archivebox.machine' + label = 'machine' # Explicit label for migrations verbose_name = 'Machine Info' + def ready(self): + """Import models to register state machines with the registry""" + from archivebox.machine import models # noqa: F401 + def register_admin(admin_site): from archivebox.machine.admin import register_admin diff --git a/archivebox/machine/migrations/0001_squashed.py b/archivebox/machine/migrations/0001_squashed.py index cd2c7db9..3ef5b8be 100644 --- a/archivebox/machine/migrations/0001_squashed.py +++ b/archivebox/machine/migrations/0001_squashed.py @@ -85,6 +85,12 @@ class Migration(migrations.Migration): ('version', models.CharField(blank=True, default=None, max_length=32)), ('sha256', models.CharField(blank=True, default=None, max_length=64)), ('machine', models.ForeignKey(blank=True, default=None, on_delete=django.db.models.deletion.CASCADE, to='machine.machine')), + # Fields added in migration 0005 (included here for fresh installs) + ('binproviders', models.CharField(blank=True, default='env', max_length=127)), + ('output_dir', models.CharField(blank=True, default='', max_length=255)), + ('overrides', models.JSONField(blank=True, default=dict)), + ('retry_at', models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True)), + ('status', models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('succeeded', 'Succeeded'), ('failed', 'Failed')], db_index=True, default='queued', max_length=16)), # dependency FK removed - Dependency model deleted ], options={ diff --git a/archivebox/machine/migrations/0005_binary_binproviders_binary_output_dir_and_more.py b/archivebox/machine/migrations/0005_binary_binproviders_binary_output_dir_and_more.py new file mode 100644 index 00000000..6d4b8ac7 --- /dev/null +++ b/archivebox/machine/migrations/0005_binary_binproviders_binary_output_dir_and_more.py @@ -0,0 +1,104 @@ +# Generated by Django 6.0 on 2025-12-29 06:45 + +import django.db.models.deletion +import django.utils.timezone +from archivebox.uuid_compat import uuid7 +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('machine', '0004_drop_dependency_table'), + ] + + operations = [ + # Update Django's state only - database already has correct schema + migrations.SeparateDatabaseAndState( + state_operations=[ + migrations.AddField( + model_name='binary', + name='binproviders', + field=models.CharField(blank=True, default='env', help_text='Comma-separated list of allowed providers: apt,brew,pip,npm,env', max_length=127), + ), + migrations.AddField( + model_name='binary', + name='output_dir', + field=models.CharField(blank=True, default='', help_text='Directory where installation hook logs are stored', max_length=255), + ), + migrations.AddField( + model_name='binary', + name='overrides', + field=models.JSONField(blank=True, default=dict, help_text="Provider-specific overrides: {'apt': {'packages': ['pkg']}, ...}"), + ), + migrations.AddField( + model_name='binary', + name='retry_at', + field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, help_text='When to retry this binary installation', null=True), + ), + migrations.AddField( + model_name='binary', + name='status', + field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('succeeded', 'Succeeded'), ('failed', 'Failed')], db_index=True, default='queued', max_length=16), + ), + migrations.AlterField( + model_name='binary', + name='abspath', + field=models.CharField(blank=True, default='', max_length=255), + ), + migrations.AlterField( + model_name='binary', + name='binprovider', + field=models.CharField(blank=True, default='', help_text='Provider that successfully installed this binary', max_length=31), + ), + migrations.AlterField( + model_name='binary', + name='id', + field=models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True), + ), + migrations.AlterField( + model_name='binary', + name='machine', + field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='machine.machine'), + ), + migrations.AlterField( + model_name='binary', + name='name', + field=models.CharField(blank=True, db_index=True, default='', max_length=63), + ), + migrations.AlterField( + model_name='binary', + name='sha256', + field=models.CharField(blank=True, default='', max_length=64), + ), + migrations.AlterField( + model_name='binary', + name='version', + field=models.CharField(blank=True, default='', max_length=32), + ), + migrations.AlterField( + model_name='machine', + name='config', + field=models.JSONField(blank=True, default=dict, help_text='Machine-specific config overrides (e.g., resolved binary paths like WGET_BINARY)', null=True), + ), + migrations.AlterField( + model_name='machine', + name='id', + field=models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True), + ), + migrations.AlterField( + model_name='machine', + name='stats', + field=models.JSONField(blank=True, default=dict, null=True), + ), + migrations.AlterField( + model_name='networkinterface', + name='id', + field=models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True), + ), + ], + database_operations=[ + # No database changes - schema already correct from previous migrations + ], + ), + ] diff --git a/archivebox/machine/models.py b/archivebox/machine/models.py index aeffd71c..cb4130f2 100755 --- a/archivebox/machine/models.py +++ b/archivebox/machine/models.py @@ -44,8 +44,8 @@ class Machine(ModelWithHealthStats): os_platform = models.CharField(max_length=63, default=None, null=False) os_release = models.CharField(max_length=63, default=None, null=False) os_kernel = models.CharField(max_length=255, default=None, null=False) - stats = models.JSONField(default=dict, null=False) - config = models.JSONField(default=dict, null=False, blank=True, + stats = models.JSONField(default=dict, null=True, blank=True) + config = models.JSONField(default=dict, null=True, blank=True, help_text="Machine-specific config overrides (e.g., resolved binary paths like WGET_BINARY)") num_uses_failed = models.PositiveIntegerField(default=0) num_uses_succeeded = models.PositiveIntegerField(default=0) @@ -213,7 +213,7 @@ class Binary(ModelWithHealthStats): num_uses_failed = models.PositiveIntegerField(default=0) num_uses_succeeded = models.PositiveIntegerField(default=0) - state_machine_name: str = 'machine.models.BinaryMachine' + state_machine_name: str = 'archivebox.machine.models.BinaryMachine' objects: BinaryManager = BinaryManager() diff --git a/archivebox/personas/apps.py b/archivebox/personas/apps.py index d7440140..9a1cfb90 100644 --- a/archivebox/personas/apps.py +++ b/archivebox/personas/apps.py @@ -4,3 +4,4 @@ from django.apps import AppConfig class SessionsConfig(AppConfig): default_auto_field = "django.db.models.BigAutoField" name = "archivebox.personas" + label = "personas" diff --git a/archivebox/personas/models.py b/archivebox/personas/models.py index 49b357d4..99f8ef87 100644 --- a/archivebox/personas/models.py +++ b/archivebox/personas/models.py @@ -21,7 +21,7 @@ # # COOKIES_TXT_FILE: '/path/to/cookies.txt', # # CHROME_USER_DATA_DIR: '/path/to/chrome/user/data/dir', # # CHECK_SSL_VALIDITY: False, -# # SAVE_ARCHIVE_DOT_ORG: True, +# # SAVE_ARCHIVEDOTORG: True, # # CHROME_BINARY: 'chromium' # # ... # # } diff --git a/archivebox/plugins/search_backend_ripgrep/tests/test_ripgrep_detection.py b/archivebox/plugins/search_backend_ripgrep/tests/test_ripgrep_detection.py index 69f7c331..7b639efd 100644 --- a/archivebox/plugins/search_backend_ripgrep/tests/test_ripgrep_detection.py +++ b/archivebox/plugins/search_backend_ripgrep/tests/test_ripgrep_detection.py @@ -63,7 +63,7 @@ def test_ripgrep_hook_detects_binary_from_path(): def test_ripgrep_hook_skips_when_backend_not_ripgrep(): """Test that ripgrep hook exits silently when search backend is not ripgrep.""" - hook_path = Path(__file__).parent.parent / 'on_Crawl__00_validate_ripgrep.py' + hook_path = Path(__file__).parent.parent / 'on_Crawl__00_install_ripgrep.py' env = os.environ.copy() env['SEARCH_BACKEND_ENGINE'] = 'sqlite' # Different backend @@ -82,7 +82,7 @@ def test_ripgrep_hook_skips_when_backend_not_ripgrep(): def test_ripgrep_hook_handles_absolute_path(): """Test that ripgrep hook works when RIPGREP_BINARY is an absolute path.""" - hook_path = Path(__file__).parent.parent / 'on_Crawl__00_validate_ripgrep.py' + hook_path = Path(__file__).parent.parent / 'on_Crawl__00_install_ripgrep.py' rg_path = shutil.which('rg') if not rg_path: @@ -222,7 +222,7 @@ def test_ripgrep_only_detected_when_backend_enabled(): if not shutil.which('rg'): pytest.skip("ripgrep not installed") - hook_path = Path(__file__).parent.parent / 'on_Crawl__00_validate_ripgrep.py' + hook_path = Path(__file__).parent.parent / 'on_Crawl__00_install_ripgrep.py' # Test 1: With ripgrep backend - should output Binary record env1 = os.environ.copy() diff --git a/archivebox/templates/core/snapshot.html b/archivebox/templates/core/snapshot.html index a08a87f9..1b6b2bbd 100644 --- a/archivebox/templates/core/snapshot.html +++ b/archivebox/templates/core/snapshot.html @@ -360,9 +360,11 @@
{% for result_info in archiveresults %} {% if result_info.result %} + {% plugin_thumbnail result_info.result as thumbnail_html %} + {% if thumbnail_html %}
- {% plugin_thumbnail result_info.result %} + {{ thumbnail_html }}
+ {% endif %} {% endif %} {% endfor %} @@ -395,7 +398,7 @@
- +