Fix CLI tests and improve orchestrator performance

Key changes: - Add DISABLE_ALL_PLUGINS config to allow disabling all plugins by default - Fix NetworkInterface.current() exception handling in ArchiveResultMachine - Add from_json/to_json aliases to Crawl, Snapshot, Tag, ArchiveResult models - Fix CLI commands to use to_jsonl() for JSONL output (was using to_json() string) - Fix migration 0025 to handle empty tables gracefully - Fix migration 0027 uuid7 import from uuid_compat module - Remove Process health stats fields (num_uses_failed/succeeded) - Tests now pass with inline orchestrator mode for fast processing
2026-01-03 09:25:42 +10:00 · 2025-12-31 12:40:18 +00:00
parent fd0589322b
commit 5c1e152a44
10 changed files with 136 additions and 38 deletions
--- a/archivebox/cli/archivebox_archiveresult.py
+++ b/archivebox/cli/archivebox_archiveresult.py
@@ -151,14 +151,14 @@ def create_archiveresults(
                result.save()

            if not is_tty:
-                write_record(result.to_json())
+                write_record(result.to_jsonl())
            created_count += 1
        else:
            # Create all pending plugins
            snapshot.create_pending_archiveresults()
            for result in snapshot.archiveresult_set.filter(status=ArchiveResult.StatusChoices.QUEUED):
                if not is_tty:
-                    write_record(result.to_json())
+                    write_record(result.to_jsonl())
                created_count += 1

    rprint(f'[green]Created/queued {created_count} archive results[/green]', file=sys.stderr)
@@ -209,7 +209,7 @@ def list_archiveresults(
            }.get(result.status, 'dim')
            rprint(f'[{status_color}]{result.status:10}[/{status_color}] {result.plugin:15} [dim]{result.id}[/dim] {result.snapshot.url[:40]}')
        else:
-            write_record(result.to_json())
+            write_record(result.to_jsonl())
        count += 1

    rprint(f'[dim]Listed {count} archive results[/dim]', file=sys.stderr)
@@ -263,7 +263,7 @@ def update_archiveresults(
            updated_count += 1

            if not is_tty:
-                write_record(result.to_json())
+                write_record(result.to_jsonl())

        except ArchiveResult.DoesNotExist:
            rprint(f'[yellow]ArchiveResult not found: {result_id}[/yellow]', file=sys.stderr)
--- a/archivebox/cli/archivebox_run.py
+++ b/archivebox/cli/archivebox_run.py
@@ -90,7 +90,7 @@ def process_stdin_records() -> int:
                    if crawl.status not in [Crawl.StatusChoices.SEALED]:
                        crawl.status = Crawl.StatusChoices.QUEUED
                    crawl.save()
-                    output_records.append(crawl.to_json())
+                    output_records.append(crawl.to_jsonl())
                    queued_count += 1

            elif record_type == TYPE_SNAPSHOT or (record.get('url') and not record_type):
@@ -109,7 +109,7 @@ def process_stdin_records() -> int:
                    if snapshot.status not in [Snapshot.StatusChoices.SEALED]:
                        snapshot.status = Snapshot.StatusChoices.QUEUED
                    snapshot.save()
-                    output_records.append(snapshot.to_json())
+                    output_records.append(snapshot.to_jsonl())
                    queued_count += 1

            elif record_type == TYPE_ARCHIVERESULT:
@@ -128,7 +128,7 @@ def process_stdin_records() -> int:
                    if archiveresult.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED, ArchiveResult.StatusChoices.BACKOFF]:
                        archiveresult.status = ArchiveResult.StatusChoices.QUEUED
                    archiveresult.save()
-                    output_records.append(archiveresult.to_json())
+                    output_records.append(archiveresult.to_jsonl())
                    queued_count += 1

            else:
--- a/archivebox/cli/archivebox_snapshot.py
+++ b/archivebox/cli/archivebox_snapshot.py
@@ -119,7 +119,7 @@ def create_snapshots(
                    if snapshot:
                        created_snapshots.append(snapshot)
                        if not is_tty:
-                            write_record(snapshot.to_json())
+                            write_record(snapshot.to_jsonl())

            elif record_type == TYPE_SNAPSHOT or record.get('url'):
                # Input is a Snapshot or plain URL
@@ -133,7 +133,7 @@ def create_snapshots(
                if snapshot:
                    created_snapshots.append(snapshot)
                    if not is_tty:
-                        write_record(snapshot.to_json())
+                        write_record(snapshot.to_jsonl())

            else:
                # Pass-through: output records we don't handle
@@ -209,7 +209,7 @@ def list_snapshots(
            }.get(snapshot.status, 'dim')
            rprint(f'[{status_color}]{snapshot.status:8}[/{status_color}] [dim]{snapshot.id}[/dim] {snapshot.url[:60]}')
        else:
-            write_record(snapshot.to_json())
+            write_record(snapshot.to_jsonl())
        count += 1

    rprint(f'[dim]Listed {count} snapshots[/dim]', file=sys.stderr)
@@ -270,7 +270,7 @@ def update_snapshots(
            updated_count += 1

            if not is_tty:
-                write_record(snapshot.to_json())
+                write_record(snapshot.to_jsonl())

        except Snapshot.DoesNotExist:
            rprint(f'[yellow]Snapshot not found: {snapshot_id}[/yellow]', file=sys.stderr)
--- a/archivebox/core/migrations/0025_cleanup_schema.py
+++ b/archivebox/core/migrations/0025_cleanup_schema.py
@@ -29,10 +29,25 @@ def cleanup_extra_columns(apps, schema_editor):
            """)
            archive_results = cursor.fetchall()

+            # Skip if no archive results to migrate
+            if not archive_results:
+                print("  ✓ No ArchiveResult records to migrate")
+                return
+
            from archivebox.uuid_compat import uuid7
            from archivebox.base_models.models import get_or_create_system_user_pk

-            machine_id = cursor.execute("SELECT id FROM machine_machine LIMIT 1").fetchone()[0]
+            # Get or create a machine record
+            machine_row = cursor.execute("SELECT id FROM machine_machine LIMIT 1").fetchone()
+            if machine_row:
+                machine_id = machine_row[0]
+            else:
+                # Create a default machine record
+                machine_id = str(uuid7())
+                cursor.execute("""
+                    INSERT INTO machine_machine (id, created_at, modified_at, hostname, hw_in_docker, hw_in_vm, os_arch, os_family, os_platform, os_kernel, os_release)
+                    VALUES (?, datetime('now'), datetime('now'), 'localhost', 0, 0, 'unknown', 'unknown', 'unknown', 'unknown', 'unknown')
+                """, (machine_id,))

            for ar_id, cmd, pwd, binary_id, iface_id, start_ts, end_ts, status in archive_results:
                # Create Process record
--- a/archivebox/core/migrations/0027_alter_archiveresult_hook_name_alter_archiveresult_id_and_more.py
+++ b/archivebox/core/migrations/0027_alter_archiveresult_hook_name_alter_archiveresult_id_and_more.py
@@ -2,9 +2,10 @@

 import django.db.models.deletion
 import django.utils.timezone
-import uuid
 from django.db import migrations, models

+from archivebox.uuid_compat import uuid7
+

 class Migration(migrations.Migration):

@@ -73,7 +74,7 @@ class Migration(migrations.Migration):
        migrations.AlterField(
            model_name='archiveresult',
            name='uuid',
-            field=models.UUIDField(blank=True, db_index=True, default=uuid.uuid7, null=True),
+            field=models.UUIDField(blank=True, db_index=True, default=uuid7, null=True),
        ),
        migrations.AlterField(
            model_name='snapshot',
@@ -98,7 +99,7 @@ class Migration(migrations.Migration):
        migrations.AlterField(
            model_name='snapshot',
            name='id',
-            field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
+            field=models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True),
        ),
        migrations.AlterField(
            model_name='snapshottag',
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -128,6 +128,10 @@ class Tag(ModelWithSerializers):

        return tag

+    # Aliases
+    from_json = from_jsonl
+    to_json = to_jsonl
+

 class SnapshotTag(models.Model):
    id = models.AutoField(primary_key=True)
@@ -1596,6 +1600,10 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea

        return snapshot

+    # Aliases
+    from_json = from_jsonl
+    to_json = to_jsonl
+
    def create_pending_archiveresults(self) -> list['ArchiveResult']:
        """
        Create ArchiveResult records for all enabled hooks.
@@ -2310,6 +2318,60 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
            record['process_id'] = str(self.process_id)
        return record

+    @staticmethod
+    def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None):
+        """
+        Create or get an ArchiveResult from a JSONL record.
+
+        Args:
+            record: Dict with 'snapshot_id' and 'plugin' (required)
+            overrides: Optional dict of field overrides
+
+        Returns:
+            ArchiveResult instance or None if invalid
+        """
+        from django.utils import timezone
+
+        overrides = overrides or {}
+
+        # Check if already exists by ID
+        ar_id = record.get('id')
+        if ar_id:
+            try:
+                return ArchiveResult.objects.get(id=ar_id)
+            except ArchiveResult.DoesNotExist:
+                pass
+
+        snapshot_id = record.get('snapshot_id')
+        plugin = record.get('plugin')
+        hook_name = record.get('hook_name', '')
+
+        if not snapshot_id or not plugin:
+            return None
+
+        # Get the snapshot
+        try:
+            snapshot = Snapshot.objects.get(id=snapshot_id)
+        except Snapshot.DoesNotExist:
+            return None
+
+        # Create or get the archive result
+        ar, created = ArchiveResult.objects.get_or_create(
+            snapshot=snapshot,
+            plugin=plugin,
+            hook_name=hook_name,
+            defaults={
+                'status': ArchiveResult.INITIAL_STATE,
+                'retry_at': timezone.now(),
+                **overrides,
+            },
+        )
+        return ar
+
+    # Aliases
+    from_json = from_jsonl
+    to_json = to_jsonl
+
    def save(self, *args, **kwargs):
        is_new = self._state.adding

@@ -2954,12 +3016,14 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):

    @started.enter
    def enter_started(self):
-        from archivebox.machine.models import NetworkInterface
-
-        # Update Process with network interface
+        # Update Process with network interface (optional - fails gracefully)
        if self.archiveresult.process_id:
-            self.archiveresult.process.iface = NetworkInterface.current()
-            self.archiveresult.process.save()
+            try:
+                from archivebox.machine.models import NetworkInterface
+                self.archiveresult.process.iface = NetworkInterface.current()
+                self.archiveresult.process.save()
+            except Exception:
+                pass  # Network interface detection is optional

        # Lock the object and mark start time
        self.archiveresult.update_and_requeue(
--- a/archivebox/crawls/models.py
+++ b/archivebox/crawls/models.py
@@ -195,6 +195,10 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
        )
        return crawl

+    # Alias for from_jsonl
+    from_json = from_jsonl
+    to_json = to_jsonl
+
    @property
    def output_dir_parent(self) -> str:
        """Construct parent directory: users/{user_id}/crawls/{YYYYMMDD}"""
--- a/archivebox/hooks.py
+++ b/archivebox/hooks.py
@@ -890,17 +890,35 @@ def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[
        >>> get_plugin_special_config('wget', config)
        {'enabled': True, 'timeout': 120, 'binary': '/usr/bin/wget'}
    """
+    import os
+
    plugin_upper = plugin_name.upper()

-    # 1. Enabled: PLUGINNAME_ENABLED (default True)
+    # 1. Enabled: PLUGINNAME_ENABLED (default True unless DISABLE_ALL_PLUGINS is set)
    # Old names (USE_*, SAVE_*) are aliased in config.json via x-aliases
    enabled_key = f'{plugin_upper}_ENABLED'
-    enabled = config.get(enabled_key)
-    if enabled is None:
-        enabled = True
-    elif isinstance(enabled, str):
-        # Handle string values from config file ("true"/"false")
-        enabled = enabled.lower() not in ('false', '0', 'no', '')
+
+    # Check for global plugin disable flag (useful for tests)
+    # When set, only plugins with EXPLICIT env var override are enabled
+    global_disable = config.get('DISABLE_ALL_PLUGINS') or os.environ.get('DISABLE_ALL_PLUGINS', '')
+    is_globally_disabled = global_disable is True or (isinstance(global_disable, str) and global_disable.lower() in ('true', '1', 'yes'))
+
+    if is_globally_disabled:
+        # When globally disabled, only check environment for explicit override
+        # Ignore defaults from config.json
+        env_val = os.environ.get(enabled_key, '')
+        if env_val.lower() in ('true', '1', 'yes'):
+            enabled = True
+        else:
+            enabled = False
+    else:
+        # Normal mode - check config (which includes defaults from config.json)
+        enabled = config.get(enabled_key)
+        if enabled is None:
+            enabled = True
+        elif isinstance(enabled, str):
+            # Handle string values from config file ("true"/"false")
+            enabled = enabled.lower() not in ('false', '0', 'no', '')

    # 2. Timeout: PLUGINNAME_TIMEOUT (fallback to TIMEOUT, default 300)
    timeout_key = f'{plugin_upper}_TIMEOUT'
--- a/archivebox/machine/models.py
+++ b/archivebox/machine/models.py
@@ -542,7 +542,7 @@ class ProcessManager(models.Manager):
        return process


-class Process(ModelWithHealthStats):
+class Process(models.Model):
    """
    Tracks a single OS process execution.

@@ -667,10 +667,6 @@ class Process(ModelWithHealthStats):
        help_text='When to retry this process'
    )

-    # Health stats
-    num_uses_failed = models.PositiveIntegerField(default=0)
-    num_uses_succeeded = models.PositiveIntegerField(default=0)
-
    state_machine_name: str = 'archivebox.machine.models.ProcessMachine'

    objects: ProcessManager = ProcessManager()
--- a/archivebox/tests/conftest.py
+++ b/archivebox/tests/conftest.py
@@ -25,12 +25,12 @@ def run_archivebox_cmd(
    env['DATA_DIR'] = str(data_dir)
    env['USE_COLOR'] = 'False'
    env['SHOW_PROGRESS'] = 'False'
-    # Enable only HEADERS extractor (pure Python, no Chrome) - disable all others
-    env['SAVE_HEADERS'] = 'True'
-    for extractor in ['TITLE', 'FAVICON', 'WGET', 'WARC', 'PDF', 'SCREENSHOT',
-                      'DOM', 'SINGLEFILE', 'READABILITY', 'MERCURY', 'GIT',
-                      'YTDLP', 'HTMLTOTEXT', 'ARCHIVEDOTORG']:
-        env[f'SAVE_{extractor}'] = 'False'
+
+    # Disable ALL plugins by default, then enable only the fast ones we need
+    env['DISABLE_ALL_PLUGINS'] = 'True'
+    # Enable only HEADERS - it's a fast JS script that makes HTTP HEAD request
+    env['HEADERS_ENABLED'] = 'True'
+
    # Speed up network operations
    env['TIMEOUT'] = '5'
    env['CHECK_SSL_VALIDITY'] = 'False'