diff --git a/archivebox/cli/archivebox_archiveresult.py b/archivebox/cli/archivebox_archiveresult.py index aea83413..88cdfe00 100644 --- a/archivebox/cli/archivebox_archiveresult.py +++ b/archivebox/cli/archivebox_archiveresult.py @@ -151,14 +151,14 @@ def create_archiveresults( result.save() if not is_tty: - write_record(result.to_json()) + write_record(result.to_jsonl()) created_count += 1 else: # Create all pending plugins snapshot.create_pending_archiveresults() for result in snapshot.archiveresult_set.filter(status=ArchiveResult.StatusChoices.QUEUED): if not is_tty: - write_record(result.to_json()) + write_record(result.to_jsonl()) created_count += 1 rprint(f'[green]Created/queued {created_count} archive results[/green]', file=sys.stderr) @@ -209,7 +209,7 @@ def list_archiveresults( }.get(result.status, 'dim') rprint(f'[{status_color}]{result.status:10}[/{status_color}] {result.plugin:15} [dim]{result.id}[/dim] {result.snapshot.url[:40]}') else: - write_record(result.to_json()) + write_record(result.to_jsonl()) count += 1 rprint(f'[dim]Listed {count} archive results[/dim]', file=sys.stderr) @@ -263,7 +263,7 @@ def update_archiveresults( updated_count += 1 if not is_tty: - write_record(result.to_json()) + write_record(result.to_jsonl()) except ArchiveResult.DoesNotExist: rprint(f'[yellow]ArchiveResult not found: {result_id}[/yellow]', file=sys.stderr) diff --git a/archivebox/cli/archivebox_run.py b/archivebox/cli/archivebox_run.py index 979d9341..4c074b74 100644 --- a/archivebox/cli/archivebox_run.py +++ b/archivebox/cli/archivebox_run.py @@ -90,7 +90,7 @@ def process_stdin_records() -> int: if crawl.status not in [Crawl.StatusChoices.SEALED]: crawl.status = Crawl.StatusChoices.QUEUED crawl.save() - output_records.append(crawl.to_json()) + output_records.append(crawl.to_jsonl()) queued_count += 1 elif record_type == TYPE_SNAPSHOT or (record.get('url') and not record_type): @@ -109,7 +109,7 @@ def process_stdin_records() -> int: if snapshot.status not in [Snapshot.StatusChoices.SEALED]: snapshot.status = Snapshot.StatusChoices.QUEUED snapshot.save() - output_records.append(snapshot.to_json()) + output_records.append(snapshot.to_jsonl()) queued_count += 1 elif record_type == TYPE_ARCHIVERESULT: @@ -128,7 +128,7 @@ def process_stdin_records() -> int: if archiveresult.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED, ArchiveResult.StatusChoices.BACKOFF]: archiveresult.status = ArchiveResult.StatusChoices.QUEUED archiveresult.save() - output_records.append(archiveresult.to_json()) + output_records.append(archiveresult.to_jsonl()) queued_count += 1 else: diff --git a/archivebox/cli/archivebox_snapshot.py b/archivebox/cli/archivebox_snapshot.py index 46ad2949..8d012fd9 100644 --- a/archivebox/cli/archivebox_snapshot.py +++ b/archivebox/cli/archivebox_snapshot.py @@ -119,7 +119,7 @@ def create_snapshots( if snapshot: created_snapshots.append(snapshot) if not is_tty: - write_record(snapshot.to_json()) + write_record(snapshot.to_jsonl()) elif record_type == TYPE_SNAPSHOT or record.get('url'): # Input is a Snapshot or plain URL @@ -133,7 +133,7 @@ def create_snapshots( if snapshot: created_snapshots.append(snapshot) if not is_tty: - write_record(snapshot.to_json()) + write_record(snapshot.to_jsonl()) else: # Pass-through: output records we don't handle @@ -209,7 +209,7 @@ def list_snapshots( }.get(snapshot.status, 'dim') rprint(f'[{status_color}]{snapshot.status:8}[/{status_color}] [dim]{snapshot.id}[/dim] {snapshot.url[:60]}') else: - write_record(snapshot.to_json()) + write_record(snapshot.to_jsonl()) count += 1 rprint(f'[dim]Listed {count} snapshots[/dim]', file=sys.stderr) @@ -270,7 +270,7 @@ def update_snapshots( updated_count += 1 if not is_tty: - write_record(snapshot.to_json()) + write_record(snapshot.to_jsonl()) except Snapshot.DoesNotExist: rprint(f'[yellow]Snapshot not found: {snapshot_id}[/yellow]', file=sys.stderr) diff --git a/archivebox/core/migrations/0025_cleanup_schema.py b/archivebox/core/migrations/0025_cleanup_schema.py index f4b13fd2..4e3ac07e 100644 --- a/archivebox/core/migrations/0025_cleanup_schema.py +++ b/archivebox/core/migrations/0025_cleanup_schema.py @@ -29,10 +29,25 @@ def cleanup_extra_columns(apps, schema_editor): """) archive_results = cursor.fetchall() + # Skip if no archive results to migrate + if not archive_results: + print(" ✓ No ArchiveResult records to migrate") + return + from archivebox.uuid_compat import uuid7 from archivebox.base_models.models import get_or_create_system_user_pk - machine_id = cursor.execute("SELECT id FROM machine_machine LIMIT 1").fetchone()[0] + # Get or create a machine record + machine_row = cursor.execute("SELECT id FROM machine_machine LIMIT 1").fetchone() + if machine_row: + machine_id = machine_row[0] + else: + # Create a default machine record + machine_id = str(uuid7()) + cursor.execute(""" + INSERT INTO machine_machine (id, created_at, modified_at, hostname, hw_in_docker, hw_in_vm, os_arch, os_family, os_platform, os_kernel, os_release) + VALUES (?, datetime('now'), datetime('now'), 'localhost', 0, 0, 'unknown', 'unknown', 'unknown', 'unknown', 'unknown') + """, (machine_id,)) for ar_id, cmd, pwd, binary_id, iface_id, start_ts, end_ts, status in archive_results: # Create Process record diff --git a/archivebox/core/migrations/0027_alter_archiveresult_hook_name_alter_archiveresult_id_and_more.py b/archivebox/core/migrations/0027_alter_archiveresult_hook_name_alter_archiveresult_id_and_more.py index 4f4ed92b..5cb459da 100644 --- a/archivebox/core/migrations/0027_alter_archiveresult_hook_name_alter_archiveresult_id_and_more.py +++ b/archivebox/core/migrations/0027_alter_archiveresult_hook_name_alter_archiveresult_id_and_more.py @@ -2,9 +2,10 @@ import django.db.models.deletion import django.utils.timezone -import uuid from django.db import migrations, models +from archivebox.uuid_compat import uuid7 + class Migration(migrations.Migration): @@ -73,7 +74,7 @@ class Migration(migrations.Migration): migrations.AlterField( model_name='archiveresult', name='uuid', - field=models.UUIDField(blank=True, db_index=True, default=uuid.uuid7, null=True), + field=models.UUIDField(blank=True, db_index=True, default=uuid7, null=True), ), migrations.AlterField( model_name='snapshot', @@ -98,7 +99,7 @@ class Migration(migrations.Migration): migrations.AlterField( model_name='snapshot', name='id', - field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True), + field=models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True), ), migrations.AlterField( model_name='snapshottag', diff --git a/archivebox/core/models.py b/archivebox/core/models.py index f7b45ba9..92e84c6e 100755 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -128,6 +128,10 @@ class Tag(ModelWithSerializers): return tag + # Aliases + from_json = from_jsonl + to_json = to_jsonl + class SnapshotTag(models.Model): id = models.AutoField(primary_key=True) @@ -1596,6 +1600,10 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea return snapshot + # Aliases + from_json = from_jsonl + to_json = to_jsonl + def create_pending_archiveresults(self) -> list['ArchiveResult']: """ Create ArchiveResult records for all enabled hooks. @@ -2310,6 +2318,60 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi record['process_id'] = str(self.process_id) return record + @staticmethod + def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None): + """ + Create or get an ArchiveResult from a JSONL record. + + Args: + record: Dict with 'snapshot_id' and 'plugin' (required) + overrides: Optional dict of field overrides + + Returns: + ArchiveResult instance or None if invalid + """ + from django.utils import timezone + + overrides = overrides or {} + + # Check if already exists by ID + ar_id = record.get('id') + if ar_id: + try: + return ArchiveResult.objects.get(id=ar_id) + except ArchiveResult.DoesNotExist: + pass + + snapshot_id = record.get('snapshot_id') + plugin = record.get('plugin') + hook_name = record.get('hook_name', '') + + if not snapshot_id or not plugin: + return None + + # Get the snapshot + try: + snapshot = Snapshot.objects.get(id=snapshot_id) + except Snapshot.DoesNotExist: + return None + + # Create or get the archive result + ar, created = ArchiveResult.objects.get_or_create( + snapshot=snapshot, + plugin=plugin, + hook_name=hook_name, + defaults={ + 'status': ArchiveResult.INITIAL_STATE, + 'retry_at': timezone.now(), + **overrides, + }, + ) + return ar + + # Aliases + from_json = from_jsonl + to_json = to_jsonl + def save(self, *args, **kwargs): is_new = self._state.adding @@ -2954,12 +3016,14 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True): @started.enter def enter_started(self): - from archivebox.machine.models import NetworkInterface - - # Update Process with network interface + # Update Process with network interface (optional - fails gracefully) if self.archiveresult.process_id: - self.archiveresult.process.iface = NetworkInterface.current() - self.archiveresult.process.save() + try: + from archivebox.machine.models import NetworkInterface + self.archiveresult.process.iface = NetworkInterface.current() + self.archiveresult.process.save() + except Exception: + pass # Network interface detection is optional # Lock the object and mark start time self.archiveresult.update_and_requeue( diff --git a/archivebox/crawls/models.py b/archivebox/crawls/models.py index c3b588c4..21a27995 100755 --- a/archivebox/crawls/models.py +++ b/archivebox/crawls/models.py @@ -195,6 +195,10 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith ) return crawl + # Alias for from_jsonl + from_json = from_jsonl + to_json = to_jsonl + @property def output_dir_parent(self) -> str: """Construct parent directory: users/{user_id}/crawls/{YYYYMMDD}""" diff --git a/archivebox/hooks.py b/archivebox/hooks.py index 73febfa0..936c0cda 100644 --- a/archivebox/hooks.py +++ b/archivebox/hooks.py @@ -890,17 +890,35 @@ def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[ >>> get_plugin_special_config('wget', config) {'enabled': True, 'timeout': 120, 'binary': '/usr/bin/wget'} """ + import os + plugin_upper = plugin_name.upper() - # 1. Enabled: PLUGINNAME_ENABLED (default True) + # 1. Enabled: PLUGINNAME_ENABLED (default True unless DISABLE_ALL_PLUGINS is set) # Old names (USE_*, SAVE_*) are aliased in config.json via x-aliases enabled_key = f'{plugin_upper}_ENABLED' - enabled = config.get(enabled_key) - if enabled is None: - enabled = True - elif isinstance(enabled, str): - # Handle string values from config file ("true"/"false") - enabled = enabled.lower() not in ('false', '0', 'no', '') + + # Check for global plugin disable flag (useful for tests) + # When set, only plugins with EXPLICIT env var override are enabled + global_disable = config.get('DISABLE_ALL_PLUGINS') or os.environ.get('DISABLE_ALL_PLUGINS', '') + is_globally_disabled = global_disable is True or (isinstance(global_disable, str) and global_disable.lower() in ('true', '1', 'yes')) + + if is_globally_disabled: + # When globally disabled, only check environment for explicit override + # Ignore defaults from config.json + env_val = os.environ.get(enabled_key, '') + if env_val.lower() in ('true', '1', 'yes'): + enabled = True + else: + enabled = False + else: + # Normal mode - check config (which includes defaults from config.json) + enabled = config.get(enabled_key) + if enabled is None: + enabled = True + elif isinstance(enabled, str): + # Handle string values from config file ("true"/"false") + enabled = enabled.lower() not in ('false', '0', 'no', '') # 2. Timeout: PLUGINNAME_TIMEOUT (fallback to TIMEOUT, default 300) timeout_key = f'{plugin_upper}_TIMEOUT' diff --git a/archivebox/machine/models.py b/archivebox/machine/models.py index 428633b3..85029a79 100755 --- a/archivebox/machine/models.py +++ b/archivebox/machine/models.py @@ -542,7 +542,7 @@ class ProcessManager(models.Manager): return process -class Process(ModelWithHealthStats): +class Process(models.Model): """ Tracks a single OS process execution. @@ -667,10 +667,6 @@ class Process(ModelWithHealthStats): help_text='When to retry this process' ) - # Health stats - num_uses_failed = models.PositiveIntegerField(default=0) - num_uses_succeeded = models.PositiveIntegerField(default=0) - state_machine_name: str = 'archivebox.machine.models.ProcessMachine' objects: ProcessManager = ProcessManager() diff --git a/archivebox/tests/conftest.py b/archivebox/tests/conftest.py index 70445c71..97c78871 100644 --- a/archivebox/tests/conftest.py +++ b/archivebox/tests/conftest.py @@ -25,12 +25,12 @@ def run_archivebox_cmd( env['DATA_DIR'] = str(data_dir) env['USE_COLOR'] = 'False' env['SHOW_PROGRESS'] = 'False' - # Enable only HEADERS extractor (pure Python, no Chrome) - disable all others - env['SAVE_HEADERS'] = 'True' - for extractor in ['TITLE', 'FAVICON', 'WGET', 'WARC', 'PDF', 'SCREENSHOT', - 'DOM', 'SINGLEFILE', 'READABILITY', 'MERCURY', 'GIT', - 'YTDLP', 'HTMLTOTEXT', 'ARCHIVEDOTORG']: - env[f'SAVE_{extractor}'] = 'False' + + # Disable ALL plugins by default, then enable only the fast ones we need + env['DISABLE_ALL_PLUGINS'] = 'True' + # Enable only HEADERS - it's a fast JS script that makes HTTP HEAD request + env['HEADERS_ENABLED'] = 'True' + # Speed up network operations env['TIMEOUT'] = '5' env['CHECK_SSL_VALIDITY'] = 'False'