mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-01-03 09:25:42 +10:00
Fix CLI tests and improve orchestrator performance
Key changes: - Add DISABLE_ALL_PLUGINS config to allow disabling all plugins by default - Fix NetworkInterface.current() exception handling in ArchiveResultMachine - Add from_json/to_json aliases to Crawl, Snapshot, Tag, ArchiveResult models - Fix CLI commands to use to_jsonl() for JSONL output (was using to_json() string) - Fix migration 0025 to handle empty tables gracefully - Fix migration 0027 uuid7 import from uuid_compat module - Remove Process health stats fields (num_uses_failed/succeeded) - Tests now pass with inline orchestrator mode for fast processing
This commit is contained in:
@@ -151,14 +151,14 @@ def create_archiveresults(
|
||||
result.save()
|
||||
|
||||
if not is_tty:
|
||||
write_record(result.to_json())
|
||||
write_record(result.to_jsonl())
|
||||
created_count += 1
|
||||
else:
|
||||
# Create all pending plugins
|
||||
snapshot.create_pending_archiveresults()
|
||||
for result in snapshot.archiveresult_set.filter(status=ArchiveResult.StatusChoices.QUEUED):
|
||||
if not is_tty:
|
||||
write_record(result.to_json())
|
||||
write_record(result.to_jsonl())
|
||||
created_count += 1
|
||||
|
||||
rprint(f'[green]Created/queued {created_count} archive results[/green]', file=sys.stderr)
|
||||
@@ -209,7 +209,7 @@ def list_archiveresults(
|
||||
}.get(result.status, 'dim')
|
||||
rprint(f'[{status_color}]{result.status:10}[/{status_color}] {result.plugin:15} [dim]{result.id}[/dim] {result.snapshot.url[:40]}')
|
||||
else:
|
||||
write_record(result.to_json())
|
||||
write_record(result.to_jsonl())
|
||||
count += 1
|
||||
|
||||
rprint(f'[dim]Listed {count} archive results[/dim]', file=sys.stderr)
|
||||
@@ -263,7 +263,7 @@ def update_archiveresults(
|
||||
updated_count += 1
|
||||
|
||||
if not is_tty:
|
||||
write_record(result.to_json())
|
||||
write_record(result.to_jsonl())
|
||||
|
||||
except ArchiveResult.DoesNotExist:
|
||||
rprint(f'[yellow]ArchiveResult not found: {result_id}[/yellow]', file=sys.stderr)
|
||||
|
||||
@@ -90,7 +90,7 @@ def process_stdin_records() -> int:
|
||||
if crawl.status not in [Crawl.StatusChoices.SEALED]:
|
||||
crawl.status = Crawl.StatusChoices.QUEUED
|
||||
crawl.save()
|
||||
output_records.append(crawl.to_json())
|
||||
output_records.append(crawl.to_jsonl())
|
||||
queued_count += 1
|
||||
|
||||
elif record_type == TYPE_SNAPSHOT or (record.get('url') and not record_type):
|
||||
@@ -109,7 +109,7 @@ def process_stdin_records() -> int:
|
||||
if snapshot.status not in [Snapshot.StatusChoices.SEALED]:
|
||||
snapshot.status = Snapshot.StatusChoices.QUEUED
|
||||
snapshot.save()
|
||||
output_records.append(snapshot.to_json())
|
||||
output_records.append(snapshot.to_jsonl())
|
||||
queued_count += 1
|
||||
|
||||
elif record_type == TYPE_ARCHIVERESULT:
|
||||
@@ -128,7 +128,7 @@ def process_stdin_records() -> int:
|
||||
if archiveresult.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED, ArchiveResult.StatusChoices.BACKOFF]:
|
||||
archiveresult.status = ArchiveResult.StatusChoices.QUEUED
|
||||
archiveresult.save()
|
||||
output_records.append(archiveresult.to_json())
|
||||
output_records.append(archiveresult.to_jsonl())
|
||||
queued_count += 1
|
||||
|
||||
else:
|
||||
|
||||
@@ -119,7 +119,7 @@ def create_snapshots(
|
||||
if snapshot:
|
||||
created_snapshots.append(snapshot)
|
||||
if not is_tty:
|
||||
write_record(snapshot.to_json())
|
||||
write_record(snapshot.to_jsonl())
|
||||
|
||||
elif record_type == TYPE_SNAPSHOT or record.get('url'):
|
||||
# Input is a Snapshot or plain URL
|
||||
@@ -133,7 +133,7 @@ def create_snapshots(
|
||||
if snapshot:
|
||||
created_snapshots.append(snapshot)
|
||||
if not is_tty:
|
||||
write_record(snapshot.to_json())
|
||||
write_record(snapshot.to_jsonl())
|
||||
|
||||
else:
|
||||
# Pass-through: output records we don't handle
|
||||
@@ -209,7 +209,7 @@ def list_snapshots(
|
||||
}.get(snapshot.status, 'dim')
|
||||
rprint(f'[{status_color}]{snapshot.status:8}[/{status_color}] [dim]{snapshot.id}[/dim] {snapshot.url[:60]}')
|
||||
else:
|
||||
write_record(snapshot.to_json())
|
||||
write_record(snapshot.to_jsonl())
|
||||
count += 1
|
||||
|
||||
rprint(f'[dim]Listed {count} snapshots[/dim]', file=sys.stderr)
|
||||
@@ -270,7 +270,7 @@ def update_snapshots(
|
||||
updated_count += 1
|
||||
|
||||
if not is_tty:
|
||||
write_record(snapshot.to_json())
|
||||
write_record(snapshot.to_jsonl())
|
||||
|
||||
except Snapshot.DoesNotExist:
|
||||
rprint(f'[yellow]Snapshot not found: {snapshot_id}[/yellow]', file=sys.stderr)
|
||||
|
||||
@@ -29,10 +29,25 @@ def cleanup_extra_columns(apps, schema_editor):
|
||||
""")
|
||||
archive_results = cursor.fetchall()
|
||||
|
||||
# Skip if no archive results to migrate
|
||||
if not archive_results:
|
||||
print(" ✓ No ArchiveResult records to migrate")
|
||||
return
|
||||
|
||||
from archivebox.uuid_compat import uuid7
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
|
||||
machine_id = cursor.execute("SELECT id FROM machine_machine LIMIT 1").fetchone()[0]
|
||||
# Get or create a machine record
|
||||
machine_row = cursor.execute("SELECT id FROM machine_machine LIMIT 1").fetchone()
|
||||
if machine_row:
|
||||
machine_id = machine_row[0]
|
||||
else:
|
||||
# Create a default machine record
|
||||
machine_id = str(uuid7())
|
||||
cursor.execute("""
|
||||
INSERT INTO machine_machine (id, created_at, modified_at, hostname, hw_in_docker, hw_in_vm, os_arch, os_family, os_platform, os_kernel, os_release)
|
||||
VALUES (?, datetime('now'), datetime('now'), 'localhost', 0, 0, 'unknown', 'unknown', 'unknown', 'unknown', 'unknown')
|
||||
""", (machine_id,))
|
||||
|
||||
for ar_id, cmd, pwd, binary_id, iface_id, start_ts, end_ts, status in archive_results:
|
||||
# Create Process record
|
||||
|
||||
@@ -2,9 +2,10 @@
|
||||
|
||||
import django.db.models.deletion
|
||||
import django.utils.timezone
|
||||
import uuid
|
||||
from django.db import migrations, models
|
||||
|
||||
from archivebox.uuid_compat import uuid7
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
@@ -73,7 +74,7 @@ class Migration(migrations.Migration):
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='uuid',
|
||||
field=models.UUIDField(blank=True, db_index=True, default=uuid.uuid7, null=True),
|
||||
field=models.UUIDField(blank=True, db_index=True, default=uuid7, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
@@ -98,7 +99,7 @@ class Migration(migrations.Migration):
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='id',
|
||||
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
|
||||
field=models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshottag',
|
||||
|
||||
@@ -128,6 +128,10 @@ class Tag(ModelWithSerializers):
|
||||
|
||||
return tag
|
||||
|
||||
# Aliases
|
||||
from_json = from_jsonl
|
||||
to_json = to_jsonl
|
||||
|
||||
|
||||
class SnapshotTag(models.Model):
|
||||
id = models.AutoField(primary_key=True)
|
||||
@@ -1596,6 +1600,10 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
|
||||
return snapshot
|
||||
|
||||
# Aliases
|
||||
from_json = from_jsonl
|
||||
to_json = to_jsonl
|
||||
|
||||
def create_pending_archiveresults(self) -> list['ArchiveResult']:
|
||||
"""
|
||||
Create ArchiveResult records for all enabled hooks.
|
||||
@@ -2310,6 +2318,60 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
record['process_id'] = str(self.process_id)
|
||||
return record
|
||||
|
||||
@staticmethod
|
||||
def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None):
|
||||
"""
|
||||
Create or get an ArchiveResult from a JSONL record.
|
||||
|
||||
Args:
|
||||
record: Dict with 'snapshot_id' and 'plugin' (required)
|
||||
overrides: Optional dict of field overrides
|
||||
|
||||
Returns:
|
||||
ArchiveResult instance or None if invalid
|
||||
"""
|
||||
from django.utils import timezone
|
||||
|
||||
overrides = overrides or {}
|
||||
|
||||
# Check if already exists by ID
|
||||
ar_id = record.get('id')
|
||||
if ar_id:
|
||||
try:
|
||||
return ArchiveResult.objects.get(id=ar_id)
|
||||
except ArchiveResult.DoesNotExist:
|
||||
pass
|
||||
|
||||
snapshot_id = record.get('snapshot_id')
|
||||
plugin = record.get('plugin')
|
||||
hook_name = record.get('hook_name', '')
|
||||
|
||||
if not snapshot_id or not plugin:
|
||||
return None
|
||||
|
||||
# Get the snapshot
|
||||
try:
|
||||
snapshot = Snapshot.objects.get(id=snapshot_id)
|
||||
except Snapshot.DoesNotExist:
|
||||
return None
|
||||
|
||||
# Create or get the archive result
|
||||
ar, created = ArchiveResult.objects.get_or_create(
|
||||
snapshot=snapshot,
|
||||
plugin=plugin,
|
||||
hook_name=hook_name,
|
||||
defaults={
|
||||
'status': ArchiveResult.INITIAL_STATE,
|
||||
'retry_at': timezone.now(),
|
||||
**overrides,
|
||||
},
|
||||
)
|
||||
return ar
|
||||
|
||||
# Aliases
|
||||
from_json = from_jsonl
|
||||
to_json = to_jsonl
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
is_new = self._state.adding
|
||||
|
||||
@@ -2954,12 +3016,14 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):
|
||||
|
||||
@started.enter
|
||||
def enter_started(self):
|
||||
from archivebox.machine.models import NetworkInterface
|
||||
|
||||
# Update Process with network interface
|
||||
# Update Process with network interface (optional - fails gracefully)
|
||||
if self.archiveresult.process_id:
|
||||
self.archiveresult.process.iface = NetworkInterface.current()
|
||||
self.archiveresult.process.save()
|
||||
try:
|
||||
from archivebox.machine.models import NetworkInterface
|
||||
self.archiveresult.process.iface = NetworkInterface.current()
|
||||
self.archiveresult.process.save()
|
||||
except Exception:
|
||||
pass # Network interface detection is optional
|
||||
|
||||
# Lock the object and mark start time
|
||||
self.archiveresult.update_and_requeue(
|
||||
|
||||
@@ -195,6 +195,10 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
)
|
||||
return crawl
|
||||
|
||||
# Alias for from_jsonl
|
||||
from_json = from_jsonl
|
||||
to_json = to_jsonl
|
||||
|
||||
@property
|
||||
def output_dir_parent(self) -> str:
|
||||
"""Construct parent directory: users/{user_id}/crawls/{YYYYMMDD}"""
|
||||
|
||||
@@ -890,17 +890,35 @@ def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[
|
||||
>>> get_plugin_special_config('wget', config)
|
||||
{'enabled': True, 'timeout': 120, 'binary': '/usr/bin/wget'}
|
||||
"""
|
||||
import os
|
||||
|
||||
plugin_upper = plugin_name.upper()
|
||||
|
||||
# 1. Enabled: PLUGINNAME_ENABLED (default True)
|
||||
# 1. Enabled: PLUGINNAME_ENABLED (default True unless DISABLE_ALL_PLUGINS is set)
|
||||
# Old names (USE_*, SAVE_*) are aliased in config.json via x-aliases
|
||||
enabled_key = f'{plugin_upper}_ENABLED'
|
||||
enabled = config.get(enabled_key)
|
||||
if enabled is None:
|
||||
enabled = True
|
||||
elif isinstance(enabled, str):
|
||||
# Handle string values from config file ("true"/"false")
|
||||
enabled = enabled.lower() not in ('false', '0', 'no', '')
|
||||
|
||||
# Check for global plugin disable flag (useful for tests)
|
||||
# When set, only plugins with EXPLICIT env var override are enabled
|
||||
global_disable = config.get('DISABLE_ALL_PLUGINS') or os.environ.get('DISABLE_ALL_PLUGINS', '')
|
||||
is_globally_disabled = global_disable is True or (isinstance(global_disable, str) and global_disable.lower() in ('true', '1', 'yes'))
|
||||
|
||||
if is_globally_disabled:
|
||||
# When globally disabled, only check environment for explicit override
|
||||
# Ignore defaults from config.json
|
||||
env_val = os.environ.get(enabled_key, '')
|
||||
if env_val.lower() in ('true', '1', 'yes'):
|
||||
enabled = True
|
||||
else:
|
||||
enabled = False
|
||||
else:
|
||||
# Normal mode - check config (which includes defaults from config.json)
|
||||
enabled = config.get(enabled_key)
|
||||
if enabled is None:
|
||||
enabled = True
|
||||
elif isinstance(enabled, str):
|
||||
# Handle string values from config file ("true"/"false")
|
||||
enabled = enabled.lower() not in ('false', '0', 'no', '')
|
||||
|
||||
# 2. Timeout: PLUGINNAME_TIMEOUT (fallback to TIMEOUT, default 300)
|
||||
timeout_key = f'{plugin_upper}_TIMEOUT'
|
||||
|
||||
@@ -542,7 +542,7 @@ class ProcessManager(models.Manager):
|
||||
return process
|
||||
|
||||
|
||||
class Process(ModelWithHealthStats):
|
||||
class Process(models.Model):
|
||||
"""
|
||||
Tracks a single OS process execution.
|
||||
|
||||
@@ -667,10 +667,6 @@ class Process(ModelWithHealthStats):
|
||||
help_text='When to retry this process'
|
||||
)
|
||||
|
||||
# Health stats
|
||||
num_uses_failed = models.PositiveIntegerField(default=0)
|
||||
num_uses_succeeded = models.PositiveIntegerField(default=0)
|
||||
|
||||
state_machine_name: str = 'archivebox.machine.models.ProcessMachine'
|
||||
|
||||
objects: ProcessManager = ProcessManager()
|
||||
|
||||
@@ -25,12 +25,12 @@ def run_archivebox_cmd(
|
||||
env['DATA_DIR'] = str(data_dir)
|
||||
env['USE_COLOR'] = 'False'
|
||||
env['SHOW_PROGRESS'] = 'False'
|
||||
# Enable only HEADERS extractor (pure Python, no Chrome) - disable all others
|
||||
env['SAVE_HEADERS'] = 'True'
|
||||
for extractor in ['TITLE', 'FAVICON', 'WGET', 'WARC', 'PDF', 'SCREENSHOT',
|
||||
'DOM', 'SINGLEFILE', 'READABILITY', 'MERCURY', 'GIT',
|
||||
'YTDLP', 'HTMLTOTEXT', 'ARCHIVEDOTORG']:
|
||||
env[f'SAVE_{extractor}'] = 'False'
|
||||
|
||||
# Disable ALL plugins by default, then enable only the fast ones we need
|
||||
env['DISABLE_ALL_PLUGINS'] = 'True'
|
||||
# Enable only HEADERS - it's a fast JS script that makes HTTP HEAD request
|
||||
env['HEADERS_ENABLED'] = 'True'
|
||||
|
||||
# Speed up network operations
|
||||
env['TIMEOUT'] = '5'
|
||||
env['CHECK_SSL_VALIDITY'] = 'False'
|
||||
|
||||
Reference in New Issue
Block a user