This commit is contained in:
Nick Sweeting
2025-12-28 17:51:54 -08:00
parent 54f91c1339
commit f0aa19fa7d
157 changed files with 6774 additions and 5061 deletions

View File

@@ -23,7 +23,9 @@
"Bash(source .venv/bin/activate)",
"Bash(mv:*)",
"Bash(echo:*)",
"Bash(grep:*)"
"Bash(grep:*)",
"WebFetch(domain:python-statemachine.readthedocs.io)",
"Bash(./bin/run_plugin_tests.sh:*)"
]
}
}

View File

@@ -24,12 +24,14 @@ ASCII_LOGO = """
╚═╝ ╚═╝╚═╝ ╚═╝ ╚═════╝╚═╝ ╚═╝╚═╝ ╚═══╝ ╚══════╝ ╚═════╝ ╚═════╝ ╚═╝ ╚═╝
"""
# make sure PACKAGE_DIR is in sys.path so we can import all subfolders
# without necessarily waiting for django to load them thorugh INSTALLED_APPS
PACKAGE_DIR = Path(__file__).resolve().parent
# Add PACKAGE_DIR to sys.path - required for Django migrations to import models
# Migrations reference models like 'machine.Binary' which need to be importable
if str(PACKAGE_DIR) not in sys.path:
sys.path.append(str(PACKAGE_DIR))
os.environ['DJANGO_SETTINGS_MODULE'] = 'core.settings'
os.environ['DJANGO_SETTINGS_MODULE'] = 'archivebox.core.settings'
os.environ['TZ'] = 'UTC'
# detect ArchiveBox user's UID/GID based on data dir ownership

View File

@@ -5,7 +5,7 @@ from signal_webhooks.utils import get_webhook_model
from archivebox.base_models.admin import BaseModelAdmin
from api.models import APIToken
from archivebox.api.models import APIToken
class APITokenAdmin(BaseModelAdmin):

View File

@@ -4,9 +4,9 @@ from django.apps import AppConfig
class APIConfig(AppConfig):
name = 'api'
name = 'archivebox.api'
def register_admin(admin_site):
from api.admin import register_admin
from archivebox.api.admin import register_admin
register_admin(admin_site)

View File

@@ -7,7 +7,7 @@ from django.conf import settings
from django.db import migrations, models
import django.db.models.deletion
import api.models
import archivebox.api.models
class Migration(migrations.Migration):
@@ -38,7 +38,7 @@ class Migration(migrations.Migration):
('created_by', models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
('created_at', models.DateTimeField(auto_now_add=True, db_index=True)),
('modified_at', models.DateTimeField(auto_now=True)),
('token', models.CharField(default=api.models.generate_secret_token, max_length=32, unique=True)),
('token', models.CharField(default=archivebox.api.models.generate_secret_token, max_length=32, unique=True)),
('expires', models.DateTimeField(blank=True, null=True)),
],
options={

View File

@@ -1,6 +1,6 @@
# Generated by Django 6.0 on 2025-12-27 01:40
import base_models.models
import archivebox.core.models
import django.db.models.deletion
from django.conf import settings
from django.db import migrations, models
@@ -17,11 +17,11 @@ class Migration(migrations.Migration):
migrations.AlterField(
model_name='apitoken',
name='created_by',
field=models.ForeignKey(default=base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
field=models.ForeignKey(default=archivebox.core.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
),
migrations.AlterField(
model_name='outboundwebhook',
name='created_by',
field=models.ForeignKey(default=base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
field=models.ForeignKey(default=archivebox.core.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
),
]

View File

@@ -10,7 +10,7 @@ from django.utils import timezone
from django_stubs_ext.db.models import TypedModelMeta
from signal_webhooks.models import WebhookBase
from base_models.models import get_or_create_system_user_pk
from archivebox.base_models.models import get_or_create_system_user_pk
def generate_secret_token() -> str:
@@ -26,6 +26,7 @@ class APIToken(models.Model):
expires = models.DateTimeField(null=True, blank=True)
class Meta(TypedModelMeta):
app_label = 'api'
verbose_name = "API Key"
verbose_name_plural = "API Keys"
@@ -47,6 +48,7 @@ class OutboundWebhook(WebhookBase):
modified_at = models.DateTimeField(auto_now=True)
class Meta(WebhookBase.Meta):
app_label = 'api'
verbose_name = 'API Outbound Webhook'
def __str__(self) -> str:

View File

@@ -15,7 +15,7 @@ from ninja import NinjaAPI, Swagger
from archivebox.config import VERSION
from archivebox.config.version import get_COMMIT_HASH
from api.auth import API_AUTH_METHODS
from archivebox.api.auth import API_AUTH_METHODS
COMMIT_HASH = get_COMMIT_HASH() or 'unknown'

View File

@@ -6,8 +6,8 @@ from ninja import Router, Schema
from django.utils import timezone
from datetime import timedelta
from api.models import APIToken
from api.auth import auth_using_token, auth_using_password, get_or_create_api_token
from archivebox.api.models import APIToken
from archivebox.api.auth import auth_using_token, auth_using_password, get_or_create_api_token
router = Router(tags=['Authentication'], auth=None)

View File

@@ -118,6 +118,7 @@ def cli_add(request, args: AddCommandSchema):
plugins=args.plugins,
parser=args.parser,
bg=True, # Always run in background for API calls
created_by_id=request.user.pk,
)
return {

View File

@@ -14,8 +14,8 @@ from ninja import Router, Schema, FilterSchema, Field, Query
from ninja.pagination import paginate, PaginationBase
from ninja.errors import HttpError
from core.models import Snapshot, ArchiveResult, Tag
from api.v1_crawls import CrawlSchema
from archivebox.core.models import Snapshot, ArchiveResult, Tag
from archivebox.api.v1_crawls import CrawlSchema
router = Router(tags=['Core Models'])
@@ -80,12 +80,11 @@ class MinimalArchiveResultSchema(Schema):
@staticmethod
def resolve_created_by_id(obj):
return str(obj.created_by_id)
return str(obj.created_by.pk)
@staticmethod
def resolve_created_by_username(obj) -> str:
User = get_user_model()
return User.objects.filter(pk=obj.created_by_id).values_list('username', flat=True)[0]
return obj.created_by.username
class ArchiveResultSchema(MinimalArchiveResultSchema):
@@ -166,12 +165,11 @@ class SnapshotSchema(Schema):
@staticmethod
def resolve_created_by_id(obj):
return str(obj.created_by_id)
return str(obj.created_by.pk)
@staticmethod
def resolve_created_by_username(obj):
User = get_user_model()
return User.objects.get(id=obj.created_by_id).username
return obj.created_by.username
@staticmethod
def resolve_tags(obj):
@@ -190,8 +188,8 @@ class SnapshotSchema(Schema):
class SnapshotFilterSchema(FilterSchema):
id: Optional[str] = Field(None, q=['id__icontains', 'timestamp__startswith'])
created_by_id: str = Field(None, q='created_by_id')
created_by_username: str = Field(None, q='created_by__username__icontains')
created_by_id: str = Field(None, q='crawl__created_by_id')
created_by_username: str = Field(None, q='crawl__created_by__username__icontains')
created_at__gte: datetime = Field(None, q='created_at__gte')
created_at__lt: datetime = Field(None, q='created_at__lt')
created_at: datetime = Field(None, q='created_at')

View File

@@ -9,8 +9,8 @@ from django.contrib.auth import get_user_model
from ninja import Router, Schema
from core.models import Snapshot
from crawls.models import Crawl
from archivebox.core.models import Snapshot
from archivebox.crawls.models import Crawl
from .auth import API_AUTH_METHODS

View File

@@ -7,7 +7,7 @@ from datetime import datetime
from ninja import Router, Schema, FilterSchema, Field, Query
from ninja.pagination import paginate
from api.v1_core import CustomPagination
from archivebox.api.v1_core import CustomPagination
router = Router(tags=['Machine and Dependencies'])
@@ -102,14 +102,14 @@ class BinaryFilterSchema(FilterSchema):
@paginate(CustomPagination)
def get_machines(request, filters: MachineFilterSchema = Query(...)):
"""List all machines."""
from machine.models import Machine
from archivebox.machine.models import Machine
return filters.filter(Machine.objects.all()).distinct()
@router.get("/machine/{machine_id}", response=MachineSchema, url_name="get_machine")
def get_machine(request, machine_id: str):
"""Get a specific machine by ID."""
from machine.models import Machine
from archivebox.machine.models import Machine
from django.db.models import Q
return Machine.objects.get(Q(id__startswith=machine_id) | Q(hostname__iexact=machine_id))
@@ -117,7 +117,7 @@ def get_machine(request, machine_id: str):
@router.get("/machine/current", response=MachineSchema, url_name="get_current_machine")
def get_current_machine(request):
"""Get the current machine."""
from machine.models import Machine
from archivebox.machine.models import Machine
return Machine.current()
@@ -132,19 +132,19 @@ def get_current_machine(request):
@paginate(CustomPagination)
def get_binaries(request, filters: BinaryFilterSchema = Query(...)):
"""List all binaries."""
from machine.models import Binary
from archivebox.machine.models import Binary
return filters.filter(Binary.objects.all().select_related('machine', 'dependency')).distinct()
@router.get("/binary/{binary_id}", response=BinarySchema, url_name="get_binary")
def get_binary(request, binary_id: str):
"""Get a specific binary by ID."""
from machine.models import Binary
from archivebox.machine.models import Binary
return Binary.objects.select_related('machine', 'dependency').get(id__startswith=binary_id)
@router.get("/binary/by-name/{name}", response=List[BinarySchema], url_name="get_binaries_by_name")
def get_binaries_by_name(request, name: str):
"""Get all binaries with the given name."""
from machine.models import Binary
from archivebox.machine.models import Binary
return list(Binary.objects.filter(name__iexact=name).select_related('machine', 'dependency'))

View File

@@ -12,6 +12,7 @@ from pathlib import Path
from django.contrib import admin
from django.db import models
from django.db.models import F
from django.utils import timezone
from django.contrib.auth import get_user_model
from django.urls import reverse_lazy
@@ -110,6 +111,11 @@ class ModelWithHealthStats(models.Model):
total = max(self.num_uses_failed + self.num_uses_succeeded, 1)
return round((self.num_uses_succeeded / total) * 100)
def increment_health_stats(self, success: bool):
"""Atomically increment success or failure counter using F() expression."""
field = 'num_uses_succeeded' if success else 'num_uses_failed'
type(self).objects.filter(pk=self.pk).update(**{field: F(field) + 1})
class ModelWithConfig(models.Model):
"""Mixin for models with a JSON config field."""

View File

@@ -19,7 +19,7 @@ from archivebox.config.permissions import USER, HOSTNAME
if TYPE_CHECKING:
from core.models import Snapshot
from archivebox.core.models import Snapshot
@enforce_types
@@ -53,8 +53,8 @@ def add(urls: str | list[str],
assert depth in (0, 1, 2, 3, 4), 'Depth must be 0-4'
# import models once django is set up
from core.models import Snapshot
from crawls.models import Crawl
from archivebox.core.models import Snapshot
from archivebox.crawls.models import Crawl
from archivebox.base_models.models import get_or_create_system_user_pk
from workers.orchestrator import Orchestrator

View File

@@ -66,18 +66,38 @@ def config(*keys,
raise SystemExit(1)
else:
matching_config = FLAT_CONFIG
# Display core config sections
for config_section in CONFIGS.values():
if hasattr(config_section, 'toml_section_header'):
print(f'[grey53]\\[{config_section.toml_section_header}][/grey53]')
else:
print('[grey53]\\[CONSTANTS] # (read-only)[/grey53]')
kv_in_section = {key: val for key, val in dict(config_section).items() if key in matching_config}
print(benedict(kv_in_section).to_toml(encoder=CustomTOMLEncoder()).strip().replace('\n\n', '\n'))
print('[grey53]################################################################[/grey53]')
# Display plugin config section
from archivebox.hooks import discover_plugin_configs
plugin_configs = discover_plugin_configs()
plugin_keys = {}
# Collect all plugin config keys
for plugin_name, schema in plugin_configs.items():
if 'properties' not in schema:
continue
for key in schema['properties'].keys():
if key in matching_config:
plugin_keys[key] = matching_config[key]
# Display all plugin config in single [PLUGINS] section
if plugin_keys:
print(f'[grey53]\\[PLUGINS][/grey53]')
print(benedict(plugin_keys).to_toml(encoder=CustomTOMLEncoder()).strip().replace('\n\n', '\n'))
print('[grey53]################################################################[/grey53]')
raise SystemExit(not matching_config)
elif set:

View File

@@ -72,11 +72,11 @@ def discover_outlinks(
from archivebox.misc.jsonl import (
read_args_or_stdin, write_record,
TYPE_SNAPSHOT, get_or_create_snapshot
TYPE_SNAPSHOT
)
from archivebox.base_models.models import get_or_create_system_user_pk
from core.models import Snapshot, ArchiveResult
from crawls.models import Crawl
from archivebox.core.models import Snapshot, ArchiveResult
from archivebox.crawls.models import Crawl
from archivebox.config import CONSTANTS
from workers.orchestrator import Orchestrator
@@ -130,8 +130,10 @@ def discover_outlinks(
record['crawl_id'] = str(crawl.id)
record['depth'] = record.get('depth', 0)
snapshot = get_or_create_snapshot(record, created_by_id=created_by_id)
snapshot_ids.append(str(snapshot.id))
overrides = {'created_by_id': created_by_id}
snapshot = Snapshot.from_jsonl(record, overrides=overrides)
if snapshot:
snapshot_ids.append(str(snapshot.id))
except Exception as e:
rprint(f'[red]Error creating snapshot: {e}[/red]', file=sys.stderr)
@@ -162,7 +164,6 @@ def discover_outlinks(
defaults={
'status': ArchiveResult.StatusChoices.QUEUED,
'retry_at': timezone.now(),
'created_by_id': snapshot.created_by_id,
}
)
else:
@@ -229,7 +230,7 @@ def process_crawl_by_id(crawl_id: str) -> int:
- Transition from started -> sealed (when all snapshots done)
"""
from rich import print as rprint
from crawls.models import Crawl
from archivebox.crawls.models import Crawl
try:
crawl = Crawl.objects.get(id=crawl_id)
@@ -256,7 +257,7 @@ def is_crawl_id(value: str) -> bool:
if not uuid_pattern.match(value):
return False
# Verify it's actually a Crawl (not a Snapshot or other object)
from crawls.models import Crawl
from archivebox.crawls.models import Crawl
return Crawl.objects.filter(id=value).exists()

View File

@@ -43,7 +43,7 @@ def process_archiveresult_by_id(archiveresult_id: str) -> int:
Triggers the ArchiveResult's state machine tick() to run the extractor plugin.
"""
from rich import print as rprint
from core.models import ArchiveResult
from archivebox.core.models import ArchiveResult
try:
archiveresult = ArchiveResult.objects.get(id=archiveresult_id)
@@ -95,7 +95,7 @@ def run_plugins(
read_args_or_stdin, write_record, archiveresult_to_jsonl,
TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
)
from core.models import Snapshot, ArchiveResult
from archivebox.core.models import Snapshot, ArchiveResult
from workers.orchestrator import Orchestrator
is_tty = sys.stdout.isatty()
@@ -155,7 +155,6 @@ def run_plugins(
defaults={
'status': ArchiveResult.StatusChoices.QUEUED,
'retry_at': timezone.now(),
'created_by_id': snapshot.created_by_id,
}
)
if not created and result.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED]:
@@ -218,7 +217,7 @@ def is_archiveresult_id(value: str) -> bool:
if not uuid_pattern.match(value):
return False
# Verify it's actually an ArchiveResult (not a Snapshot or other object)
from core.models import ArchiveResult
from archivebox.core.models import ArchiveResult
return ArchiveResult.objects.filter(id=value).exists()

View File

@@ -95,7 +95,7 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
print()
print('[dodger_blue3][*] Checking links from indexes and archive folders (safe to Ctrl+C)...[/dodger_blue3]')
from core.models import Snapshot
from archivebox.core.models import Snapshot
all_links = Snapshot.objects.none()
pending_links: dict[str, SnapshotDict] = {}

View File

@@ -42,7 +42,7 @@ def install(dry_run: bool=False) -> None:
setup_django()
from django.utils import timezone
from crawls.models import Crawl
from archivebox.crawls.models import Crawl
from archivebox.base_models.models import get_or_create_system_user_pk
# Create a crawl for dependency detection
@@ -70,7 +70,7 @@ def install(dry_run: bool=False) -> None:
print(f'[+] Crawl status: {crawl.status}, retry_at: {crawl.retry_at}')
# Verify the crawl is in the queue
from crawls.models import Crawl as CrawlModel
from archivebox.crawls.models import Crawl as CrawlModel
queued_crawls = CrawlModel.objects.filter(
retry_at__lte=timezone.now()
).exclude(

View File

@@ -71,7 +71,7 @@ def remove(filter_patterns: Iterable[str]=(),
to_remove = snapshots.count()
from archivebox.search import flush_search_index
from core.models import Snapshot
from archivebox.core.models import Snapshot
flush_search_index(snapshots=snapshots)
snapshots.delete()

View File

@@ -36,7 +36,7 @@ def get_snapshots(snapshots: Optional[QuerySet]=None,
before: Optional[float]=None,
out_dir: Path=DATA_DIR) -> QuerySet:
"""Filter and return Snapshots matching the given criteria."""
from core.models import Snapshot
from archivebox.core.models import Snapshot
if snapshots:
result = snapshots
@@ -68,7 +68,7 @@ def search(filter_patterns: list[str] | None=None,
csv: str | None=None,
with_headers: bool=False):
"""List, filter, and export information about archive entries"""
from core.models import Snapshot
from archivebox.core.models import Snapshot
if with_headers and not (json or html or csv):
stderr('[X] --with-headers requires --json, --html or --csv\n', color='red')

View File

@@ -46,7 +46,7 @@ def process_snapshot_by_id(snapshot_id: str) -> int:
- Transition from started -> sealed (when all ArchiveResults done)
"""
from rich import print as rprint
from core.models import Snapshot
from archivebox.core.models import Snapshot
try:
snapshot = Snapshot.objects.get(id=snapshot_id)
@@ -88,11 +88,11 @@ def create_snapshots(
from archivebox.misc.jsonl import (
read_args_or_stdin, write_record, snapshot_to_jsonl,
TYPE_SNAPSHOT, TYPE_TAG, get_or_create_snapshot
TYPE_SNAPSHOT, TYPE_TAG
)
from archivebox.base_models.models import get_or_create_system_user_pk
from core.models import Snapshot
from crawls.models import Crawl
from archivebox.core.models import Snapshot
from archivebox.crawls.models import Crawl
from archivebox.config import CONSTANTS
created_by_id = created_by_id or get_or_create_system_user_pk()
@@ -137,8 +137,10 @@ def create_snapshots(
record['tags'] = tag
# Get or create the snapshot
snapshot = get_or_create_snapshot(record, created_by_id=created_by_id)
created_snapshots.append(snapshot)
overrides = {'created_by_id': created_by_id}
snapshot = Snapshot.from_jsonl(record, overrides=overrides)
if snapshot:
created_snapshots.append(snapshot)
# Output JSONL record (only when piped)
if not is_tty:

View File

@@ -21,7 +21,7 @@ def status(out_dir: Path=DATA_DIR) -> None:
from django.contrib.auth import get_user_model
from archivebox.misc.db import get_admins
from core.models import Snapshot
from archivebox.core.models import Snapshot
User = get_user_model()
print('[green]\\[*] Scanning archive main index...[/green]')

View File

@@ -36,7 +36,7 @@ def update(filter_patterns: Iterable[str] = (),
from archivebox.config.django import setup_django
setup_django()
from core.models import Snapshot
from archivebox.core.models import Snapshot
from django.utils import timezone
while True:
@@ -83,7 +83,7 @@ def import_orphans_from_archive(resume_from: str = None, batch_size: int = 100)
Skip symlinks (already migrated).
Create DB records and trigger migration on save().
"""
from core.models import Snapshot
from archivebox.core.models import Snapshot
from archivebox.config import CONSTANTS
from django.db import transaction
@@ -151,7 +151,7 @@ def process_all_db_snapshots(batch_size: int = 100) -> dict:
Process all snapshots in DB.
Reconcile index.json and queue for archiving.
"""
from core.models import Snapshot
from archivebox.core.models import Snapshot
from django.db import transaction
from django.utils import timezone
@@ -189,7 +189,7 @@ def process_filtered_snapshots(
batch_size: int
) -> dict:
"""Process snapshots matching filters (DB query only)."""
from core.models import Snapshot
from archivebox.core.models import Snapshot
from django.db import transaction
from django.utils import timezone
from datetime import datetime

View File

@@ -107,7 +107,7 @@ def version(quiet: bool=False,
from archivebox.config.django import setup_django
setup_django()
from machine.models import Machine, Binary
from archivebox.machine.models import Machine, Binary
machine = Machine.current()

View File

@@ -542,10 +542,10 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
Test: archivebox snapshot URL
Should create a Snapshot and output JSONL when piped.
"""
from core.models import Snapshot
from archivebox.core.models import Snapshot
from archivebox.misc.jsonl import (
read_args_or_stdin, write_record, snapshot_to_jsonl,
TYPE_SNAPSHOT, get_or_create_snapshot
TYPE_SNAPSHOT
)
from archivebox.base_models.models import get_or_create_system_user_pk
@@ -559,7 +559,8 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
self.assertEqual(records[0]['url'], url)
# Create snapshot
snapshot = get_or_create_snapshot(records[0], created_by_id=created_by_id)
overrides = {'created_by_id': created_by_id}
snapshot = Snapshot.from_jsonl(records[0], overrides=overrides)
self.assertIsNotNone(snapshot.id)
self.assertEqual(snapshot.url, url)
@@ -575,9 +576,9 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
Test: archivebox snapshot URL | archivebox extract
Extract should accept JSONL output from snapshot command.
"""
from core.models import Snapshot, ArchiveResult
from archivebox.core.models import Snapshot, ArchiveResult
from archivebox.misc.jsonl import (
snapshot_to_jsonl, read_args_or_stdin, get_or_create_snapshot,
snapshot_to_jsonl, read_args_or_stdin,
TYPE_SNAPSHOT
)
from archivebox.base_models.models import get_or_create_system_user_pk
@@ -586,7 +587,8 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
# Step 1: Create snapshot (simulating 'archivebox snapshot')
url = 'https://test-extract-1.example.com'
snapshot = get_or_create_snapshot({'url': url}, created_by_id=created_by_id)
overrides = {'created_by_id': created_by_id}
snapshot = Snapshot.from_jsonl({'url': url}, overrides=overrides)
snapshot_output = snapshot_to_jsonl(snapshot)
# Step 2: Parse snapshot output as extract input
@@ -648,7 +650,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
This is equivalent to: archivebox add URL
"""
from core.models import Snapshot
from archivebox.core.models import Snapshot
from archivebox.misc.jsonl import (
get_or_create_snapshot, snapshot_to_jsonl, read_args_or_stdin,
TYPE_SNAPSHOT
@@ -682,7 +684,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
This is equivalent to: archivebox add --depth=1 URL
"""
from core.models import Snapshot
from archivebox.core.models import Snapshot
from archivebox.misc.jsonl import (
get_or_create_snapshot, snapshot_to_jsonl, read_args_or_stdin,
TYPE_SNAPSHOT
@@ -772,7 +774,7 @@ class TestDepthWorkflows(unittest.TestCase):
Depth 0: Only archive the specified URL, no crawling.
"""
from core.models import Snapshot
from archivebox.core.models import Snapshot
from archivebox.misc.jsonl import get_or_create_snapshot
from archivebox.base_models.models import get_or_create_system_user_pk

View File

@@ -35,177 +35,41 @@ def _get_config():
# These are recalculated each time the module attribute is accessed
def __getattr__(name: str):
"""Module-level __getattr__ for lazy config loading."""
# Timeout settings
"""
Module-level __getattr__ for lazy config loading.
Only provides backwards compatibility for GENERIC/SHARED config.
Plugin-specific config (binaries, args, toggles) should come from plugin config.json files.
"""
# Generic timeout settings (used by multiple plugins)
if name == 'TIMEOUT':
cfg, _ = _get_config()
return cfg.TIMEOUT
if name == 'MEDIA_TIMEOUT':
cfg, _ = _get_config()
return cfg.MEDIA_TIMEOUT
# SSL/Security settings
# Generic SSL/Security settings (used by multiple plugins)
if name == 'CHECK_SSL_VALIDITY':
cfg, _ = _get_config()
return cfg.CHECK_SSL_VALIDITY
# Storage settings
# Generic storage settings (used by multiple plugins)
if name == 'RESTRICT_FILE_NAMES':
_, storage = _get_config()
return storage.RESTRICT_FILE_NAMES
# User agent / cookies
# Generic user agent / cookies (used by multiple plugins)
if name == 'COOKIES_FILE':
cfg, _ = _get_config()
return cfg.COOKIES_FILE
if name == 'USER_AGENT':
cfg, _ = _get_config()
return cfg.USER_AGENT
if name == 'CURL_USER_AGENT':
cfg, _ = _get_config()
return cfg.USER_AGENT
if name == 'WGET_USER_AGENT':
cfg, _ = _get_config()
return cfg.USER_AGENT
if name == 'CHROME_USER_AGENT':
cfg, _ = _get_config()
return cfg.USER_AGENT
# Archive method toggles (SAVE_*)
if name == 'SAVE_TITLE':
return True
if name == 'SAVE_FAVICON':
return True
if name == 'SAVE_WGET':
return True
if name == 'SAVE_WARC':
return True
if name == 'SAVE_WGET_REQUISITES':
return True
if name == 'SAVE_SINGLEFILE':
return True
if name == 'SAVE_READABILITY':
return True
if name == 'SAVE_MERCURY':
return True
if name == 'SAVE_HTMLTOTEXT':
return True
if name == 'SAVE_PDF':
return True
if name == 'SAVE_SCREENSHOT':
return True
if name == 'SAVE_DOM':
return True
if name == 'SAVE_HEADERS':
return True
if name == 'SAVE_GIT':
return True
if name == 'SAVE_MEDIA':
return True
if name == 'SAVE_ARCHIVE_DOT_ORG':
return True
# Extractor-specific settings
# Generic resolution settings (used by multiple plugins)
if name == 'RESOLUTION':
cfg, _ = _get_config()
return cfg.RESOLUTION
if name == 'GIT_DOMAINS':
return 'github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht'
if name == 'MEDIA_MAX_SIZE':
cfg, _ = _get_config()
return cfg.MEDIA_MAX_SIZE
if name == 'FAVICON_PROVIDER':
return 'https://www.google.com/s2/favicons?domain={}'
# Binary paths (use shutil.which for detection)
if name == 'CURL_BINARY':
return shutil.which('curl') or 'curl'
if name == 'WGET_BINARY':
return shutil.which('wget') or 'wget'
if name == 'GIT_BINARY':
return shutil.which('git') or 'git'
if name == 'YOUTUBEDL_BINARY':
return shutil.which('yt-dlp') or shutil.which('youtube-dl') or 'yt-dlp'
if name == 'CHROME_BINARY':
for chrome in ['chromium', 'chromium-browser', 'google-chrome', 'google-chrome-stable', 'chrome']:
path = shutil.which(chrome)
if path:
return path
return 'chromium'
if name == 'NODE_BINARY':
return shutil.which('node') or 'node'
if name == 'SINGLEFILE_BINARY':
return shutil.which('single-file') or shutil.which('singlefile') or 'single-file'
if name == 'READABILITY_BINARY':
return shutil.which('readability-extractor') or 'readability-extractor'
if name == 'MERCURY_BINARY':
return shutil.which('mercury-parser') or shutil.which('postlight-parser') or 'mercury-parser'
# Binary versions (return placeholder, actual version detection happens elsewhere)
if name == 'CURL_VERSION':
return 'curl'
if name == 'WGET_VERSION':
return 'wget'
if name == 'GIT_VERSION':
return 'git'
if name == 'YOUTUBEDL_VERSION':
return 'yt-dlp'
if name == 'CHROME_VERSION':
return 'chromium'
if name == 'SINGLEFILE_VERSION':
return 'singlefile'
if name == 'READABILITY_VERSION':
return 'readability'
if name == 'MERCURY_VERSION':
return 'mercury'
# Binary arguments
if name == 'CURL_ARGS':
return ['--silent', '--location', '--compressed']
if name == 'WGET_ARGS':
return [
'--no-verbose',
'--adjust-extension',
'--convert-links',
'--force-directories',
'--backup-converted',
'--span-hosts',
'--no-parent',
'-e', 'robots=off',
]
if name == 'GIT_ARGS':
return ['--recursive']
if name == 'YOUTUBEDL_ARGS':
cfg, _ = _get_config()
return [
'--write-description',
'--write-info-json',
'--write-annotations',
'--write-thumbnail',
'--no-call-home',
'--write-sub',
'--write-auto-subs',
'--convert-subs=srt',
'--yes-playlist',
'--continue',
'--no-abort-on-error',
'--ignore-errors',
'--geo-bypass',
'--add-metadata',
f'--format=(bv*+ba/b)[filesize<={cfg.MEDIA_MAX_SIZE}][filesize_approx<=?{cfg.MEDIA_MAX_SIZE}]/(bv*+ba/b)',
]
if name == 'SINGLEFILE_ARGS':
return None # Uses defaults
if name == 'CHROME_ARGS':
return []
# Other settings
if name == 'WGET_AUTO_COMPRESSION':
return True
if name == 'DEPENDENCIES':
return {} # Legacy, not used anymore
# Allowlist/Denylist patterns (compiled regexes)
if name == 'SAVE_ALLOWLIST_PTN':
cfg, _ = _get_config()
@@ -213,7 +77,7 @@ def __getattr__(name: str):
if name == 'SAVE_DENYLIST_PTN':
cfg, _ = _get_config()
return cfg.SAVE_DENYLIST_PTNS
raise AttributeError(f"module 'archivebox.config' has no attribute '{name}'")

View File

@@ -111,6 +111,24 @@ def load_config_file() -> Optional[benedict]:
return None
class PluginConfigSection:
"""Pseudo-section for all plugin config keys written to [PLUGINS] section in ArchiveBox.conf"""
toml_section_header = "PLUGINS"
def __init__(self, key: str):
self._key = key
def __getattr__(self, name: str) -> Any:
# Allow hasattr checks to pass for the key
if name == self._key:
return None
raise AttributeError(f"PluginConfigSection has no attribute '{name}'")
def update_in_place(self, warn: bool = True, persist: bool = False, **kwargs):
"""No-op update since plugins read config dynamically via get_config()."""
pass
def section_for_key(key: str) -> Any:
"""Find the config section containing a given key."""
from archivebox.config.common import (
@@ -121,11 +139,22 @@ def section_for_key(key: str) -> Any:
ARCHIVING_CONFIG,
SEARCH_BACKEND_CONFIG,
)
for section in [SHELL_CONFIG, STORAGE_CONFIG, GENERAL_CONFIG,
# First check core config sections
for section in [SHELL_CONFIG, STORAGE_CONFIG, GENERAL_CONFIG,
SERVER_CONFIG, ARCHIVING_CONFIG, SEARCH_BACKEND_CONFIG]:
if hasattr(section, key):
return section
# Check if this is a plugin config key
from archivebox.hooks import discover_plugin_configs
plugin_configs = discover_plugin_configs()
for plugin_name, schema in plugin_configs.items():
if 'properties' in schema and key in schema['properties']:
# All plugin config goes to [PLUGINS] section
return PluginConfigSection(key)
raise ValueError(f'No config section found for key: {key}')

View File

@@ -123,9 +123,7 @@ class ArchivingConfig(BaseConfigSet):
OVERWRITE: bool = Field(default=False)
TIMEOUT: int = Field(default=60)
MEDIA_TIMEOUT: int = Field(default=3600)
MEDIA_MAX_SIZE: str = Field(default="750m")
RESOLUTION: str = Field(default="1440,2000")
CHECK_SSL_VALIDITY: bool = Field(default=True)
USER_AGENT: str = Field(
@@ -141,15 +139,6 @@ class ArchivingConfig(BaseConfigSet):
DEFAULT_PERSONA: str = Field(default="Default")
# GIT_DOMAINS: str = Field(default='github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht')
# WGET_USER_AGENT: str = Field(default=lambda c: c['USER_AGENT'] + ' wget/{WGET_VERSION}')
# CURL_USER_AGENT: str = Field(default=lambda c: c['USER_AGENT'] + ' curl/{CURL_VERSION}')
# CHROME_USER_AGENT: str = Field(default=lambda c: c['USER_AGENT'])
# CHROME_USER_DATA_DIR: str | None = Field(default=None)
# CHROME_TIMEOUT: int = Field(default=0)
# CHROME_HEADLESS: bool = Field(default=True)
# CHROME_SANDBOX: bool = Field(default=lambda: not SHELL_CONFIG.IN_DOCKER)
def validate(self):
if int(self.TIMEOUT) < 5:
print(f"[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.TIMEOUT} seconds)[/red]", file=sys.stderr)
@@ -215,7 +204,6 @@ class SearchBackendConfig(BaseConfigSet):
SEARCH_BACKEND_ENGINE: str = Field(default="ripgrep")
SEARCH_PROCESS_HTML: bool = Field(default=True)
SEARCH_BACKEND_TIMEOUT: int = Field(default=10)
SEARCH_BACKEND_CONFIG = SearchBackendConfig()

View File

@@ -174,7 +174,7 @@ def get_config(
config.update(dict(ARCHIVING_CONFIG))
config.update(dict(SEARCH_BACKEND_CONFIG))
# Load from config file
# Load from archivebox.config.file
config_file = CONSTANTS.CONFIG_FILE
if config_file.exists():
file_config = BaseConfigSet.load_from_file(config_file)

View File

@@ -17,7 +17,7 @@ from admin_data_views.utils import render_with_table_view, render_with_item_view
from archivebox.config import CONSTANTS
from archivebox.misc.util import parse_date
from machine.models import Binary
from archivebox.machine.models import Binary
# Common binaries to check for

View File

@@ -4,7 +4,7 @@ __order__ = 100
def register_admin(admin_site):
"""Register the core.models views (Snapshot, ArchiveResult, Tag, etc.) with the admin site"""
from core.admin import register_admin as do_register
from archivebox.core.admin import register_admin as do_register
do_register(admin_site)

View File

@@ -3,11 +3,11 @@ __package__ = 'archivebox.core'
from django.contrib.auth import get_user_model
from core.models import Snapshot, ArchiveResult, Tag
from core.admin_tags import TagAdmin
from core.admin_snapshots import SnapshotAdmin
from core.admin_archiveresults import ArchiveResultAdmin
from core.admin_users import UserAdmin
from archivebox.core.models import Snapshot, ArchiveResult, Tag
from archivebox.core.admin_tags import TagAdmin
from archivebox.core.admin_snapshots import SnapshotAdmin
from archivebox.core.admin_archiveresults import ArchiveResultAdmin
from archivebox.core.admin_users import UserAdmin
def register_admin(admin_site):

View File

@@ -16,7 +16,7 @@ from archivebox.base_models.admin import BaseModelAdmin
from archivebox.hooks import get_plugin_icon
from core.models import ArchiveResult, Snapshot
from archivebox.core.models import ArchiveResult, Snapshot
def render_archiveresults_list(archiveresults_qs, limit=50):
@@ -187,7 +187,7 @@ class ArchiveResultInline(admin.TabularInline):
extra = 0
sort_fields = ('end_ts', 'plugin', 'output_str', 'status', 'cmd_version')
readonly_fields = ('id', 'result_id', 'completed', 'command', 'version')
fields = ('start_ts', 'end_ts', *readonly_fields, 'plugin', 'cmd', 'cmd_version', 'pwd', 'created_by', 'status', 'retry_at', 'output_str')
fields = ('start_ts', 'end_ts', *readonly_fields, 'plugin', 'cmd', 'cmd_version', 'pwd', 'status', 'retry_at', 'output_str')
# exclude = ('id',)
ordering = ('end_ts',)
show_change_link = True
@@ -229,17 +229,15 @@ class ArchiveResultInline(admin.TabularInline):
formset.form.base_fields['end_ts'].initial = timezone.now()
formset.form.base_fields['cmd_version'].initial = '-'
formset.form.base_fields['pwd'].initial = str(snapshot.output_dir)
formset.form.base_fields['created_by'].initial = request.user
formset.form.base_fields['cmd'].initial = '["-"]'
formset.form.base_fields['output_str'].initial = 'Manually recorded cmd output...'
if obj is not None:
# hidden values for existing entries and new entries
formset.form.base_fields['start_ts'].widget = formset.form.base_fields['start_ts'].hidden_widget()
formset.form.base_fields['end_ts'].widget = formset.form.base_fields['end_ts'].hidden_widget()
formset.form.base_fields['cmd'].widget = formset.form.base_fields['cmd'].hidden_widget()
formset.form.base_fields['pwd'].widget = formset.form.base_fields['pwd'].hidden_widget()
formset.form.base_fields['created_by'].widget = formset.form.base_fields['created_by'].hidden_widget()
formset.form.base_fields['cmd_version'].widget = formset.form.base_fields['cmd_version'].hidden_widget()
return formset
@@ -252,8 +250,8 @@ class ArchiveResultInline(admin.TabularInline):
class ArchiveResultAdmin(BaseModelAdmin):
list_display = ('id', 'created_by', 'created_at', 'snapshot_info', 'tags_str', 'status', 'plugin_with_icon', 'cmd_str', 'output_str')
sort_fields = ('id', 'created_by', 'created_at', 'plugin', 'status')
list_display = ('id', 'created_at', 'snapshot_info', 'tags_str', 'status', 'plugin_with_icon', 'cmd_str', 'output_str')
sort_fields = ('id', 'created_at', 'plugin', 'status')
readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'plugin_with_icon', 'iface')
search_fields = ('id', 'snapshot__url', 'plugin', 'output_str', 'cmd_version', 'cmd', 'snapshot__timestamp')
autocomplete_fields = ['snapshot']
@@ -279,10 +277,6 @@ class ArchiveResultAdmin(BaseModelAdmin):
'fields': ('output_str', 'output_json', 'output_files', 'output_size', 'output_mimetypes', 'output_summary'),
'classes': ('card', 'wide'),
}),
('Metadata', {
'fields': ('created_by',),
'classes': ('card',),
}),
)
list_filter = ('status', 'plugin', 'start_ts', 'cmd_version')

View File

@@ -38,11 +38,11 @@ def register_admin_site():
# Register admin views for each app
# (Previously handled by ABX plugin system, now called directly)
from core.admin import register_admin as register_core_admin
from crawls.admin import register_admin as register_crawls_admin
from api.admin import register_admin as register_api_admin
from machine.admin import register_admin as register_machine_admin
from workers.admin import register_admin as register_workers_admin
from archivebox.core.admin import register_admin as register_core_admin
from archivebox.crawls.admin import register_admin as register_crawls_admin
from archivebox.api.admin import register_admin as register_api_admin
from archivebox.machine.admin import register_admin as register_machine_admin
from archivebox.workers.admin import register_admin as register_workers_admin
register_core_admin(archivebox_admin)
register_crawls_admin(archivebox_admin)

View File

@@ -23,9 +23,9 @@ from archivebox.search.admin import SearchResultsAdminMixin
from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
from archivebox.workers.tasks import bg_archive_snapshots, bg_add
from core.models import Tag, Snapshot
from core.admin_tags import TagInline
from core.admin_archiveresults import ArchiveResultInline, render_archiveresults_list
from archivebox.core.models import Tag, Snapshot
from archivebox.core.admin_tags import TagInline
from archivebox.core.admin_archiveresults import ArchiveResultInline, render_archiveresults_list
# GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': [], 'CAN_UPGRADE': False}
@@ -59,7 +59,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
sort_fields = ('title_str', 'url_str', 'created_at', 'status', 'crawl')
readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'output_dir', 'archiveresults_list')
search_fields = ('id', 'url', 'timestamp', 'title', 'tags__name')
list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'created_by', 'tags__name')
list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'crawl__created_by', 'tags__name')
fieldsets = (
('URL', {
@@ -75,7 +75,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
'classes': ('card',),
}),
('Relations', {
'fields': ('crawl', 'created_by', 'tags_str'),
'fields': ('crawl', 'tags_str'),
'classes': ('card',),
}),
('Config', {

View File

@@ -6,7 +6,7 @@ from django.utils.html import format_html, mark_safe
from archivebox.misc.paginators import AccelleratedPaginator
from archivebox.base_models.admin import BaseModelAdmin
from core.models import Tag
from archivebox.core.models import Tag
class TagInline(admin.TabularInline):

View File

@@ -4,9 +4,9 @@ from django.apps import AppConfig
class CoreConfig(AppConfig):
name = 'core'
name = 'archivebox.core'
def ready(self):
"""Register the archivebox.core.admin_site as the main django admin site"""
from core.admin_site import register_admin_site
from archivebox.core.admin_site import register_admin_site
register_admin_site()

View File

@@ -20,7 +20,7 @@ application = get_asgi_application()
# from channels.routing import ProtocolTypeRouter, URLRouter
# from channels.auth import AuthMiddlewareStack
# from channels.security.websocket import AllowedHostsOriginValidator
# from core.routing import websocket_urlpatterns
# from archivebox.core.routing import websocket_urlpatterns
#
# application = ProtocolTypeRouter({
# "http": get_asgi_application(),

View File

@@ -4,10 +4,14 @@ from django import forms
from archivebox.misc.util import URL_REGEX
from taggit.utils import edit_string_for_tags, parse_tags
from archivebox.base_models.admin import KeyValueWidget
DEPTH_CHOICES = (
('0', 'depth = 0 (archive just these URLs)'),
('1', 'depth = 1 (archive these URLs and all URLs one hop away)'),
('1', 'depth = 1 (+ URLs one hop away)'),
('2', 'depth = 2 (+ URLs two hops away)'),
('3', 'depth = 3 (+ URLs three hops away)'),
('4', 'depth = 4 (+ URLs four hops away)'),
)
from archivebox.hooks import get_plugins
@@ -18,39 +22,180 @@ def get_plugin_choices():
class AddLinkForm(forms.Form):
url = forms.RegexField(label="URLs (one per line)", regex=URL_REGEX, min_length='6', strip=True, widget=forms.Textarea, required=True)
tag = forms.CharField(label="Tags (comma separated tag1,tag2,tag3)", strip=True, required=False)
depth = forms.ChoiceField(label="Archive depth", choices=DEPTH_CHOICES, initial='0', widget=forms.RadioSelect(attrs={"class": "depth-selection"}))
plugins = forms.MultipleChoiceField(
label="Plugins (select at least 1, otherwise all will be used by default)",
# Basic fields
url = forms.RegexField(
label="URLs (one per line)",
regex=URL_REGEX,
min_length='6',
strip=True,
widget=forms.Textarea,
required=True
)
tag = forms.CharField(
label="Tags (comma separated tag1,tag2,tag3)",
strip=True,
required=False,
widget=forms.TextInput(attrs={
'list': 'tag-datalist',
'autocomplete': 'off',
})
)
depth = forms.ChoiceField(
label="Archive depth",
choices=DEPTH_CHOICES,
initial='0',
widget=forms.RadioSelect(attrs={"class": "depth-selection"})
)
notes = forms.CharField(
label="Notes",
strip=True,
required=False,
widget=forms.Textarea(attrs={
'rows': 3,
'placeholder': 'Optional notes about this crawl (e.g., purpose, project name, context...)',
})
)
# Plugin groups
chrome_plugins = forms.MultipleChoiceField(
label="Chrome-dependent plugins",
required=False,
widget=forms.CheckboxSelectMultiple,
choices=[], # populated in __init__
)
archiving_plugins = forms.MultipleChoiceField(
label="Archiving",
required=False,
widget=forms.CheckboxSelectMultiple,
choices=[],
)
parsing_plugins = forms.MultipleChoiceField(
label="Parsing",
required=False,
widget=forms.CheckboxSelectMultiple,
choices=[],
)
search_plugins = forms.MultipleChoiceField(
label="Search",
required=False,
widget=forms.CheckboxSelectMultiple,
choices=[],
)
binary_plugins = forms.MultipleChoiceField(
label="Binary providers",
required=False,
widget=forms.CheckboxSelectMultiple,
choices=[],
)
extension_plugins = forms.MultipleChoiceField(
label="Browser extensions",
required=False,
widget=forms.CheckboxSelectMultiple,
choices=[],
)
# Advanced options
schedule = forms.CharField(
label="Repeat schedule",
max_length=64,
required=False,
widget=forms.TextInput(attrs={
'placeholder': 'e.g., daily, weekly, 0 */6 * * * (every 6 hours)',
})
)
persona = forms.CharField(
label="Persona (authentication profile)",
max_length=100,
initial='Default',
required=False,
)
overwrite = forms.BooleanField(
label="Overwrite existing snapshots",
initial=False,
required=False,
)
update = forms.BooleanField(
label="Update/retry previously failed URLs",
initial=False,
required=False,
)
index_only = forms.BooleanField(
label="Index only (don't archive yet)",
initial=False,
required=False,
)
config = forms.JSONField(
label="Custom config overrides",
widget=KeyValueWidget(),
initial=dict,
required=False,
widget=forms.SelectMultiple,
choices=[], # populated dynamically in __init__
)
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.fields['plugins'].choices = get_plugin_choices()
# TODO: hook these up to the view and put them
# in a collapsible UI section labeled "Advanced"
#
# exclude_patterns = forms.CharField(
# label="Exclude patterns",
# min_length='1',
# required=False,
# initial=URL_DENYLIST,
# )
# timeout = forms.IntegerField(
# initial=TIMEOUT,
# )
# overwrite = forms.BooleanField(
# label="Overwrite any existing Snapshots",
# initial=False,
# )
# index_only = forms.BooleanField(
# label="Add URLs to index without Snapshotting",
# initial=False,
# )
# Import at runtime to avoid circular imports
from archivebox.config.common import ARCHIVING_CONFIG
# Get all plugins
all_plugins = get_plugins()
# Define plugin groups
chrome_dependent = {
'accessibility', 'chrome', 'consolelog', 'dom', 'headers',
'parse_dom_outlinks', 'pdf', 'redirects', 'responses',
'screenshot', 'seo', 'singlefile', 'ssl', 'staticfile', 'title'
}
archiving = {
'archive_org', 'favicon', 'forumdl', 'gallerydl', 'git',
'htmltotext', 'media', 'mercury', 'papersdl', 'readability', 'wget'
}
parsing = {
'parse_html_urls', 'parse_jsonl_urls',
'parse_netscape_urls', 'parse_rss_urls', 'parse_txt_urls'
}
search = {
'search_backend_ripgrep', 'search_backend_sonic', 'search_backend_sqlite'
}
binary = {'apt', 'brew', 'custom', 'env', 'npm', 'pip'}
extensions = {'captcha2', 'istilldontcareaboutcookies', 'ublock'}
# Populate plugin field choices
self.fields['chrome_plugins'].choices = [
(p, p) for p in sorted(all_plugins) if p in chrome_dependent
]
self.fields['archiving_plugins'].choices = [
(p, p) for p in sorted(all_plugins) if p in archiving
]
self.fields['parsing_plugins'].choices = [
(p, p) for p in sorted(all_plugins) if p in parsing
]
self.fields['search_plugins'].choices = [
(p, p) for p in sorted(all_plugins) if p in search
]
self.fields['binary_plugins'].choices = [
(p, p) for p in sorted(all_plugins) if p in binary
]
self.fields['extension_plugins'].choices = [
(p, p) for p in sorted(all_plugins) if p in extensions
]
# Set update default from config
self.fields['update'].initial = not ARCHIVING_CONFIG.ONLY_NEW
def clean(self):
cleaned_data = super().clean()
# Combine all plugin groups into single list
all_selected_plugins = []
for field in ['chrome_plugins', 'archiving_plugins', 'parsing_plugins',
'search_plugins', 'binary_plugins', 'extension_plugins']:
all_selected_plugins.extend(cleaned_data.get(field, []))
# Store combined list for easy access
cleaned_data['plugins'] = all_selected_plugins
return cleaned_data
class TagWidgetMixin:
def format_value(self, value):

View File

@@ -12,7 +12,7 @@ try:
ARCHIVE_DIR = CONSTANTS.ARCHIVE_DIR
except ImportError:
try:
from config import CONFIG
from archivebox.config import CONFIG
ARCHIVE_DIR = Path(CONFIG.get('ARCHIVE_DIR', './archive'))
except ImportError:
ARCHIVE_DIR = Path('./archive')

View File

@@ -11,7 +11,7 @@ class Migration(migrations.Migration):
dependencies = [
('core', '0031_snapshot_parent_snapshot'),
('crawls', '0004_alter_crawl_output_dir'),
('machine', '0003_alter_dependency_id_alter_installedbinary_dependency_and_more'),
('machine', '0004_drop_dependency_table'), # Changed from 0003 - wait until Dependency is dropped
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]

View File

@@ -0,0 +1,79 @@
# Generated migration
from django.conf import settings
from django.db import migrations, models
import django.db.models.deletion
def create_catchall_crawls_and_assign_snapshots(apps, schema_editor):
"""
Create one catchall Crawl per user for all snapshots without a crawl.
Assign those snapshots to their user's catchall crawl.
"""
Snapshot = apps.get_model('core', 'Snapshot')
Crawl = apps.get_model('crawls', 'Crawl')
User = apps.get_model(settings.AUTH_USER_MODEL)
# Get all snapshots without a crawl
snapshots_without_crawl = Snapshot.objects.filter(crawl__isnull=True)
if not snapshots_without_crawl.exists():
return
# Group by created_by_id
snapshots_by_user = {}
for snapshot in snapshots_without_crawl:
user_id = snapshot.created_by_id
if user_id not in snapshots_by_user:
snapshots_by_user[user_id] = []
snapshots_by_user[user_id].append(snapshot)
# Create one catchall crawl per user and assign snapshots
for user_id, snapshots in snapshots_by_user.items():
try:
user = User.objects.get(pk=user_id)
username = user.username
except User.DoesNotExist:
username = 'unknown'
# Create catchall crawl for this user
crawl = Crawl.objects.create(
urls=f'# Catchall crawl for {len(snapshots)} snapshots without a crawl',
max_depth=0,
label=f'[migration] catchall for user {username}',
created_by_id=user_id,
)
# Assign all snapshots to this crawl
for snapshot in snapshots:
snapshot.crawl = crawl
snapshot.save(update_fields=['crawl'])
class Migration(migrations.Migration):
dependencies = [
('core', '0034_snapshot_current_step'),
('crawls', '0004_alter_crawl_output_dir'),
]
operations = [
# Step 1: Assign all snapshots without a crawl to catchall crawls
migrations.RunPython(
create_catchall_crawls_and_assign_snapshots,
reverse_code=migrations.RunPython.noop,
),
# Step 2: Make crawl non-nullable
migrations.AlterField(
model_name='snapshot',
name='crawl',
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to='crawls.crawl'),
),
# Step 3: Remove created_by field
migrations.RemoveField(
model_name='snapshot',
name='created_by',
),
]

View File

@@ -0,0 +1,19 @@
# Generated migration
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('core', '0035_snapshot_crawl_non_nullable_remove_created_by'),
]
operations = [
# Remove created_by field from ArchiveResult
# No data migration needed - created_by can be accessed via snapshot.crawl.created_by
migrations.RemoveField(
model_name='archiveresult',
name='created_by',
),
]

View File

@@ -9,6 +9,8 @@ import os
import json
from pathlib import Path
from statemachine import State, registry
from django.db import models
from django.db.models import QuerySet, Value, Case, When, IntegerField
from django.utils.functional import cached_property
@@ -33,10 +35,10 @@ from archivebox.base_models.models import (
ModelWithConfig, ModelWithNotes, ModelWithHealthStats,
get_or_create_system_user_pk,
)
from workers.models import ModelWithStateMachine
from workers.tasks import bg_archive_snapshot
from crawls.models import Crawl
from machine.models import NetworkInterface, Binary
from archivebox.workers.models import ModelWithStateMachine, BaseStateMachine
from archivebox.workers.tasks import bg_archive_snapshot
from archivebox.crawls.models import Crawl
from archivebox.machine.models import NetworkInterface, Binary
@@ -53,6 +55,7 @@ class Tag(ModelWithSerializers):
snapshot_set: models.Manager['Snapshot']
class Meta(TypedModelMeta):
app_label = 'core'
verbose_name = "Tag"
verbose_name_plural = "Tags"
@@ -122,6 +125,7 @@ class SnapshotTag(models.Model):
tag = models.ForeignKey(Tag, db_column='tag_id', on_delete=models.CASCADE, to_field='id')
class Meta:
app_label = 'core'
db_table = 'core_snapshot_tags'
unique_together = [('snapshot', 'tag')]
@@ -263,52 +267,6 @@ class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)):
# Import Methods
# =========================================================================
def create_or_update_from_dict(self, link_dict: Dict[str, Any], created_by_id: Optional[int] = None) -> 'Snapshot':
"""Create or update a Snapshot from a SnapshotDict (parser output)"""
import re
from archivebox.config.common import GENERAL_CONFIG
url = link_dict['url']
timestamp = link_dict.get('timestamp')
title = link_dict.get('title')
tags_str = link_dict.get('tags')
tag_list = []
if tags_str:
tag_list = list(dict.fromkeys(
tag.strip() for tag in re.split(GENERAL_CONFIG.TAG_SEPARATOR_PATTERN, tags_str)
if tag.strip()
))
# Get most recent snapshot with this URL (URLs can exist in multiple crawls)
snapshot = self.filter(url=url).order_by('-created_at').first()
if snapshot:
if title and (not snapshot.title or len(title) > len(snapshot.title or '')):
snapshot.title = title
snapshot.save(update_fields=['title', 'modified_at'])
else:
if timestamp:
while self.filter(timestamp=timestamp).exists():
timestamp = str(float(timestamp) + 1.0)
snapshot = self.create(
url=url,
timestamp=timestamp,
title=title,
created_by_id=created_by_id or get_or_create_system_user_pk(),
)
if tag_list:
existing_tags = set(snapshot.tags.values_list('name', flat=True))
new_tags = set(tag_list) | existing_tags
snapshot.save_tags(new_tags)
return snapshot
def create_from_dicts(self, link_dicts: List[Dict[str, Any]], created_by_id: Optional[int] = None) -> List['Snapshot']:
"""Create or update multiple Snapshots from a list of SnapshotDicts"""
return [self.create_or_update_from_dict(d, created_by_id=created_by_id) for d in link_dicts]
def remove(self, atomic: bool = False) -> tuple:
"""Remove snapshots from the database"""
from django.db import transaction
@@ -320,14 +278,13 @@ class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)):
class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False, related_name='snapshot_set', db_index=True)
created_at = models.DateTimeField(default=timezone.now, db_index=True)
modified_at = models.DateTimeField(auto_now=True)
url = models.URLField(unique=False, db_index=True) # URLs can appear in multiple crawls
timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False)
bookmarked_at = models.DateTimeField(default=timezone.now, db_index=True)
crawl: Crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name='snapshot_set', db_index=True) # type: ignore
crawl: Crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, null=False, related_name='snapshot_set', db_index=True) # type: ignore[assignment]
parent_snapshot = models.ForeignKey('self', on_delete=models.SET_NULL, null=True, blank=True, related_name='child_snapshots', db_index=True, help_text='Parent snapshot that discovered this URL (for recursive crawling)')
title = models.CharField(max_length=512, null=True, blank=True, db_index=True)
@@ -344,7 +301,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name='snapshot_set', through_fields=('snapshot', 'tag'))
state_machine_name = 'core.statemachines.SnapshotMachine'
state_machine_name = 'core.models.SnapshotMachine'
state_field_name = 'status'
retry_at_field_name = 'retry_at'
StatusChoices = ModelWithStateMachine.StatusChoices
@@ -354,6 +311,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
archiveresult_set: models.Manager['ArchiveResult']
class Meta(TypedModelMeta):
app_label = 'core'
verbose_name = "Snapshot"
verbose_name_plural = "Snapshots"
constraints = [
@@ -366,6 +324,11 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
def __str__(self):
return f'[{self.id}] {self.url[:64]}'
@property
def created_by(self):
"""Convenience property to access the user who created this snapshot via its crawl."""
return self.crawl.created_by
def save(self, *args, **kwargs):
is_new = self._state.adding
if not self.bookmarked_at:
@@ -395,7 +358,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
self.fs_version = target
super().save(*args, **kwargs)
if self.crawl and self.url not in self.crawl.urls:
if self.url not in self.crawl.urls:
self.crawl.urls += f'\n{self.url}'
self.crawl.save()
@@ -408,7 +371,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
url=self.url,
metadata={
'id': str(self.id),
'crawl_id': str(self.crawl_id) if self.crawl_id else None,
'crawl_id': str(self.crawl_id),
'depth': self.depth,
'status': self.status,
},
@@ -437,20 +400,11 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
return self.fs_version != self._fs_current_version()
def _fs_next_version(self, version: str) -> str:
"""Get next version in migration chain"""
chain = ['0.7.0', '0.8.0', '0.9.0']
try:
idx = chain.index(version)
return chain[idx + 1] if idx + 1 < len(chain) else self._fs_current_version()
except ValueError:
# Unknown version - skip to current
return self._fs_current_version()
def _fs_migrate_from_0_7_0_to_0_8_0(self):
"""Migration from 0.7.0 to 0.8.0 layout (no-op)"""
# 0.7 and 0.8 both used archive/<timestamp>
# Nothing to do!
pass
"""Get next version in migration chain (0.7/0.8 had same layout, only 0.8→0.9 migration needed)"""
# Treat 0.7.0 and 0.8.0 as equivalent (both used archive/{timestamp})
if version in ('0.7.0', '0.8.0'):
return '0.9.0'
return self._fs_current_version()
def _fs_migrate_from_0_8_0_to_0_9_0(self):
"""
@@ -578,7 +532,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
return CONSTANTS.ARCHIVE_DIR / self.timestamp
elif version in ('0.9.0', '1.0.0'):
username = self.created_by.username if self.created_by else 'unknown'
username = self.created_by.username
# Use created_at for date grouping (fallback to timestamp)
if self.created_at:
@@ -875,7 +829,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
pwd=result_data.get('pwd', str(self.output_dir)),
start_ts=start_ts,
end_ts=end_ts,
created_by=self.created_by,
)
except:
pass
@@ -1069,6 +1022,12 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
result = archive_results.get(plugin)
existing = result and result.status == 'succeeded' and (result.output_files or result.output_str)
icon = get_plugin_icon(plugin)
# Skip plugins with empty icons that have no output
# (e.g., staticfile only shows when there's actual output)
if not icon.strip() and not existing:
continue
output += format_html(
output_template,
path,
@@ -1139,9 +1098,20 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
def run(self) -> list['ArchiveResult']:
"""
Execute this Snapshot by creating ArchiveResults for all enabled extractors.
Execute snapshot by creating pending ArchiveResults for all enabled hooks.
Called by the state machine when entering the 'started' state.
Called by: SnapshotMachine.enter_started()
Hook Lifecycle:
1. discover_hooks('Snapshot') → finds all plugin hooks
2. For each hook:
- Create ArchiveResult with status=QUEUED
- Store hook_name (e.g., 'on_Snapshot__50_wget.py')
3. ArchiveResults execute independently via ArchiveResultMachine
4. Hook execution happens in ArchiveResult.run(), NOT here
Returns:
list[ArchiveResult]: Newly created pending results
"""
return self.create_pending_archiveresults()
@@ -1152,28 +1122,20 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
Called by the state machine when entering the 'sealed' state.
Kills any background hooks and finalizes their ArchiveResults.
"""
from pathlib import Path
from archivebox.hooks import kill_process
# Kill any background ArchiveResult hooks
if not self.OUTPUT_DIR.exists():
return
for plugin_dir in self.OUTPUT_DIR.iterdir():
if not plugin_dir.is_dir():
continue
pid_file = plugin_dir / 'hook.pid'
if pid_file.exists():
kill_process(pid_file, validate=True) # Use validation
# Find all .pid files in this snapshot's output directory
for pid_file in self.OUTPUT_DIR.glob('**/*.pid'):
kill_process(pid_file, validate=True)
# Update the ArchiveResult from filesystem
plugin_name = plugin_dir.name
results = self.archiveresult_set.filter(
status=ArchiveResult.StatusChoices.STARTED,
pwd__contains=plugin_name
)
for ar in results:
ar.update_from_output()
# Update all STARTED ArchiveResults from filesystem
results = self.archiveresult_set.filter(status=ArchiveResult.StatusChoices.STARTED)
for ar in results:
ar.update_from_output()
def has_running_background_hooks(self) -> bool:
"""
@@ -1196,51 +1158,156 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
return False
@staticmethod
def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None):
def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None, queue_for_extraction: bool = True):
"""
Create/update Snapshot from JSONL record.
Create/update Snapshot from JSONL record or dict.
Unified method that handles:
- ID-based patching: {"id": "...", "title": "new title"}
- URL-based create/update: {"url": "...", "title": "...", "tags": "..."}
- Auto-creates Crawl if not provided
- Optionally queues for extraction
Args:
record: JSONL record with 'url' field and optional metadata
record: Dict with 'url' (for create) or 'id' (for patch), plus other fields
overrides: Dict with 'crawl', 'snapshot' (parent), 'created_by_id'
queue_for_extraction: If True, sets status=QUEUED and retry_at (default: True)
Returns:
Snapshot instance or None
Note:
Filtering (depth, URL allowlist/denylist) should be done by caller
BEFORE calling this method. This method just creates the snapshot.
"""
from archivebox.misc.jsonl import get_or_create_snapshot
import re
from django.utils import timezone
from archivebox.misc.util import parse_date
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.config.common import GENERAL_CONFIG
overrides = overrides or {}
# If 'id' is provided, lookup and patch that specific snapshot
snapshot_id = record.get('id')
if snapshot_id:
try:
snapshot = Snapshot.objects.get(id=snapshot_id)
# Generically update all fields present in record
update_fields = []
for field_name, value in record.items():
# Skip internal fields
if field_name in ('id', 'type'):
continue
# Skip if field doesn't exist on model
if not hasattr(snapshot, field_name):
continue
# Special parsing for date fields
if field_name in ('bookmarked_at', 'retry_at', 'created_at', 'modified_at'):
if value and isinstance(value, str):
value = parse_date(value)
# Update field if value is provided and different
if value is not None and getattr(snapshot, field_name) != value:
setattr(snapshot, field_name, value)
update_fields.append(field_name)
if update_fields:
snapshot.save(update_fields=update_fields + ['modified_at'])
return snapshot
except Snapshot.DoesNotExist:
# ID not found, fall through to create-by-URL logic
pass
url = record.get('url')
if not url:
return None
# Apply crawl context metadata
# Determine or create crawl (every snapshot must have a crawl)
crawl = overrides.get('crawl')
snapshot = overrides.get('snapshot') # Parent snapshot
parent_snapshot = overrides.get('snapshot') # Parent snapshot
created_by_id = overrides.get('created_by_id') or (parent_snapshot.created_by.pk if parent_snapshot else get_or_create_system_user_pk())
if crawl:
record.setdefault('crawl_id', str(crawl.id))
record.setdefault('depth', (snapshot.depth + 1 if snapshot else 1))
if snapshot:
record.setdefault('parent_snapshot_id', str(snapshot.id))
# If no crawl provided, inherit from parent or auto-create one
if not crawl:
if parent_snapshot:
# Inherit crawl from parent snapshot
crawl = parent_snapshot.crawl
else:
# Auto-create a single-URL crawl
from archivebox.crawls.models import Crawl
from archivebox.config import CONSTANTS
try:
created_by_id = overrides.get('created_by_id') or (snapshot.created_by_id if snapshot else None)
new_snapshot = get_or_create_snapshot(record, created_by_id=created_by_id)
timestamp_str = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
sources_file = CONSTANTS.SOURCES_DIR / f'{timestamp_str}__auto_crawl.txt'
sources_file.parent.mkdir(parents=True, exist_ok=True)
sources_file.write_text(url)
# Queue for extraction
new_snapshot.status = Snapshot.StatusChoices.QUEUED
new_snapshot.retry_at = timezone.now()
new_snapshot.save()
crawl = Crawl.objects.create(
urls=url,
max_depth=0,
label=f'auto-created for {url[:50]}',
created_by_id=created_by_id,
)
return new_snapshot
except ValueError:
return None
# Parse tags
tags_str = record.get('tags', '')
tag_list = []
if tags_str:
tag_list = list(dict.fromkeys(
tag.strip() for tag in re.split(GENERAL_CONFIG.TAG_SEPARATOR_PATTERN, tags_str)
if tag.strip()
))
# Get most recent snapshot with this URL (URLs can exist in multiple crawls)
snapshot = Snapshot.objects.filter(url=url).order_by('-created_at').first()
title = record.get('title')
timestamp = record.get('timestamp')
if snapshot:
# Update existing snapshot
if title and (not snapshot.title or len(title) > len(snapshot.title or '')):
snapshot.title = title
snapshot.save(update_fields=['title', 'modified_at'])
else:
# Create new snapshot
if timestamp:
while Snapshot.objects.filter(timestamp=timestamp).exists():
timestamp = str(float(timestamp) + 1.0)
snapshot = Snapshot.objects.create(
url=url,
timestamp=timestamp,
title=title,
crawl=crawl,
)
# Update tags
if tag_list:
existing_tags = set(snapshot.tags.values_list('name', flat=True))
new_tags = set(tag_list) | existing_tags
snapshot.save_tags(new_tags)
# Queue for extraction and update additional fields
update_fields = []
if queue_for_extraction:
snapshot.status = Snapshot.StatusChoices.QUEUED
snapshot.retry_at = timezone.now()
update_fields.extend(['status', 'retry_at'])
# Update additional fields if provided
for field_name in ('depth', 'parent_snapshot_id', 'crawl_id', 'bookmarked_at'):
value = record.get(field_name)
if value is not None and getattr(snapshot, field_name) != value:
setattr(snapshot, field_name, value)
update_fields.append(field_name)
if update_fields:
snapshot.save(update_fields=update_fields + ['modified_at'])
return snapshot
def create_pending_archiveresults(self) -> list['ArchiveResult']:
"""
@@ -1273,7 +1340,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
'plugin': plugin,
'status': ArchiveResult.INITIAL_STATE,
'retry_at': timezone.now(),
'created_by_id': self.created_by_id,
},
)
if archiveresult.status == ArchiveResult.INITIAL_STATE:
@@ -1329,6 +1395,36 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
self.save(update_fields=['current_step', 'modified_at'])
return True
def is_finished_processing(self) -> bool:
"""
Check if this snapshot has finished processing.
Used by SnapshotMachine.is_finished() to determine if snapshot is complete.
Returns:
True if all archiveresults are finished (or no work to do), False otherwise.
"""
# if no archiveresults exist yet, it's not finished
if not self.archiveresult_set.exists():
return False
# Try to advance step if ready (handles step-based hook execution)
# This will increment current_step when all foreground hooks in current step are done
while self.advance_step_if_ready():
pass # Keep advancing until we can't anymore
# if archiveresults exist but are still pending, it's not finished
if self.pending_archiveresults().exists():
return False
# Don't wait for background hooks - they'll be cleaned up on entering sealed state
# Background hooks in STARTED state are excluded by pending_archiveresults()
# (STARTED is in FINAL_OR_ACTIVE_STATES) so once all results are FINAL or ACTIVE,
# we can transition to sealed and cleanup() will kill the background hooks
# otherwise archiveresults exist and are all finished, so it's finished
return True
def retry_failed_archiveresults(self, retry_at: Optional['timezone.datetime'] = None) -> int:
"""
Reset failed/skipped ArchiveResults to queued for retry.
@@ -1730,6 +1826,97 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
return dt.strftime('%Y-%m-%d %H:%M:%S') if dt else None
# =============================================================================
# Snapshot State Machine
# =============================================================================
class SnapshotMachine(BaseStateMachine, strict_states=True):
"""
State machine for managing Snapshot lifecycle.
Hook Lifecycle:
┌─────────────────────────────────────────────────────────────┐
│ QUEUED State │
│ • Waiting for snapshot to be ready │
└─────────────────────────────────────────────────────────────┘
↓ tick() when can_start()
┌─────────────────────────────────────────────────────────────┐
│ STARTED State → enter_started() │
│ 1. snapshot.run() │
│ • discover_hooks('Snapshot') → finds all plugin hooks │
│ • create_pending_archiveresults() → creates ONE │
│ ArchiveResult per hook (NO execution yet) │
│ 2. ArchiveResults process independently with their own │
│ state machines (see ArchiveResultMachine) │
│ 3. Advance through steps 0-9 as foreground hooks complete │
└─────────────────────────────────────────────────────────────┘
↓ tick() when is_finished()
┌─────────────────────────────────────────────────────────────┐
│ SEALED State → enter_sealed() │
│ • cleanup() → kills any background hooks still running │
│ • Set retry_at=None (no more processing) │
└─────────────────────────────────────────────────────────────┘
https://github.com/ArchiveBox/ArchiveBox/wiki/ArchiveBox-Architecture-Diagrams
"""
model_attr_name = 'snapshot'
# States
queued = State(value=Snapshot.StatusChoices.QUEUED, initial=True)
started = State(value=Snapshot.StatusChoices.STARTED)
sealed = State(value=Snapshot.StatusChoices.SEALED, final=True)
# Tick Event
tick = (
queued.to.itself(unless='can_start') |
queued.to(started, cond='can_start') |
started.to.itself(unless='is_finished') |
started.to(sealed, cond='is_finished')
)
def can_start(self) -> bool:
can_start = bool(self.snapshot.url)
return can_start
def is_finished(self) -> bool:
"""Check if snapshot processing is complete - delegates to model method."""
return self.snapshot.is_finished_processing()
@queued.enter
def enter_queued(self):
self.snapshot.update_and_requeue(
retry_at=timezone.now(),
status=Snapshot.StatusChoices.QUEUED,
)
@started.enter
def enter_started(self):
# lock the snapshot while we create the pending archiveresults
self.snapshot.update_and_requeue(
retry_at=timezone.now() + timedelta(seconds=30), # if failed, wait 30s before retrying
)
# Run the snapshot - creates pending archiveresults for all enabled plugins
self.snapshot.run()
# unlock the snapshot after we're done + set status = started
self.snapshot.update_and_requeue(
retry_at=timezone.now() + timedelta(seconds=5), # check again in 5s
status=Snapshot.StatusChoices.STARTED,
)
@sealed.enter
def enter_sealed(self):
# Clean up background hooks
self.snapshot.cleanup()
self.snapshot.update_and_requeue(
retry_at=None,
status=Snapshot.StatusChoices.SEALED,
)
class ArchiveResultManager(models.Manager):
def indexable(self, sorted: bool = True):
INDEXABLE_METHODS = [r[0] for r in EXTRACTOR_INDEXING_PRECEDENCE]
@@ -1761,7 +1948,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
# Note: unique constraint is added by migration 0027 - don't set unique=True here
# or SQLite table recreation in earlier migrations will fail
uuid = models.UUIDField(default=uuid7, null=True, blank=True, db_index=True)
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False, related_name='archiveresult_set', db_index=True)
created_at = models.DateTimeField(default=timezone.now, db_index=True)
modified_at = models.DateTimeField(auto_now=True)
@@ -1782,7 +1968,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
# Binary FK (optional - set when hook reports cmd)
binary = models.ForeignKey(
'machine.Binary',
Binary,
on_delete=models.SET_NULL,
null=True, blank=True,
related_name='archiveresults',
@@ -1798,7 +1984,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
output_dir = models.CharField(max_length=256, default=None, null=True, blank=True)
iface = models.ForeignKey(NetworkInterface, on_delete=models.SET_NULL, null=True, blank=True)
state_machine_name = 'core.statemachines.ArchiveResultMachine'
state_machine_name = 'core.models.ArchiveResultMachine'
retry_at_field_name = 'retry_at'
state_field_name = 'status'
active_state = StatusChoices.STARTED
@@ -1806,12 +1992,18 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
objects = ArchiveResultManager()
class Meta(TypedModelMeta):
app_label = 'core'
verbose_name = 'Archive Result'
verbose_name_plural = 'Archive Results Log'
def __str__(self):
return f'[{self.id}] {self.snapshot.url[:64]} -> {self.plugin}'
@property
def created_by(self):
"""Convenience property to access the user who created this archive result via its snapshot's crawl."""
return self.snapshot.crawl.created_by
def save(self, *args, **kwargs):
is_new = self._state.adding
# Skip ModelWithOutputDir.save() to avoid creating index.json in plugin directories
@@ -1900,6 +2092,12 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
def save_search_index(self):
pass
def cascade_health_update(self, success: bool):
"""Update health stats for self, parent Snapshot, and grandparent Crawl."""
self.increment_health_stats(success)
self.snapshot.increment_health_stats(success)
self.snapshot.crawl.increment_health_stats(success)
def run(self):
"""
Execute this ArchiveResult's hook and update status.
@@ -1911,8 +2109,13 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
"""
from django.utils import timezone
from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook, is_background_hook
from archivebox.config.configset import get_config
config_objects = [self.snapshot.crawl, self.snapshot] if self.snapshot.crawl else [self.snapshot]
# Get merged config with proper context
config = get_config(
crawl=self.snapshot.crawl,
snapshot=self.snapshot,
)
# Determine which hook(s) to run
hooks = []
@@ -1962,10 +2165,10 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
result = run_hook(
hook,
output_dir=plugin_dir,
config_objects=config_objects,
config=config,
url=self.snapshot.url,
snapshot_id=str(self.snapshot.id),
crawl_id=str(self.snapshot.crawl.id) if self.snapshot.crawl else None,
crawl_id=str(self.snapshot.crawl.id),
depth=self.snapshot.depth,
)
@@ -2112,9 +2315,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
# Filter Snapshot records for depth/URL constraints
if record_type == 'Snapshot':
if not self.snapshot.crawl:
continue
url = record.get('url')
if not url:
continue
@@ -2132,19 +2332,10 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
overrides = {
'snapshot': self.snapshot,
'crawl': self.snapshot.crawl,
'created_by_id': self.snapshot.created_by_id,
'created_by_id': self.created_by.pk,
}
process_hook_records(filtered_records, overrides=overrides)
# Update snapshot title if this is the title plugin
plugin_name = get_plugin_name(self.plugin)
if self.status == self.StatusChoices.SUCCEEDED and plugin_name == 'title':
self._update_snapshot_title(plugin_dir)
# Trigger search indexing if succeeded
if self.status == self.StatusChoices.SUCCEEDED:
self.trigger_search_indexing()
# Cleanup PID files and empty logs
pid_file = plugin_dir / 'hook.pid'
pid_file.unlink(missing_ok=True)
@@ -2164,7 +2355,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
if not cmd:
return
from machine.models import Machine
from archivebox.machine.models import Machine
bin_path_or_name = cmd[0] if isinstance(cmd, list) else cmd
machine = Machine.current()
@@ -2189,23 +2380,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
if binary:
self.binary = binary
def _update_snapshot_title(self, plugin_dir: Path):
"""
Update snapshot title from title plugin output.
The title plugin writes title.txt with the extracted page title.
This updates the Snapshot.title field if the file exists and has content.
"""
title_file = plugin_dir / 'title.txt'
if title_file.exists():
try:
title = title_file.read_text(encoding='utf-8').strip()
if title and (not self.snapshot.title or len(title) > len(self.snapshot.title)):
self.snapshot.title = title[:512] # Max length from model
self.snapshot.save(update_fields=['title', 'modified_at'])
except Exception:
pass # Failed to read title, that's okay
def _url_passes_filters(self, url: str) -> bool:
"""Check if URL passes URL_ALLOWLIST and URL_DENYLIST config filters.
@@ -2216,8 +2390,8 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
# Get merged config with proper hierarchy
config = get_config(
user=self.snapshot.created_by if self.snapshot else None,
crawl=self.snapshot.crawl if self.snapshot else None,
user=self.created_by,
crawl=self.snapshot.crawl,
snapshot=self.snapshot,
)
@@ -2256,23 +2430,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
return False # No allowlist patterns matched
return True # No filters or passed filters
def trigger_search_indexing(self):
"""Run any ArchiveResult__index hooks to update search indexes."""
from archivebox.hooks import discover_hooks, run_hook
# Pass config objects in priority order (later overrides earlier)
config_objects = [self.snapshot.crawl, self.snapshot] if self.snapshot.crawl else [self.snapshot]
for hook in discover_hooks('ArchiveResult__index'):
run_hook(
hook,
output_dir=self.output_dir,
config_objects=config_objects,
url=self.snapshot.url,
snapshot_id=str(self.snapshot.id),
plugin=self.plugin,
)
@property
def output_dir(self) -> Path:
@@ -2285,4 +2442,185 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
if not plugin_dir:
return False
pid_file = plugin_dir / 'hook.pid'
return pid_file.exists()
return pid_file.exists()
# =============================================================================
# ArchiveResult State Machine
# =============================================================================
class ArchiveResultMachine(BaseStateMachine, strict_states=True):
"""
State machine for managing ArchiveResult (single plugin execution) lifecycle.
Hook Lifecycle:
┌─────────────────────────────────────────────────────────────┐
│ QUEUED State │
│ • Waiting for its turn to run │
└─────────────────────────────────────────────────────────────┘
↓ tick() when can_start()
┌─────────────────────────────────────────────────────────────┐
│ STARTED State → enter_started() │
│ 1. archiveresult.run() │
│ • Find specific hook by hook_name │
│ • run_hook(script, output_dir, ...) → subprocess │
│ │
│ 2a. FOREGROUND hook (returns HookResult): │
│ • update_from_output() immediately │
│ - Read stdout.log │
│ - Parse JSONL records │
│ - Extract 'ArchiveResult' record → update status │
│ - Walk output_dir → populate output_files │
│ - Call process_hook_records() for side effects │
│ │
│ 2b. BACKGROUND hook (returns None): │
│ • Status stays STARTED │
│ • Continues running in background │
│ • Killed by Snapshot.cleanup() when sealed │
└─────────────────────────────────────────────────────────────┘
↓ tick() checks status
┌─────────────────────────────────────────────────────────────┐
│ SUCCEEDED / FAILED / SKIPPED / BACKOFF │
│ • Set by hook's JSONL output during update_from_output() │
│ • Health stats incremented (num_uses_succeeded/failed) │
│ • Parent Snapshot health stats also updated │
└─────────────────────────────────────────────────────────────┘
https://github.com/ArchiveBox/ArchiveBox/wiki/ArchiveBox-Architecture-Diagrams
"""
model_attr_name = 'archiveresult'
# States
queued = State(value=ArchiveResult.StatusChoices.QUEUED, initial=True)
started = State(value=ArchiveResult.StatusChoices.STARTED)
backoff = State(value=ArchiveResult.StatusChoices.BACKOFF)
succeeded = State(value=ArchiveResult.StatusChoices.SUCCEEDED, final=True)
failed = State(value=ArchiveResult.StatusChoices.FAILED, final=True)
skipped = State(value=ArchiveResult.StatusChoices.SKIPPED, final=True)
# Tick Event - transitions based on conditions
tick = (
queued.to.itself(unless='can_start') |
queued.to(started, cond='can_start') |
started.to.itself(unless='is_finished') |
started.to(succeeded, cond='is_succeeded') |
started.to(failed, cond='is_failed') |
started.to(skipped, cond='is_skipped') |
started.to(backoff, cond='is_backoff') |
backoff.to.itself(unless='can_start') |
backoff.to(started, cond='can_start') |
backoff.to(succeeded, cond='is_succeeded') |
backoff.to(failed, cond='is_failed') |
backoff.to(skipped, cond='is_skipped')
)
def can_start(self) -> bool:
can_start = bool(self.archiveresult.snapshot.url)
return can_start
def is_succeeded(self) -> bool:
"""Check if extractor plugin succeeded (status was set by run())."""
return self.archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED
def is_failed(self) -> bool:
"""Check if extractor plugin failed (status was set by run())."""
return self.archiveresult.status == ArchiveResult.StatusChoices.FAILED
def is_skipped(self) -> bool:
"""Check if extractor plugin was skipped (status was set by run())."""
return self.archiveresult.status == ArchiveResult.StatusChoices.SKIPPED
def is_backoff(self) -> bool:
"""Check if we should backoff and retry later."""
# Backoff if status is still started (plugin didn't complete) and output_str is empty
return (
self.archiveresult.status == ArchiveResult.StatusChoices.STARTED and
not self.archiveresult.output_str
)
def is_finished(self) -> bool:
"""Check if extraction has completed (success, failure, or skipped)."""
return self.archiveresult.status in (
ArchiveResult.StatusChoices.SUCCEEDED,
ArchiveResult.StatusChoices.FAILED,
ArchiveResult.StatusChoices.SKIPPED,
)
@queued.enter
def enter_queued(self):
self.archiveresult.update_and_requeue(
retry_at=timezone.now(),
status=ArchiveResult.StatusChoices.QUEUED,
start_ts=None,
) # bump the snapshot's retry_at so they pickup any new changes
@started.enter
def enter_started(self):
from archivebox.machine.models import NetworkInterface
# Lock the object and mark start time
self.archiveresult.update_and_requeue(
retry_at=timezone.now() + timedelta(seconds=120), # 2 min timeout for plugin
status=ArchiveResult.StatusChoices.STARTED,
start_ts=timezone.now(),
iface=NetworkInterface.current(),
)
# Run the plugin - this updates status, output, timestamps, etc.
self.archiveresult.run()
# Save the updated result
self.archiveresult.save()
@backoff.enter
def enter_backoff(self):
self.archiveresult.update_and_requeue(
retry_at=timezone.now() + timedelta(seconds=60),
status=ArchiveResult.StatusChoices.BACKOFF,
end_ts=None,
)
@succeeded.enter
def enter_succeeded(self):
self.archiveresult.update_and_requeue(
retry_at=None,
status=ArchiveResult.StatusChoices.SUCCEEDED,
end_ts=timezone.now(),
)
# Update health stats for ArchiveResult, Snapshot, and Crawl cascade
self.archiveresult.cascade_health_update(success=True)
@failed.enter
def enter_failed(self):
self.archiveresult.update_and_requeue(
retry_at=None,
status=ArchiveResult.StatusChoices.FAILED,
end_ts=timezone.now(),
)
# Update health stats for ArchiveResult, Snapshot, and Crawl cascade
self.archiveresult.cascade_health_update(success=False)
@skipped.enter
def enter_skipped(self):
self.archiveresult.update_and_requeue(
retry_at=None,
status=ArchiveResult.StatusChoices.SKIPPED,
end_ts=timezone.now(),
)
def after_transition(self, event: str, source: State, target: State):
self.archiveresult.snapshot.update_and_requeue() # bump snapshot retry time so it picks up all the new changes
# =============================================================================
# State Machine Registration
# =============================================================================
# Manually register state machines with python-statemachine registry
# (normally auto-discovered from statemachines.py, but we define them here for clarity)
registry.register(SnapshotMachine)
registry.register(ArchiveResultMachine)

2638
archivebox/core/models.py.bak Executable file

File diff suppressed because it is too large Load Diff

View File

@@ -30,9 +30,9 @@ LOADED_PLUGINS = archivebox.LOADED_PLUGINS
### Django Core Settings
################################################################################
WSGI_APPLICATION = "core.wsgi.application"
ASGI_APPLICATION = "core.asgi.application"
ROOT_URLCONF = "core.urls"
WSGI_APPLICATION = "archivebox.core.wsgi.application"
ASGI_APPLICATION = "archivebox.core.asgi.application"
ROOT_URLCONF = "archivebox.core.urls"
LOGIN_URL = "/accounts/login/"
LOGOUT_REDIRECT_URL = os.environ.get("LOGOUT_REDIRECT_URL", "/")
@@ -55,14 +55,15 @@ INSTALLED_APPS = [
# 3rd-party apps from PyPI
"signal_webhooks", # handles REST API outbound webhooks https://github.com/MrThearMan/django-signal-webhooks
"django_object_actions", # provides easy Django Admin action buttons on change views https://github.com/crccheck/django-object-actions
# Our ArchiveBox-provided apps
"config", # ArchiveBox config settings (loaded as a plugin, don't need to add it here)
"machine", # handles collecting and storing information about the host machine, network interfaces, binaries, etc.
"workers", # handles starting and managing background workers and processes (orchestrators and actors)
"crawls", # handles Crawl and CrawlSchedule models and management
"personas", # handles Persona and session management
"core", # core django model with Snapshot, ArchiveResult, etc.
"api", # Django-Ninja-based Rest API interfaces, config, APIToken model, etc.
# Our ArchiveBox-provided apps (use fully qualified names)
# NOTE: Order matters! Apps with migrations that depend on other apps must come AFTER their dependencies
# "archivebox.config", # ArchiveBox config settings (no models, not a real Django app)
"archivebox.machine", # handles collecting and storing information about the host machine, network interfaces, binaries, etc.
"archivebox.workers", # handles starting and managing background workers and processes (orchestrators and actors)
"archivebox.personas", # handles Persona and session management
"archivebox.core", # core django model with Snapshot, ArchiveResult, etc. (crawls depends on this)
"archivebox.crawls", # handles Crawl and CrawlSchedule models and management (depends on core)
"archivebox.api", # Django-Ninja-based Rest API interfaces, config, APIToken model, etc.
# ArchiveBox plugins (hook-based plugins no longer add Django apps)
# Use hooks.py discover_hooks() for plugin functionality
# 3rd-party apps from PyPI that need to be loaded last
@@ -72,15 +73,15 @@ INSTALLED_APPS = [
MIDDLEWARE = [
"core.middleware.TimezoneMiddleware",
"archivebox.core.middleware.TimezoneMiddleware",
"django.middleware.security.SecurityMiddleware",
"django.contrib.sessions.middleware.SessionMiddleware",
"django.middleware.common.CommonMiddleware",
"django.middleware.csrf.CsrfViewMiddleware",
"django.contrib.auth.middleware.AuthenticationMiddleware",
"core.middleware.ReverseProxyAuthMiddleware",
"archivebox.core.middleware.ReverseProxyAuthMiddleware",
"django.contrib.messages.middleware.MessageMiddleware",
"core.middleware.CacheControlMiddleware",
"archivebox.core.middleware.CacheControlMiddleware",
# Additional middlewares from plugins (if any)
]
@@ -370,15 +371,15 @@ LOGGING = SETTINGS_LOGGING
################################################################################
# Add default webhook configuration to the User model
SIGNAL_WEBHOOKS_CUSTOM_MODEL = "api.models.OutboundWebhook"
SIGNAL_WEBHOOKS_CUSTOM_MODEL = "archivebox.api.models.OutboundWebhook"
SIGNAL_WEBHOOKS = {
"HOOKS": {
# ... is a special sigil value that means "use the default autogenerated hooks"
"django.contrib.auth.models.User": ...,
"core.models.Snapshot": ...,
"core.models.ArchiveResult": ...,
"core.models.Tag": ...,
"api.models.APIToken": ...,
"archivebox.core.models.Snapshot": ...,
"archivebox.core.models.ArchiveResult": ...,
"archivebox.core.models.Tag": ...,
"archivebox.api.models.APIToken": ...,
},
}
@@ -391,11 +392,11 @@ ADMIN_DATA_VIEWS = {
"URLS": [
{
"route": "config/",
"view": "core.views.live_config_list_view",
"view": "archivebox.core.views.live_config_list_view",
"name": "Configuration",
"items": {
"route": "<str:key>/",
"view": "core.views.live_config_value_view",
"view": "archivebox.core.views.live_config_value_view",
"name": "config_val",
},
},

View File

@@ -1,319 +0,0 @@
__package__ = 'archivebox.core'
import time
import os
from datetime import timedelta
from typing import ClassVar
from django.db.models import F
from django.utils import timezone
from rich import print
from statemachine import State, StateMachine
# from workers.actor import ActorType
from core.models import Snapshot, ArchiveResult
from crawls.models import Crawl
class SnapshotMachine(StateMachine, strict_states=True):
"""
State machine for managing Snapshot lifecycle.
https://github.com/ArchiveBox/ArchiveBox/wiki/ArchiveBox-Architecture-Diagrams
"""
model: Snapshot
# States
queued = State(value=Snapshot.StatusChoices.QUEUED, initial=True)
started = State(value=Snapshot.StatusChoices.STARTED)
sealed = State(value=Snapshot.StatusChoices.SEALED, final=True)
# Tick Event
tick = (
queued.to.itself(unless='can_start') |
queued.to(started, cond='can_start') |
started.to.itself(unless='is_finished') |
started.to(sealed, cond='is_finished')
)
def __init__(self, snapshot, *args, **kwargs):
self.snapshot = snapshot
super().__init__(snapshot, *args, **kwargs)
def __repr__(self) -> str:
return f'Snapshot[{self.snapshot.id}]'
def __str__(self) -> str:
return self.__repr__()
def can_start(self) -> bool:
can_start = bool(self.snapshot.url)
# Suppressed: queue waiting logs
return can_start
def is_finished(self) -> bool:
# if no archiveresults exist yet, it's not finished
if not self.snapshot.archiveresult_set.exists():
return False
# Try to advance step if ready (handles step-based hook execution)
# This will increment current_step when all foreground hooks in current step are done
while self.snapshot.advance_step_if_ready():
pass # Keep advancing until we can't anymore
# if archiveresults exist but are still pending, it's not finished
if self.snapshot.pending_archiveresults().exists():
return False
# Don't wait for background hooks - they'll be cleaned up on entering sealed state
# Background hooks in STARTED state are excluded by pending_archiveresults()
# (STARTED is in FINAL_OR_ACTIVE_STATES) so once all results are FINAL or ACTIVE,
# we can transition to sealed and cleanup() will kill the background hooks
# otherwise archiveresults exist and are all finished, so it's finished
return True
# def on_transition(self, event, state):
# print(f'{self}.on_transition() [blue]{str(state).upper()}[/blue] ➡️ ...')
@queued.enter
def enter_queued(self):
# Suppressed: state transition logs
self.snapshot.update_for_workers(
retry_at=timezone.now(),
status=Snapshot.StatusChoices.QUEUED,
)
@started.enter
def enter_started(self):
# Suppressed: state transition logs
# lock the snapshot while we create the pending archiveresults
self.snapshot.update_for_workers(
retry_at=timezone.now() + timedelta(seconds=30), # if failed, wait 30s before retrying
)
# Run the snapshot - creates pending archiveresults for all enabled plugins
self.snapshot.run()
# unlock the snapshot after we're done + set status = started
self.snapshot.update_for_workers(
retry_at=timezone.now() + timedelta(seconds=5), # check again in 5s
status=Snapshot.StatusChoices.STARTED,
)
@sealed.enter
def enter_sealed(self):
# Clean up background hooks
self.snapshot.cleanup()
# Suppressed: state transition logs
self.snapshot.update_for_workers(
retry_at=None,
status=Snapshot.StatusChoices.SEALED,
)
# class SnapshotWorker(ActorType[Snapshot]):
# """
# The primary actor for progressing Snapshot objects
# through their lifecycle using the SnapshotMachine.
# """
# Model = Snapshot
# StateMachineClass = SnapshotMachine
# ACTIVE_STATE: ClassVar[State] = SnapshotMachine.started # 'started'
# MAX_CONCURRENT_ACTORS: ClassVar[int] = 3
# MAX_TICK_TIME: ClassVar[int] = 10
# CLAIM_FROM_TOP_N: ClassVar[int] = MAX_CONCURRENT_ACTORS * 10
class ArchiveResultMachine(StateMachine, strict_states=True):
"""
State machine for managing ArchiveResult lifecycle.
https://github.com/ArchiveBox/ArchiveBox/wiki/ArchiveBox-Architecture-Diagrams
"""
model: ArchiveResult
# States
queued = State(value=ArchiveResult.StatusChoices.QUEUED, initial=True)
started = State(value=ArchiveResult.StatusChoices.STARTED)
backoff = State(value=ArchiveResult.StatusChoices.BACKOFF)
succeeded = State(value=ArchiveResult.StatusChoices.SUCCEEDED, final=True)
failed = State(value=ArchiveResult.StatusChoices.FAILED, final=True)
skipped = State(value=ArchiveResult.StatusChoices.SKIPPED, final=True)
# Tick Event - transitions based on conditions
tick = (
queued.to.itself(unless='can_start') |
queued.to(started, cond='can_start') |
started.to.itself(unless='is_finished') |
started.to(succeeded, cond='is_succeeded') |
started.to(failed, cond='is_failed') |
started.to(skipped, cond='is_skipped') |
started.to(backoff, cond='is_backoff') |
backoff.to.itself(unless='can_start') |
backoff.to(started, cond='can_start') |
backoff.to(succeeded, cond='is_succeeded') |
backoff.to(failed, cond='is_failed') |
backoff.to(skipped, cond='is_skipped')
)
def __init__(self, archiveresult, *args, **kwargs):
self.archiveresult = archiveresult
super().__init__(archiveresult, *args, **kwargs)
def __repr__(self) -> str:
return f'ArchiveResult[{self.archiveresult.id}]'
def __str__(self) -> str:
return self.__repr__()
def can_start(self) -> bool:
can_start = bool(self.archiveresult.snapshot.url)
# Suppressed: queue waiting logs
return can_start
def is_succeeded(self) -> bool:
"""Check if extractor plugin succeeded (status was set by run())."""
return self.archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED
def is_failed(self) -> bool:
"""Check if extractor plugin failed (status was set by run())."""
return self.archiveresult.status == ArchiveResult.StatusChoices.FAILED
def is_skipped(self) -> bool:
"""Check if extractor plugin was skipped (status was set by run())."""
return self.archiveresult.status == ArchiveResult.StatusChoices.SKIPPED
def is_backoff(self) -> bool:
"""Check if we should backoff and retry later."""
# Backoff if status is still started (plugin didn't complete) and output_str is empty
return (
self.archiveresult.status == ArchiveResult.StatusChoices.STARTED and
not self.archiveresult.output_str
)
def is_finished(self) -> bool:
"""Check if extraction has completed (success, failure, or skipped)."""
return self.archiveresult.status in (
ArchiveResult.StatusChoices.SUCCEEDED,
ArchiveResult.StatusChoices.FAILED,
ArchiveResult.StatusChoices.SKIPPED,
)
@queued.enter
def enter_queued(self):
# Suppressed: state transition logs
self.archiveresult.update_for_workers(
retry_at=timezone.now(),
status=ArchiveResult.StatusChoices.QUEUED,
start_ts=None,
) # bump the snapshot's retry_at so they pickup any new changes
@started.enter
def enter_started(self):
from machine.models import NetworkInterface
# Suppressed: state transition logs
# Lock the object and mark start time
self.archiveresult.update_for_workers(
retry_at=timezone.now() + timedelta(seconds=120), # 2 min timeout for plugin
status=ArchiveResult.StatusChoices.STARTED,
start_ts=timezone.now(),
iface=NetworkInterface.current(),
)
# Run the plugin - this updates status, output, timestamps, etc.
self.archiveresult.run()
# Save the updated result
self.archiveresult.save()
# Suppressed: plugin result logs (already logged by worker)
@backoff.enter
def enter_backoff(self):
# Suppressed: state transition logs
self.archiveresult.update_for_workers(
retry_at=timezone.now() + timedelta(seconds=60),
status=ArchiveResult.StatusChoices.BACKOFF,
end_ts=None,
# retries=F('retries') + 1, # F() equivalent to getattr(self.archiveresult, 'retries', 0) + 1,
)
self.archiveresult.save()
@succeeded.enter
def enter_succeeded(self):
# Suppressed: state transition logs
self.archiveresult.update_for_workers(
retry_at=None,
status=ArchiveResult.StatusChoices.SUCCEEDED,
end_ts=timezone.now(),
# **self.archiveresult.get_output_dict(), # {output, output_json, stderr, stdout, returncode, errors, cmd_version, pwd, cmd, machine}
)
self.archiveresult.save()
# Increment health stats on ArchiveResult, Snapshot, and optionally Crawl
ArchiveResult.objects.filter(pk=self.archiveresult.pk).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
Snapshot.objects.filter(pk=self.archiveresult.snapshot_id).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
# Also update Crawl health stats if snapshot has a crawl
snapshot = self.archiveresult.snapshot
if snapshot.crawl_id:
Crawl.objects.filter(pk=snapshot.crawl_id).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
@failed.enter
def enter_failed(self):
# Suppressed: state transition logs
self.archiveresult.update_for_workers(
retry_at=None,
status=ArchiveResult.StatusChoices.FAILED,
end_ts=timezone.now(),
)
# Increment health stats on ArchiveResult, Snapshot, and optionally Crawl
ArchiveResult.objects.filter(pk=self.archiveresult.pk).update(num_uses_failed=F('num_uses_failed') + 1)
Snapshot.objects.filter(pk=self.archiveresult.snapshot_id).update(num_uses_failed=F('num_uses_failed') + 1)
# Also update Crawl health stats if snapshot has a crawl
snapshot = self.archiveresult.snapshot
if snapshot.crawl_id:
Crawl.objects.filter(pk=snapshot.crawl_id).update(num_uses_failed=F('num_uses_failed') + 1)
@skipped.enter
def enter_skipped(self):
# Suppressed: state transition logs
self.archiveresult.update_for_workers(
retry_at=None,
status=ArchiveResult.StatusChoices.SKIPPED,
end_ts=timezone.now(),
)
def after_transition(self, event: str, source: State, target: State):
# print(f"after '{event}' from '{source.id}' to '{target.id}'")
self.archiveresult.snapshot.update_for_workers() # bump snapshot retry time so it picks up all the new changes
# class ArchiveResultWorker(ActorType[ArchiveResult]):
# """
# The primary actor for progressing ArchiveResult objects
# through their lifecycle using the ArchiveResultMachine.
# """
# Model = ArchiveResult
# StateMachineClass = ArchiveResultMachine
# ACTIVE_STATE: ClassVar[State] = ArchiveResultMachine.started # 'started'
# MAX_CONCURRENT_ACTORS: ClassVar[int] = 6
# MAX_TICK_TIME: ClassVar[int] = 60
# CLAIM_FROM_TOP_N: ClassVar[int] = MAX_CONCURRENT_ACTORS * 10

View File

@@ -0,0 +1,20 @@
"""Template tags for accessing config values in templates."""
from django import template
from archivebox.config.configset import get_config as _get_config
register = template.Library()
@register.simple_tag
def get_config(key: str) -> any:
"""
Get a config value by key.
Usage: {% get_config "ARCHIVEDOTORG_ENABLED" as enabled %}
"""
try:
return _get_config(key)
except (KeyError, AttributeError):
return None

View File

@@ -1,3 +1,319 @@
#from django.test import TestCase
"""Tests for the core views, especially AddView."""
# Create your tests here.
import os
import django
# Set up Django before importing any Django-dependent modules
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
django.setup()
from django.test import TestCase, Client
from django.contrib.auth.models import User
from django.urls import reverse
from archivebox.crawls.models import Crawl, CrawlSchedule
from archivebox.core.models import Tag
class AddViewTests(TestCase):
"""Tests for the AddView (crawl creation form)."""
def setUp(self):
"""Set up test user and client."""
self.client = Client()
self.user = User.objects.create_user(
username='testuser',
password='testpass123',
email='test@example.com'
)
self.client.login(username='testuser', password='testpass123')
self.add_url = reverse('add')
def test_add_view_get_requires_auth(self):
"""Test that GET /add requires authentication."""
self.client.logout()
response = self.client.get(self.add_url)
# Should redirect to login or show 403/404
self.assertIn(response.status_code, [302, 403, 404])
def test_add_view_get_shows_form(self):
"""Test that GET /add shows the form with all fields."""
response = self.client.get(self.add_url)
self.assertEqual(response.status_code, 200)
# Check that form fields are present
self.assertContains(response, 'name="url"')
self.assertContains(response, 'name="tag"')
self.assertContains(response, 'name="depth"')
self.assertContains(response, 'name="notes"')
self.assertContains(response, 'name="schedule"')
self.assertContains(response, 'name="persona"')
self.assertContains(response, 'name="overwrite"')
self.assertContains(response, 'name="update"')
self.assertContains(response, 'name="index_only"')
# Check for plugin groups
self.assertContains(response, 'name="chrome_plugins"')
self.assertContains(response, 'name="archiving_plugins"')
self.assertContains(response, 'name="parsing_plugins"')
def test_add_view_shows_tag_autocomplete(self):
"""Test that tag autocomplete datalist is rendered."""
# Create some tags
Tag.objects.create(name='test-tag-1')
Tag.objects.create(name='test-tag-2')
response = self.client.get(self.add_url)
self.assertEqual(response.status_code, 200)
# Check for datalist with tags
self.assertContains(response, 'id="tag-datalist"')
self.assertContains(response, 'test-tag-1')
self.assertContains(response, 'test-tag-2')
def test_add_view_shows_plugin_presets(self):
"""Test that plugin preset buttons are rendered."""
response = self.client.get(self.add_url)
self.assertEqual(response.status_code, 200)
self.assertContains(response, 'Quick Archive')
self.assertContains(response, 'Full Chrome')
self.assertContains(response, 'Text Only')
self.assertContains(response, 'Select All')
self.assertContains(response, 'Clear All')
def test_add_view_shows_links_to_resources(self):
"""Test that helpful links are present."""
response = self.client.get(self.add_url)
self.assertEqual(response.status_code, 200)
# Link to plugin documentation
self.assertContains(response, '/admin/environment/plugins/')
# Link to create new persona
self.assertContains(response, '/admin/personas/persona/add/')
def test_add_basic_crawl_without_schedule(self):
"""Test creating a basic crawl without a schedule."""
response = self.client.post(self.add_url, {
'url': 'https://example.com\nhttps://example.org',
'tag': 'test-tag',
'depth': '0',
'notes': 'Test crawl notes',
})
# Should redirect to crawl admin page
self.assertEqual(response.status_code, 302)
# Check that crawl was created
self.assertEqual(Crawl.objects.count(), 1)
crawl = Crawl.objects.first()
self.assertIn('https://example.com', crawl.urls)
self.assertIn('https://example.org', crawl.urls)
self.assertEqual(crawl.tags_str, 'test-tag')
self.assertEqual(crawl.max_depth, 0)
self.assertEqual(crawl.notes, 'Test crawl notes')
self.assertEqual(crawl.created_by, self.user)
# No schedule should be created
self.assertIsNone(crawl.schedule)
self.assertEqual(CrawlSchedule.objects.count(), 0)
def test_add_crawl_with_schedule(self):
"""Test creating a crawl with a repeat schedule."""
response = self.client.post(self.add_url, {
'url': 'https://example.com',
'tag': 'scheduled',
'depth': '1',
'notes': 'Daily crawl',
'schedule': 'daily',
})
self.assertEqual(response.status_code, 302)
# Check that crawl and schedule were created
self.assertEqual(Crawl.objects.count(), 1)
self.assertEqual(CrawlSchedule.objects.count(), 1)
crawl = Crawl.objects.first()
schedule = CrawlSchedule.objects.first()
self.assertEqual(crawl.schedule, schedule)
self.assertEqual(schedule.template, crawl)
self.assertEqual(schedule.schedule, 'daily')
self.assertTrue(schedule.is_enabled)
self.assertEqual(schedule.created_by, self.user)
def test_add_crawl_with_cron_schedule(self):
"""Test creating a crawl with a cron format schedule."""
response = self.client.post(self.add_url, {
'url': 'https://example.com',
'depth': '0',
'schedule': '0 */6 * * *', # Every 6 hours
})
self.assertEqual(response.status_code, 302)
schedule = CrawlSchedule.objects.first()
self.assertEqual(schedule.schedule, '0 */6 * * *')
def test_add_crawl_with_plugins(self):
"""Test creating a crawl with specific plugins selected."""
response = self.client.post(self.add_url, {
'url': 'https://example.com',
'depth': '0',
'chrome_plugins': ['screenshot', 'dom'],
'archiving_plugins': ['wget'],
})
self.assertEqual(response.status_code, 302)
crawl = Crawl.objects.first()
plugins = crawl.config.get('PLUGINS', '')
# Should contain the selected plugins
self.assertIn('screenshot', plugins)
self.assertIn('dom', plugins)
self.assertIn('wget', plugins)
def test_add_crawl_with_depth_range(self):
"""Test creating crawls with different depth values (0-4)."""
for depth in range(5):
response = self.client.post(self.add_url, {
'url': f'https://example{depth}.com',
'depth': str(depth),
})
self.assertEqual(response.status_code, 302)
self.assertEqual(Crawl.objects.count(), 5)
for i, crawl in enumerate(Crawl.objects.order_by('created_at')):
self.assertEqual(crawl.max_depth, i)
def test_add_crawl_with_advanced_options(self):
"""Test creating a crawl with advanced options."""
response = self.client.post(self.add_url, {
'url': 'https://example.com',
'depth': '0',
'persona': 'CustomPersona',
'overwrite': True,
'update': True,
'index_only': True,
})
self.assertEqual(response.status_code, 302)
crawl = Crawl.objects.first()
config = crawl.config
self.assertEqual(config.get('DEFAULT_PERSONA'), 'CustomPersona')
self.assertEqual(config.get('OVERWRITE'), True)
self.assertEqual(config.get('ONLY_NEW'), False) # opposite of update
self.assertEqual(config.get('INDEX_ONLY'), True)
def test_add_crawl_with_custom_config(self):
"""Test creating a crawl with custom config overrides."""
# Note: Django test client can't easily POST the KeyValueWidget format,
# so this test would need to use the form directly or mock the cleaned_data
# For now, we'll skip this test or mark it as TODO
pass
def test_add_empty_urls_fails(self):
"""Test that submitting without URLs fails validation."""
response = self.client.post(self.add_url, {
'url': '',
'depth': '0',
})
# Should show form again with errors, not redirect
self.assertEqual(response.status_code, 200)
self.assertFormError(response, 'form', 'url', 'This field is required.')
def test_add_invalid_urls_fails(self):
"""Test that invalid URLs fail validation."""
response = self.client.post(self.add_url, {
'url': 'not-a-url',
'depth': '0',
})
# Should show form again with errors
self.assertEqual(response.status_code, 200)
# Check for validation error (URL regex should fail)
self.assertContains(response, 'error')
def test_add_success_message_without_schedule(self):
"""Test that success message is shown without schedule link."""
response = self.client.post(self.add_url, {
'url': 'https://example.com\nhttps://example.org',
'depth': '0',
}, follow=True)
# Check success message mentions crawl creation
messages = list(response.context['messages'])
self.assertEqual(len(messages), 1)
message_text = str(messages[0])
self.assertIn('Created crawl with 2 starting URL', message_text)
self.assertIn('View Crawl', message_text)
self.assertNotIn('scheduled to repeat', message_text)
def test_add_success_message_with_schedule(self):
"""Test that success message includes schedule link."""
response = self.client.post(self.add_url, {
'url': 'https://example.com',
'depth': '0',
'schedule': 'weekly',
}, follow=True)
# Check success message mentions schedule
messages = list(response.context['messages'])
self.assertEqual(len(messages), 1)
message_text = str(messages[0])
self.assertIn('Created crawl', message_text)
self.assertIn('scheduled to repeat weekly', message_text)
self.assertIn('View Crawl', message_text)
def test_add_crawl_creates_source_file(self):
"""Test that crawl creation saves URLs to sources file."""
response = self.client.post(self.add_url, {
'url': 'https://example.com',
'depth': '0',
})
self.assertEqual(response.status_code, 302)
# Check that source file was created in sources/ directory
from archivebox.config import CONSTANTS
sources_dir = CONSTANTS.SOURCES_DIR
# Should have created a source file
source_files = list(sources_dir.glob('*__web_ui_add_by_user_*.txt'))
self.assertGreater(len(source_files), 0)
def test_multiple_tags_are_saved(self):
"""Test that multiple comma-separated tags are saved."""
response = self.client.post(self.add_url, {
'url': 'https://example.com',
'depth': '0',
'tag': 'tag1,tag2,tag3',
})
self.assertEqual(response.status_code, 302)
crawl = Crawl.objects.first()
self.assertEqual(crawl.tags_str, 'tag1,tag2,tag3')
def test_crawl_redirects_to_admin_change_page(self):
"""Test that successful submission redirects to crawl admin page."""
response = self.client.post(self.add_url, {
'url': 'https://example.com',
'depth': '0',
})
crawl = Crawl.objects.first()
expected_redirect = f'/admin/crawls/crawl/{crawl.id}/change/'
self.assertRedirects(response, expected_redirect, fetch_redirect_response=False)

View File

@@ -7,10 +7,10 @@ from django.views.generic.base import RedirectView
from archivebox.misc.serve_static import serve_static
from core.admin_site import archivebox_admin
from core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView, live_progress_view
from archivebox.core.admin_site import archivebox_admin
from archivebox.core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView, live_progress_view
from workers.views import JobsDashboardView
from archivebox.workers.views import JobsDashboardView
# GLOBAL_CONTEXT doesn't work as-is, disabled for now: https://github.com/ArchiveBox/ArchiveBox/discussions/1306
# from archivebox.config import VERSION, VERSIONS_AVAILABLE, CAN_UPGRADE

View File

@@ -23,7 +23,7 @@ from admin_data_views.typing import TableContext, ItemContext
from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink
import archivebox
from archivebox.config import CONSTANTS, CONSTANTS_CONFIG, DATA_DIR, VERSION, SAVE_ARCHIVE_DOT_ORG
from archivebox.config import CONSTANTS, CONSTANTS_CONFIG, DATA_DIR, VERSION
from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG, ARCHIVING_CONFIG
from archivebox.config.configset import get_flat_config, get_config, get_all_configs
from archivebox.misc.util import base_url, htmlencode, ts_to_date_str
@@ -31,9 +31,9 @@ from archivebox.misc.serve_static import serve_static_with_byterange_support
from archivebox.misc.logging_util import printable_filesize
from archivebox.search import query_search_index
from core.models import Snapshot
from core.forms import AddLinkForm
from crawls.models import Crawl
from archivebox.core.models import Snapshot
from archivebox.core.forms import AddLinkForm
from archivebox.crawls.models import Crawl
from archivebox.hooks import get_extractors, get_extractor_name
@@ -150,7 +150,6 @@ class SnapshotView(View):
'status_color': 'success' if snapshot.is_archived else 'danger',
'oldest_archive_date': ts_to_date_str(snapshot.oldest_archive_date),
'warc_path': warc_path,
'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG,
'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS,
'archiveresults': sorted(archiveresults.values(), key=lambda r: all_types.index(r['name']) if r['name'] in all_types else -r['size']),
'best_result': best_result,
@@ -421,35 +420,34 @@ class AddView(UserPassesTestMixin, FormView):
return SERVER_CONFIG.PUBLIC_ADD_VIEW or self.request.user.is_authenticated
def get_context_data(self, **kwargs):
from archivebox.core.models import Tag
return {
**super().get_context_data(**kwargs),
'title': "Add URLs",
'title': "Create Crawl",
# We can't just call request.build_absolute_uri in the template, because it would include query parameters
'absolute_add_path': self.request.build_absolute_uri(self.request.path),
'VERSION': VERSION,
'FOOTER_INFO': SERVER_CONFIG.FOOTER_INFO,
'stdout': '',
'available_tags': list(Tag.objects.all().order_by('name').values_list('name', flat=True)),
}
def form_valid(self, form):
urls = form.cleaned_data["url"]
print(f'[+] Adding URL: {urls}')
parser = form.cleaned_data.get("parser", "auto") # default to auto-detect parser
tag = form.cleaned_data["tag"]
depth = 0 if form.cleaned_data["depth"] == "0" else 1
plugins = ','.join(form.cleaned_data["archive_methods"])
input_kwargs = {
"urls": urls,
"tag": tag,
"depth": depth,
"parser": parser,
"update_all": False,
"out_dir": DATA_DIR,
"created_by_id": self.request.user.pk,
}
if plugins:
input_kwargs.update({"plugins": plugins})
# Extract all form fields
tag = form.cleaned_data["tag"]
depth = int(form.cleaned_data["depth"])
plugins = ','.join(form.cleaned_data.get("plugins", []))
schedule = form.cleaned_data.get("schedule", "").strip()
persona = form.cleaned_data.get("persona", "Default")
overwrite = form.cleaned_data.get("overwrite", False)
update = form.cleaned_data.get("update", False)
index_only = form.cleaned_data.get("index_only", False)
notes = form.cleaned_data.get("notes", "")
custom_config = form.cleaned_data.get("config", {})
from archivebox.config.permissions import HOSTNAME
@@ -461,33 +459,59 @@ class AddView(UserPassesTestMixin, FormView):
# 2. create a new Crawl with the URLs from the file
timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
urls_content = sources_file.read_text()
# Build complete config
config = {
'ONLY_NEW': not update,
'INDEX_ONLY': index_only,
'OVERWRITE': overwrite,
'DEPTH': depth,
'PLUGINS': plugins or '',
'DEFAULT_PERSONA': persona or 'Default',
}
# Merge custom config overrides
config.update(custom_config)
crawl = Crawl.objects.create(
urls=urls_content,
max_depth=depth,
tags_str=tag,
notes=notes,
label=f'{self.request.user.username}@{HOSTNAME}{self.request.path} {timestamp}',
created_by_id=self.request.user.pk,
config={
# 'ONLY_NEW': not update,
# 'INDEX_ONLY': index_only,
# 'OVERWRITE': False,
'DEPTH': depth,
'PLUGINS': plugins or '',
# 'DEFAULT_PERSONA': persona or 'Default',
}
config=config
)
# 3. create a CrawlSchedule if schedule is provided
if schedule:
from crawls.models import CrawlSchedule
crawl_schedule = CrawlSchedule.objects.create(
template=crawl,
schedule=schedule,
is_enabled=True,
label=crawl.label,
notes=f"Auto-created from add page. {notes}".strip(),
created_by_id=self.request.user.pk,
)
crawl.schedule = crawl_schedule
crawl.save(update_fields=['schedule'])
# 4. start the Orchestrator & wait until it completes
# ... orchestrator will create the root Snapshot, which creates pending ArchiveResults, which gets run by the ArchiveResultActors ...
# from crawls.actors import CrawlActor
# from core.actors import SnapshotActor, ArchiveResultActor
# from archivebox.crawls.actors import CrawlActor
# from archivebox.core.actors import SnapshotActor, ArchiveResultActor
rough_url_count = urls.count('://')
# Build success message with schedule link if created
schedule_msg = ""
if schedule:
schedule_msg = f" and <a href='{crawl.schedule.admin_change_url}'>scheduled to repeat {schedule}</a>"
messages.success(
self.request,
mark_safe(f"Adding {rough_url_count} URLs in the background. (refresh in a minute start seeing results) {crawl.admin_change_url}"),
mark_safe(f"Created crawl with {rough_url_count} starting URL(s){schedule_msg}. Snapshots will be created and archived in the background. <a href='{crawl.admin_change_url}'>View Crawl →</a>"),
)
# Orchestrator (managed by supervisord) will pick up the queued crawl
@@ -516,8 +540,8 @@ def live_progress_view(request):
"""Simple JSON endpoint for live progress status - used by admin progress monitor."""
try:
from workers.orchestrator import Orchestrator
from crawls.models import Crawl
from core.models import Snapshot, ArchiveResult
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot, ArchiveResult
from django.db.models import Case, When, Value, IntegerField
# Get orchestrator status
@@ -764,9 +788,9 @@ def key_is_safe(key: str) -> bool:
def find_config_source(key: str, merged_config: dict) -> str:
"""Determine where a config value comes from."""
import os
from machine.models import Machine
from archivebox.machine.models import Machine
# Check if it's from machine config
# Check if it's from archivebox.machine.config
try:
machine = Machine.current()
if machine.config and key in machine.config:
@@ -778,7 +802,7 @@ def find_config_source(key: str, merged_config: dict) -> str:
if key in os.environ:
return 'Environment'
# Check if it's from config file
# Check if it's from archivebox.config.file
from archivebox.config.configset import BaseConfigSet
file_config = BaseConfigSet.load_from_file(CONSTANTS.CONFIG_FILE)
if key in file_config:
@@ -796,7 +820,7 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
# Get merged config that includes Machine.config overrides
try:
from machine.models import Machine
from archivebox.machine.models import Machine
machine = Machine.current()
merged_config = get_config()
except Exception as e:
@@ -859,7 +883,7 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
@render_with_item_view
def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
import os
from machine.models import Machine
from archivebox.machine.models import Machine
from archivebox.config.configset import BaseConfigSet
CONFIGS = get_all_configs()

View File

@@ -17,8 +17,8 @@ from django_object_actions import action
from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
from core.models import Snapshot
from crawls.models import Crawl, CrawlSchedule
from archivebox.core.models import Snapshot
from archivebox.crawls.models import Crawl, CrawlSchedule
def render_snapshots_list(snapshots_qs, limit=20):

View File

@@ -3,4 +3,4 @@ from django.apps import AppConfig
class CrawlsConfig(AppConfig):
default_auto_field = "django.db.models.BigAutoField"
name = "crawls"
name = "archivebox.crawls"

View File

@@ -1,6 +1,7 @@
__package__ = 'archivebox.crawls'
from typing import TYPE_CHECKING, Iterable
from datetime import timedelta
from archivebox.uuid_compat import uuid7
from pathlib import Path
@@ -11,13 +12,15 @@ from django.conf import settings
from django.urls import reverse_lazy
from django.utils import timezone
from django_stubs_ext.db.models import TypedModelMeta
from statemachine import State, registry
from rich import print
from archivebox.config import CONSTANTS
from archivebox.base_models.models import ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, get_or_create_system_user_pk
from workers.models import ModelWithStateMachine
from archivebox.workers.models import ModelWithStateMachine, BaseStateMachine
if TYPE_CHECKING:
from core.models import Snapshot, ArchiveResult
from archivebox.core.models import Snapshot, ArchiveResult
class CrawlSchedule(ModelWithSerializers, ModelWithNotes, ModelWithHealthStats):
@@ -35,6 +38,7 @@ class CrawlSchedule(ModelWithSerializers, ModelWithNotes, ModelWithHealthStats):
crawl_set: models.Manager['Crawl']
class Meta(TypedModelMeta):
app_label = 'crawls'
verbose_name = 'Scheduled Crawl'
verbose_name_plural = 'Scheduled Crawls'
@@ -73,7 +77,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
status = ModelWithStateMachine.StatusField(choices=ModelWithStateMachine.StatusChoices, default=ModelWithStateMachine.StatusChoices.QUEUED)
retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
state_machine_name = 'crawls.statemachines.CrawlMachine'
state_machine_name = 'crawls.models.CrawlMachine'
retry_at_field_name = 'retry_at'
state_field_name = 'status'
StatusChoices = ModelWithStateMachine.StatusChoices
@@ -82,6 +86,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
snapshot_set: models.Manager['Snapshot']
class Meta(TypedModelMeta):
app_label = 'crawls'
verbose_name = 'Crawl'
verbose_name_plural = 'Crawls'
@@ -168,7 +173,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
return Path(path_str)
def create_root_snapshot(self) -> 'Snapshot':
from core.models import Snapshot
from archivebox.core.models import Snapshot
first_url = self.get_urls_list()[0] if self.get_urls_list() else None
if not first_url:
@@ -245,7 +250,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
List of newly created Snapshot objects
"""
import json
from core.models import Snapshot
from archivebox.core.models import Snapshot
created_snapshots = []
@@ -309,9 +314,13 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
import time
from pathlib import Path
from archivebox.hooks import run_hook, discover_hooks, process_hook_records
from archivebox.config.configset import get_config
# Get merged config with crawl context
config = get_config(crawl=self)
# Discover and run on_Crawl hooks
hooks = discover_hooks('Crawl')
hooks = discover_hooks('Crawl', config=config)
first_url = self.get_urls_list()[0] if self.get_urls_list() else ''
for hook in hooks:
@@ -323,8 +332,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
result = run_hook(
hook,
output_dir=output_dir,
timeout=60,
config_objects=[self],
config=config,
crawl_id=str(self.id),
source_url=first_url,
)
@@ -380,7 +388,10 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
pass
# Run on_CrawlEnd hooks
hooks = discover_hooks('CrawlEnd')
from archivebox.config.configset import get_config
config = get_config(crawl=self)
hooks = discover_hooks('CrawlEnd', config=config)
first_url = self.get_urls_list()[0] if self.get_urls_list() else ''
for hook in hooks:
@@ -391,8 +402,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
result = run_hook(
hook,
output_dir=output_dir,
timeout=30,
config_objects=[self],
config=config,
crawl_id=str(self.id),
source_url=first_url,
)
@@ -400,3 +410,131 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
# Log failures but don't block
if result and result['returncode'] != 0:
print(f'[yellow]⚠️ CrawlEnd hook failed: {hook.name}[/yellow]')
# =============================================================================
# State Machines
# =============================================================================
class CrawlMachine(BaseStateMachine, strict_states=True):
"""
State machine for managing Crawl lifecycle.
Hook Lifecycle:
┌─────────────────────────────────────────────────────────────┐
│ QUEUED State │
│ • Waiting for crawl to be ready (has URLs) │
└─────────────────────────────────────────────────────────────┘
↓ tick() when can_start()
┌─────────────────────────────────────────────────────────────┐
│ STARTED State → enter_started() │
│ 1. crawl.run() │
│ • discover_hooks('Crawl') → finds all crawl hooks │
│ • For each hook: │
│ - run_hook(script, output_dir, ...) │
│ - Parse JSONL from hook output │
│ - process_hook_records() → creates Snapshots │
│ • create_root_snapshot() → root snapshot for crawl │
│ • create_snapshots_from_urls() → from self.urls field │
│ │
│ 2. Snapshots process independently with their own │
│ state machines (see SnapshotMachine) │
└─────────────────────────────────────────────────────────────┘
↓ tick() when is_finished()
┌─────────────────────────────────────────────────────────────┐
│ SEALED State → enter_sealed() │
│ • cleanup() → runs on_CrawlEnd hooks, kills background │
│ • Set retry_at=None (no more processing) │
└─────────────────────────────────────────────────────────────┘
"""
model_attr_name = 'crawl'
# States
queued = State(value=Crawl.StatusChoices.QUEUED, initial=True)
started = State(value=Crawl.StatusChoices.STARTED)
sealed = State(value=Crawl.StatusChoices.SEALED, final=True)
# Tick Event
tick = (
queued.to.itself(unless='can_start') |
queued.to(started, cond='can_start') |
started.to.itself(unless='is_finished') |
started.to(sealed, cond='is_finished')
)
def can_start(self) -> bool:
if not self.crawl.urls:
print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: no URLs[/red]')
return False
urls_list = self.crawl.get_urls_list()
if not urls_list:
print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: no valid URLs in urls field[/red]')
return False
return True
def is_finished(self) -> bool:
from archivebox.core.models import Snapshot
# check that at least one snapshot exists for this crawl
snapshots = Snapshot.objects.filter(crawl=self.crawl)
if not snapshots.exists():
return False
# check if all snapshots are sealed
# Snapshots handle their own background hooks via the step system,
# so we just need to wait for all snapshots to reach sealed state
if snapshots.filter(status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]).exists():
return False
return True
@started.enter
def enter_started(self):
# Lock the crawl by bumping retry_at so other workers don't pick it up while we create snapshots
self.crawl.update_and_requeue(
retry_at=timezone.now() + timedelta(seconds=30), # Lock for 30 seconds
)
try:
# Run the crawl - runs hooks, processes JSONL, creates snapshots
self.crawl.run()
# Update status to STARTED once snapshots are created
# Set retry_at to future so we don't busy-loop - wait for snapshots to process
self.crawl.update_and_requeue(
retry_at=timezone.now() + timedelta(seconds=5), # Check again in 5s
status=Crawl.StatusChoices.STARTED,
)
except Exception as e:
print(f'[red]⚠️ Crawl {self.crawl.id} failed to start: {e}[/red]')
import traceback
traceback.print_exc()
# Re-raise so the worker knows it failed
raise
def on_started_to_started(self):
"""Called when Crawl stays in started state (snapshots not sealed yet)."""
# Bump retry_at so we check again in a few seconds
self.crawl.update_and_requeue(
retry_at=timezone.now() + timedelta(seconds=5),
)
@sealed.enter
def enter_sealed(self):
# Clean up background hooks and run on_CrawlEnd hooks
self.crawl.cleanup()
self.crawl.update_and_requeue(
retry_at=None,
status=Crawl.StatusChoices.SEALED,
)
# =============================================================================
# Register State Machines
# =============================================================================
# Manually register state machines with python-statemachine registry
# (normally auto-discovered from statemachines.py, but we define them here for clarity)
registry.register(CrawlMachine)

View File

@@ -1,114 +0,0 @@
__package__ = 'archivebox.crawls'
import os
from typing import ClassVar
from datetime import timedelta
from django.utils import timezone
from rich import print
from statemachine import State, StateMachine
# from workers.actor import ActorType
from crawls.models import Crawl
class CrawlMachine(StateMachine, strict_states=True):
"""State machine for managing Crawl lifecycle."""
model: Crawl
# States
queued = State(value=Crawl.StatusChoices.QUEUED, initial=True)
started = State(value=Crawl.StatusChoices.STARTED)
sealed = State(value=Crawl.StatusChoices.SEALED, final=True)
# Tick Event
tick = (
queued.to.itself(unless='can_start') |
queued.to(started, cond='can_start') |
started.to.itself(unless='is_finished') |
started.to(sealed, cond='is_finished')
)
def __init__(self, crawl, *args, **kwargs):
self.crawl = crawl
super().__init__(crawl, *args, **kwargs)
def __repr__(self) -> str:
return f'Crawl[{self.crawl.id}]'
def __str__(self) -> str:
return self.__repr__()
def can_start(self) -> bool:
if not self.crawl.urls:
print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: no URLs[/red]')
return False
urls_list = self.crawl.get_urls_list()
if not urls_list:
print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: no valid URLs in urls field[/red]')
return False
return True
def is_finished(self) -> bool:
from core.models import Snapshot, ArchiveResult
# check that at least one snapshot exists for this crawl
snapshots = Snapshot.objects.filter(crawl=self.crawl)
if not snapshots.exists():
return False
# check to make sure no snapshots are in non-final states
if snapshots.filter(status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]).exists():
return False
# check that some archiveresults exist for this crawl
results = ArchiveResult.objects.filter(snapshot__crawl=self.crawl)
if not results.exists():
return False
# check if all archiveresults are finished
if results.filter(status__in=[ArchiveResult.StatusChoices.QUEUED, ArchiveResult.StatusChoices.STARTED]).exists():
return False
return True
# def before_transition(self, event, state):
# print(f"Before '{event}', on the '{state.id}' state.")
# return "before_transition_return"
@started.enter
def enter_started(self):
# Suppressed: state transition logs
# Lock the crawl by bumping retry_at so other workers don't pick it up while we create snapshots
self.crawl.update_for_workers(
retry_at=timezone.now() + timedelta(seconds=30), # Lock for 30 seconds
)
try:
# Run the crawl - runs hooks, processes JSONL, creates snapshots
self.crawl.run()
# Update status to STARTED once snapshots are created
self.crawl.update_for_workers(
retry_at=timezone.now(), # Process immediately
status=Crawl.StatusChoices.STARTED,
)
except Exception as e:
print(f'[red]⚠️ Crawl {self.crawl.id} failed to start: {e}[/red]')
import traceback
traceback.print_exc()
# Re-raise so the worker knows it failed
raise
@sealed.enter
def enter_sealed(self):
# Clean up background hooks and run on_CrawlEnd hooks
self.crawl.cleanup()
# Suppressed: state transition logs
self.crawl.update_for_workers(
retry_at=None,
status=Crawl.StatusChoices.SEALED,
)

View File

@@ -146,11 +146,16 @@ class HookResult(TypedDict, total=False):
records: List[Dict[str, Any]] # Parsed JSONL records with 'type' field
def discover_hooks(event_name: str) -> List[Path]:
def discover_hooks(
event_name: str,
filter_disabled: bool = True,
config: Optional[Dict[str, Any]] = None
) -> List[Path]:
"""
Find all hook scripts matching on_{event_name}__*.{sh,py,js} pattern.
Searches both built-in and user plugin directories.
Filters out hooks from disabled plugins by default (respects USE_/SAVE_ flags).
Returns scripts sorted alphabetically by filename for deterministic execution order.
Hook naming convention uses numeric prefixes to control order:
@@ -158,9 +163,29 @@ def discover_hooks(event_name: str) -> List[Path]:
on_Snapshot__15_singlefile.py # runs second
on_Snapshot__26_readability.py # runs later (depends on singlefile)
Example:
Args:
event_name: Event name (e.g., 'Snapshot', 'Binary', 'Crawl')
filter_disabled: If True, skip hooks from disabled plugins (default: True)
config: Optional config dict from get_config() (merges file, env, machine, crawl, snapshot)
If None, will call get_config() with global scope
Returns:
Sorted list of hook script paths from enabled plugins only.
Examples:
# With proper config context (recommended):
from archivebox.config.configset import get_config
config = get_config(crawl=my_crawl, snapshot=my_snapshot)
discover_hooks('Snapshot', config=config)
# Returns: [Path('.../on_Snapshot__10_title.py'), ...] (wget excluded if SAVE_WGET=False)
# Without config (uses global defaults):
discover_hooks('Snapshot')
# Returns: [Path('.../on_Snapshot__10_title.py'), Path('.../on_Snapshot__15_singlefile.py'), ...]
# Returns: [Path('.../on_Snapshot__10_title.py'), ...]
# Show all plugins regardless of enabled status:
discover_hooks('Snapshot', filter_disabled=False)
# Returns: [Path('.../on_Snapshot__10_title.py'), ..., Path('.../on_Snapshot__50_wget.py')]
"""
hooks = []
@@ -177,45 +202,44 @@ def discover_hooks(event_name: str) -> List[Path]:
pattern_direct = f'on_{event_name}__*.{ext}'
hooks.extend(base_dir.glob(pattern_direct))
# Filter by enabled plugins
if filter_disabled:
# Get merged config if not provided (lazy import to avoid circular dependency)
if config is None:
from archivebox.config.configset import get_config
config = get_config(scope='global')
enabled_hooks = []
for hook in hooks:
# Get plugin name from parent directory
# e.g., archivebox/plugins/wget/on_Snapshot__50_wget.py -> 'wget'
plugin_name = hook.parent.name
# Check if this is a plugin directory (not the root plugins dir)
if plugin_name in ('plugins', '.'):
# Hook is in root plugins directory, not a plugin subdir
# Include it by default (no filtering for non-plugin hooks)
enabled_hooks.append(hook)
continue
# Check if plugin is enabled
plugin_config = get_plugin_special_config(plugin_name, config)
if plugin_config['enabled']:
enabled_hooks.append(hook)
hooks = enabled_hooks
# Sort by filename (not full path) to ensure numeric prefix ordering works
# e.g., on_Snapshot__10_title.py sorts before on_Snapshot__26_readability.py
return sorted(set(hooks), key=lambda p: p.name)
def discover_all_hooks() -> Dict[str, List[Path]]:
"""
Discover all hooks organized by event name.
Returns a dict mapping event names to lists of hook script paths.
"""
hooks_by_event: Dict[str, List[Path]] = {}
for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
if not base_dir.exists():
continue
for ext in ('sh', 'py', 'js'):
for hook_path in base_dir.glob(f'*/on_*__*.{ext}'):
# Extract event name from filename: on_EventName__hook_name.ext
filename = hook_path.stem # on_EventName__hook_name
if filename.startswith('on_') and '__' in filename:
event_name = filename[3:].split('__')[0] # EventName
if event_name not in hooks_by_event:
hooks_by_event[event_name] = []
hooks_by_event[event_name].append(hook_path)
# Sort hooks within each event
for event_name in hooks_by_event:
hooks_by_event[event_name] = sorted(set(hooks_by_event[event_name]), key=lambda p: p.name)
return hooks_by_event
def run_hook(
script: Path,
output_dir: Path,
timeout: int = 300,
config_objects: Optional[List[Any]] = None,
config: Dict[str, Any],
timeout: Optional[int] = None,
**kwargs: Any
) -> HookResult:
"""
@@ -224,31 +248,33 @@ def run_hook(
This is the low-level hook executor. For running extractors with proper
metadata handling, use call_extractor() instead.
Config is passed to hooks via environment variables with this priority:
1. Plugin schema defaults (config.json)
2. Config file (ArchiveBox.conf)
3. Environment variables
4. Machine.config (auto-included, lowest override priority)
5. config_objects (in order - later objects override earlier ones)
Config is passed to hooks via environment variables. Caller MUST use
get_config() to merge all sources (file, env, machine, crawl, snapshot).
Args:
script: Path to the hook script (.sh, .py, or .js)
output_dir: Working directory for the script (where output files go)
config: Merged config dict from get_config(crawl=..., snapshot=...) - REQUIRED
timeout: Maximum execution time in seconds
config_objects: Optional list of objects with .config JSON fields
(e.g., [crawl, snapshot] - later items have higher priority)
If None, auto-detects from PLUGINNAME_TIMEOUT config (fallback to TIMEOUT, default 300)
**kwargs: Arguments passed to the script as --key=value
Returns:
HookResult with 'returncode', 'stdout', 'stderr', 'output_json', 'output_files', 'duration_ms'
Example:
from archivebox.config.configset import get_config
config = get_config(crawl=my_crawl, snapshot=my_snapshot)
result = run_hook(hook_path, output_dir, config=config, url=url, snapshot_id=id)
"""
import time
start_time = time.time()
# Auto-include Machine.config at the start (lowest priority among config_objects)
from machine.models import Machine
machine = Machine.current()
all_config_objects = [machine] + list(config_objects or [])
# Auto-detect timeout from plugin config if not explicitly provided
if timeout is None:
plugin_name = script.parent.name
plugin_config = get_plugin_special_config(plugin_name, config)
timeout = plugin_config['timeout']
if not script.exists():
return HookResult(
@@ -302,51 +328,16 @@ def run_hook(
env['ARCHIVE_DIR'] = str(getattr(settings, 'ARCHIVE_DIR', Path.cwd() / 'archive'))
env.setdefault('MACHINE_ID', getattr(settings, 'MACHINE_ID', '') or os.environ.get('MACHINE_ID', ''))
# If a Crawl is in config_objects, pass its OUTPUT_DIR for hooks that need to find crawl-level resources
for obj in all_config_objects:
if hasattr(obj, 'OUTPUT_DIR') and hasattr(obj, 'get_urls_list'): # Duck-type check for Crawl
env['CRAWL_OUTPUT_DIR'] = str(obj.OUTPUT_DIR)
break
# Build overrides from any objects with .config fields (in order, later overrides earlier)
# all_config_objects includes Machine at the start, then any passed config_objects
overrides = {}
for obj in all_config_objects:
if obj and hasattr(obj, 'config') and obj.config:
# Strip 'config/' prefix from Machine.config keys (e.g., 'config/CHROME_BINARY' -> 'CHROME_BINARY')
for key, value in obj.config.items():
clean_key = key.removeprefix('config/')
overrides[clean_key] = value
# Get plugin config from JSON schemas with hierarchy resolution
# This merges: schema defaults -> config file -> env vars -> object config overrides
plugin_config = get_flat_plugin_config(overrides=overrides if overrides else None)
export_plugin_config_to_env(plugin_config, env)
# Also pass core config values that aren't in plugin schemas yet
# These are legacy values that may still be needed
from archivebox import config
env.setdefault('CHROME_BINARY', str(getattr(config, 'CHROME_BINARY', '')))
env.setdefault('WGET_BINARY', str(getattr(config, 'WGET_BINARY', '')))
env.setdefault('CURL_BINARY', str(getattr(config, 'CURL_BINARY', '')))
env.setdefault('GIT_BINARY', str(getattr(config, 'GIT_BINARY', '')))
env.setdefault('YOUTUBEDL_BINARY', str(getattr(config, 'YOUTUBEDL_BINARY', '')))
env.setdefault('SINGLEFILE_BINARY', str(getattr(config, 'SINGLEFILE_BINARY', '')))
env.setdefault('READABILITY_BINARY', str(getattr(config, 'READABILITY_BINARY', '')))
env.setdefault('MERCURY_BINARY', str(getattr(config, 'MERCURY_BINARY', '')))
env.setdefault('NODE_BINARY', str(getattr(config, 'NODE_BINARY', '')))
env.setdefault('TIMEOUT', str(getattr(config, 'TIMEOUT', 60)))
env.setdefault('CHECK_SSL_VALIDITY', str(getattr(config, 'CHECK_SSL_VALIDITY', True)))
env.setdefault('USER_AGENT', str(getattr(config, 'USER_AGENT', '')))
env.setdefault('RESOLUTION', str(getattr(config, 'RESOLUTION', '')))
# Pass SEARCH_BACKEND_ENGINE from new-style config
try:
from archivebox.config.configset import get_config
search_config = get_config()
env.setdefault('SEARCH_BACKEND_ENGINE', str(search_config.get('SEARCH_BACKEND_ENGINE', 'ripgrep')))
except Exception:
env.setdefault('SEARCH_BACKEND_ENGINE', 'ripgrep')
# Export all config values to environment (already merged by get_config())
for key, value in config.items():
if value is None:
continue
elif isinstance(value, bool):
env[key] = 'true' if value else 'false'
elif isinstance(value, (list, dict)):
env[key] = json.dumps(value)
else:
env[key] = str(value)
# Create output directory if needed
output_dir.mkdir(parents=True, exist_ok=True)
@@ -525,31 +516,35 @@ def collect_urls_from_plugins(snapshot_dir: Path) -> List[Dict[str, Any]]:
def run_hooks(
event_name: str,
output_dir: Path,
timeout: int = 300,
config: Dict[str, Any],
timeout: Optional[int] = None,
stop_on_failure: bool = False,
config_objects: Optional[List[Any]] = None,
**kwargs: Any
) -> List[HookResult]:
"""
Run all hooks for a given event.
Args:
event_name: The event name to trigger (e.g., 'Snapshot__wget')
event_name: The event name to trigger (e.g., 'Snapshot', 'Crawl', 'Binary')
output_dir: Working directory for hook scripts
timeout: Maximum execution time per hook
config: Merged config dict from get_config(crawl=..., snapshot=...) - REQUIRED
timeout: Maximum execution time per hook (None = auto-detect from plugin config)
stop_on_failure: If True, stop executing hooks after first failure
config_objects: Optional list of objects with .config JSON fields
(e.g., [crawl, snapshot] - later items have higher priority)
**kwargs: Arguments passed to each hook script
Returns:
List of results from each hook execution
Example:
from archivebox.config.configset import get_config
config = get_config(crawl=my_crawl, snapshot=my_snapshot)
results = run_hooks('Snapshot', output_dir, config=config, url=url, snapshot_id=id)
"""
hooks = discover_hooks(event_name)
hooks = discover_hooks(event_name, config=config)
results = []
for hook in hooks:
result = run_hook(hook, output_dir, timeout=timeout, config_objects=config_objects, **kwargs)
result = run_hook(hook, output_dir, config=config, timeout=timeout, **kwargs)
# Background hooks return None - skip adding to results
if result is None:
@@ -638,24 +633,44 @@ EXTRACTOR_INDEXING_PRECEDENCE = [
]
def get_enabled_plugins(config: Optional[Dict] = None) -> List[str]:
def get_enabled_plugins(config: Optional[Dict[str, Any]] = None) -> List[str]:
"""
Get the list of enabled plugins based on config and available hooks.
Checks for ENABLED_PLUGINS (or legacy ENABLED_EXTRACTORS) in config,
falls back to discovering available hooks from the plugins directory.
Filters plugins by USE_/SAVE_ flags. Only returns plugins that are enabled.
Returns plugin names sorted alphabetically (numeric prefix controls order).
Args:
config: Merged config dict from get_config() - if None, uses global config
Returns:
Plugin names sorted alphabetically (numeric prefix controls order).
Example:
from archivebox.config.configset import get_config
config = get_config(crawl=my_crawl, snapshot=my_snapshot)
enabled = get_enabled_plugins(config) # ['wget', 'media', 'chrome', ...]
"""
if config:
# Support both new and legacy config keys
if 'ENABLED_PLUGINS' in config:
return config['ENABLED_PLUGINS']
if 'ENABLED_EXTRACTORS' in config:
return config['ENABLED_EXTRACTORS']
# Get merged config if not provided
if config is None:
from archivebox.config.configset import get_config
config = get_config(scope='global')
# Discover from hooks - this is the source of truth
return get_plugins()
# Support explicit ENABLED_PLUGINS override (legacy)
if 'ENABLED_PLUGINS' in config:
return config['ENABLED_PLUGINS']
if 'ENABLED_EXTRACTORS' in config:
return config['ENABLED_EXTRACTORS']
# Filter all plugins by enabled status
all_plugins = get_plugins()
enabled = []
for plugin in all_plugins:
plugin_config = get_plugin_special_config(plugin, config)
if plugin_config['enabled']:
enabled.append(plugin)
return enabled
def discover_plugins_that_provide_interface(
@@ -822,37 +837,6 @@ def discover_plugin_configs() -> Dict[str, Dict[str, Any]]:
return configs
def get_merged_config_schema() -> Dict[str, Any]:
"""
Get a merged JSONSchema combining all plugin config schemas.
This creates a single schema that can validate all plugin config keys.
Useful for validating the complete configuration at startup.
Returns:
Combined JSONSchema with all plugin properties merged.
"""
plugin_configs = discover_plugin_configs()
merged_properties = {}
for plugin_name, schema in plugin_configs.items():
properties = schema.get('properties', {})
for key, prop_schema in properties.items():
if key in merged_properties:
# Key already exists from another plugin - log warning but keep first
import sys
print(f"Warning: Config key '{key}' defined in multiple plugins, using first definition", file=sys.stderr)
continue
merged_properties[key] = prop_schema
return {
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": True, # Allow unknown keys (core config, etc.)
"properties": merged_properties,
}
def get_config_defaults_from_plugins() -> Dict[str, Any]:
"""
Get default values for all plugin config options.
@@ -873,173 +857,63 @@ def get_config_defaults_from_plugins() -> Dict[str, Any]:
return defaults
def resolve_config_value(
key: str,
prop_schema: Dict[str, Any],
env_vars: Dict[str, str],
config_file: Dict[str, str],
overrides: Optional[Dict[str, Any]] = None,
) -> Any:
def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[str, Any]:
"""
Resolve a single config value following the hierarchy and schema rules.
Extract special config keys for a plugin following naming conventions.
Resolution order (later overrides earlier):
1. Schema default
2. x-fallback (global config key)
3. Config file (ArchiveBox.conf)
4. Environment variables (including x-aliases)
5. Explicit overrides (User/Crawl/Snapshot config)
ArchiveBox recognizes 3 special config key patterns per plugin:
- {PLUGIN}_ENABLED: Enable/disable toggle (default True)
- {PLUGIN}_TIMEOUT: Plugin-specific timeout (fallback to TIMEOUT, default 300)
- {PLUGIN}_BINARY: Primary binary path (default to plugin_name)
These allow ArchiveBox to:
- Skip disabled plugins (optimization)
- Enforce plugin-specific timeouts automatically
- Discover plugin binaries for validation
Args:
key: Config key name (e.g., 'WGET_TIMEOUT')
prop_schema: JSONSchema property definition for this key
env_vars: Environment variables dict
config_file: Config file values dict
overrides: Optional override values (from User/Crawl/Snapshot)
plugin_name: Plugin name (e.g., 'wget', 'media', 'chrome')
config: Merged config dict from get_config() (properly merges file, env, machine, crawl, snapshot)
Returns:
Resolved value with appropriate type coercion.
Dict with standardized keys:
{
'enabled': True, # bool
'timeout': 60, # int, seconds
'binary': 'wget', # str, path or name
}
Examples:
>>> from archivebox.config.configset import get_config
>>> config = get_config(crawl=my_crawl, snapshot=my_snapshot)
>>> get_plugin_special_config('wget', config)
{'enabled': True, 'timeout': 120, 'binary': '/usr/bin/wget'}
"""
value = None
prop_type = prop_schema.get('type', 'string')
plugin_upper = plugin_name.upper()
# 1. Start with schema default
if 'default' in prop_schema:
value = prop_schema['default']
# 1. Enabled: PLUGINNAME_ENABLED (default True)
# Old names (USE_*, SAVE_*) are aliased in config.json via x-aliases
enabled_key = f'{plugin_upper}_ENABLED'
enabled = config.get(enabled_key)
if enabled is None:
enabled = True
elif isinstance(enabled, str):
# Handle string values from config file ("true"/"false")
enabled = enabled.lower() not in ('false', '0', 'no', '')
# 2. Check x-fallback (global config key)
fallback_key = prop_schema.get('x-fallback')
if fallback_key:
if fallback_key in env_vars:
value = env_vars[fallback_key]
elif fallback_key in config_file:
value = config_file[fallback_key]
# 2. Timeout: PLUGINNAME_TIMEOUT (fallback to TIMEOUT, default 300)
timeout_key = f'{plugin_upper}_TIMEOUT'
timeout = config.get(timeout_key) or config.get('TIMEOUT', 300)
# 3. Check config file for main key
if key in config_file:
value = config_file[key]
# 3. Binary: PLUGINNAME_BINARY (default to plugin_name)
binary_key = f'{plugin_upper}_BINARY'
binary = config.get(binary_key, plugin_name)
# 4. Check environment variables (main key and aliases)
keys_to_check = [key] + prop_schema.get('x-aliases', [])
for check_key in keys_to_check:
if check_key in env_vars:
value = env_vars[check_key]
break
# 5. Apply explicit overrides
if overrides and key in overrides:
value = overrides[key]
# Type coercion for env var strings
if value is not None and isinstance(value, str):
value = coerce_config_value(value, prop_type, prop_schema)
return value
def coerce_config_value(value: str, prop_type: str, prop_schema: Dict[str, Any]) -> Any:
"""
Coerce a string value to the appropriate type based on schema.
Args:
value: String value to coerce
prop_type: JSONSchema type ('boolean', 'integer', 'number', 'array', 'string')
prop_schema: Full property schema (for array item types, etc.)
Returns:
Coerced value of appropriate type.
"""
if prop_type == 'boolean':
return value.lower() in ('true', '1', 'yes', 'on')
elif prop_type == 'integer':
try:
return int(value)
except ValueError:
return prop_schema.get('default', 0)
elif prop_type == 'number':
try:
return float(value)
except ValueError:
return prop_schema.get('default', 0.0)
elif prop_type == 'array':
# Try JSON parse first, fall back to comma-separated
try:
return json.loads(value)
except json.JSONDecodeError:
return [v.strip() for v in value.split(',') if v.strip()]
else:
return value
def get_flat_plugin_config(
env_vars: Optional[Dict[str, str]] = None,
config_file: Optional[Dict[str, str]] = None,
overrides: Optional[Dict[str, Any]] = None,
) -> Dict[str, Any]:
"""
Get all plugin config values resolved according to hierarchy.
This is the main function for getting plugin configuration.
It discovers all plugin schemas and resolves each config key.
Args:
env_vars: Environment variables (defaults to os.environ)
config_file: Config file values (from ArchiveBox.conf)
overrides: Override values (from User/Crawl/Snapshot config fields)
Returns:
Flat dict of all resolved config values.
e.g., {'SAVE_WGET': True, 'WGET_TIMEOUT': 60, ...}
"""
if env_vars is None:
env_vars = dict(os.environ)
if config_file is None:
config_file = {}
plugin_configs = discover_plugin_configs()
flat_config = {}
for plugin_name, schema in plugin_configs.items():
properties = schema.get('properties', {})
for key, prop_schema in properties.items():
flat_config[key] = resolve_config_value(
key, prop_schema, env_vars, config_file, overrides
)
return flat_config
def export_plugin_config_to_env(
config: Dict[str, Any],
env: Optional[Dict[str, str]] = None,
) -> Dict[str, str]:
"""
Export plugin config values to environment variable format.
Converts all values to strings suitable for subprocess environment.
Arrays are JSON-encoded.
Args:
config: Flat config dict from get_flat_plugin_config()
env: Optional existing env dict to update (creates new if None)
Returns:
Environment dict with config values as strings.
"""
if env is None:
env = {}
for key, value in config.items():
if value is None:
continue
elif isinstance(value, bool):
env[key] = 'true' if value else 'false'
elif isinstance(value, (list, dict)):
env[key] = json.dumps(value)
else:
env[key] = str(value)
return env
return {
'enabled': bool(enabled),
'timeout': int(timeout),
'binary': str(binary),
}
# =============================================================================
@@ -1233,7 +1107,7 @@ def find_binary_for_cmd(cmd: List[str], machine_id: str) -> Optional[str]:
if not cmd:
return None
from machine.models import Binary
from archivebox.machine.models import Binary
bin_path_or_name = cmd[0] if isinstance(cmd, list) else cmd
@@ -1266,7 +1140,7 @@ def create_model_record(record: Dict[str, Any]) -> Any:
Returns:
Created/updated model instance, or None if type unknown
"""
from machine.models import Binary, Machine
from archivebox.machine.models import Binary, Machine
record_type = record.pop('type', None)
if not record_type:
@@ -1349,25 +1223,25 @@ def process_hook_records(records: List[Dict[str, Any]], overrides: Dict[str, Any
try:
# Dispatch to appropriate model's from_jsonl() method
if record_type == 'Snapshot':
from core.models import Snapshot
from archivebox.core.models import Snapshot
obj = Snapshot.from_jsonl(record.copy(), overrides)
if obj:
stats['Snapshot'] = stats.get('Snapshot', 0) + 1
elif record_type == 'Tag':
from core.models import Tag
from archivebox.core.models import Tag
obj = Tag.from_jsonl(record.copy(), overrides)
if obj:
stats['Tag'] = stats.get('Tag', 0) + 1
elif record_type == 'Binary':
from machine.models import Binary
from archivebox.machine.models import Binary
obj = Binary.from_jsonl(record.copy(), overrides)
if obj:
stats['Binary'] = stats.get('Binary', 0) + 1
elif record_type == 'Machine':
from machine.models import Machine
from archivebox.machine.models import Machine
obj = Machine.from_jsonl(record.copy(), overrides)
if obj:
stats['Machine'] = stats.get('Machine', 0) + 1

View File

@@ -4,7 +4,7 @@ from django.contrib import admin
from django.utils.html import format_html
from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
from machine.models import Machine, NetworkInterface, Binary
from archivebox.machine.models import Machine, NetworkInterface, Binary
class MachineAdmin(ConfigEditorMixin, BaseModelAdmin):

View File

@@ -5,11 +5,11 @@ from django.apps import AppConfig
class MachineConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField'
name = 'machine'
name = 'archivebox.machine'
verbose_name = 'Machine Info'
def register_admin(admin_site):
from machine.admin import register_admin
from archivebox.machine.admin import register_admin
register_admin(admin_site)

View File

@@ -14,9 +14,9 @@ class Migration(migrations.Migration):
replaces = [
('machine', '0001_initial'),
('machine', '0002_alter_machine_stats_binary'),
('machine', '0003_alter_binary_options_and_more'),
('machine', '0004_alter_binary_abspath_and_more'),
('machine', '0002_alter_machine_stats_installedbinary'),
('machine', '0003_alter_installedbinary_options_and_more'),
('machine', '0004_alter_installedbinary_abspath_and_more'),
]
dependencies = []
@@ -70,22 +70,7 @@ class Migration(migrations.Migration):
'unique_together': {('machine', 'ip_public', 'ip_local', 'mac_address', 'dns_server')},
},
),
migrations.CreateModel(
name='Dependency',
fields=[
('id', models.UUIDField(default=uuid4, editable=False, primary_key=True, serialize=False, unique=True)),
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
('modified_at', models.DateTimeField(auto_now=True)),
('bin_name', models.CharField(db_index=True, max_length=63, unique=True)),
('bin_providers', models.CharField(default='*', max_length=127)),
('custom_cmds', models.JSONField(blank=True, default=dict)),
('config', models.JSONField(blank=True, default=dict)),
],
options={
'verbose_name': 'Dependency',
'verbose_name_plural': 'Dependencies',
},
),
# Dependency model removed - not needed anymore
migrations.CreateModel(
name='Binary',
fields=[
@@ -100,7 +85,7 @@ class Migration(migrations.Migration):
('version', models.CharField(blank=True, default=None, max_length=32)),
('sha256', models.CharField(blank=True, default=None, max_length=64)),
('machine', models.ForeignKey(blank=True, default=None, on_delete=django.db.models.deletion.CASCADE, to='machine.machine')),
('dependency', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='binary_set', to='machine.dependency')),
# dependency FK removed - Dependency model deleted
],
options={
'verbose_name': 'Binary',

View File

@@ -1,6 +1,8 @@
# Generated manually on 2025-12-26
# NOTE: This migration is intentionally empty but kept for dependency chain
# The Dependency model was removed in 0004, so all operations have been stripped
from django.db import migrations, models
from django.db import migrations
class Migration(migrations.Migration):
@@ -10,29 +12,5 @@ class Migration(migrations.Migration):
]
operations = [
migrations.RenameField(
model_name='dependency',
old_name='custom_cmds',
new_name='overrides',
),
migrations.AlterField(
model_name='dependency',
name='bin_name',
field=models.CharField(db_index=True, help_text='Binary executable name (e.g., wget, yt-dlp, chromium)', max_length=63, unique=True),
),
migrations.AlterField(
model_name='dependency',
name='bin_providers',
field=models.CharField(default='*', help_text='Comma-separated list of allowed providers: apt,brew,pip,npm,gem,nix,custom or * for any', max_length=127),
),
migrations.AlterField(
model_name='dependency',
name='overrides',
field=models.JSONField(blank=True, default=dict, help_text="JSON map matching abx-pkg Binary.overrides format: {'pip': {'packages': ['pkg']}, 'apt': {'packages': ['pkg']}}"),
),
migrations.AlterField(
model_name='dependency',
name='config',
field=models.JSONField(blank=True, default=dict, help_text='JSON map of env var config to use during install'),
),
# All Dependency operations removed - model deleted in 0004
]

View File

@@ -1,8 +1,8 @@
# Generated by Django 6.0 on 2025-12-28 05:12
# NOTE: This migration is intentionally empty but kept for dependency chain
# The Dependency model was removed in 0004, all operations stripped
import django.db.models.deletion
from archivebox import uuid_compat
from django.db import migrations, models
from django.db import migrations
class Migration(migrations.Migration):
@@ -12,34 +12,6 @@ class Migration(migrations.Migration):
]
operations = [
migrations.AlterField(
model_name='dependency',
name='id',
field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
migrations.AlterField(
model_name='binary',
name='dependency',
field=models.ForeignKey(blank=True, help_text='The Dependency this binary satisfies', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='binary_set', to='machine.dependency'),
),
migrations.AlterField(
model_name='binary',
name='id',
field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
migrations.AlterField(
model_name='machine',
name='config',
field=models.JSONField(blank=True, default=dict, help_text='Machine-specific config overrides (e.g., resolved binary paths like WGET_BINARY)'),
),
migrations.AlterField(
model_name='machine',
name='id',
field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
migrations.AlterField(
model_name='networkinterface',
name='id',
field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
# All operations removed - Dependency model deleted in 0004
# This is a stub migration for users upgrading from old dev versions
]

View File

@@ -0,0 +1,28 @@
# Generated migration - removes Dependency model entirely
# NOTE: This is a cleanup migration for users upgrading from old dev versions
# that had the Dependency model. Fresh installs never create this table.
from django.db import migrations
def drop_dependency_table(apps, schema_editor):
"""
Drop old Dependency table if it exists (from dev versions that had it).
Safe to run multiple times, safe if table doesn't exist.
Does NOT touch machine_binary - that's our current Binary model table!
"""
schema_editor.execute('DROP TABLE IF EXISTS machine_dependency')
# Also drop old InstalledBinary table if it somehow still exists
schema_editor.execute('DROP TABLE IF EXISTS machine_installedbinary')
class Migration(migrations.Migration):
dependencies = [
('machine', '0003_alter_dependency_id_alter_installedbinary_dependency_and_more'),
]
operations = [
migrations.RunPython(drop_dependency_table, migrations.RunPython.noop),
]

View File

@@ -1,56 +0,0 @@
# Generated migration - Clean slate for Binary model
# Drops old InstalledBinary and Dependency tables, creates new Binary table
from django.db import migrations, models
import django.utils.timezone
import archivebox.uuid_compat
def drop_old_tables(apps, schema_editor):
"""Drop old tables using raw SQL"""
schema_editor.execute('DROP TABLE IF EXISTS machine_installedbinary')
schema_editor.execute('DROP TABLE IF EXISTS machine_dependency')
schema_editor.execute('DROP TABLE IF EXISTS machine_binary') # In case rename happened
class Migration(migrations.Migration):
dependencies = [
('machine', '0003_alter_dependency_id_alter_installedbinary_dependency_and_more'),
]
operations = [
# Drop old tables using raw SQL
migrations.RunPython(drop_old_tables, migrations.RunPython.noop),
# Create new Binary model from scratch
migrations.CreateModel(
name='Binary',
fields=[
('id', models.UUIDField(default=archivebox.uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
('modified_at', models.DateTimeField(auto_now=True)),
('name', models.CharField(blank=True, db_index=True, default=None, max_length=63)),
('binproviders', models.CharField(blank=True, default='env', help_text='Comma-separated list of allowed providers: apt,brew,pip,npm,env', max_length=127)),
('overrides', models.JSONField(blank=True, default=dict, help_text="Provider-specific overrides: {'apt': {'packages': ['pkg']}, ...}")),
('binprovider', models.CharField(blank=True, default=None, help_text='Provider that successfully installed this binary', max_length=31)),
('abspath', models.CharField(blank=True, default=None, max_length=255)),
('version', models.CharField(blank=True, default=None, max_length=32)),
('sha256', models.CharField(blank=True, default=None, max_length=64)),
('status', models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('succeeded', 'Succeeded'), ('failed', 'Failed')], db_index=True, default='queued', max_length=16)),
('retry_at', models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, help_text='When to retry this binary installation', null=True)),
('output_dir', models.CharField(blank=True, default='', help_text='Directory where installation hook logs are stored', max_length=255)),
('num_uses_failed', models.PositiveIntegerField(default=0)),
('num_uses_succeeded', models.PositiveIntegerField(default=0)),
('machine', models.ForeignKey(blank=True, default=None, on_delete=models.deletion.CASCADE, to='machine.machine')),
],
options={
'verbose_name': 'Binary',
'verbose_name_plural': 'Binaries',
},
),
migrations.AddIndex(
model_name='binary',
index=models.Index(fields=['machine', 'name', 'abspath', 'version', 'sha256'], name='machine_bin_machine_idx'),
),
]

View File

@@ -4,11 +4,14 @@ import socket
from archivebox.uuid_compat import uuid7
from datetime import timedelta
from statemachine import State, registry
from django.db import models
from django.utils import timezone
from django.utils.functional import cached_property
from archivebox.base_models.models import ModelWithHealthStats
from archivebox.workers.models import BaseStateMachine
from .detect import get_host_guid, get_os_info, get_vm_info, get_host_network, get_host_stats
_CURRENT_MACHINE = None
@@ -50,6 +53,9 @@ class Machine(ModelWithHealthStats):
objects: MachineManager = MachineManager()
networkinterface_set: models.Manager['NetworkInterface']
class Meta:
app_label = 'machine'
@classmethod
def current(cls) -> 'Machine':
global _CURRENT_MACHINE
@@ -115,6 +121,7 @@ class NetworkInterface(ModelWithHealthStats):
objects: NetworkInterfaceManager = NetworkInterfaceManager()
class Meta:
app_label = 'machine'
unique_together = (('machine', 'ip_public', 'ip_local', 'mac_address', 'dns_server'),)
@classmethod
@@ -206,11 +213,12 @@ class Binary(ModelWithHealthStats):
num_uses_failed = models.PositiveIntegerField(default=0)
num_uses_succeeded = models.PositiveIntegerField(default=0)
state_machine_name: str = 'machine.statemachines.BinaryMachine'
state_machine_name: str = 'machine.models.BinaryMachine'
objects: BinaryManager = BinaryManager()
class Meta:
app_label = 'machine'
verbose_name = 'Binary'
verbose_name_plural = 'Binaries'
unique_together = (('machine', 'name', 'abspath', 'version', 'sha256'),)
@@ -302,9 +310,9 @@ class Binary(ModelWithHealthStats):
DATA_DIR = getattr(settings, 'DATA_DIR', Path.cwd())
return Path(DATA_DIR) / 'machines' / str(self.machine_id) / 'binaries' / self.name / str(self.id)
def update_for_workers(self, **kwargs):
def update_and_requeue(self, **kwargs):
"""
Update binary fields for worker state machine.
Update binary fields and requeue for worker state machine.
Sets modified_at to ensure workers pick up changes.
Always saves the model after updating.
@@ -325,6 +333,10 @@ class Binary(ModelWithHealthStats):
"""
import json
from archivebox.hooks import discover_hooks, run_hook
from archivebox.config.configset import get_config
# Get merged config (Binary doesn't have crawl/snapshot context)
config = get_config(scope='global')
# Create output directory
output_dir = self.OUTPUT_DIR
@@ -333,7 +345,7 @@ class Binary(ModelWithHealthStats):
self.save()
# Discover ALL on_Binary__install_* hooks
hooks = discover_hooks('Binary')
hooks = discover_hooks('Binary', config=config)
if not hooks:
self.status = self.StatusChoices.FAILED
self.save()
@@ -361,7 +373,8 @@ class Binary(ModelWithHealthStats):
result = run_hook(
hook,
output_dir=plugin_output_dir,
timeout=600, # 10 min timeout
config=config,
timeout=600, # 10 min timeout for binary installation
**hook_kwargs
)
@@ -420,3 +433,128 @@ class Binary(ModelWithHealthStats):
kill_process(pid_file)
# =============================================================================
# Binary State Machine
# =============================================================================
class BinaryMachine(BaseStateMachine, strict_states=True):
"""
State machine for managing Binary installation lifecycle.
Hook Lifecycle:
┌─────────────────────────────────────────────────────────────┐
│ QUEUED State │
│ • Binary needs to be installed │
└─────────────────────────────────────────────────────────────┘
↓ tick() when can_start()
┌─────────────────────────────────────────────────────────────┐
│ STARTED State → enter_started() │
│ 1. binary.run() │
│ • discover_hooks('Binary') → all on_Binary__install_* │
│ • Try each provider hook in sequence: │
│ - run_hook(script, output_dir, ...) │
│ - If returncode == 0: │
│ * Read stdout.log │
│ * Parse JSONL for 'Binary' record with abspath │
│ * Update self: abspath, version, sha256, provider │
│ * Set status=SUCCEEDED, RETURN │
│ • If no hook succeeds: set status=FAILED │
└─────────────────────────────────────────────────────────────┘
↓ tick() checks status
┌─────────────────────────────────────────────────────────────┐
│ SUCCEEDED / FAILED │
│ • Set by binary.run() based on hook results │
│ • Health stats incremented (num_uses_succeeded/failed) │
└─────────────────────────────────────────────────────────────┘
"""
model_attr_name = 'binary'
# States
queued = State(value=Binary.StatusChoices.QUEUED, initial=True)
started = State(value=Binary.StatusChoices.STARTED)
succeeded = State(value=Binary.StatusChoices.SUCCEEDED, final=True)
failed = State(value=Binary.StatusChoices.FAILED, final=True)
# Tick Event - transitions based on conditions
tick = (
queued.to.itself(unless='can_start') |
queued.to(started, cond='can_start') |
started.to.itself(unless='is_finished') |
started.to(succeeded, cond='is_succeeded') |
started.to(failed, cond='is_failed')
)
def can_start(self) -> bool:
"""Check if binary installation can start."""
return bool(self.binary.name and self.binary.binproviders)
def is_succeeded(self) -> bool:
"""Check if installation succeeded (status was set by run())."""
return self.binary.status == Binary.StatusChoices.SUCCEEDED
def is_failed(self) -> bool:
"""Check if installation failed (status was set by run())."""
return self.binary.status == Binary.StatusChoices.FAILED
def is_finished(self) -> bool:
"""Check if installation has completed (success or failure)."""
return self.binary.status in (
Binary.StatusChoices.SUCCEEDED,
Binary.StatusChoices.FAILED,
)
@queued.enter
def enter_queued(self):
"""Binary is queued for installation."""
self.binary.update_and_requeue(
retry_at=timezone.now(),
status=Binary.StatusChoices.QUEUED,
)
@started.enter
def enter_started(self):
"""Start binary installation."""
# Lock the binary while installation runs
self.binary.update_and_requeue(
retry_at=timezone.now() + timedelta(seconds=300), # 5 min timeout for installation
status=Binary.StatusChoices.STARTED,
)
# Run installation hooks
self.binary.run()
# Save updated status (run() updates status to succeeded/failed)
self.binary.save()
@succeeded.enter
def enter_succeeded(self):
"""Binary installed successfully."""
self.binary.update_and_requeue(
retry_at=None,
status=Binary.StatusChoices.SUCCEEDED,
)
# Increment health stats
self.binary.increment_health_stats(success=True)
@failed.enter
def enter_failed(self):
"""Binary installation failed."""
self.binary.update_and_requeue(
retry_at=None,
status=Binary.StatusChoices.FAILED,
)
# Increment health stats
self.binary.increment_health_stats(success=False)
# =============================================================================
# State Machine Registration
# =============================================================================
# Manually register state machines with python-statemachine registry
registry.register(BinaryMachine)

View File

@@ -1,112 +0,0 @@
__package__ = 'archivebox.machine'
from datetime import timedelta
from django.utils import timezone
from django.db.models import F
from statemachine import State, StateMachine
from machine.models import Binary
class BinaryMachine(StateMachine, strict_states=True):
"""
State machine for managing Binary installation lifecycle.
Follows the unified pattern used by Crawl, Snapshot, and ArchiveResult:
- queued: Binary needs to be installed
- started: Installation hooks are running
- succeeded: Binary installed successfully (abspath, version, sha256 populated)
- failed: Installation failed permanently
"""
model: Binary
# States
queued = State(value=Binary.StatusChoices.QUEUED, initial=True)
started = State(value=Binary.StatusChoices.STARTED)
succeeded = State(value=Binary.StatusChoices.SUCCEEDED, final=True)
failed = State(value=Binary.StatusChoices.FAILED, final=True)
# Tick Event - transitions based on conditions
tick = (
queued.to.itself(unless='can_start') |
queued.to(started, cond='can_start') |
started.to.itself(unless='is_finished') |
started.to(succeeded, cond='is_succeeded') |
started.to(failed, cond='is_failed')
)
def __init__(self, binary, *args, **kwargs):
self.binary = binary
super().__init__(binary, *args, **kwargs)
def __repr__(self) -> str:
return f'Binary[{self.binary.id}]'
def __str__(self) -> str:
return self.__repr__()
def can_start(self) -> bool:
"""Check if binary installation can start."""
return bool(self.binary.name and self.binary.binproviders)
def is_succeeded(self) -> bool:
"""Check if installation succeeded (status was set by run())."""
return self.binary.status == Binary.StatusChoices.SUCCEEDED
def is_failed(self) -> bool:
"""Check if installation failed (status was set by run())."""
return self.binary.status == Binary.StatusChoices.FAILED
def is_finished(self) -> bool:
"""Check if installation has completed (success or failure)."""
return self.binary.status in (
Binary.StatusChoices.SUCCEEDED,
Binary.StatusChoices.FAILED,
)
@queued.enter
def enter_queued(self):
"""Binary is queued for installation."""
self.binary.update_for_workers(
retry_at=timezone.now(),
status=Binary.StatusChoices.QUEUED,
)
@started.enter
def enter_started(self):
"""Start binary installation."""
# Lock the binary while installation runs
self.binary.update_for_workers(
retry_at=timezone.now() + timedelta(seconds=300), # 5 min timeout for installation
status=Binary.StatusChoices.STARTED,
)
# Run installation hooks
self.binary.run()
# Save updated status (run() updates status to succeeded/failed)
self.binary.save()
@succeeded.enter
def enter_succeeded(self):
"""Binary installed successfully."""
self.binary.update_for_workers(
retry_at=None,
status=Binary.StatusChoices.SUCCEEDED,
)
# Increment health stats
Binary.objects.filter(pk=self.binary.pk).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
@failed.enter
def enter_failed(self):
"""Binary installation failed."""
self.binary.update_for_workers(
retry_at=None,
status=Binary.StatusChoices.FAILED,
)
# Increment health stats
Binary.objects.filter(pk=self.binary.pk).update(num_uses_failed=F('num_uses_failed') + 1)

View File

@@ -250,68 +250,13 @@ def process_records(
yield result
def get_or_create_snapshot(record: Dict[str, Any], created_by_id: Optional[int] = None):
"""
Get or create a Snapshot from a JSONL record.
Returns the Snapshot instance.
"""
from core.models import Snapshot
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.misc.util import parse_date
created_by_id = created_by_id or get_or_create_system_user_pk()
# Extract fields from record
url = record.get('url')
if not url:
raise ValueError("Record missing required 'url' field")
title = record.get('title')
tags_str = record.get('tags', '')
bookmarked_at = record.get('bookmarked_at')
depth = record.get('depth', 0)
crawl_id = record.get('crawl_id')
parent_snapshot_id = record.get('parent_snapshot_id')
# Parse bookmarked_at if string
if bookmarked_at and isinstance(bookmarked_at, str):
bookmarked_at = parse_date(bookmarked_at)
# Use the manager's create_or_update_from_dict method
snapshot = Snapshot.objects.create_or_update_from_dict(
{'url': url, 'title': title, 'tags': tags_str},
created_by_id=created_by_id
)
# Update additional fields if provided
update_fields = []
if depth is not None and snapshot.depth != depth:
snapshot.depth = depth
update_fields.append('depth')
if parent_snapshot_id and str(snapshot.parent_snapshot_id) != str(parent_snapshot_id):
snapshot.parent_snapshot_id = parent_snapshot_id
update_fields.append('parent_snapshot_id')
if bookmarked_at and snapshot.bookmarked_at != bookmarked_at:
snapshot.bookmarked_at = bookmarked_at
update_fields.append('bookmarked_at')
if crawl_id and str(snapshot.crawl_id) != str(crawl_id):
snapshot.crawl_id = crawl_id
update_fields.append('crawl_id')
if update_fields:
snapshot.save(update_fields=update_fields + ['modified_at'])
return snapshot
def get_or_create_tag(record: Dict[str, Any]):
"""
Get or create a Tag from a JSONL record.
Returns the Tag instance.
"""
from core.models import Tag
from archivebox.core.models import Tag
name = record.get('name')
if not name:
@@ -353,8 +298,11 @@ def process_jsonl_records(records: Iterator[Dict[str, Any]], created_by_id: Opti
elif record_type == TYPE_SNAPSHOT or 'url' in record:
try:
snapshot = get_or_create_snapshot(record, created_by_id=created_by_id)
results['snapshots'].append(snapshot)
from archivebox.core.models import Snapshot
overrides = {'created_by_id': created_by_id} if created_by_id else {}
snapshot = Snapshot.from_jsonl(record, overrides=overrides)
if snapshot:
results['snapshots'].append(snapshot)
except ValueError:
continue

View File

@@ -17,7 +17,7 @@ from dataclasses import dataclass
from typing import Any, Optional, List, Dict, Union, Iterable, IO, TYPE_CHECKING
if TYPE_CHECKING:
from core.models import Snapshot
from archivebox.core.models import Snapshot
from rich import print
from rich.panel import Panel
@@ -257,7 +257,7 @@ def log_archiving_paused(num_links: int, idx: int, timestamp: str):
def log_archiving_finished(num_links: int):
from core.models import Snapshot
from archivebox.core.models import Snapshot
end_ts = datetime.now(timezone.utc)
_LAST_RUN_STATS.archiving_end_ts = end_ts
@@ -395,7 +395,7 @@ def log_list_started(filter_patterns: Optional[List[str]], filter_type: str):
print(' {}'.format(' '.join(filter_patterns or ())))
def log_list_finished(snapshots):
from core.models import Snapshot
from archivebox.core.models import Snapshot
print()
print('---------------------------------------------------------------------------------------------------')
print(Snapshot.objects.filter(pk__in=[s.pk for s in snapshots]).to_csv(cols=['timestamp', 'is_archived', 'num_outputs', 'url'], header=True, ljust=16, separator=' | '))

View File

@@ -1,335 +0,0 @@
__package__ = 'abx.archivebox'
# from django.test import TestCase
# from .toml_util import convert, TOML_HEADER
# TEST_INPUT = """
# [SERVER_CONFIG]
# IS_TTY=False
# USE_COLOR=False
# SHOW_PROGRESS=False
# IN_DOCKER=False
# IN_QEMU=False
# PUID=501
# PGID=20
# CONFIG_FILE=/opt/archivebox/data/ArchiveBox.conf
# ONLY_NEW=True
# TIMEOUT=60
# MEDIA_TIMEOUT=3600
# OUTPUT_PERMISSIONS=644
# RESTRICT_FILE_NAMES=windows
# URL_DENYLIST=\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$
# URL_ALLOWLIST=None
# ADMIN_USERNAME=None
# ADMIN_PASSWORD=None
# ENFORCE_ATOMIC_WRITES=True
# TAG_SEPARATOR_PATTERN=[,]
# SECRET_KEY=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
# BIND_ADDR=127.0.0.1:8000
# ALLOWED_HOSTS=*
# DEBUG=False
# PUBLIC_INDEX=True
# PUBLIC_SNAPSHOTS=True
# PUBLIC_ADD_VIEW=False
# FOOTER_INFO=Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.
# SNAPSHOTS_PER_PAGE=40
# CUSTOM_TEMPLATES_DIR=None
# TIME_ZONE=UTC
# TIMEZONE=UTC
# REVERSE_PROXY_USER_HEADER=Remote-User
# REVERSE_PROXY_WHITELIST=
# LOGOUT_REDIRECT_URL=/
# PREVIEW_ORIGINALS=True
# LDAP=False
# LDAP_SERVER_URI=None
# LDAP_BIND_DN=None
# LDAP_BIND_PASSWORD=None
# LDAP_USER_BASE=None
# LDAP_USER_FILTER=None
# LDAP_USERNAME_ATTR=None
# LDAP_FIRSTNAME_ATTR=None
# LDAP_LASTNAME_ATTR=None
# LDAP_EMAIL_ATTR=None
# LDAP_CREATE_SUPERUSER=False
# SAVE_TITLE=True
# SAVE_FAVICON=True
# SAVE_WGET=True
# SAVE_WGET_REQUISITES=True
# SAVE_SINGLEFILE=True
# SAVE_READABILITY=True
# SAVE_MERCURY=True
# SAVE_HTMLTOTEXT=True
# SAVE_PDF=True
# SAVE_SCREENSHOT=True
# SAVE_DOM=True
# SAVE_HEADERS=True
# SAVE_WARC=True
# SAVE_GIT=True
# SAVE_MEDIA=True
# SAVE_ARCHIVE_DOT_ORG=True
# RESOLUTION=1440,2000
# GIT_DOMAINS=github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht
# CHECK_SSL_VALIDITY=True
# MEDIA_MAX_SIZE=750m
# USER_AGENT=None
# CURL_USER_AGENT=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/) curl/curl 8.4.0 (x86_64-apple-darwin23.0)
# WGET_USER_AGENT=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/) wget/GNU Wget 1.24.5
# CHROME_USER_AGENT=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/)
# COOKIES_FILE=None
# CHROME_USER_DATA_DIR=None
# CHROME_TIMEOUT=0
# CHROME_HEADLESS=True
# CHROME_SANDBOX=True
# CHROME_EXTRA_ARGS=[]
# YOUTUBEDL_ARGS=['--restrict-filenames', '--trim-filenames', '128', '--write-description', '--write-info-json', '--write-annotations', '--write-thumbnail', '--no-call-home', '--write-sub', '--write-auto-subs', '--convert-subs=srt', '--yes-playlist', '--continue', '--no-abort-on-error', '--ignore-errors', '--geo-bypass', '--add-metadata', '--format=(bv*+ba/b)[filesize<=750m][filesize_approx<=?750m]/(bv*+ba/b)']
# YOUTUBEDL_EXTRA_ARGS=[]
# WGET_ARGS=['--no-verbose', '--adjust-extension', '--convert-links', '--force-directories', '--backup-converted', '--span-hosts', '--no-parent', '-e', 'robots=off']
# WGET_EXTRA_ARGS=[]
# CURL_ARGS=['--silent', '--location', '--compressed']
# CURL_EXTRA_ARGS=[]
# GIT_ARGS=['--recursive']
# SINGLEFILE_ARGS=[]
# SINGLEFILE_EXTRA_ARGS=[]
# MERCURY_ARGS=['--format=text']
# MERCURY_EXTRA_ARGS=[]
# FAVICON_PROVIDER=https://www.google.com/s2/favicons?domain={}
# USE_INDEXING_BACKEND=True
# USE_SEARCHING_BACKEND=True
# SEARCH_BACKEND_ENGINE=ripgrep
# SEARCH_BACKEND_HOST_NAME=localhost
# SEARCH_BACKEND_PORT=1491
# SEARCH_BACKEND_PASSWORD=SecretPassword
# SEARCH_PROCESS_HTML=True
# SONIC_COLLECTION=archivebox
# SONIC_BUCKET=snapshots
# SEARCH_BACKEND_TIMEOUT=90
# FTS_SEPARATE_DATABASE=True
# FTS_TOKENIZERS=porter unicode61 remove_diacritics 2
# FTS_SQLITE_MAX_LENGTH=1000000000
# USE_CURL=True
# USE_WGET=True
# USE_SINGLEFILE=True
# USE_READABILITY=True
# USE_MERCURY=True
# USE_GIT=True
# USE_CHROME=True
# USE_NODE=True
# USE_YOUTUBEDL=True
# USE_RIPGREP=True
# CURL_BINARY=curl
# GIT_BINARY=git
# WGET_BINARY=wget
# SINGLEFILE_BINARY=single-file
# READABILITY_BINARY=readability-extractor
# MERCURY_BINARY=postlight-parser
# YOUTUBEDL_BINARY=yt-dlp
# NODE_BINARY=node
# RIPGREP_BINARY=rg
# CHROME_BINARY=chrome
# POCKET_CONSUMER_KEY=None
# USER=squash
# PACKAGE_DIR=/opt/archivebox/archivebox
# TEMPLATES_DIR=/opt/archivebox/archivebox/templates
# ARCHIVE_DIR=/opt/archivebox/data/archive
# SOURCES_DIR=/opt/archivebox/data/sources
# LOGS_DIR=/opt/archivebox/data/logs
# PERSONAS_DIR=/opt/archivebox/data/personas
# URL_DENYLIST_PTN=re.compile('\\.(css|js|otf|ttf|woff|woff2|gstatic\\.com|googleapis\\.com/css)(\\?.*)?$', re.IGNORECASE|re.MULTILINE)
# URL_ALLOWLIST_PTN=None
# DIR_OUTPUT_PERMISSIONS=755
# ARCHIVEBOX_BINARY=/opt/archivebox/.venv/bin/archivebox
# VERSION=0.8.0
# COMMIT_HASH=102e87578c6036bb0132dd1ebd17f8f05ffc880f
# BUILD_TIME=2024-05-15 03:28:05 1715768885
# VERSIONS_AVAILABLE=None
# CAN_UPGRADE=False
# PYTHON_BINARY=/opt/archivebox/.venv/bin/python3.10
# PYTHON_VERSION=3.10.14
# DJANGO_BINARY=/opt/archivebox/.venv/lib/python3.10/site-packages/django/__init__.py
# DJANGO_VERSION=5.0.6 final (0)
# SQLITE_BINARY=/opt/homebrew/Cellar/python@3.10/3.10.14/Frameworks/Python.framework/Versions/3.10/lib/python3.10/sqlite3/dbapi2.py
# SQLITE_VERSION=2.6.0
# CURL_VERSION=curl 8.4.0 (x86_64-apple-darwin23.0)
# WGET_VERSION=GNU Wget 1.24.5
# WGET_AUTO_COMPRESSION=True
# RIPGREP_VERSION=ripgrep 14.1.0
# SINGLEFILE_VERSION=None
# READABILITY_VERSION=None
# MERCURY_VERSION=None
# GIT_VERSION=git version 2.44.0
# YOUTUBEDL_VERSION=2024.04.09
# CHROME_VERSION=Google Chrome 124.0.6367.207
# NODE_VERSION=v21.7.3
# """
# EXPECTED_OUTPUT = TOML_HEADER + '''[SERVER_CONFIG]
# IS_TTY = false
# USE_COLOR = false
# SHOW_PROGRESS = false
# IN_DOCKER = false
# IN_QEMU = false
# PUID = 501
# PGID = 20
# CONFIG_FILE = "/opt/archivebox/data/ArchiveBox.conf"
# ONLY_NEW = true
# TIMEOUT = 60
# MEDIA_TIMEOUT = 3600
# OUTPUT_PERMISSIONS = 644
# RESTRICT_FILE_NAMES = "windows"
# URL_DENYLIST = "\\\\.(css|js|otf|ttf|woff|woff2|gstatic\\\\.com|googleapis\\\\.com/css)(\\\\?.*)?$"
# URL_ALLOWLIST = null
# ADMIN_USERNAME = null
# ADMIN_PASSWORD = null
# ENFORCE_ATOMIC_WRITES = true
# TAG_SEPARATOR_PATTERN = "[,]"
# SECRET_KEY = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
# BIND_ADDR = "127.0.0.1:8000"
# ALLOWED_HOSTS = "*"
# DEBUG = false
# PUBLIC_INDEX = true
# PUBLIC_SNAPSHOTS = true
# PUBLIC_ADD_VIEW = false
# FOOTER_INFO = "Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests."
# SNAPSHOTS_PER_PAGE = 40
# CUSTOM_TEMPLATES_DIR = null
# TIME_ZONE = "UTC"
# TIMEZONE = "UTC"
# REVERSE_PROXY_USER_HEADER = "Remote-User"
# REVERSE_PROXY_WHITELIST = ""
# LOGOUT_REDIRECT_URL = "/"
# PREVIEW_ORIGINALS = true
# LDAP = false
# LDAP_SERVER_URI = null
# LDAP_BIND_DN = null
# LDAP_BIND_PASSWORD = null
# LDAP_USER_BASE = null
# LDAP_USER_FILTER = null
# LDAP_USERNAME_ATTR = null
# LDAP_FIRSTNAME_ATTR = null
# LDAP_LASTNAME_ATTR = null
# LDAP_EMAIL_ATTR = null
# LDAP_CREATE_SUPERUSER = false
# SAVE_TITLE = true
# SAVE_FAVICON = true
# SAVE_WGET = true
# SAVE_WGET_REQUISITES = true
# SAVE_SINGLEFILE = true
# SAVE_READABILITY = true
# SAVE_MERCURY = true
# SAVE_HTMLTOTEXT = true
# SAVE_PDF = true
# SAVE_SCREENSHOT = true
# SAVE_DOM = true
# SAVE_HEADERS = true
# SAVE_WARC = true
# SAVE_GIT = true
# SAVE_MEDIA = true
# SAVE_ARCHIVE_DOT_ORG = true
# RESOLUTION = [1440, 2000]
# GIT_DOMAINS = "github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht"
# CHECK_SSL_VALIDITY = true
# MEDIA_MAX_SIZE = "750m"
# USER_AGENT = null
# CURL_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/) curl/curl 8.4.0 (x86_64-apple-darwin23.0)"
# WGET_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/) wget/GNU Wget 1.24.5"
# CHROME_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/)"
# COOKIES_FILE = null
# CHROME_USER_DATA_DIR = null
# CHROME_TIMEOUT = false
# CHROME_HEADLESS = true
# CHROME_SANDBOX = true
# CHROME_EXTRA_ARGS = []
# YOUTUBEDL_ARGS = ["--restrict-filenames", "--trim-filenames", "128", "--write-description", "--write-info-json", "--write-annotations", "--write-thumbnail", "--no-call-home", "--write-sub", "--write-auto-subs", "--convert-subs=srt", "--yes-playlist", "--continue", "--no-abort-on-error", "--ignore-errors", "--geo-bypass", "--add-metadata", "--format=(bv*+ba/b)[filesize<=750m][filesize_approx<=?750m]/(bv*+ba/b)"]
# YOUTUBEDL_EXTRA_ARGS = []
# WGET_ARGS = ["--no-verbose", "--adjust-extension", "--convert-links", "--force-directories", "--backup-converted", "--span-hosts", "--no-parent", "-e", "robots=off"]
# WGET_EXTRA_ARGS = []
# CURL_ARGS = ["--silent", "--location", "--compressed"]
# CURL_EXTRA_ARGS = []
# GIT_ARGS = ["--recursive"]
# SINGLEFILE_ARGS = []
# SINGLEFILE_EXTRA_ARGS = []
# MERCURY_ARGS = ["--format=text"]
# MERCURY_EXTRA_ARGS = []
# FAVICON_PROVIDER = "https://www.google.com/s2/favicons?domain={}"
# USE_INDEXING_BACKEND = true
# USE_SEARCHING_BACKEND = true
# SEARCH_BACKEND_ENGINE = "ripgrep"
# SEARCH_BACKEND_HOST_NAME = "localhost"
# SEARCH_BACKEND_PORT = 1491
# SEARCH_BACKEND_PASSWORD = "SecretPassword"
# SEARCH_PROCESS_HTML = true
# SONIC_COLLECTION = "archivebox"
# SONIC_BUCKET = "snapshots"
# SEARCH_BACKEND_TIMEOUT = 90
# FTS_SEPARATE_DATABASE = true
# FTS_TOKENIZERS = "porter unicode61 remove_diacritics 2"
# FTS_SQLITE_MAX_LENGTH = 1000000000
# USE_CURL = true
# USE_WGET = true
# USE_SINGLEFILE = true
# USE_READABILITY = true
# USE_MERCURY = true
# USE_GIT = true
# USE_CHROME = true
# USE_NODE = true
# USE_YOUTUBEDL = true
# USE_RIPGREP = true
# CURL_BINARY = "curl"
# GIT_BINARY = "git"
# WGET_BINARY = "wget"
# SINGLEFILE_BINARY = "single-file"
# READABILITY_BINARY = "readability-extractor"
# MERCURY_BINARY = "postlight-parser"
# YOUTUBEDL_BINARY = "yt-dlp"
# NODE_BINARY = "node"
# RIPGREP_BINARY = "rg"
# CHROME_BINARY = "chrome"
# POCKET_CONSUMER_KEY = null
# USER = "squash"
# PACKAGE_DIR = "/opt/archivebox/archivebox"
# TEMPLATES_DIR = "/opt/archivebox/archivebox/templates"
# ARCHIVE_DIR = "/opt/archivebox/data/archive"
# SOURCES_DIR = "/opt/archivebox/data/sources"
# LOGS_DIR = "/opt/archivebox/data/logs"
# PERSONAS_DIR = "/opt/archivebox/data/personas"
# URL_DENYLIST_PTN = "re.compile(\'\\\\.(css|js|otf|ttf|woff|woff2|gstatic\\\\.com|googleapis\\\\.com/css)(\\\\?.*)?$\', re.IGNORECASE|re.MULTILINE)"
# URL_ALLOWLIST_PTN = null
# DIR_OUTPUT_PERMISSIONS = 755
# ARCHIVEBOX_BINARY = "/opt/archivebox/.venv/bin/archivebox"
# VERSION = "0.8.0"
# COMMIT_HASH = "102e87578c6036bb0132dd1ebd17f8f05ffc880f"
# BUILD_TIME = "2024-05-15 03:28:05 1715768885"
# VERSIONS_AVAILABLE = null
# CAN_UPGRADE = false
# PYTHON_BINARY = "/opt/archivebox/.venv/bin/python3.10"
# PYTHON_VERSION = "3.10.14"
# DJANGO_BINARY = "/opt/archivebox/.venv/lib/python3.10/site-packages/django/__init__.py"
# DJANGO_VERSION = "5.0.6 final (0)"
# SQLITE_BINARY = "/opt/homebrew/Cellar/python@3.10/3.10.14/Frameworks/Python.framework/Versions/3.10/lib/python3.10/sqlite3/dbapi2.py"
# SQLITE_VERSION = "2.6.0"
# CURL_VERSION = "curl 8.4.0 (x86_64-apple-darwin23.0)"
# WGET_VERSION = "GNU Wget 1.24.5"
# WGET_AUTO_COMPRESSION = true
# RIPGREP_VERSION = "ripgrep 14.1.0"
# SINGLEFILE_VERSION = null
# READABILITY_VERSION = null
# MERCURY_VERSION = null
# GIT_VERSION = "git version 2.44.0"
# YOUTUBEDL_VERSION = "2024.04.09"
# CHROME_VERSION = "Google Chrome 124.0.6367.207"
# NODE_VERSION = "v21.7.3"'''
# class IniToTomlTests(TestCase):
# def test_convert(self):
# first_output = convert(TEST_INPUT) # make sure ini -> toml parses correctly
# second_output = convert(first_output) # make sure toml -> toml parses/dumps consistently
# assert first_output == second_output == EXPECTED_OUTPUT # make sure parsing is indempotent
# # DEBUGGING
# import sys
# import difflib
# sys.stdout.writelines(difflib.context_diff(first_output, second_output, fromfile='first', tofile='second'))
# print(repr(second_output))

View File

@@ -478,62 +478,6 @@ for url_str, num_urls in _test_url_strs.items():
### Chrome Helpers
def chrome_args(**options) -> List[str]:
"""Helper to build up a chrome shell command with arguments."""
import shutil
from archivebox.config import CHECK_SSL_VALIDITY, RESOLUTION, USER_AGENT, CHROME_BINARY
chrome_binary = options.get('CHROME_BINARY', CHROME_BINARY)
chrome_headless = options.get('CHROME_HEADLESS', True)
chrome_sandbox = options.get('CHROME_SANDBOX', True)
check_ssl = options.get('CHECK_SSL_VALIDITY', CHECK_SSL_VALIDITY)
user_agent = options.get('CHROME_USER_AGENT', USER_AGENT)
resolution = options.get('RESOLUTION', RESOLUTION)
timeout = options.get('CHROME_TIMEOUT', 0)
user_data_dir = options.get('CHROME_USER_DATA_DIR', None)
if not chrome_binary:
raise Exception('Could not find any CHROME_BINARY installed on your system')
cmd_args = [chrome_binary]
if chrome_headless:
cmd_args += ("--headless=new",)
if not chrome_sandbox:
# running in docker or other sandboxed environment
cmd_args += (
"--no-sandbox",
"--no-zygote",
"--disable-dev-shm-usage",
"--disable-software-rasterizer",
"--run-all-compositor-stages-before-draw",
"--hide-scrollbars",
"--autoplay-policy=no-user-gesture-required",
"--no-first-run",
"--use-fake-ui-for-media-stream",
"--use-fake-device-for-media-stream",
"--disable-sync",
)
if not check_ssl:
cmd_args += ('--disable-web-security', '--ignore-certificate-errors')
if user_agent:
cmd_args += (f'--user-agent={user_agent}',)
if resolution:
cmd_args += (f'--window-size={resolution}',)
if timeout:
cmd_args += (f'--timeout={timeout * 1000}',)
if user_data_dir:
cmd_args += (f'--user-data-dir={user_data_dir}',)
return cmd_args
def chrome_cleanup():
"""
Cleans up any state or runtime files that chrome leaves behind when killed by

View File

@@ -3,4 +3,4 @@ from django.apps import AppConfig
class SessionsConfig(AppConfig):
default_auto_field = "django.db.models.BigAutoField"
name = "personas"
name = "archivebox.personas"

View File

@@ -29,6 +29,7 @@
# # domain_denylist = models.CharField(max_length=1024, blank=True, null=False, default='')
# class Meta:
# app_label = 'personas'
# verbose_name = 'Session Type'
# verbose_name_plural = 'Session Types'
# unique_together = (('created_by', 'name'),)

View File

@@ -3,10 +3,10 @@
"type": "object",
"additionalProperties": false,
"properties": {
"SAVE_ARCHIVE_DOT_ORG": {
"ARCHIVE_ORG_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["SUBMIT_ARCHIVE_DOT_ORG"],
"x-aliases": ["SAVE_ARCHIVE_DOT_ORG", "USE_ARCHIVE_ORG", "SUBMIT_ARCHIVE_DOT_ORG"],
"description": "Submit URLs to archive.org Wayback Machine"
},
"ARCHIVE_ORG_TIMEOUT": {

View File

@@ -0,0 +1,10 @@
{% load config_tags %}
{% get_config "ARCHIVEDOTORG_ENABLED" as enabled %}
{% if enabled %}
<!-- Archive.org embed - full iframe view -->
<iframe src="{{ output_path }}"
class="extractor-embed archivedotorg-embed"
style="width: 100%; height: 600px; border: 1px solid #ddd;"
sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms">
</iframe>
{% endif %}

View File

@@ -0,0 +1,10 @@
{% load config_tags %}
{% get_config "ARCHIVEDOTORG_ENABLED" as enabled %}
{% if enabled %}
<!-- Archive.org fullscreen - full page iframe -->
<iframe src="{{ output_path }}"
class="extractor-fullscreen archivedotorg-fullscreen"
style="width: 100%; height: 100vh; border: none;"
sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms">
</iframe>
{% endif %}

View File

@@ -0,0 +1,12 @@
{% load config_tags %}
{% get_config "ARCHIVEDOTORG_ENABLED" as enabled %}
{% if enabled %}
<!-- Archive.org thumbnail - iframe preview of archived page -->
<div class="extractor-thumbnail archivedotorg-thumbnail" style="width: 100%; height: 100px; overflow: hidden;">
<iframe src="{{ output_path }}"
style="width: 100%; height: 100px; border: none; pointer-events: none;"
loading="lazy"
sandbox="allow-same-origin">
</iframe>
</div>
{% endif %}

View File

@@ -60,21 +60,6 @@
"default": true,
"x-fallback": "CHECK_SSL_VALIDITY",
"description": "Whether to verify SSL certificates"
},
"SAVE_SCREENSHOT": {
"type": "boolean",
"default": true,
"description": "Enable screenshot capture"
},
"SAVE_PDF": {
"type": "boolean",
"default": true,
"description": "Enable PDF generation"
},
"SAVE_DOM": {
"type": "boolean",
"default": true,
"description": "Enable DOM capture"
}
}
}

View File

@@ -0,0 +1,21 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"required_plugins": ["chrome"],
"properties": {
"DOM_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_DOM", "USE_DOM"],
"description": "Enable DOM capture"
},
"DOM_TIMEOUT": {
"type": "integer",
"default": 60,
"minimum": 5,
"x-fallback": "TIMEOUT",
"description": "Timeout for DOM capture in seconds"
}
}
}

View File

@@ -3,9 +3,10 @@
"type": "object",
"additionalProperties": false,
"properties": {
"SAVE_FAVICON": {
"FAVICON_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_FAVICON", "USE_FAVICON"],
"description": "Enable favicon downloading"
},
"FAVICON_TIMEOUT": {

View File

@@ -2,6 +2,7 @@
Integration tests for favicon plugin
Tests verify:
pass
1. Plugin script exists
2. requests library is available
3. Favicon extraction works for real example.com
@@ -40,7 +41,7 @@ def test_requests_library_available():
)
if result.returncode != 0:
pytest.skip("requests library not installed")
pass
assert len(result.stdout.strip()) > 0, "Should report requests version"
@@ -58,7 +59,7 @@ def test_extracts_favicon_from_example_com():
capture_output=True
)
if check_result.returncode != 0:
pytest.skip("requests not installed")
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
@@ -80,6 +81,7 @@ def test_extracts_favicon_from_example_com():
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
pass
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
@@ -124,7 +126,7 @@ def test_config_timeout_honored():
capture_output=True
)
if check_result.returncode != 0:
pytest.skip("requests not installed")
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
@@ -155,7 +157,7 @@ def test_config_user_agent():
capture_output=True
)
if check_result.returncode != 0:
pytest.skip("requests not installed")
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
@@ -181,6 +183,7 @@ def test_config_user_agent():
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
pass
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
@@ -201,7 +204,7 @@ def test_handles_https_urls():
capture_output=True
)
if check_result.returncode != 0:
pytest.skip("requests not installed")
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
@@ -232,7 +235,7 @@ def test_handles_missing_favicon_gracefully():
capture_output=True
)
if check_result.returncode != 0:
pytest.skip("requests not installed")
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)

View File

@@ -3,9 +3,10 @@
"type": "object",
"additionalProperties": false,
"properties": {
"SAVE_FORUMDL": {
"FORUMDL_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_FORUMDL", "USE_FORUMDL"],
"description": "Enable forum downloading with forum-dl"
},
"FORUMDL_BINARY": {

View File

@@ -2,6 +2,7 @@
Integration tests for forumdl plugin
Tests verify:
pass
1. Hook script exists
2. Dependencies installed via validation hooks
3. Verify deps with abx-pkg
@@ -48,7 +49,9 @@ def get_forumdl_binary_path():
# Check if binary was found
for line in result.stdout.strip().split('\n'):
pass
if line.strip():
pass
try:
record = json.loads(line)
if record.get('type') == 'Binary' and record.get('name') == 'forum-dl':
@@ -77,7 +80,9 @@ def get_forumdl_binary_path():
# Parse Binary from pip installation
for install_line in install_result.stdout.strip().split('\n'):
pass
if install_line.strip():
pass
try:
install_record = json.loads(install_line)
if install_record.get('type') == 'Binary' and install_record.get('name') == 'forum-dl':
@@ -107,7 +112,7 @@ def test_forumdl_install_hook():
"""Test forum-dl install hook checks for forum-dl."""
# Skip if install hook doesn't exist yet
if not FORUMDL_INSTALL_HOOK.exists():
pytest.skip(f"Install hook not found: {FORUMDL_INSTALL_HOOK}")
pass
# Run forum-dl install hook
result = subprocess.run(
@@ -123,14 +128,18 @@ def test_forumdl_install_hook():
found_dependency = False
for line in result.stdout.strip().split('\n'):
pass
if line.strip():
pass
try:
record = json.loads(line)
if record.get('type') == 'Binary':
pass
if record['name'] == 'forum-dl':
assert record['abspath'], "forum-dl should have abspath"
found_binary = True
elif record.get('type') == 'Dependency':
pass
if record['bin_name'] == 'forum-dl':
found_dependency = True
except json.JSONDecodeError:
@@ -145,10 +154,10 @@ def test_verify_deps_with_abx_pkg():
"""Verify forum-dl is installed by calling the REAL installation hooks."""
binary_path = get_forumdl_binary_path()
if not binary_path:
pytest.skip(
"forum-dl installation skipped. Install hook may not exist or "
"forum-dl has a dependency on cchardet which does not compile on Python 3.14+ "
"due to removed longintrepr.h header. This is a known compatibility issue with forum-dl."
assert False, (
"forum-dl installation failed. Install hook should install forum-dl automatically. "
"Note: forum-dl has a dependency on cchardet which may not compile on Python 3.14+ "
"due to removed longintrepr.h header."
)
assert Path(binary_path).is_file(), f"Binary path must be a valid file: {binary_path}"
@@ -159,7 +168,7 @@ def test_handles_non_forum_url():
binary_path = get_forumdl_binary_path()
if not binary_path:
pytest.skip("forum-dl binary not available")
pass
assert Path(binary_path).is_file(), f"Binary must be a valid file: {binary_path}"
with tempfile.TemporaryDirectory() as tmpdir:
@@ -186,6 +195,7 @@ def test_handles_non_forum_url():
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
pass
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
@@ -231,7 +241,7 @@ def test_config_timeout():
binary_path = get_forumdl_binary_path()
if not binary_path:
pytest.skip("forum-dl binary not available")
pass
assert Path(binary_path).is_file(), f"Binary must be a valid file: {binary_path}"
with tempfile.TemporaryDirectory() as tmpdir:

View File

@@ -3,9 +3,10 @@
"type": "object",
"additionalProperties": false,
"properties": {
"SAVE_GALLERYDL": {
"GALLERYDL_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_GALLERYDL", "USE_GALLERYDL"],
"description": "Enable gallery downloading with gallery-dl"
},
"GALLERYDL_BINARY": {

View File

@@ -2,6 +2,7 @@
Integration tests for gallerydl plugin
Tests verify:
pass
1. Hook script exists
2. Dependencies installed via validation hooks
3. Verify deps with abx-pkg
@@ -45,14 +46,18 @@ def test_gallerydl_install_hook():
found_dependency = False
for line in result.stdout.strip().split('\n'):
pass
if line.strip():
pass
try:
record = json.loads(line)
if record.get('type') == 'Binary':
pass
if record['name'] == 'gallery-dl':
assert record['abspath'], "gallery-dl should have abspath"
found_binary = True
elif record.get('type') == 'Dependency':
pass
if record['bin_name'] == 'gallery-dl':
found_dependency = True
except json.JSONDecodeError:
@@ -76,7 +81,7 @@ def test_verify_deps_with_abx_pkg():
missing_binaries.append('gallery-dl')
if missing_binaries:
pytest.skip(f"Binaries not available: {', '.join(missing_binaries)} - Dependency records should have been emitted")
pass
def test_handles_non_gallery_url():
@@ -103,6 +108,7 @@ def test_handles_non_gallery_url():
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
pass
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':

View File

@@ -3,9 +3,10 @@
"type": "object",
"additionalProperties": false,
"properties": {
"SAVE_GIT": {
"GIT_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_GIT", "USE_GIT"],
"description": "Enable git repository cloning"
},
"GIT_BINARY": {

View File

@@ -2,6 +2,7 @@
Integration tests for git plugin
Tests verify:
pass
1. Validate hook checks for git binary
2. Verify deps with abx-pkg
3. Standalone git extractor execution
@@ -37,7 +38,9 @@ def test_git_install_hook():
# Binary found - verify Binary JSONL output
found_binary = False
for line in result.stdout.strip().split('\n'):
pass
if line.strip():
pass
try:
record = json.loads(line)
if record.get('type') == 'Binary':
@@ -52,7 +55,9 @@ def test_git_install_hook():
# Binary not found - verify Dependency JSONL output
found_dependency = False
for line in result.stdout.strip().split('\n'):
pass
if line.strip():
pass
try:
record = json.loads(line)
if record.get('type') == 'Dependency':
@@ -74,7 +79,7 @@ def test_verify_deps_with_abx_pkg():
if git_loaded and git_loaded.abspath:
assert True, "git is available"
else:
pytest.skip("git not available - Dependency record should have been emitted")
pass
def test_reports_missing_git():
with tempfile.TemporaryDirectory() as tmpdir:
@@ -88,8 +93,9 @@ def test_reports_missing_git():
assert 'DEPENDENCY_NEEDED' in combined or 'git' in combined.lower() or 'ERROR=' in combined
def test_handles_non_git_url():
pass
if not shutil.which('git'):
pytest.skip("git not installed")
pass
with tempfile.TemporaryDirectory() as tmpdir:
result = subprocess.run(
@@ -104,6 +110,7 @@ def test_handles_non_git_url():
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
pass
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':

View File

@@ -2,6 +2,7 @@
Integration tests for headers plugin
Tests verify:
pass
1. Plugin script exists and is executable
2. Node.js is available
3. Headers extraction works for real example.com
@@ -38,7 +39,7 @@ def test_node_is_available():
)
if result.returncode != 0:
pytest.skip("node not installed on system")
pass
binary_path = result.stdout.strip()
assert Path(binary_path).exists(), f"Binary should exist at {binary_path}"
@@ -59,7 +60,7 @@ def test_extracts_headers_from_example_com():
# Check node is available
if not shutil.which('node'):
pytest.skip("node not installed")
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
@@ -80,6 +81,7 @@ def test_extracts_headers_from_example_com():
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
pass
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
@@ -119,7 +121,7 @@ def test_headers_output_structure():
"""Test that headers plugin produces correctly structured output."""
if not shutil.which('node'):
pytest.skip("node not installed")
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
@@ -140,6 +142,7 @@ def test_headers_output_structure():
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
pass
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
@@ -175,7 +178,7 @@ def test_falls_back_to_http_when_chrome_unavailable():
"""Test that headers plugin falls back to HTTP HEAD when chrome unavailable."""
if not shutil.which('node'):
pytest.skip("node not installed")
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
@@ -198,6 +201,7 @@ def test_falls_back_to_http_when_chrome_unavailable():
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
pass
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
@@ -224,7 +228,7 @@ def test_config_timeout_honored():
"""Test that TIMEOUT config is respected."""
if not shutil.which('node'):
pytest.skip("node not installed")
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
@@ -251,7 +255,7 @@ def test_config_user_agent():
"""Test that USER_AGENT config is used."""
if not shutil.which('node'):
pytest.skip("node not installed")
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
@@ -277,6 +281,7 @@ def test_config_user_agent():
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
pass
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
@@ -293,7 +298,7 @@ def test_handles_https_urls():
"""Test that HTTPS URLs work correctly."""
if not shutil.which('node'):
pytest.skip("node not installed")
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
@@ -318,7 +323,7 @@ def test_handles_404_gracefully():
"""Test that headers plugin handles 404s gracefully."""
if not shutil.which('node'):
pytest.skip("node not installed")
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)

View File

@@ -1,279 +0,0 @@
/**
* Unit tests for istilldontcareaboutcookies plugin
*
* Run with: node --test tests/test_istilldontcareaboutcookies.js
*/
const assert = require('assert');
const fs = require('fs');
const path = require('path');
const { describe, it, before, after, beforeEach, afterEach } = require('node:test');
// Test fixtures
const TEST_DIR = path.join(__dirname, '.test_fixtures');
const TEST_EXTENSIONS_DIR = path.join(TEST_DIR, 'chrome_extensions');
describe('istilldontcareaboutcookies plugin', () => {
before(() => {
if (!fs.existsSync(TEST_DIR)) {
fs.mkdirSync(TEST_DIR, { recursive: true });
}
});
after(() => {
if (fs.existsSync(TEST_DIR)) {
fs.rmSync(TEST_DIR, { recursive: true, force: true });
}
});
describe('EXTENSION metadata', () => {
it('should have correct webstore_id', () => {
const { EXTENSION } = require('../on_Snapshot__02_istilldontcareaboutcookies.js');
assert.strictEqual(EXTENSION.webstore_id, 'edibdbjcniadpccecjdfdjjppcpchdlm');
});
it('should have correct name', () => {
const { EXTENSION } = require('../on_Snapshot__02_istilldontcareaboutcookies.js');
assert.strictEqual(EXTENSION.name, 'istilldontcareaboutcookies');
});
});
describe('installCookiesExtension', () => {
beforeEach(() => {
process.env.CHROME_EXTENSIONS_DIR = TEST_EXTENSIONS_DIR;
if (!fs.existsSync(TEST_EXTENSIONS_DIR)) {
fs.mkdirSync(TEST_EXTENSIONS_DIR, { recursive: true });
}
});
afterEach(() => {
if (fs.existsSync(TEST_EXTENSIONS_DIR)) {
fs.rmSync(TEST_EXTENSIONS_DIR, { recursive: true });
}
delete process.env.CHROME_EXTENSIONS_DIR;
});
it('should use cached extension if available', async () => {
const { installCookiesExtension } = require('../on_Snapshot__02_istilldontcareaboutcookies.js');
// Create fake cache
const cacheFile = path.join(TEST_EXTENSIONS_DIR, 'istilldontcareaboutcookies.extension.json');
const fakeExtensionDir = path.join(TEST_EXTENSIONS_DIR, 'fake_cookies');
fs.mkdirSync(fakeExtensionDir, { recursive: true });
fs.writeFileSync(
path.join(fakeExtensionDir, 'manifest.json'),
JSON.stringify({ version: '1.1.8' })
);
const fakeCache = {
webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm',
name: 'istilldontcareaboutcookies',
unpacked_path: fakeExtensionDir,
version: '1.1.8'
};
fs.writeFileSync(cacheFile, JSON.stringify(fakeCache));
const result = await installCookiesExtension();
assert.notStrictEqual(result, null);
assert.strictEqual(result.webstore_id, 'edibdbjcniadpccecjdfdjjppcpchdlm');
});
it('should not require any configuration', async () => {
// This extension works out of the box
// No API keys or config needed
const { EXTENSION } = require('../on_Snapshot__02_istilldontcareaboutcookies.js');
assert.ok(EXTENSION);
// No config fields should be required
});
});
describe('cache file creation', () => {
beforeEach(() => {
process.env.CHROME_EXTENSIONS_DIR = TEST_EXTENSIONS_DIR;
if (!fs.existsSync(TEST_EXTENSIONS_DIR)) {
fs.mkdirSync(TEST_EXTENSIONS_DIR, { recursive: true });
}
});
afterEach(() => {
if (fs.existsSync(TEST_EXTENSIONS_DIR)) {
fs.rmSync(TEST_EXTENSIONS_DIR, { recursive: true });
}
delete process.env.CHROME_EXTENSIONS_DIR;
});
it('should create cache file with correct extension name', async () => {
const cacheFile = path.join(TEST_EXTENSIONS_DIR, 'istilldontcareaboutcookies.extension.json');
// Create mock extension
const mockExtension = {
webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm',
name: 'istilldontcareaboutcookies',
version: '1.1.9'
};
await fs.promises.writeFile(cacheFile, JSON.stringify(mockExtension, null, 2));
assert.ok(fs.existsSync(cacheFile));
const cache = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
assert.strictEqual(cache.name, 'istilldontcareaboutcookies');
});
it('should use correct filename pattern', () => {
const expectedPattern = 'istilldontcareaboutcookies.extension.json';
const cacheFile = path.join(TEST_EXTENSIONS_DIR, expectedPattern);
// Pattern should match expected format
assert.ok(path.basename(cacheFile).endsWith('.extension.json'));
assert.ok(path.basename(cacheFile).includes('istilldontcareaboutcookies'));
});
});
describe('extension functionality', () => {
it('should work automatically without configuration', () => {
// This extension automatically dismisses cookie banners
// No manual trigger or configuration needed
const features = {
automaticBannerDismissal: true,
requiresConfiguration: false,
requiresApiKey: false,
requiresUserAction: false
};
assert.strictEqual(features.automaticBannerDismissal, true);
assert.strictEqual(features.requiresConfiguration, false);
assert.strictEqual(features.requiresApiKey, false);
assert.strictEqual(features.requiresUserAction, false);
});
it('should not require any runtime hooks', () => {
// Extension works purely via Chrome's content script injection
// No need for additional hooks or configuration
const requiresHooks = {
preNavigation: false,
postNavigation: false,
onPageLoad: false
};
assert.strictEqual(requiresHooks.preNavigation, false);
assert.strictEqual(requiresHooks.postNavigation, false);
assert.strictEqual(requiresHooks.onPageLoad, false);
});
});
describe('priority and execution order', () => {
it('should have priority 02 (early)', () => {
const filename = 'on_Snapshot__02_istilldontcareaboutcookies.js';
// Extract priority from filename
const match = filename.match(/on_Snapshot__(\d+)_/);
assert.ok(match);
const priority = parseInt(match[1]);
assert.strictEqual(priority, 2);
});
it('should run before chrome (priority 20)', () => {
const extensionPriority = 2;
const chromeSessionPriority = 20;
assert.ok(extensionPriority < chromeSessionPriority);
});
});
describe('error handling', () => {
beforeEach(() => {
process.env.CHROME_EXTENSIONS_DIR = TEST_EXTENSIONS_DIR;
if (!fs.existsSync(TEST_EXTENSIONS_DIR)) {
fs.mkdirSync(TEST_EXTENSIONS_DIR, { recursive: true });
}
});
afterEach(() => {
if (fs.existsSync(TEST_EXTENSIONS_DIR)) {
fs.rmSync(TEST_EXTENSIONS_DIR, { recursive: true });
}
delete process.env.CHROME_EXTENSIONS_DIR;
});
it('should handle corrupted cache gracefully', async () => {
const cacheFile = path.join(TEST_EXTENSIONS_DIR, 'istilldontcareaboutcookies.extension.json');
// Create corrupted cache
fs.writeFileSync(cacheFile, 'invalid json content');
// Should detect corruption and proceed with fresh install
const { installCookiesExtension } = require('../on_Snapshot__02_istilldontcareaboutcookies.js');
// Mock loadOrInstallExtension to avoid actual download
const extensionUtils = require('../../chrome_extensions/chrome_extension_utils.js');
const originalFunc = extensionUtils.loadOrInstallExtension;
extensionUtils.loadOrInstallExtension = async () => ({
webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm',
name: 'istilldontcareaboutcookies',
version: '1.1.9'
});
const result = await installCookiesExtension();
extensionUtils.loadOrInstallExtension = originalFunc;
assert.notStrictEqual(result, null);
});
it('should handle missing manifest gracefully', async () => {
const cacheFile = path.join(TEST_EXTENSIONS_DIR, 'istilldontcareaboutcookies.extension.json');
const fakeExtensionDir = path.join(TEST_EXTENSIONS_DIR, 'fake_cookies_no_manifest');
// Create directory without manifest
fs.mkdirSync(fakeExtensionDir, { recursive: true });
const fakeCache = {
webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm',
name: 'istilldontcareaboutcookies',
unpacked_path: fakeExtensionDir
};
fs.writeFileSync(cacheFile, JSON.stringify(fakeCache));
const { installCookiesExtension } = require('../on_Snapshot__02_istilldontcareaboutcookies.js');
// Mock to return fresh extension when manifest missing
const extensionUtils = require('../../chrome_extensions/chrome_extension_utils.js');
const originalFunc = extensionUtils.loadOrInstallExtension;
let freshInstallCalled = false;
extensionUtils.loadOrInstallExtension = async () => {
freshInstallCalled = true;
return {
webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm',
name: 'istilldontcareaboutcookies',
version: '1.1.9'
};
};
const result = await installCookiesExtension();
extensionUtils.loadOrInstallExtension = originalFunc;
// Should trigger fresh install when manifest missing
assert.ok(freshInstallCalled || result);
});
});
});

View File

@@ -3,16 +3,16 @@
"type": "object",
"additionalProperties": false,
"properties": {
"SAVE_MEDIA": {
"MEDIA_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["USE_YTDLP", "FETCH_MEDIA"],
"x-aliases": ["SAVE_MEDIA", "USE_MEDIA", "USE_YTDLP", "FETCH_MEDIA"],
"description": "Enable media downloading with yt-dlp"
},
"YOUTUBEDL_BINARY": {
"MEDIA_BINARY": {
"type": "string",
"default": "yt-dlp",
"x-aliases": ["YTDLP_BINARY", "YOUTUBE_DL_BINARY"],
"x-aliases": ["YOUTUBEDL_BINARY", "YTDLP_BINARY", "YOUTUBE_DL_BINARY"],
"description": "Path to yt-dlp binary"
},
"MEDIA_TIMEOUT": {
@@ -28,13 +28,14 @@
"pattern": "^\\d+[kmgKMG]?$",
"description": "Maximum file size for media downloads"
},
"YTDLP_CHECK_SSL_VALIDITY": {
"MEDIA_CHECK_SSL_VALIDITY": {
"type": "boolean",
"default": true,
"x-fallback": "CHECK_SSL_VALIDITY",
"x-aliases": ["YTDLP_CHECK_SSL_VALIDITY"],
"description": "Whether to verify SSL certificates"
},
"YTDLP_ARGS": {
"MEDIA_ARGS": {
"type": "array",
"items": {"type": "string"},
"default": [
@@ -44,11 +45,13 @@
"--embed-subs",
"--write-auto-sub"
],
"x-aliases": ["YTDLP_ARGS"],
"description": "Default yt-dlp arguments"
},
"YTDLP_EXTRA_ARGS": {
"MEDIA_EXTRA_ARGS": {
"type": "string",
"default": "",
"x-aliases": ["YTDLP_EXTRA_ARGS"],
"description": "Extra arguments for yt-dlp (space-separated)"
}
}

View File

@@ -2,6 +2,7 @@
Integration tests for media plugin
Tests verify:
pass
1. Hook script exists
2. Dependencies installed via validation hooks
3. Verify deps with abx-pkg
@@ -45,7 +46,9 @@ def test_ytdlp_install_hook():
found_dependencies = {'node': False, 'ffmpeg': False, 'yt-dlp': False}
for line in result.stdout.strip().split('\n'):
pass
if line.strip():
pass
try:
record = json.loads(line)
if record.get('type') == 'Binary':
@@ -94,7 +97,7 @@ def test_verify_deps_with_abx_pkg():
missing_binaries.append('ffmpeg')
if missing_binaries:
pytest.skip(f"Binaries not available: {', '.join(missing_binaries)} - Dependency records should have been emitted")
pass
def test_handles_non_media_url():
"""Test that media extractor handles non-media URLs gracefully via hook."""
@@ -120,6 +123,7 @@ def test_handles_non_media_url():
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
pass
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':

View File

@@ -3,9 +3,10 @@
"type": "object",
"additionalProperties": false,
"properties": {
"SAVE_MERCURY": {
"MERCURY_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_MERCURY", "USE_MERCURY"],
"description": "Enable Mercury text extraction"
},
"MERCURY_BINARY": {

View File

@@ -2,6 +2,7 @@
Integration tests for mercury plugin
Tests verify:
pass
1. Hook script exists
2. Dependencies installed via validation hooks
3. Verify deps with abx-pkg
@@ -44,7 +45,9 @@ def test_mercury_install_hook():
# Binary found - verify Binary JSONL output
found_binary = False
for line in result.stdout.strip().split('\n'):
pass
if line.strip():
pass
try:
record = json.loads(line)
if record.get('type') == 'Binary':
@@ -59,7 +62,9 @@ def test_mercury_install_hook():
# Binary not found - verify Dependency JSONL output
found_dependency = False
for line in result.stdout.strip().split('\n'):
pass
if line.strip():
pass
try:
record = json.loads(line)
if record.get('type') == 'Dependency':
@@ -89,7 +94,7 @@ def test_verify_deps_with_abx_pkg():
if mercury_loaded and mercury_loaded.abspath:
assert True, "postlight-parser is available"
else:
pytest.skip("postlight-parser not available - Dependency record should have been emitted")
pass
def test_extracts_with_mercury_parser():
"""Test full workflow: extract with postlight-parser from real HTML via hook."""
@@ -122,6 +127,7 @@ def test_extracts_with_mercury_parser():
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
pass
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
@@ -184,6 +190,7 @@ def test_fails_gracefully_without_html():
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
pass
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':

View File

@@ -1,925 +0,0 @@
{
"name": "archivebox-plugins",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "archivebox-plugins",
"dependencies": {
"puppeteer-core": "^24.34.0"
}
},
"node_modules/@puppeteer/browsers": {
"version": "2.11.0",
"resolved": "https://registry.npmjs.org/@puppeteer/browsers/-/browsers-2.11.0.tgz",
"integrity": "sha512-n6oQX6mYkG8TRPuPXmbPidkUbsSRalhmaaVAQxvH1IkQy63cwsH+kOjB3e4cpCDHg0aSvsiX9bQ4s2VB6mGWUQ==",
"license": "Apache-2.0",
"dependencies": {
"debug": "^4.4.3",
"extract-zip": "^2.0.1",
"progress": "^2.0.3",
"proxy-agent": "^6.5.0",
"semver": "^7.7.3",
"tar-fs": "^3.1.1",
"yargs": "^17.7.2"
},
"bin": {
"browsers": "lib/cjs/main-cli.js"
},
"engines": {
"node": ">=18"
}
},
"node_modules/@tootallnate/quickjs-emscripten": {
"version": "0.23.0",
"resolved": "https://registry.npmjs.org/@tootallnate/quickjs-emscripten/-/quickjs-emscripten-0.23.0.tgz",
"integrity": "sha512-C5Mc6rdnsaJDjO3UpGW/CQTHtCKaYlScZTly4JIu97Jxo/odCiH0ITnDXSJPTOrEKk/ycSZ0AOgTmkDtkOsvIA==",
"license": "MIT"
},
"node_modules/@types/node": {
"version": "25.0.3",
"resolved": "https://registry.npmjs.org/@types/node/-/node-25.0.3.tgz",
"integrity": "sha512-W609buLVRVmeW693xKfzHeIV6nJGGz98uCPfeXI1ELMLXVeKYZ9m15fAMSaUPBHYLGFsVRcMmSCksQOrZV9BYA==",
"license": "MIT",
"optional": true,
"dependencies": {
"undici-types": "~7.16.0"
}
},
"node_modules/@types/yauzl": {
"version": "2.10.3",
"resolved": "https://registry.npmjs.org/@types/yauzl/-/yauzl-2.10.3.tgz",
"integrity": "sha512-oJoftv0LSuaDZE3Le4DbKX+KS9G36NzOeSap90UIK0yMA/NhKJhqlSGtNDORNRaIbQfzjXDrQa0ytJ6mNRGz/Q==",
"license": "MIT",
"optional": true,
"dependencies": {
"@types/node": "*"
}
},
"node_modules/agent-base": {
"version": "7.1.4",
"resolved": "https://registry.npmjs.org/agent-base/-/agent-base-7.1.4.tgz",
"integrity": "sha512-MnA+YT8fwfJPgBx3m60MNqakm30XOkyIoH1y6huTQvC0PwZG7ki8NacLBcrPbNoo8vEZy7Jpuk7+jMO+CUovTQ==",
"license": "MIT",
"engines": {
"node": ">= 14"
}
},
"node_modules/ansi-regex": {
"version": "5.0.1",
"resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz",
"integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==",
"license": "MIT",
"engines": {
"node": ">=8"
}
},
"node_modules/ansi-styles": {
"version": "4.3.0",
"resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
"integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
"license": "MIT",
"dependencies": {
"color-convert": "^2.0.1"
},
"engines": {
"node": ">=8"
},
"funding": {
"url": "https://github.com/chalk/ansi-styles?sponsor=1"
}
},
"node_modules/ast-types": {
"version": "0.13.4",
"resolved": "https://registry.npmjs.org/ast-types/-/ast-types-0.13.4.tgz",
"integrity": "sha512-x1FCFnFifvYDDzTaLII71vG5uvDwgtmDTEVWAxrgeiR8VjMONcCXJx7E+USjDtHlwFmt9MysbqgF9b9Vjr6w+w==",
"license": "MIT",
"dependencies": {
"tslib": "^2.0.1"
},
"engines": {
"node": ">=4"
}
},
"node_modules/b4a": {
"version": "1.7.3",
"resolved": "https://registry.npmjs.org/b4a/-/b4a-1.7.3.tgz",
"integrity": "sha512-5Q2mfq2WfGuFp3uS//0s6baOJLMoVduPYVeNmDYxu5OUA1/cBfvr2RIS7vi62LdNj/urk1hfmj867I3qt6uZ7Q==",
"license": "Apache-2.0",
"peerDependencies": {
"react-native-b4a": "*"
},
"peerDependenciesMeta": {
"react-native-b4a": {
"optional": true
}
}
},
"node_modules/bare-events": {
"version": "2.8.2",
"resolved": "https://registry.npmjs.org/bare-events/-/bare-events-2.8.2.tgz",
"integrity": "sha512-riJjyv1/mHLIPX4RwiK+oW9/4c3TEUeORHKefKAKnZ5kyslbN+HXowtbaVEqt4IMUB7OXlfixcs6gsFeo/jhiQ==",
"license": "Apache-2.0",
"peerDependencies": {
"bare-abort-controller": "*"
},
"peerDependenciesMeta": {
"bare-abort-controller": {
"optional": true
}
}
},
"node_modules/bare-fs": {
"version": "4.5.2",
"resolved": "https://registry.npmjs.org/bare-fs/-/bare-fs-4.5.2.tgz",
"integrity": "sha512-veTnRzkb6aPHOvSKIOy60KzURfBdUflr5VReI+NSaPL6xf+XLdONQgZgpYvUuZLVQ8dCqxpBAudaOM1+KpAUxw==",
"license": "Apache-2.0",
"optional": true,
"dependencies": {
"bare-events": "^2.5.4",
"bare-path": "^3.0.0",
"bare-stream": "^2.6.4",
"bare-url": "^2.2.2",
"fast-fifo": "^1.3.2"
},
"engines": {
"bare": ">=1.16.0"
},
"peerDependencies": {
"bare-buffer": "*"
},
"peerDependenciesMeta": {
"bare-buffer": {
"optional": true
}
}
},
"node_modules/bare-os": {
"version": "3.6.2",
"resolved": "https://registry.npmjs.org/bare-os/-/bare-os-3.6.2.tgz",
"integrity": "sha512-T+V1+1srU2qYNBmJCXZkUY5vQ0B4FSlL3QDROnKQYOqeiQR8UbjNHlPa+TIbM4cuidiN9GaTaOZgSEgsvPbh5A==",
"license": "Apache-2.0",
"optional": true,
"engines": {
"bare": ">=1.14.0"
}
},
"node_modules/bare-path": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/bare-path/-/bare-path-3.0.0.tgz",
"integrity": "sha512-tyfW2cQcB5NN8Saijrhqn0Zh7AnFNsnczRcuWODH0eYAXBsJ5gVxAUuNr7tsHSC6IZ77cA0SitzT+s47kot8Mw==",
"license": "Apache-2.0",
"optional": true,
"dependencies": {
"bare-os": "^3.0.1"
}
},
"node_modules/bare-stream": {
"version": "2.7.0",
"resolved": "https://registry.npmjs.org/bare-stream/-/bare-stream-2.7.0.tgz",
"integrity": "sha512-oyXQNicV1y8nc2aKffH+BUHFRXmx6VrPzlnaEvMhram0nPBrKcEdcyBg5r08D0i8VxngHFAiVyn1QKXpSG0B8A==",
"license": "Apache-2.0",
"optional": true,
"dependencies": {
"streamx": "^2.21.0"
},
"peerDependencies": {
"bare-buffer": "*",
"bare-events": "*"
},
"peerDependenciesMeta": {
"bare-buffer": {
"optional": true
},
"bare-events": {
"optional": true
}
}
},
"node_modules/bare-url": {
"version": "2.3.2",
"resolved": "https://registry.npmjs.org/bare-url/-/bare-url-2.3.2.tgz",
"integrity": "sha512-ZMq4gd9ngV5aTMa5p9+UfY0b3skwhHELaDkhEHetMdX0LRkW9kzaym4oo/Eh+Ghm0CCDuMTsRIGM/ytUc1ZYmw==",
"license": "Apache-2.0",
"optional": true,
"dependencies": {
"bare-path": "^3.0.0"
}
},
"node_modules/basic-ftp": {
"version": "5.0.5",
"resolved": "https://registry.npmjs.org/basic-ftp/-/basic-ftp-5.0.5.tgz",
"integrity": "sha512-4Bcg1P8xhUuqcii/S0Z9wiHIrQVPMermM1any+MX5GeGD7faD3/msQUDGLol9wOcz4/jbg/WJnGqoJF6LiBdtg==",
"license": "MIT",
"engines": {
"node": ">=10.0.0"
}
},
"node_modules/buffer-crc32": {
"version": "0.2.13",
"resolved": "https://registry.npmjs.org/buffer-crc32/-/buffer-crc32-0.2.13.tgz",
"integrity": "sha512-VO9Ht/+p3SN7SKWqcrgEzjGbRSJYTx+Q1pTQC0wrWqHx0vpJraQ6GtHx8tvcg1rlK1byhU5gccxgOgj7B0TDkQ==",
"license": "MIT",
"engines": {
"node": "*"
}
},
"node_modules/chromium-bidi": {
"version": "12.0.1",
"resolved": "https://registry.npmjs.org/chromium-bidi/-/chromium-bidi-12.0.1.tgz",
"integrity": "sha512-fGg+6jr0xjQhzpy5N4ErZxQ4wF7KLEvhGZXD6EgvZKDhu7iOhZXnZhcDxPJDcwTcrD48NPzOCo84RP2lv3Z+Cg==",
"license": "Apache-2.0",
"dependencies": {
"mitt": "^3.0.1",
"zod": "^3.24.1"
},
"peerDependencies": {
"devtools-protocol": "*"
}
},
"node_modules/cliui": {
"version": "8.0.1",
"resolved": "https://registry.npmjs.org/cliui/-/cliui-8.0.1.tgz",
"integrity": "sha512-BSeNnyus75C4//NQ9gQt1/csTXyo/8Sb+afLAkzAptFuMsod9HFokGNudZpi/oQV73hnVK+sR+5PVRMd+Dr7YQ==",
"license": "ISC",
"dependencies": {
"string-width": "^4.2.0",
"strip-ansi": "^6.0.1",
"wrap-ansi": "^7.0.0"
},
"engines": {
"node": ">=12"
}
},
"node_modules/color-convert": {
"version": "2.0.1",
"resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
"integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
"license": "MIT",
"dependencies": {
"color-name": "~1.1.4"
},
"engines": {
"node": ">=7.0.0"
}
},
"node_modules/color-name": {
"version": "1.1.4",
"resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
"integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
"license": "MIT"
},
"node_modules/data-uri-to-buffer": {
"version": "6.0.2",
"resolved": "https://registry.npmjs.org/data-uri-to-buffer/-/data-uri-to-buffer-6.0.2.tgz",
"integrity": "sha512-7hvf7/GW8e86rW0ptuwS3OcBGDjIi6SZva7hCyWC0yYry2cOPmLIjXAUHI6DK2HsnwJd9ifmt57i8eV2n4YNpw==",
"license": "MIT",
"engines": {
"node": ">= 14"
}
},
"node_modules/debug": {
"version": "4.4.3",
"resolved": "https://registry.npmjs.org/debug/-/debug-4.4.3.tgz",
"integrity": "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA==",
"license": "MIT",
"dependencies": {
"ms": "^2.1.3"
},
"engines": {
"node": ">=6.0"
},
"peerDependenciesMeta": {
"supports-color": {
"optional": true
}
}
},
"node_modules/degenerator": {
"version": "5.0.1",
"resolved": "https://registry.npmjs.org/degenerator/-/degenerator-5.0.1.tgz",
"integrity": "sha512-TllpMR/t0M5sqCXfj85i4XaAzxmS5tVA16dqvdkMwGmzI+dXLXnw3J+3Vdv7VKw+ThlTMboK6i9rnZ6Nntj5CQ==",
"license": "MIT",
"dependencies": {
"ast-types": "^0.13.4",
"escodegen": "^2.1.0",
"esprima": "^4.0.1"
},
"engines": {
"node": ">= 14"
}
},
"node_modules/devtools-protocol": {
"version": "0.0.1534754",
"resolved": "https://registry.npmjs.org/devtools-protocol/-/devtools-protocol-0.0.1534754.tgz",
"integrity": "sha512-26T91cV5dbOYnXdJi5qQHoTtUoNEqwkHcAyu/IKtjIAxiEqPMrDiRkDOPWVsGfNZGmlQVHQbZRSjD8sxagWVsQ==",
"license": "BSD-3-Clause",
"peer": true
},
"node_modules/emoji-regex": {
"version": "8.0.0",
"resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz",
"integrity": "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==",
"license": "MIT"
},
"node_modules/end-of-stream": {
"version": "1.4.5",
"resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.5.tgz",
"integrity": "sha512-ooEGc6HP26xXq/N+GCGOT0JKCLDGrq2bQUZrQ7gyrJiZANJ/8YDTxTpQBXGMn+WbIQXNVpyWymm7KYVICQnyOg==",
"license": "MIT",
"dependencies": {
"once": "^1.4.0"
}
},
"node_modules/escalade": {
"version": "3.2.0",
"resolved": "https://registry.npmjs.org/escalade/-/escalade-3.2.0.tgz",
"integrity": "sha512-WUj2qlxaQtO4g6Pq5c29GTcWGDyd8itL8zTlipgECz3JesAiiOKotd8JU6otB3PACgG6xkJUyVhboMS+bje/jA==",
"license": "MIT",
"engines": {
"node": ">=6"
}
},
"node_modules/escodegen": {
"version": "2.1.0",
"resolved": "https://registry.npmjs.org/escodegen/-/escodegen-2.1.0.tgz",
"integrity": "sha512-2NlIDTwUWJN0mRPQOdtQBzbUHvdGY2P1VXSyU83Q3xKxM7WHX2Ql8dKq782Q9TgQUNOLEzEYu9bzLNj1q88I5w==",
"license": "BSD-2-Clause",
"dependencies": {
"esprima": "^4.0.1",
"estraverse": "^5.2.0",
"esutils": "^2.0.2"
},
"bin": {
"escodegen": "bin/escodegen.js",
"esgenerate": "bin/esgenerate.js"
},
"engines": {
"node": ">=6.0"
},
"optionalDependencies": {
"source-map": "~0.6.1"
}
},
"node_modules/esprima": {
"version": "4.0.1",
"resolved": "https://registry.npmjs.org/esprima/-/esprima-4.0.1.tgz",
"integrity": "sha512-eGuFFw7Upda+g4p+QHvnW0RyTX/SVeJBDM/gCtMARO0cLuT2HcEKnTPvhjV6aGeqrCB/sbNop0Kszm0jsaWU4A==",
"license": "BSD-2-Clause",
"bin": {
"esparse": "bin/esparse.js",
"esvalidate": "bin/esvalidate.js"
},
"engines": {
"node": ">=4"
}
},
"node_modules/estraverse": {
"version": "5.3.0",
"resolved": "https://registry.npmjs.org/estraverse/-/estraverse-5.3.0.tgz",
"integrity": "sha512-MMdARuVEQziNTeJD8DgMqmhwR11BRQ/cBP+pLtYdSTnf3MIO8fFeiINEbX36ZdNlfU/7A9f3gUw49B3oQsvwBA==",
"license": "BSD-2-Clause",
"engines": {
"node": ">=4.0"
}
},
"node_modules/esutils": {
"version": "2.0.3",
"resolved": "https://registry.npmjs.org/esutils/-/esutils-2.0.3.tgz",
"integrity": "sha512-kVscqXk4OCp68SZ0dkgEKVi6/8ij300KBWTJq32P/dYeWTSwK41WyTxalN1eRmA5Z9UU/LX9D7FWSmV9SAYx6g==",
"license": "BSD-2-Clause",
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/events-universal": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/events-universal/-/events-universal-1.0.1.tgz",
"integrity": "sha512-LUd5euvbMLpwOF8m6ivPCbhQeSiYVNb8Vs0fQ8QjXo0JTkEHpz8pxdQf0gStltaPpw0Cca8b39KxvK9cfKRiAw==",
"license": "Apache-2.0",
"dependencies": {
"bare-events": "^2.7.0"
}
},
"node_modules/extract-zip": {
"version": "2.0.1",
"resolved": "https://registry.npmjs.org/extract-zip/-/extract-zip-2.0.1.tgz",
"integrity": "sha512-GDhU9ntwuKyGXdZBUgTIe+vXnWj0fppUEtMDL0+idd5Sta8TGpHssn/eusA9mrPr9qNDym6SxAYZjNvCn/9RBg==",
"license": "BSD-2-Clause",
"dependencies": {
"debug": "^4.1.1",
"get-stream": "^5.1.0",
"yauzl": "^2.10.0"
},
"bin": {
"extract-zip": "cli.js"
},
"engines": {
"node": ">= 10.17.0"
},
"optionalDependencies": {
"@types/yauzl": "^2.9.1"
}
},
"node_modules/fast-fifo": {
"version": "1.3.2",
"resolved": "https://registry.npmjs.org/fast-fifo/-/fast-fifo-1.3.2.tgz",
"integrity": "sha512-/d9sfos4yxzpwkDkuN7k2SqFKtYNmCTzgfEpz82x34IM9/zc8KGxQoXg1liNC/izpRM/MBdt44Nmx41ZWqk+FQ==",
"license": "MIT"
},
"node_modules/fd-slicer": {
"version": "1.1.0",
"resolved": "https://registry.npmjs.org/fd-slicer/-/fd-slicer-1.1.0.tgz",
"integrity": "sha512-cE1qsB/VwyQozZ+q1dGxR8LBYNZeofhEdUNGSMbQD3Gw2lAzX9Zb3uIU6Ebc/Fmyjo9AWWfnn0AUCHqtevs/8g==",
"license": "MIT",
"dependencies": {
"pend": "~1.2.0"
}
},
"node_modules/get-caller-file": {
"version": "2.0.5",
"resolved": "https://registry.npmjs.org/get-caller-file/-/get-caller-file-2.0.5.tgz",
"integrity": "sha512-DyFP3BM/3YHTQOCUL/w0OZHR0lpKeGrxotcHWcqNEdnltqFwXVfhEBQ94eIo34AfQpo0rGki4cyIiftY06h2Fg==",
"license": "ISC",
"engines": {
"node": "6.* || 8.* || >= 10.*"
}
},
"node_modules/get-stream": {
"version": "5.2.0",
"resolved": "https://registry.npmjs.org/get-stream/-/get-stream-5.2.0.tgz",
"integrity": "sha512-nBF+F1rAZVCu/p7rjzgA+Yb4lfYXrpl7a6VmJrU8wF9I1CKvP/QwPNZHnOlwbTkY6dvtFIzFMSyQXbLoTQPRpA==",
"license": "MIT",
"dependencies": {
"pump": "^3.0.0"
},
"engines": {
"node": ">=8"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/get-uri": {
"version": "6.0.5",
"resolved": "https://registry.npmjs.org/get-uri/-/get-uri-6.0.5.tgz",
"integrity": "sha512-b1O07XYq8eRuVzBNgJLstU6FYc1tS6wnMtF1I1D9lE8LxZSOGZ7LhxN54yPP6mGw5f2CkXY2BQUL9Fx41qvcIg==",
"license": "MIT",
"dependencies": {
"basic-ftp": "^5.0.2",
"data-uri-to-buffer": "^6.0.2",
"debug": "^4.3.4"
},
"engines": {
"node": ">= 14"
}
},
"node_modules/http-proxy-agent": {
"version": "7.0.2",
"resolved": "https://registry.npmjs.org/http-proxy-agent/-/http-proxy-agent-7.0.2.tgz",
"integrity": "sha512-T1gkAiYYDWYx3V5Bmyu7HcfcvL7mUrTWiM6yOfa3PIphViJ/gFPbvidQ+veqSOHci/PxBcDabeUNCzpOODJZig==",
"license": "MIT",
"dependencies": {
"agent-base": "^7.1.0",
"debug": "^4.3.4"
},
"engines": {
"node": ">= 14"
}
},
"node_modules/https-proxy-agent": {
"version": "7.0.6",
"resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-7.0.6.tgz",
"integrity": "sha512-vK9P5/iUfdl95AI+JVyUuIcVtd4ofvtrOr3HNtM2yxC9bnMbEdp3x01OhQNnjb8IJYi38VlTE3mBXwcfvywuSw==",
"license": "MIT",
"dependencies": {
"agent-base": "^7.1.2",
"debug": "4"
},
"engines": {
"node": ">= 14"
}
},
"node_modules/ip-address": {
"version": "10.1.0",
"resolved": "https://registry.npmjs.org/ip-address/-/ip-address-10.1.0.tgz",
"integrity": "sha512-XXADHxXmvT9+CRxhXg56LJovE+bmWnEWB78LB83VZTprKTmaC5QfruXocxzTZ2Kl0DNwKuBdlIhjL8LeY8Sf8Q==",
"license": "MIT",
"engines": {
"node": ">= 12"
}
},
"node_modules/is-fullwidth-code-point": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-3.0.0.tgz",
"integrity": "sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg==",
"license": "MIT",
"engines": {
"node": ">=8"
}
},
"node_modules/lru-cache": {
"version": "7.18.3",
"resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-7.18.3.tgz",
"integrity": "sha512-jumlc0BIUrS3qJGgIkWZsyfAM7NCWiBcCDhnd+3NNM5KbBmLTgHVfWBcg6W+rLUsIpzpERPsvwUP7CckAQSOoA==",
"license": "ISC",
"engines": {
"node": ">=12"
}
},
"node_modules/mitt": {
"version": "3.0.1",
"resolved": "https://registry.npmjs.org/mitt/-/mitt-3.0.1.tgz",
"integrity": "sha512-vKivATfr97l2/QBCYAkXYDbrIWPM2IIKEl7YPhjCvKlG3kE2gm+uBo6nEXK3M5/Ffh/FLpKExzOQ3JJoJGFKBw==",
"license": "MIT"
},
"node_modules/ms": {
"version": "2.1.3",
"resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz",
"integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==",
"license": "MIT"
},
"node_modules/netmask": {
"version": "2.0.2",
"resolved": "https://registry.npmjs.org/netmask/-/netmask-2.0.2.tgz",
"integrity": "sha512-dBpDMdxv9Irdq66304OLfEmQ9tbNRFnFTuZiLo+bD+r332bBmMJ8GBLXklIXXgxd3+v9+KUnZaUR5PJMa75Gsg==",
"license": "MIT",
"engines": {
"node": ">= 0.4.0"
}
},
"node_modules/once": {
"version": "1.4.0",
"resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz",
"integrity": "sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==",
"license": "ISC",
"dependencies": {
"wrappy": "1"
}
},
"node_modules/pac-proxy-agent": {
"version": "7.2.0",
"resolved": "https://registry.npmjs.org/pac-proxy-agent/-/pac-proxy-agent-7.2.0.tgz",
"integrity": "sha512-TEB8ESquiLMc0lV8vcd5Ql/JAKAoyzHFXaStwjkzpOpC5Yv+pIzLfHvjTSdf3vpa2bMiUQrg9i6276yn8666aA==",
"license": "MIT",
"dependencies": {
"@tootallnate/quickjs-emscripten": "^0.23.0",
"agent-base": "^7.1.2",
"debug": "^4.3.4",
"get-uri": "^6.0.1",
"http-proxy-agent": "^7.0.0",
"https-proxy-agent": "^7.0.6",
"pac-resolver": "^7.0.1",
"socks-proxy-agent": "^8.0.5"
},
"engines": {
"node": ">= 14"
}
},
"node_modules/pac-resolver": {
"version": "7.0.1",
"resolved": "https://registry.npmjs.org/pac-resolver/-/pac-resolver-7.0.1.tgz",
"integrity": "sha512-5NPgf87AT2STgwa2ntRMr45jTKrYBGkVU36yT0ig/n/GMAa3oPqhZfIQ2kMEimReg0+t9kZViDVZ83qfVUlckg==",
"license": "MIT",
"dependencies": {
"degenerator": "^5.0.0",
"netmask": "^2.0.2"
},
"engines": {
"node": ">= 14"
}
},
"node_modules/pend": {
"version": "1.2.0",
"resolved": "https://registry.npmjs.org/pend/-/pend-1.2.0.tgz",
"integrity": "sha512-F3asv42UuXchdzt+xXqfW1OGlVBe+mxa2mqI0pg5yAHZPvFmY3Y6drSf/GQ1A86WgWEN9Kzh/WrgKa6iGcHXLg==",
"license": "MIT"
},
"node_modules/progress": {
"version": "2.0.3",
"resolved": "https://registry.npmjs.org/progress/-/progress-2.0.3.tgz",
"integrity": "sha512-7PiHtLll5LdnKIMw100I+8xJXR5gW2QwWYkT6iJva0bXitZKa/XMrSbdmg3r2Xnaidz9Qumd0VPaMrZlF9V9sA==",
"license": "MIT",
"engines": {
"node": ">=0.4.0"
}
},
"node_modules/proxy-agent": {
"version": "6.5.0",
"resolved": "https://registry.npmjs.org/proxy-agent/-/proxy-agent-6.5.0.tgz",
"integrity": "sha512-TmatMXdr2KlRiA2CyDu8GqR8EjahTG3aY3nXjdzFyoZbmB8hrBsTyMezhULIXKnC0jpfjlmiZ3+EaCzoInSu/A==",
"license": "MIT",
"dependencies": {
"agent-base": "^7.1.2",
"debug": "^4.3.4",
"http-proxy-agent": "^7.0.1",
"https-proxy-agent": "^7.0.6",
"lru-cache": "^7.14.1",
"pac-proxy-agent": "^7.1.0",
"proxy-from-env": "^1.1.0",
"socks-proxy-agent": "^8.0.5"
},
"engines": {
"node": ">= 14"
}
},
"node_modules/proxy-from-env": {
"version": "1.1.0",
"resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.1.0.tgz",
"integrity": "sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==",
"license": "MIT"
},
"node_modules/pump": {
"version": "3.0.3",
"resolved": "https://registry.npmjs.org/pump/-/pump-3.0.3.tgz",
"integrity": "sha512-todwxLMY7/heScKmntwQG8CXVkWUOdYxIvY2s0VWAAMh/nd8SoYiRaKjlr7+iCs984f2P8zvrfWcDDYVb73NfA==",
"license": "MIT",
"dependencies": {
"end-of-stream": "^1.1.0",
"once": "^1.3.1"
}
},
"node_modules/puppeteer-core": {
"version": "24.34.0",
"resolved": "https://registry.npmjs.org/puppeteer-core/-/puppeteer-core-24.34.0.tgz",
"integrity": "sha512-24evawO+mUGW4mvS2a2ivwLdX3gk8zRLZr9HP+7+VT2vBQnm0oh9jJEZmUE3ePJhRkYlZ93i7OMpdcoi2qNCLg==",
"license": "Apache-2.0",
"dependencies": {
"@puppeteer/browsers": "2.11.0",
"chromium-bidi": "12.0.1",
"debug": "^4.4.3",
"devtools-protocol": "0.0.1534754",
"typed-query-selector": "^2.12.0",
"webdriver-bidi-protocol": "0.3.10",
"ws": "^8.18.3"
},
"engines": {
"node": ">=18"
}
},
"node_modules/require-directory": {
"version": "2.1.1",
"resolved": "https://registry.npmjs.org/require-directory/-/require-directory-2.1.1.tgz",
"integrity": "sha512-fGxEI7+wsG9xrvdjsrlmL22OMTTiHRwAMroiEeMgq8gzoLC/PQr7RsRDSTLUg/bZAZtF+TVIkHc6/4RIKrui+Q==",
"license": "MIT",
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/semver": {
"version": "7.7.3",
"resolved": "https://registry.npmjs.org/semver/-/semver-7.7.3.tgz",
"integrity": "sha512-SdsKMrI9TdgjdweUSR9MweHA4EJ8YxHn8DFaDisvhVlUOe4BF1tLD7GAj0lIqWVl+dPb/rExr0Btby5loQm20Q==",
"license": "ISC",
"bin": {
"semver": "bin/semver.js"
},
"engines": {
"node": ">=10"
}
},
"node_modules/smart-buffer": {
"version": "4.2.0",
"resolved": "https://registry.npmjs.org/smart-buffer/-/smart-buffer-4.2.0.tgz",
"integrity": "sha512-94hK0Hh8rPqQl2xXc3HsaBoOXKV20MToPkcXvwbISWLEs+64sBq5kFgn2kJDHb1Pry9yrP0dxrCI9RRci7RXKg==",
"license": "MIT",
"engines": {
"node": ">= 6.0.0",
"npm": ">= 3.0.0"
}
},
"node_modules/socks": {
"version": "2.8.7",
"resolved": "https://registry.npmjs.org/socks/-/socks-2.8.7.tgz",
"integrity": "sha512-HLpt+uLy/pxB+bum/9DzAgiKS8CX1EvbWxI4zlmgGCExImLdiad2iCwXT5Z4c9c3Eq8rP2318mPW2c+QbtjK8A==",
"license": "MIT",
"dependencies": {
"ip-address": "^10.0.1",
"smart-buffer": "^4.2.0"
},
"engines": {
"node": ">= 10.0.0",
"npm": ">= 3.0.0"
}
},
"node_modules/socks-proxy-agent": {
"version": "8.0.5",
"resolved": "https://registry.npmjs.org/socks-proxy-agent/-/socks-proxy-agent-8.0.5.tgz",
"integrity": "sha512-HehCEsotFqbPW9sJ8WVYB6UbmIMv7kUUORIF2Nncq4VQvBfNBLibW9YZR5dlYCSUhwcD628pRllm7n+E+YTzJw==",
"license": "MIT",
"dependencies": {
"agent-base": "^7.1.2",
"debug": "^4.3.4",
"socks": "^2.8.3"
},
"engines": {
"node": ">= 14"
}
},
"node_modules/source-map": {
"version": "0.6.1",
"resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz",
"integrity": "sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g==",
"license": "BSD-3-Clause",
"optional": true,
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/streamx": {
"version": "2.23.0",
"resolved": "https://registry.npmjs.org/streamx/-/streamx-2.23.0.tgz",
"integrity": "sha512-kn+e44esVfn2Fa/O0CPFcex27fjIL6MkVae0Mm6q+E6f0hWv578YCERbv+4m02cjxvDsPKLnmxral/rR6lBMAg==",
"license": "MIT",
"dependencies": {
"events-universal": "^1.0.0",
"fast-fifo": "^1.3.2",
"text-decoder": "^1.1.0"
}
},
"node_modules/string-width": {
"version": "4.2.3",
"resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz",
"integrity": "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==",
"license": "MIT",
"dependencies": {
"emoji-regex": "^8.0.0",
"is-fullwidth-code-point": "^3.0.0",
"strip-ansi": "^6.0.1"
},
"engines": {
"node": ">=8"
}
},
"node_modules/strip-ansi": {
"version": "6.0.1",
"resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz",
"integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==",
"license": "MIT",
"dependencies": {
"ansi-regex": "^5.0.1"
},
"engines": {
"node": ">=8"
}
},
"node_modules/tar-fs": {
"version": "3.1.1",
"resolved": "https://registry.npmjs.org/tar-fs/-/tar-fs-3.1.1.tgz",
"integrity": "sha512-LZA0oaPOc2fVo82Txf3gw+AkEd38szODlptMYejQUhndHMLQ9M059uXR+AfS7DNo0NpINvSqDsvyaCrBVkptWg==",
"license": "MIT",
"dependencies": {
"pump": "^3.0.0",
"tar-stream": "^3.1.5"
},
"optionalDependencies": {
"bare-fs": "^4.0.1",
"bare-path": "^3.0.0"
}
},
"node_modules/tar-stream": {
"version": "3.1.7",
"resolved": "https://registry.npmjs.org/tar-stream/-/tar-stream-3.1.7.tgz",
"integrity": "sha512-qJj60CXt7IU1Ffyc3NJMjh6EkuCFej46zUqJ4J7pqYlThyd9bO0XBTmcOIhSzZJVWfsLks0+nle/j538YAW9RQ==",
"license": "MIT",
"dependencies": {
"b4a": "^1.6.4",
"fast-fifo": "^1.2.0",
"streamx": "^2.15.0"
}
},
"node_modules/text-decoder": {
"version": "1.2.3",
"resolved": "https://registry.npmjs.org/text-decoder/-/text-decoder-1.2.3.tgz",
"integrity": "sha512-3/o9z3X0X0fTupwsYvR03pJ/DjWuqqrfwBgTQzdWDiQSm9KitAyz/9WqsT2JQW7KV2m+bC2ol/zqpW37NHxLaA==",
"license": "Apache-2.0",
"dependencies": {
"b4a": "^1.6.4"
}
},
"node_modules/tslib": {
"version": "2.8.1",
"resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.1.tgz",
"integrity": "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==",
"license": "0BSD"
},
"node_modules/typed-query-selector": {
"version": "2.12.0",
"resolved": "https://registry.npmjs.org/typed-query-selector/-/typed-query-selector-2.12.0.tgz",
"integrity": "sha512-SbklCd1F0EiZOyPiW192rrHZzZ5sBijB6xM+cpmrwDqObvdtunOHHIk9fCGsoK5JVIYXoyEp4iEdE3upFH3PAg==",
"license": "MIT"
},
"node_modules/undici-types": {
"version": "7.16.0",
"resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.16.0.tgz",
"integrity": "sha512-Zz+aZWSj8LE6zoxD+xrjh4VfkIG8Ya6LvYkZqtUQGJPZjYl53ypCaUwWqo7eI0x66KBGeRo+mlBEkMSeSZ38Nw==",
"license": "MIT",
"optional": true
},
"node_modules/webdriver-bidi-protocol": {
"version": "0.3.10",
"resolved": "https://registry.npmjs.org/webdriver-bidi-protocol/-/webdriver-bidi-protocol-0.3.10.tgz",
"integrity": "sha512-5LAE43jAVLOhB/QqX4bwSiv0Hg1HBfMmOuwBSXHdvg4GMGu9Y0lIq7p4R/yySu6w74WmaR4GM4H9t2IwLW7hgw==",
"license": "Apache-2.0"
},
"node_modules/wrap-ansi": {
"version": "7.0.0",
"resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-7.0.0.tgz",
"integrity": "sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==",
"license": "MIT",
"dependencies": {
"ansi-styles": "^4.0.0",
"string-width": "^4.1.0",
"strip-ansi": "^6.0.0"
},
"engines": {
"node": ">=10"
},
"funding": {
"url": "https://github.com/chalk/wrap-ansi?sponsor=1"
}
},
"node_modules/wrappy": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz",
"integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==",
"license": "ISC"
},
"node_modules/ws": {
"version": "8.18.3",
"resolved": "https://registry.npmjs.org/ws/-/ws-8.18.3.tgz",
"integrity": "sha512-PEIGCY5tSlUt50cqyMXfCzX+oOPqN0vuGqWzbcJ2xvnkzkq46oOpz7dQaTDBdfICb4N14+GARUDw2XV2N4tvzg==",
"license": "MIT",
"engines": {
"node": ">=10.0.0"
},
"peerDependencies": {
"bufferutil": "^4.0.1",
"utf-8-validate": ">=5.0.2"
},
"peerDependenciesMeta": {
"bufferutil": {
"optional": true
},
"utf-8-validate": {
"optional": true
}
}
},
"node_modules/y18n": {
"version": "5.0.8",
"resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz",
"integrity": "sha512-0pfFzegeDWJHJIAmTLRP2DwHjdF5s7jo9tuztdQxAhINCdvS+3nGINqPd00AphqJR/0LhANUS6/+7SCb98YOfA==",
"license": "ISC",
"engines": {
"node": ">=10"
}
},
"node_modules/yargs": {
"version": "17.7.2",
"resolved": "https://registry.npmjs.org/yargs/-/yargs-17.7.2.tgz",
"integrity": "sha512-7dSzzRQ++CKnNI/krKnYRV7JKKPUXMEh61soaHKg9mrWEhzFWhFnxPxGl+69cD1Ou63C13NUPCnmIcrvqCuM6w==",
"license": "MIT",
"dependencies": {
"cliui": "^8.0.1",
"escalade": "^3.1.1",
"get-caller-file": "^2.0.5",
"require-directory": "^2.1.1",
"string-width": "^4.2.3",
"y18n": "^5.0.5",
"yargs-parser": "^21.1.1"
},
"engines": {
"node": ">=12"
}
},
"node_modules/yargs-parser": {
"version": "21.1.1",
"resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-21.1.1.tgz",
"integrity": "sha512-tVpsJW7DdjecAiFpbIB1e3qxIQsE6NoPc5/eTdrbbIC4h0LVsWhnoa3g+m2HclBIujHzsxZ4VJVA+GUuc2/LBw==",
"license": "ISC",
"engines": {
"node": ">=12"
}
},
"node_modules/yauzl": {
"version": "2.10.0",
"resolved": "https://registry.npmjs.org/yauzl/-/yauzl-2.10.0.tgz",
"integrity": "sha512-p4a9I6X6nu6IhoGmBqAcbJy1mlC4j27vEPZX9F4L4/vZT3Lyq1VkFHw/V/PUcB9Buo+DG3iHkT0x3Qya58zc3g==",
"license": "MIT",
"dependencies": {
"buffer-crc32": "~0.2.3",
"fd-slicer": "~1.1.0"
}
},
"node_modules/zod": {
"version": "3.25.76",
"resolved": "https://registry.npmjs.org/zod/-/zod-3.25.76.tgz",
"integrity": "sha512-gzUt/qt81nXsFGKIFcC3YnfEAx5NkunCfnDlvuBSSFS02bcXu4Lmea0AFIUwbLWxWPx3d9p8S5QoaujKcNQxcQ==",
"license": "MIT",
"funding": {
"url": "https://github.com/sponsors/colinhacks"
}
}
}
}

View File

@@ -1 +0,0 @@
{"name":"archivebox-plugins","private":true,"dependencies":{"puppeteer-core":"^24.34.0"}}

View File

@@ -3,9 +3,10 @@
"type": "object",
"additionalProperties": false,
"properties": {
"SAVE_PAPERSDL": {
"PAPERSDL_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_PAPERSDL", "USE_PAPERSDL"],
"description": "Enable paper downloading with papers-dl"
},
"PAPERSDL_BINARY": {

View File

@@ -170,10 +170,6 @@ def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0
if normalized != url:
urls_found.add(unescape(normalized))
if not urls_found:
click.echo('No URLs found', err=True)
sys.exit(1)
# Emit Snapshot records to stdout (JSONL)
for found_url in sorted(urls_found):
record = {
@@ -189,7 +185,17 @@ def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0
print(json.dumps(record))
click.echo(f'Found {len(urls_found)} URLs', err=True)
# Emit ArchiveResult record to mark completion
status = 'succeeded' if urls_found else 'skipped'
output_str = f'Found {len(urls_found)} URLs' if urls_found else 'No URLs found'
ar_record = {
'type': 'ArchiveResult',
'status': status,
'output_str': output_str,
}
print(json.dumps(ar_record))
click.echo(output_str, err=True)
sys.exit(0)

View File

@@ -27,12 +27,13 @@ class TestParseHtmlUrls:
assert result.returncode == 0, f"Failed to parse example.com: {result.stderr}"
output_file = tmp_path / 'urls.jsonl'
assert output_file.exists(), "Output file not created"
# Verify stdout contains JSONL records for discovered URLs
# example.com links to iana.org
assert 'iana.org' in result.stdout or 'example' in result.stdout, "Expected links from example.com not found"
# Verify output contains IANA link (example.com links to iana.org)
content = output_file.read_text()
assert 'iana.org' in content or 'example' in content, "Expected links from example.com not found"
# Verify ArchiveResult record is present
assert '"type": "ArchiveResult"' in result.stdout, "Missing ArchiveResult record"
assert '"status": "succeeded"' in result.stdout, "Missing success status"
def test_extracts_href_urls(self, tmp_path):
"""Test extracting URLs from anchor tags."""
@@ -56,17 +57,16 @@ class TestParseHtmlUrls:
)
assert result.returncode == 0
assert 'Found 3 URLs' in result.stdout
assert 'Found 3 URLs' in result.stderr
output_file = tmp_path / 'urls.jsonl'
assert output_file.exists()
lines = output_file.read_text().strip().split('\n')
assert len(lines) == 3
# Parse Snapshot records from stdout
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '"type": "Snapshot"' in line]
assert len(lines) == 3, f"Expected 3 Snapshot records, got {len(lines)}"
urls = set()
for line in lines:
entry = json.loads(line)
assert entry['type'] == 'Snapshot'
assert 'url' in entry
urls.add(entry['url'])
@@ -74,6 +74,10 @@ class TestParseHtmlUrls:
assert 'https://foo.bar/page' in urls
assert 'http://test.org' in urls
# Verify ArchiveResult record
assert '"type": "ArchiveResult"' in result.stdout
assert '"status": "succeeded"' in result.stdout
def test_ignores_non_http_schemes(self, tmp_path):
"""Test that non-http schemes are ignored."""
input_file = tmp_path / 'page.html'
@@ -96,9 +100,10 @@ class TestParseHtmlUrls:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
assert len(lines) == 1
# Parse Snapshot records from stdout
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '"type": "Snapshot"' in line]
assert len(lines) == 1, f"Expected 1 Snapshot record, got {len(lines)}"
entry = json.loads(lines[0])
assert entry['url'] == 'https://valid.com'
@@ -122,8 +127,8 @@ class TestParseHtmlUrls:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
entry = json.loads(lines[0])
assert entry['url'] == 'https://example.com/page?a=1&b=2'
def test_deduplicates_urls(self, tmp_path):
@@ -147,8 +152,7 @@ class TestParseHtmlUrls:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
assert len(lines) == 1
def test_excludes_source_url(self, tmp_path):
@@ -172,14 +176,13 @@ class TestParseHtmlUrls:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
assert len(lines) == 1
entry = json.loads(lines[0])
assert entry['url'] == 'https://other.com'
def test_exits_1_when_no_urls_found(self, tmp_path):
"""Test that script exits with code 1 when no URLs found."""
def test_skips_when_no_urls_found(self, tmp_path):
"""Test that script returns skipped status when no URLs found."""
input_file = tmp_path / 'page.html'
input_file.write_text('<html><body>No links here</body></html>')
@@ -190,8 +193,9 @@ class TestParseHtmlUrls:
text=True,
)
assert result.returncode == 1
assert result.returncode == 0
assert 'No URLs found' in result.stderr
assert '"status": "skipped"' in result.stdout
def test_handles_malformed_html(self, tmp_path):
"""Test handling of malformed HTML."""
@@ -212,8 +216,7 @@ class TestParseHtmlUrls:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
assert len(lines) == 2
def test_output_is_valid_json(self, tmp_path):
@@ -229,11 +232,11 @@ class TestParseHtmlUrls:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
entry = json.loads(lines[0])
assert entry['url'] == 'https://example.com'
assert 'type' in entry
assert 'plugin' in entry
assert entry['type'] == 'Snapshot'
assert entry['plugin'] == 'parse_html_urls'
if __name__ == '__main__':

Some files were not shown because too many files have changed in this diff Show More