mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-03 14:27:55 +10:00
wip
This commit is contained in:
@@ -23,7 +23,9 @@
|
||||
"Bash(source .venv/bin/activate)",
|
||||
"Bash(mv:*)",
|
||||
"Bash(echo:*)",
|
||||
"Bash(grep:*)"
|
||||
"Bash(grep:*)",
|
||||
"WebFetch(domain:python-statemachine.readthedocs.io)",
|
||||
"Bash(./bin/run_plugin_tests.sh:*)"
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
@@ -24,12 +24,14 @@ ASCII_LOGO = """
|
||||
╚═╝ ╚═╝╚═╝ ╚═╝ ╚═════╝╚═╝ ╚═╝╚═╝ ╚═══╝ ╚══════╝ ╚═════╝ ╚═════╝ ╚═╝ ╚═╝
|
||||
"""
|
||||
|
||||
# make sure PACKAGE_DIR is in sys.path so we can import all subfolders
|
||||
# without necessarily waiting for django to load them thorugh INSTALLED_APPS
|
||||
PACKAGE_DIR = Path(__file__).resolve().parent
|
||||
|
||||
# Add PACKAGE_DIR to sys.path - required for Django migrations to import models
|
||||
# Migrations reference models like 'machine.Binary' which need to be importable
|
||||
if str(PACKAGE_DIR) not in sys.path:
|
||||
sys.path.append(str(PACKAGE_DIR))
|
||||
os.environ['DJANGO_SETTINGS_MODULE'] = 'core.settings'
|
||||
|
||||
os.environ['DJANGO_SETTINGS_MODULE'] = 'archivebox.core.settings'
|
||||
os.environ['TZ'] = 'UTC'
|
||||
|
||||
# detect ArchiveBox user's UID/GID based on data dir ownership
|
||||
|
||||
@@ -5,7 +5,7 @@ from signal_webhooks.utils import get_webhook_model
|
||||
|
||||
from archivebox.base_models.admin import BaseModelAdmin
|
||||
|
||||
from api.models import APIToken
|
||||
from archivebox.api.models import APIToken
|
||||
|
||||
|
||||
class APITokenAdmin(BaseModelAdmin):
|
||||
|
||||
@@ -4,9 +4,9 @@ from django.apps import AppConfig
|
||||
|
||||
|
||||
class APIConfig(AppConfig):
|
||||
name = 'api'
|
||||
name = 'archivebox.api'
|
||||
|
||||
|
||||
def register_admin(admin_site):
|
||||
from api.admin import register_admin
|
||||
from archivebox.api.admin import register_admin
|
||||
register_admin(admin_site)
|
||||
|
||||
@@ -7,7 +7,7 @@ from django.conf import settings
|
||||
from django.db import migrations, models
|
||||
import django.db.models.deletion
|
||||
|
||||
import api.models
|
||||
import archivebox.api.models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
@@ -38,7 +38,7 @@ class Migration(migrations.Migration):
|
||||
('created_by', models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
|
||||
('created_at', models.DateTimeField(auto_now_add=True, db_index=True)),
|
||||
('modified_at', models.DateTimeField(auto_now=True)),
|
||||
('token', models.CharField(default=api.models.generate_secret_token, max_length=32, unique=True)),
|
||||
('token', models.CharField(default=archivebox.api.models.generate_secret_token, max_length=32, unique=True)),
|
||||
('expires', models.DateTimeField(blank=True, null=True)),
|
||||
],
|
||||
options={
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# Generated by Django 6.0 on 2025-12-27 01:40
|
||||
|
||||
import base_models.models
|
||||
import archivebox.core.models
|
||||
import django.db.models.deletion
|
||||
from django.conf import settings
|
||||
from django.db import migrations, models
|
||||
@@ -17,11 +17,11 @@ class Migration(migrations.Migration):
|
||||
migrations.AlterField(
|
||||
model_name='apitoken',
|
||||
name='created_by',
|
||||
field=models.ForeignKey(default=base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
|
||||
field=models.ForeignKey(default=archivebox.core.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='outboundwebhook',
|
||||
name='created_by',
|
||||
field=models.ForeignKey(default=base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
|
||||
field=models.ForeignKey(default=archivebox.core.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
|
||||
),
|
||||
]
|
||||
|
||||
@@ -10,7 +10,7 @@ from django.utils import timezone
|
||||
from django_stubs_ext.db.models import TypedModelMeta
|
||||
from signal_webhooks.models import WebhookBase
|
||||
|
||||
from base_models.models import get_or_create_system_user_pk
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
|
||||
|
||||
def generate_secret_token() -> str:
|
||||
@@ -26,6 +26,7 @@ class APIToken(models.Model):
|
||||
expires = models.DateTimeField(null=True, blank=True)
|
||||
|
||||
class Meta(TypedModelMeta):
|
||||
app_label = 'api'
|
||||
verbose_name = "API Key"
|
||||
verbose_name_plural = "API Keys"
|
||||
|
||||
@@ -47,6 +48,7 @@ class OutboundWebhook(WebhookBase):
|
||||
modified_at = models.DateTimeField(auto_now=True)
|
||||
|
||||
class Meta(WebhookBase.Meta):
|
||||
app_label = 'api'
|
||||
verbose_name = 'API Outbound Webhook'
|
||||
|
||||
def __str__(self) -> str:
|
||||
|
||||
@@ -15,7 +15,7 @@ from ninja import NinjaAPI, Swagger
|
||||
from archivebox.config import VERSION
|
||||
from archivebox.config.version import get_COMMIT_HASH
|
||||
|
||||
from api.auth import API_AUTH_METHODS
|
||||
from archivebox.api.auth import API_AUTH_METHODS
|
||||
|
||||
|
||||
COMMIT_HASH = get_COMMIT_HASH() or 'unknown'
|
||||
|
||||
@@ -6,8 +6,8 @@ from ninja import Router, Schema
|
||||
from django.utils import timezone
|
||||
from datetime import timedelta
|
||||
|
||||
from api.models import APIToken
|
||||
from api.auth import auth_using_token, auth_using_password, get_or_create_api_token
|
||||
from archivebox.api.models import APIToken
|
||||
from archivebox.api.auth import auth_using_token, auth_using_password, get_or_create_api_token
|
||||
|
||||
|
||||
router = Router(tags=['Authentication'], auth=None)
|
||||
|
||||
@@ -118,6 +118,7 @@ def cli_add(request, args: AddCommandSchema):
|
||||
plugins=args.plugins,
|
||||
parser=args.parser,
|
||||
bg=True, # Always run in background for API calls
|
||||
created_by_id=request.user.pk,
|
||||
)
|
||||
|
||||
return {
|
||||
|
||||
@@ -14,8 +14,8 @@ from ninja import Router, Schema, FilterSchema, Field, Query
|
||||
from ninja.pagination import paginate, PaginationBase
|
||||
from ninja.errors import HttpError
|
||||
|
||||
from core.models import Snapshot, ArchiveResult, Tag
|
||||
from api.v1_crawls import CrawlSchema
|
||||
from archivebox.core.models import Snapshot, ArchiveResult, Tag
|
||||
from archivebox.api.v1_crawls import CrawlSchema
|
||||
|
||||
|
||||
router = Router(tags=['Core Models'])
|
||||
@@ -80,12 +80,11 @@ class MinimalArchiveResultSchema(Schema):
|
||||
|
||||
@staticmethod
|
||||
def resolve_created_by_id(obj):
|
||||
return str(obj.created_by_id)
|
||||
return str(obj.created_by.pk)
|
||||
|
||||
@staticmethod
|
||||
def resolve_created_by_username(obj) -> str:
|
||||
User = get_user_model()
|
||||
return User.objects.filter(pk=obj.created_by_id).values_list('username', flat=True)[0]
|
||||
return obj.created_by.username
|
||||
|
||||
|
||||
class ArchiveResultSchema(MinimalArchiveResultSchema):
|
||||
@@ -166,12 +165,11 @@ class SnapshotSchema(Schema):
|
||||
|
||||
@staticmethod
|
||||
def resolve_created_by_id(obj):
|
||||
return str(obj.created_by_id)
|
||||
return str(obj.created_by.pk)
|
||||
|
||||
@staticmethod
|
||||
def resolve_created_by_username(obj):
|
||||
User = get_user_model()
|
||||
return User.objects.get(id=obj.created_by_id).username
|
||||
return obj.created_by.username
|
||||
|
||||
@staticmethod
|
||||
def resolve_tags(obj):
|
||||
@@ -190,8 +188,8 @@ class SnapshotSchema(Schema):
|
||||
|
||||
class SnapshotFilterSchema(FilterSchema):
|
||||
id: Optional[str] = Field(None, q=['id__icontains', 'timestamp__startswith'])
|
||||
created_by_id: str = Field(None, q='created_by_id')
|
||||
created_by_username: str = Field(None, q='created_by__username__icontains')
|
||||
created_by_id: str = Field(None, q='crawl__created_by_id')
|
||||
created_by_username: str = Field(None, q='crawl__created_by__username__icontains')
|
||||
created_at__gte: datetime = Field(None, q='created_at__gte')
|
||||
created_at__lt: datetime = Field(None, q='created_at__lt')
|
||||
created_at: datetime = Field(None, q='created_at')
|
||||
|
||||
@@ -9,8 +9,8 @@ from django.contrib.auth import get_user_model
|
||||
|
||||
from ninja import Router, Schema
|
||||
|
||||
from core.models import Snapshot
|
||||
from crawls.models import Crawl
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
from .auth import API_AUTH_METHODS
|
||||
|
||||
|
||||
@@ -7,7 +7,7 @@ from datetime import datetime
|
||||
from ninja import Router, Schema, FilterSchema, Field, Query
|
||||
from ninja.pagination import paginate
|
||||
|
||||
from api.v1_core import CustomPagination
|
||||
from archivebox.api.v1_core import CustomPagination
|
||||
|
||||
|
||||
router = Router(tags=['Machine and Dependencies'])
|
||||
@@ -102,14 +102,14 @@ class BinaryFilterSchema(FilterSchema):
|
||||
@paginate(CustomPagination)
|
||||
def get_machines(request, filters: MachineFilterSchema = Query(...)):
|
||||
"""List all machines."""
|
||||
from machine.models import Machine
|
||||
from archivebox.machine.models import Machine
|
||||
return filters.filter(Machine.objects.all()).distinct()
|
||||
|
||||
|
||||
@router.get("/machine/{machine_id}", response=MachineSchema, url_name="get_machine")
|
||||
def get_machine(request, machine_id: str):
|
||||
"""Get a specific machine by ID."""
|
||||
from machine.models import Machine
|
||||
from archivebox.machine.models import Machine
|
||||
from django.db.models import Q
|
||||
return Machine.objects.get(Q(id__startswith=machine_id) | Q(hostname__iexact=machine_id))
|
||||
|
||||
@@ -117,7 +117,7 @@ def get_machine(request, machine_id: str):
|
||||
@router.get("/machine/current", response=MachineSchema, url_name="get_current_machine")
|
||||
def get_current_machine(request):
|
||||
"""Get the current machine."""
|
||||
from machine.models import Machine
|
||||
from archivebox.machine.models import Machine
|
||||
return Machine.current()
|
||||
|
||||
|
||||
@@ -132,19 +132,19 @@ def get_current_machine(request):
|
||||
@paginate(CustomPagination)
|
||||
def get_binaries(request, filters: BinaryFilterSchema = Query(...)):
|
||||
"""List all binaries."""
|
||||
from machine.models import Binary
|
||||
from archivebox.machine.models import Binary
|
||||
return filters.filter(Binary.objects.all().select_related('machine', 'dependency')).distinct()
|
||||
|
||||
|
||||
@router.get("/binary/{binary_id}", response=BinarySchema, url_name="get_binary")
|
||||
def get_binary(request, binary_id: str):
|
||||
"""Get a specific binary by ID."""
|
||||
from machine.models import Binary
|
||||
from archivebox.machine.models import Binary
|
||||
return Binary.objects.select_related('machine', 'dependency').get(id__startswith=binary_id)
|
||||
|
||||
|
||||
@router.get("/binary/by-name/{name}", response=List[BinarySchema], url_name="get_binaries_by_name")
|
||||
def get_binaries_by_name(request, name: str):
|
||||
"""Get all binaries with the given name."""
|
||||
from machine.models import Binary
|
||||
from archivebox.machine.models import Binary
|
||||
return list(Binary.objects.filter(name__iexact=name).select_related('machine', 'dependency'))
|
||||
|
||||
@@ -12,6 +12,7 @@ from pathlib import Path
|
||||
|
||||
from django.contrib import admin
|
||||
from django.db import models
|
||||
from django.db.models import F
|
||||
from django.utils import timezone
|
||||
from django.contrib.auth import get_user_model
|
||||
from django.urls import reverse_lazy
|
||||
@@ -110,6 +111,11 @@ class ModelWithHealthStats(models.Model):
|
||||
total = max(self.num_uses_failed + self.num_uses_succeeded, 1)
|
||||
return round((self.num_uses_succeeded / total) * 100)
|
||||
|
||||
def increment_health_stats(self, success: bool):
|
||||
"""Atomically increment success or failure counter using F() expression."""
|
||||
field = 'num_uses_succeeded' if success else 'num_uses_failed'
|
||||
type(self).objects.filter(pk=self.pk).update(**{field: F(field) + 1})
|
||||
|
||||
|
||||
class ModelWithConfig(models.Model):
|
||||
"""Mixin for models with a JSON config field."""
|
||||
|
||||
@@ -19,7 +19,7 @@ from archivebox.config.permissions import USER, HOSTNAME
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from core.models import Snapshot
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
|
||||
@enforce_types
|
||||
@@ -53,8 +53,8 @@ def add(urls: str | list[str],
|
||||
assert depth in (0, 1, 2, 3, 4), 'Depth must be 0-4'
|
||||
|
||||
# import models once django is set up
|
||||
from core.models import Snapshot
|
||||
from crawls.models import Crawl
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from workers.orchestrator import Orchestrator
|
||||
|
||||
|
||||
@@ -66,18 +66,38 @@ def config(*keys,
|
||||
raise SystemExit(1)
|
||||
else:
|
||||
matching_config = FLAT_CONFIG
|
||||
|
||||
|
||||
# Display core config sections
|
||||
for config_section in CONFIGS.values():
|
||||
if hasattr(config_section, 'toml_section_header'):
|
||||
print(f'[grey53]\\[{config_section.toml_section_header}][/grey53]')
|
||||
else:
|
||||
print('[grey53]\\[CONSTANTS] # (read-only)[/grey53]')
|
||||
|
||||
|
||||
kv_in_section = {key: val for key, val in dict(config_section).items() if key in matching_config}
|
||||
print(benedict(kv_in_section).to_toml(encoder=CustomTOMLEncoder()).strip().replace('\n\n', '\n'))
|
||||
print('[grey53]################################################################[/grey53]')
|
||||
|
||||
|
||||
|
||||
# Display plugin config section
|
||||
from archivebox.hooks import discover_plugin_configs
|
||||
|
||||
plugin_configs = discover_plugin_configs()
|
||||
plugin_keys = {}
|
||||
|
||||
# Collect all plugin config keys
|
||||
for plugin_name, schema in plugin_configs.items():
|
||||
if 'properties' not in schema:
|
||||
continue
|
||||
for key in schema['properties'].keys():
|
||||
if key in matching_config:
|
||||
plugin_keys[key] = matching_config[key]
|
||||
|
||||
# Display all plugin config in single [PLUGINS] section
|
||||
if plugin_keys:
|
||||
print(f'[grey53]\\[PLUGINS][/grey53]')
|
||||
print(benedict(plugin_keys).to_toml(encoder=CustomTOMLEncoder()).strip().replace('\n\n', '\n'))
|
||||
print('[grey53]################################################################[/grey53]')
|
||||
|
||||
raise SystemExit(not matching_config)
|
||||
|
||||
elif set:
|
||||
|
||||
@@ -72,11 +72,11 @@ def discover_outlinks(
|
||||
|
||||
from archivebox.misc.jsonl import (
|
||||
read_args_or_stdin, write_record,
|
||||
TYPE_SNAPSHOT, get_or_create_snapshot
|
||||
TYPE_SNAPSHOT
|
||||
)
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from core.models import Snapshot, ArchiveResult
|
||||
from crawls.models import Crawl
|
||||
from archivebox.core.models import Snapshot, ArchiveResult
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.config import CONSTANTS
|
||||
from workers.orchestrator import Orchestrator
|
||||
|
||||
@@ -130,8 +130,10 @@ def discover_outlinks(
|
||||
record['crawl_id'] = str(crawl.id)
|
||||
record['depth'] = record.get('depth', 0)
|
||||
|
||||
snapshot = get_or_create_snapshot(record, created_by_id=created_by_id)
|
||||
snapshot_ids.append(str(snapshot.id))
|
||||
overrides = {'created_by_id': created_by_id}
|
||||
snapshot = Snapshot.from_jsonl(record, overrides=overrides)
|
||||
if snapshot:
|
||||
snapshot_ids.append(str(snapshot.id))
|
||||
|
||||
except Exception as e:
|
||||
rprint(f'[red]Error creating snapshot: {e}[/red]', file=sys.stderr)
|
||||
@@ -162,7 +164,6 @@ def discover_outlinks(
|
||||
defaults={
|
||||
'status': ArchiveResult.StatusChoices.QUEUED,
|
||||
'retry_at': timezone.now(),
|
||||
'created_by_id': snapshot.created_by_id,
|
||||
}
|
||||
)
|
||||
else:
|
||||
@@ -229,7 +230,7 @@ def process_crawl_by_id(crawl_id: str) -> int:
|
||||
- Transition from started -> sealed (when all snapshots done)
|
||||
"""
|
||||
from rich import print as rprint
|
||||
from crawls.models import Crawl
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
try:
|
||||
crawl = Crawl.objects.get(id=crawl_id)
|
||||
@@ -256,7 +257,7 @@ def is_crawl_id(value: str) -> bool:
|
||||
if not uuid_pattern.match(value):
|
||||
return False
|
||||
# Verify it's actually a Crawl (not a Snapshot or other object)
|
||||
from crawls.models import Crawl
|
||||
from archivebox.crawls.models import Crawl
|
||||
return Crawl.objects.filter(id=value).exists()
|
||||
|
||||
|
||||
|
||||
@@ -43,7 +43,7 @@ def process_archiveresult_by_id(archiveresult_id: str) -> int:
|
||||
Triggers the ArchiveResult's state machine tick() to run the extractor plugin.
|
||||
"""
|
||||
from rich import print as rprint
|
||||
from core.models import ArchiveResult
|
||||
from archivebox.core.models import ArchiveResult
|
||||
|
||||
try:
|
||||
archiveresult = ArchiveResult.objects.get(id=archiveresult_id)
|
||||
@@ -95,7 +95,7 @@ def run_plugins(
|
||||
read_args_or_stdin, write_record, archiveresult_to_jsonl,
|
||||
TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
|
||||
)
|
||||
from core.models import Snapshot, ArchiveResult
|
||||
from archivebox.core.models import Snapshot, ArchiveResult
|
||||
from workers.orchestrator import Orchestrator
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
@@ -155,7 +155,6 @@ def run_plugins(
|
||||
defaults={
|
||||
'status': ArchiveResult.StatusChoices.QUEUED,
|
||||
'retry_at': timezone.now(),
|
||||
'created_by_id': snapshot.created_by_id,
|
||||
}
|
||||
)
|
||||
if not created and result.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED]:
|
||||
@@ -218,7 +217,7 @@ def is_archiveresult_id(value: str) -> bool:
|
||||
if not uuid_pattern.match(value):
|
||||
return False
|
||||
# Verify it's actually an ArchiveResult (not a Snapshot or other object)
|
||||
from core.models import ArchiveResult
|
||||
from archivebox.core.models import ArchiveResult
|
||||
return ArchiveResult.objects.filter(id=value).exists()
|
||||
|
||||
|
||||
|
||||
@@ -95,7 +95,7 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
|
||||
print()
|
||||
print('[dodger_blue3][*] Checking links from indexes and archive folders (safe to Ctrl+C)...[/dodger_blue3]')
|
||||
|
||||
from core.models import Snapshot
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
all_links = Snapshot.objects.none()
|
||||
pending_links: dict[str, SnapshotDict] = {}
|
||||
|
||||
@@ -42,7 +42,7 @@ def install(dry_run: bool=False) -> None:
|
||||
setup_django()
|
||||
|
||||
from django.utils import timezone
|
||||
from crawls.models import Crawl
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
|
||||
# Create a crawl for dependency detection
|
||||
@@ -70,7 +70,7 @@ def install(dry_run: bool=False) -> None:
|
||||
print(f'[+] Crawl status: {crawl.status}, retry_at: {crawl.retry_at}')
|
||||
|
||||
# Verify the crawl is in the queue
|
||||
from crawls.models import Crawl as CrawlModel
|
||||
from archivebox.crawls.models import Crawl as CrawlModel
|
||||
queued_crawls = CrawlModel.objects.filter(
|
||||
retry_at__lte=timezone.now()
|
||||
).exclude(
|
||||
|
||||
@@ -71,7 +71,7 @@ def remove(filter_patterns: Iterable[str]=(),
|
||||
to_remove = snapshots.count()
|
||||
|
||||
from archivebox.search import flush_search_index
|
||||
from core.models import Snapshot
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
flush_search_index(snapshots=snapshots)
|
||||
snapshots.delete()
|
||||
|
||||
@@ -36,7 +36,7 @@ def get_snapshots(snapshots: Optional[QuerySet]=None,
|
||||
before: Optional[float]=None,
|
||||
out_dir: Path=DATA_DIR) -> QuerySet:
|
||||
"""Filter and return Snapshots matching the given criteria."""
|
||||
from core.models import Snapshot
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
if snapshots:
|
||||
result = snapshots
|
||||
@@ -68,7 +68,7 @@ def search(filter_patterns: list[str] | None=None,
|
||||
csv: str | None=None,
|
||||
with_headers: bool=False):
|
||||
"""List, filter, and export information about archive entries"""
|
||||
from core.models import Snapshot
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
if with_headers and not (json or html or csv):
|
||||
stderr('[X] --with-headers requires --json, --html or --csv\n', color='red')
|
||||
|
||||
@@ -46,7 +46,7 @@ def process_snapshot_by_id(snapshot_id: str) -> int:
|
||||
- Transition from started -> sealed (when all ArchiveResults done)
|
||||
"""
|
||||
from rich import print as rprint
|
||||
from core.models import Snapshot
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
try:
|
||||
snapshot = Snapshot.objects.get(id=snapshot_id)
|
||||
@@ -88,11 +88,11 @@ def create_snapshots(
|
||||
|
||||
from archivebox.misc.jsonl import (
|
||||
read_args_or_stdin, write_record, snapshot_to_jsonl,
|
||||
TYPE_SNAPSHOT, TYPE_TAG, get_or_create_snapshot
|
||||
TYPE_SNAPSHOT, TYPE_TAG
|
||||
)
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from core.models import Snapshot
|
||||
from crawls.models import Crawl
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.config import CONSTANTS
|
||||
|
||||
created_by_id = created_by_id or get_or_create_system_user_pk()
|
||||
@@ -137,8 +137,10 @@ def create_snapshots(
|
||||
record['tags'] = tag
|
||||
|
||||
# Get or create the snapshot
|
||||
snapshot = get_or_create_snapshot(record, created_by_id=created_by_id)
|
||||
created_snapshots.append(snapshot)
|
||||
overrides = {'created_by_id': created_by_id}
|
||||
snapshot = Snapshot.from_jsonl(record, overrides=overrides)
|
||||
if snapshot:
|
||||
created_snapshots.append(snapshot)
|
||||
|
||||
# Output JSONL record (only when piped)
|
||||
if not is_tty:
|
||||
|
||||
@@ -21,7 +21,7 @@ def status(out_dir: Path=DATA_DIR) -> None:
|
||||
|
||||
from django.contrib.auth import get_user_model
|
||||
from archivebox.misc.db import get_admins
|
||||
from core.models import Snapshot
|
||||
from archivebox.core.models import Snapshot
|
||||
User = get_user_model()
|
||||
|
||||
print('[green]\\[*] Scanning archive main index...[/green]')
|
||||
|
||||
@@ -36,7 +36,7 @@ def update(filter_patterns: Iterable[str] = (),
|
||||
from archivebox.config.django import setup_django
|
||||
setup_django()
|
||||
|
||||
from core.models import Snapshot
|
||||
from archivebox.core.models import Snapshot
|
||||
from django.utils import timezone
|
||||
|
||||
while True:
|
||||
@@ -83,7 +83,7 @@ def import_orphans_from_archive(resume_from: str = None, batch_size: int = 100)
|
||||
Skip symlinks (already migrated).
|
||||
Create DB records and trigger migration on save().
|
||||
"""
|
||||
from core.models import Snapshot
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.config import CONSTANTS
|
||||
from django.db import transaction
|
||||
|
||||
@@ -151,7 +151,7 @@ def process_all_db_snapshots(batch_size: int = 100) -> dict:
|
||||
Process all snapshots in DB.
|
||||
Reconcile index.json and queue for archiving.
|
||||
"""
|
||||
from core.models import Snapshot
|
||||
from archivebox.core.models import Snapshot
|
||||
from django.db import transaction
|
||||
from django.utils import timezone
|
||||
|
||||
@@ -189,7 +189,7 @@ def process_filtered_snapshots(
|
||||
batch_size: int
|
||||
) -> dict:
|
||||
"""Process snapshots matching filters (DB query only)."""
|
||||
from core.models import Snapshot
|
||||
from archivebox.core.models import Snapshot
|
||||
from django.db import transaction
|
||||
from django.utils import timezone
|
||||
from datetime import datetime
|
||||
|
||||
@@ -107,7 +107,7 @@ def version(quiet: bool=False,
|
||||
from archivebox.config.django import setup_django
|
||||
setup_django()
|
||||
|
||||
from machine.models import Machine, Binary
|
||||
from archivebox.machine.models import Machine, Binary
|
||||
|
||||
machine = Machine.current()
|
||||
|
||||
|
||||
@@ -542,10 +542,10 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
|
||||
Test: archivebox snapshot URL
|
||||
Should create a Snapshot and output JSONL when piped.
|
||||
"""
|
||||
from core.models import Snapshot
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.misc.jsonl import (
|
||||
read_args_or_stdin, write_record, snapshot_to_jsonl,
|
||||
TYPE_SNAPSHOT, get_or_create_snapshot
|
||||
TYPE_SNAPSHOT
|
||||
)
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
|
||||
@@ -559,7 +559,8 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
|
||||
self.assertEqual(records[0]['url'], url)
|
||||
|
||||
# Create snapshot
|
||||
snapshot = get_or_create_snapshot(records[0], created_by_id=created_by_id)
|
||||
overrides = {'created_by_id': created_by_id}
|
||||
snapshot = Snapshot.from_jsonl(records[0], overrides=overrides)
|
||||
|
||||
self.assertIsNotNone(snapshot.id)
|
||||
self.assertEqual(snapshot.url, url)
|
||||
@@ -575,9 +576,9 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
|
||||
Test: archivebox snapshot URL | archivebox extract
|
||||
Extract should accept JSONL output from snapshot command.
|
||||
"""
|
||||
from core.models import Snapshot, ArchiveResult
|
||||
from archivebox.core.models import Snapshot, ArchiveResult
|
||||
from archivebox.misc.jsonl import (
|
||||
snapshot_to_jsonl, read_args_or_stdin, get_or_create_snapshot,
|
||||
snapshot_to_jsonl, read_args_or_stdin,
|
||||
TYPE_SNAPSHOT
|
||||
)
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
@@ -586,7 +587,8 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
|
||||
|
||||
# Step 1: Create snapshot (simulating 'archivebox snapshot')
|
||||
url = 'https://test-extract-1.example.com'
|
||||
snapshot = get_or_create_snapshot({'url': url}, created_by_id=created_by_id)
|
||||
overrides = {'created_by_id': created_by_id}
|
||||
snapshot = Snapshot.from_jsonl({'url': url}, overrides=overrides)
|
||||
snapshot_output = snapshot_to_jsonl(snapshot)
|
||||
|
||||
# Step 2: Parse snapshot output as extract input
|
||||
@@ -648,7 +650,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
|
||||
|
||||
This is equivalent to: archivebox add URL
|
||||
"""
|
||||
from core.models import Snapshot
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.misc.jsonl import (
|
||||
get_or_create_snapshot, snapshot_to_jsonl, read_args_or_stdin,
|
||||
TYPE_SNAPSHOT
|
||||
@@ -682,7 +684,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
|
||||
|
||||
This is equivalent to: archivebox add --depth=1 URL
|
||||
"""
|
||||
from core.models import Snapshot
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.misc.jsonl import (
|
||||
get_or_create_snapshot, snapshot_to_jsonl, read_args_or_stdin,
|
||||
TYPE_SNAPSHOT
|
||||
@@ -772,7 +774,7 @@ class TestDepthWorkflows(unittest.TestCase):
|
||||
|
||||
Depth 0: Only archive the specified URL, no crawling.
|
||||
"""
|
||||
from core.models import Snapshot
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.misc.jsonl import get_or_create_snapshot
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
|
||||
|
||||
@@ -35,177 +35,41 @@ def _get_config():
|
||||
# These are recalculated each time the module attribute is accessed
|
||||
|
||||
def __getattr__(name: str):
|
||||
"""Module-level __getattr__ for lazy config loading."""
|
||||
|
||||
# Timeout settings
|
||||
"""
|
||||
Module-level __getattr__ for lazy config loading.
|
||||
|
||||
Only provides backwards compatibility for GENERIC/SHARED config.
|
||||
Plugin-specific config (binaries, args, toggles) should come from plugin config.json files.
|
||||
"""
|
||||
|
||||
# Generic timeout settings (used by multiple plugins)
|
||||
if name == 'TIMEOUT':
|
||||
cfg, _ = _get_config()
|
||||
return cfg.TIMEOUT
|
||||
if name == 'MEDIA_TIMEOUT':
|
||||
cfg, _ = _get_config()
|
||||
return cfg.MEDIA_TIMEOUT
|
||||
|
||||
# SSL/Security settings
|
||||
|
||||
# Generic SSL/Security settings (used by multiple plugins)
|
||||
if name == 'CHECK_SSL_VALIDITY':
|
||||
cfg, _ = _get_config()
|
||||
return cfg.CHECK_SSL_VALIDITY
|
||||
|
||||
# Storage settings
|
||||
|
||||
# Generic storage settings (used by multiple plugins)
|
||||
if name == 'RESTRICT_FILE_NAMES':
|
||||
_, storage = _get_config()
|
||||
return storage.RESTRICT_FILE_NAMES
|
||||
|
||||
# User agent / cookies
|
||||
|
||||
# Generic user agent / cookies (used by multiple plugins)
|
||||
if name == 'COOKIES_FILE':
|
||||
cfg, _ = _get_config()
|
||||
return cfg.COOKIES_FILE
|
||||
if name == 'USER_AGENT':
|
||||
cfg, _ = _get_config()
|
||||
return cfg.USER_AGENT
|
||||
if name == 'CURL_USER_AGENT':
|
||||
cfg, _ = _get_config()
|
||||
return cfg.USER_AGENT
|
||||
if name == 'WGET_USER_AGENT':
|
||||
cfg, _ = _get_config()
|
||||
return cfg.USER_AGENT
|
||||
if name == 'CHROME_USER_AGENT':
|
||||
cfg, _ = _get_config()
|
||||
return cfg.USER_AGENT
|
||||
|
||||
# Archive method toggles (SAVE_*)
|
||||
if name == 'SAVE_TITLE':
|
||||
return True
|
||||
if name == 'SAVE_FAVICON':
|
||||
return True
|
||||
if name == 'SAVE_WGET':
|
||||
return True
|
||||
if name == 'SAVE_WARC':
|
||||
return True
|
||||
if name == 'SAVE_WGET_REQUISITES':
|
||||
return True
|
||||
if name == 'SAVE_SINGLEFILE':
|
||||
return True
|
||||
if name == 'SAVE_READABILITY':
|
||||
return True
|
||||
if name == 'SAVE_MERCURY':
|
||||
return True
|
||||
if name == 'SAVE_HTMLTOTEXT':
|
||||
return True
|
||||
if name == 'SAVE_PDF':
|
||||
return True
|
||||
if name == 'SAVE_SCREENSHOT':
|
||||
return True
|
||||
if name == 'SAVE_DOM':
|
||||
return True
|
||||
if name == 'SAVE_HEADERS':
|
||||
return True
|
||||
if name == 'SAVE_GIT':
|
||||
return True
|
||||
if name == 'SAVE_MEDIA':
|
||||
return True
|
||||
if name == 'SAVE_ARCHIVE_DOT_ORG':
|
||||
return True
|
||||
|
||||
# Extractor-specific settings
|
||||
|
||||
# Generic resolution settings (used by multiple plugins)
|
||||
if name == 'RESOLUTION':
|
||||
cfg, _ = _get_config()
|
||||
return cfg.RESOLUTION
|
||||
if name == 'GIT_DOMAINS':
|
||||
return 'github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht'
|
||||
if name == 'MEDIA_MAX_SIZE':
|
||||
cfg, _ = _get_config()
|
||||
return cfg.MEDIA_MAX_SIZE
|
||||
if name == 'FAVICON_PROVIDER':
|
||||
return 'https://www.google.com/s2/favicons?domain={}'
|
||||
|
||||
# Binary paths (use shutil.which for detection)
|
||||
if name == 'CURL_BINARY':
|
||||
return shutil.which('curl') or 'curl'
|
||||
if name == 'WGET_BINARY':
|
||||
return shutil.which('wget') or 'wget'
|
||||
if name == 'GIT_BINARY':
|
||||
return shutil.which('git') or 'git'
|
||||
if name == 'YOUTUBEDL_BINARY':
|
||||
return shutil.which('yt-dlp') or shutil.which('youtube-dl') or 'yt-dlp'
|
||||
if name == 'CHROME_BINARY':
|
||||
for chrome in ['chromium', 'chromium-browser', 'google-chrome', 'google-chrome-stable', 'chrome']:
|
||||
path = shutil.which(chrome)
|
||||
if path:
|
||||
return path
|
||||
return 'chromium'
|
||||
if name == 'NODE_BINARY':
|
||||
return shutil.which('node') or 'node'
|
||||
if name == 'SINGLEFILE_BINARY':
|
||||
return shutil.which('single-file') or shutil.which('singlefile') or 'single-file'
|
||||
if name == 'READABILITY_BINARY':
|
||||
return shutil.which('readability-extractor') or 'readability-extractor'
|
||||
if name == 'MERCURY_BINARY':
|
||||
return shutil.which('mercury-parser') or shutil.which('postlight-parser') or 'mercury-parser'
|
||||
|
||||
# Binary versions (return placeholder, actual version detection happens elsewhere)
|
||||
if name == 'CURL_VERSION':
|
||||
return 'curl'
|
||||
if name == 'WGET_VERSION':
|
||||
return 'wget'
|
||||
if name == 'GIT_VERSION':
|
||||
return 'git'
|
||||
if name == 'YOUTUBEDL_VERSION':
|
||||
return 'yt-dlp'
|
||||
if name == 'CHROME_VERSION':
|
||||
return 'chromium'
|
||||
if name == 'SINGLEFILE_VERSION':
|
||||
return 'singlefile'
|
||||
if name == 'READABILITY_VERSION':
|
||||
return 'readability'
|
||||
if name == 'MERCURY_VERSION':
|
||||
return 'mercury'
|
||||
|
||||
# Binary arguments
|
||||
if name == 'CURL_ARGS':
|
||||
return ['--silent', '--location', '--compressed']
|
||||
if name == 'WGET_ARGS':
|
||||
return [
|
||||
'--no-verbose',
|
||||
'--adjust-extension',
|
||||
'--convert-links',
|
||||
'--force-directories',
|
||||
'--backup-converted',
|
||||
'--span-hosts',
|
||||
'--no-parent',
|
||||
'-e', 'robots=off',
|
||||
]
|
||||
if name == 'GIT_ARGS':
|
||||
return ['--recursive']
|
||||
if name == 'YOUTUBEDL_ARGS':
|
||||
cfg, _ = _get_config()
|
||||
return [
|
||||
'--write-description',
|
||||
'--write-info-json',
|
||||
'--write-annotations',
|
||||
'--write-thumbnail',
|
||||
'--no-call-home',
|
||||
'--write-sub',
|
||||
'--write-auto-subs',
|
||||
'--convert-subs=srt',
|
||||
'--yes-playlist',
|
||||
'--continue',
|
||||
'--no-abort-on-error',
|
||||
'--ignore-errors',
|
||||
'--geo-bypass',
|
||||
'--add-metadata',
|
||||
f'--format=(bv*+ba/b)[filesize<={cfg.MEDIA_MAX_SIZE}][filesize_approx<=?{cfg.MEDIA_MAX_SIZE}]/(bv*+ba/b)',
|
||||
]
|
||||
if name == 'SINGLEFILE_ARGS':
|
||||
return None # Uses defaults
|
||||
if name == 'CHROME_ARGS':
|
||||
return []
|
||||
|
||||
# Other settings
|
||||
if name == 'WGET_AUTO_COMPRESSION':
|
||||
return True
|
||||
if name == 'DEPENDENCIES':
|
||||
return {} # Legacy, not used anymore
|
||||
|
||||
|
||||
# Allowlist/Denylist patterns (compiled regexes)
|
||||
if name == 'SAVE_ALLOWLIST_PTN':
|
||||
cfg, _ = _get_config()
|
||||
@@ -213,7 +77,7 @@ def __getattr__(name: str):
|
||||
if name == 'SAVE_DENYLIST_PTN':
|
||||
cfg, _ = _get_config()
|
||||
return cfg.SAVE_DENYLIST_PTNS
|
||||
|
||||
|
||||
raise AttributeError(f"module 'archivebox.config' has no attribute '{name}'")
|
||||
|
||||
|
||||
|
||||
@@ -111,6 +111,24 @@ def load_config_file() -> Optional[benedict]:
|
||||
return None
|
||||
|
||||
|
||||
class PluginConfigSection:
|
||||
"""Pseudo-section for all plugin config keys written to [PLUGINS] section in ArchiveBox.conf"""
|
||||
toml_section_header = "PLUGINS"
|
||||
|
||||
def __init__(self, key: str):
|
||||
self._key = key
|
||||
|
||||
def __getattr__(self, name: str) -> Any:
|
||||
# Allow hasattr checks to pass for the key
|
||||
if name == self._key:
|
||||
return None
|
||||
raise AttributeError(f"PluginConfigSection has no attribute '{name}'")
|
||||
|
||||
def update_in_place(self, warn: bool = True, persist: bool = False, **kwargs):
|
||||
"""No-op update since plugins read config dynamically via get_config()."""
|
||||
pass
|
||||
|
||||
|
||||
def section_for_key(key: str) -> Any:
|
||||
"""Find the config section containing a given key."""
|
||||
from archivebox.config.common import (
|
||||
@@ -121,11 +139,22 @@ def section_for_key(key: str) -> Any:
|
||||
ARCHIVING_CONFIG,
|
||||
SEARCH_BACKEND_CONFIG,
|
||||
)
|
||||
|
||||
for section in [SHELL_CONFIG, STORAGE_CONFIG, GENERAL_CONFIG,
|
||||
|
||||
# First check core config sections
|
||||
for section in [SHELL_CONFIG, STORAGE_CONFIG, GENERAL_CONFIG,
|
||||
SERVER_CONFIG, ARCHIVING_CONFIG, SEARCH_BACKEND_CONFIG]:
|
||||
if hasattr(section, key):
|
||||
return section
|
||||
|
||||
# Check if this is a plugin config key
|
||||
from archivebox.hooks import discover_plugin_configs
|
||||
|
||||
plugin_configs = discover_plugin_configs()
|
||||
for plugin_name, schema in plugin_configs.items():
|
||||
if 'properties' in schema and key in schema['properties']:
|
||||
# All plugin config goes to [PLUGINS] section
|
||||
return PluginConfigSection(key)
|
||||
|
||||
raise ValueError(f'No config section found for key: {key}')
|
||||
|
||||
|
||||
|
||||
@@ -123,9 +123,7 @@ class ArchivingConfig(BaseConfigSet):
|
||||
OVERWRITE: bool = Field(default=False)
|
||||
|
||||
TIMEOUT: int = Field(default=60)
|
||||
MEDIA_TIMEOUT: int = Field(default=3600)
|
||||
|
||||
MEDIA_MAX_SIZE: str = Field(default="750m")
|
||||
RESOLUTION: str = Field(default="1440,2000")
|
||||
CHECK_SSL_VALIDITY: bool = Field(default=True)
|
||||
USER_AGENT: str = Field(
|
||||
@@ -141,15 +139,6 @@ class ArchivingConfig(BaseConfigSet):
|
||||
|
||||
DEFAULT_PERSONA: str = Field(default="Default")
|
||||
|
||||
# GIT_DOMAINS: str = Field(default='github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht')
|
||||
# WGET_USER_AGENT: str = Field(default=lambda c: c['USER_AGENT'] + ' wget/{WGET_VERSION}')
|
||||
# CURL_USER_AGENT: str = Field(default=lambda c: c['USER_AGENT'] + ' curl/{CURL_VERSION}')
|
||||
# CHROME_USER_AGENT: str = Field(default=lambda c: c['USER_AGENT'])
|
||||
# CHROME_USER_DATA_DIR: str | None = Field(default=None)
|
||||
# CHROME_TIMEOUT: int = Field(default=0)
|
||||
# CHROME_HEADLESS: bool = Field(default=True)
|
||||
# CHROME_SANDBOX: bool = Field(default=lambda: not SHELL_CONFIG.IN_DOCKER)
|
||||
|
||||
def validate(self):
|
||||
if int(self.TIMEOUT) < 5:
|
||||
print(f"[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.TIMEOUT} seconds)[/red]", file=sys.stderr)
|
||||
@@ -215,7 +204,6 @@ class SearchBackendConfig(BaseConfigSet):
|
||||
|
||||
SEARCH_BACKEND_ENGINE: str = Field(default="ripgrep")
|
||||
SEARCH_PROCESS_HTML: bool = Field(default=True)
|
||||
SEARCH_BACKEND_TIMEOUT: int = Field(default=10)
|
||||
|
||||
|
||||
SEARCH_BACKEND_CONFIG = SearchBackendConfig()
|
||||
|
||||
@@ -174,7 +174,7 @@ def get_config(
|
||||
config.update(dict(ARCHIVING_CONFIG))
|
||||
config.update(dict(SEARCH_BACKEND_CONFIG))
|
||||
|
||||
# Load from config file
|
||||
# Load from archivebox.config.file
|
||||
config_file = CONSTANTS.CONFIG_FILE
|
||||
if config_file.exists():
|
||||
file_config = BaseConfigSet.load_from_file(config_file)
|
||||
|
||||
@@ -17,7 +17,7 @@ from admin_data_views.utils import render_with_table_view, render_with_item_view
|
||||
from archivebox.config import CONSTANTS
|
||||
from archivebox.misc.util import parse_date
|
||||
|
||||
from machine.models import Binary
|
||||
from archivebox.machine.models import Binary
|
||||
|
||||
|
||||
# Common binaries to check for
|
||||
|
||||
@@ -4,7 +4,7 @@ __order__ = 100
|
||||
|
||||
def register_admin(admin_site):
|
||||
"""Register the core.models views (Snapshot, ArchiveResult, Tag, etc.) with the admin site"""
|
||||
from core.admin import register_admin as do_register
|
||||
from archivebox.core.admin import register_admin as do_register
|
||||
do_register(admin_site)
|
||||
|
||||
|
||||
|
||||
@@ -3,11 +3,11 @@ __package__ = 'archivebox.core'
|
||||
from django.contrib.auth import get_user_model
|
||||
|
||||
|
||||
from core.models import Snapshot, ArchiveResult, Tag
|
||||
from core.admin_tags import TagAdmin
|
||||
from core.admin_snapshots import SnapshotAdmin
|
||||
from core.admin_archiveresults import ArchiveResultAdmin
|
||||
from core.admin_users import UserAdmin
|
||||
from archivebox.core.models import Snapshot, ArchiveResult, Tag
|
||||
from archivebox.core.admin_tags import TagAdmin
|
||||
from archivebox.core.admin_snapshots import SnapshotAdmin
|
||||
from archivebox.core.admin_archiveresults import ArchiveResultAdmin
|
||||
from archivebox.core.admin_users import UserAdmin
|
||||
|
||||
|
||||
def register_admin(admin_site):
|
||||
|
||||
@@ -16,7 +16,7 @@ from archivebox.base_models.admin import BaseModelAdmin
|
||||
from archivebox.hooks import get_plugin_icon
|
||||
|
||||
|
||||
from core.models import ArchiveResult, Snapshot
|
||||
from archivebox.core.models import ArchiveResult, Snapshot
|
||||
|
||||
|
||||
def render_archiveresults_list(archiveresults_qs, limit=50):
|
||||
@@ -187,7 +187,7 @@ class ArchiveResultInline(admin.TabularInline):
|
||||
extra = 0
|
||||
sort_fields = ('end_ts', 'plugin', 'output_str', 'status', 'cmd_version')
|
||||
readonly_fields = ('id', 'result_id', 'completed', 'command', 'version')
|
||||
fields = ('start_ts', 'end_ts', *readonly_fields, 'plugin', 'cmd', 'cmd_version', 'pwd', 'created_by', 'status', 'retry_at', 'output_str')
|
||||
fields = ('start_ts', 'end_ts', *readonly_fields, 'plugin', 'cmd', 'cmd_version', 'pwd', 'status', 'retry_at', 'output_str')
|
||||
# exclude = ('id',)
|
||||
ordering = ('end_ts',)
|
||||
show_change_link = True
|
||||
@@ -229,17 +229,15 @@ class ArchiveResultInline(admin.TabularInline):
|
||||
formset.form.base_fields['end_ts'].initial = timezone.now()
|
||||
formset.form.base_fields['cmd_version'].initial = '-'
|
||||
formset.form.base_fields['pwd'].initial = str(snapshot.output_dir)
|
||||
formset.form.base_fields['created_by'].initial = request.user
|
||||
formset.form.base_fields['cmd'].initial = '["-"]'
|
||||
formset.form.base_fields['output_str'].initial = 'Manually recorded cmd output...'
|
||||
|
||||
|
||||
if obj is not None:
|
||||
# hidden values for existing entries and new entries
|
||||
formset.form.base_fields['start_ts'].widget = formset.form.base_fields['start_ts'].hidden_widget()
|
||||
formset.form.base_fields['end_ts'].widget = formset.form.base_fields['end_ts'].hidden_widget()
|
||||
formset.form.base_fields['cmd'].widget = formset.form.base_fields['cmd'].hidden_widget()
|
||||
formset.form.base_fields['pwd'].widget = formset.form.base_fields['pwd'].hidden_widget()
|
||||
formset.form.base_fields['created_by'].widget = formset.form.base_fields['created_by'].hidden_widget()
|
||||
formset.form.base_fields['cmd_version'].widget = formset.form.base_fields['cmd_version'].hidden_widget()
|
||||
return formset
|
||||
|
||||
@@ -252,8 +250,8 @@ class ArchiveResultInline(admin.TabularInline):
|
||||
|
||||
|
||||
class ArchiveResultAdmin(BaseModelAdmin):
|
||||
list_display = ('id', 'created_by', 'created_at', 'snapshot_info', 'tags_str', 'status', 'plugin_with_icon', 'cmd_str', 'output_str')
|
||||
sort_fields = ('id', 'created_by', 'created_at', 'plugin', 'status')
|
||||
list_display = ('id', 'created_at', 'snapshot_info', 'tags_str', 'status', 'plugin_with_icon', 'cmd_str', 'output_str')
|
||||
sort_fields = ('id', 'created_at', 'plugin', 'status')
|
||||
readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'plugin_with_icon', 'iface')
|
||||
search_fields = ('id', 'snapshot__url', 'plugin', 'output_str', 'cmd_version', 'cmd', 'snapshot__timestamp')
|
||||
autocomplete_fields = ['snapshot']
|
||||
@@ -279,10 +277,6 @@ class ArchiveResultAdmin(BaseModelAdmin):
|
||||
'fields': ('output_str', 'output_json', 'output_files', 'output_size', 'output_mimetypes', 'output_summary'),
|
||||
'classes': ('card', 'wide'),
|
||||
}),
|
||||
('Metadata', {
|
||||
'fields': ('created_by',),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
)
|
||||
|
||||
list_filter = ('status', 'plugin', 'start_ts', 'cmd_version')
|
||||
|
||||
@@ -38,11 +38,11 @@ def register_admin_site():
|
||||
|
||||
# Register admin views for each app
|
||||
# (Previously handled by ABX plugin system, now called directly)
|
||||
from core.admin import register_admin as register_core_admin
|
||||
from crawls.admin import register_admin as register_crawls_admin
|
||||
from api.admin import register_admin as register_api_admin
|
||||
from machine.admin import register_admin as register_machine_admin
|
||||
from workers.admin import register_admin as register_workers_admin
|
||||
from archivebox.core.admin import register_admin as register_core_admin
|
||||
from archivebox.crawls.admin import register_admin as register_crawls_admin
|
||||
from archivebox.api.admin import register_admin as register_api_admin
|
||||
from archivebox.machine.admin import register_admin as register_machine_admin
|
||||
from archivebox.workers.admin import register_admin as register_workers_admin
|
||||
|
||||
register_core_admin(archivebox_admin)
|
||||
register_crawls_admin(archivebox_admin)
|
||||
|
||||
@@ -23,9 +23,9 @@ from archivebox.search.admin import SearchResultsAdminMixin
|
||||
from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
|
||||
from archivebox.workers.tasks import bg_archive_snapshots, bg_add
|
||||
|
||||
from core.models import Tag, Snapshot
|
||||
from core.admin_tags import TagInline
|
||||
from core.admin_archiveresults import ArchiveResultInline, render_archiveresults_list
|
||||
from archivebox.core.models import Tag, Snapshot
|
||||
from archivebox.core.admin_tags import TagInline
|
||||
from archivebox.core.admin_archiveresults import ArchiveResultInline, render_archiveresults_list
|
||||
|
||||
|
||||
# GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': [], 'CAN_UPGRADE': False}
|
||||
@@ -59,7 +59,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
||||
sort_fields = ('title_str', 'url_str', 'created_at', 'status', 'crawl')
|
||||
readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'output_dir', 'archiveresults_list')
|
||||
search_fields = ('id', 'url', 'timestamp', 'title', 'tags__name')
|
||||
list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'created_by', 'tags__name')
|
||||
list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'crawl__created_by', 'tags__name')
|
||||
|
||||
fieldsets = (
|
||||
('URL', {
|
||||
@@ -75,7 +75,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Relations', {
|
||||
'fields': ('crawl', 'created_by', 'tags_str'),
|
||||
'fields': ('crawl', 'tags_str'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Config', {
|
||||
|
||||
@@ -6,7 +6,7 @@ from django.utils.html import format_html, mark_safe
|
||||
from archivebox.misc.paginators import AccelleratedPaginator
|
||||
from archivebox.base_models.admin import BaseModelAdmin
|
||||
|
||||
from core.models import Tag
|
||||
from archivebox.core.models import Tag
|
||||
|
||||
|
||||
class TagInline(admin.TabularInline):
|
||||
|
||||
@@ -4,9 +4,9 @@ from django.apps import AppConfig
|
||||
|
||||
|
||||
class CoreConfig(AppConfig):
|
||||
name = 'core'
|
||||
name = 'archivebox.core'
|
||||
|
||||
def ready(self):
|
||||
"""Register the archivebox.core.admin_site as the main django admin site"""
|
||||
from core.admin_site import register_admin_site
|
||||
from archivebox.core.admin_site import register_admin_site
|
||||
register_admin_site()
|
||||
|
||||
@@ -20,7 +20,7 @@ application = get_asgi_application()
|
||||
# from channels.routing import ProtocolTypeRouter, URLRouter
|
||||
# from channels.auth import AuthMiddlewareStack
|
||||
# from channels.security.websocket import AllowedHostsOriginValidator
|
||||
# from core.routing import websocket_urlpatterns
|
||||
# from archivebox.core.routing import websocket_urlpatterns
|
||||
#
|
||||
# application = ProtocolTypeRouter({
|
||||
# "http": get_asgi_application(),
|
||||
|
||||
@@ -4,10 +4,14 @@ from django import forms
|
||||
|
||||
from archivebox.misc.util import URL_REGEX
|
||||
from taggit.utils import edit_string_for_tags, parse_tags
|
||||
from archivebox.base_models.admin import KeyValueWidget
|
||||
|
||||
DEPTH_CHOICES = (
|
||||
('0', 'depth = 0 (archive just these URLs)'),
|
||||
('1', 'depth = 1 (archive these URLs and all URLs one hop away)'),
|
||||
('1', 'depth = 1 (+ URLs one hop away)'),
|
||||
('2', 'depth = 2 (+ URLs two hops away)'),
|
||||
('3', 'depth = 3 (+ URLs three hops away)'),
|
||||
('4', 'depth = 4 (+ URLs four hops away)'),
|
||||
)
|
||||
|
||||
from archivebox.hooks import get_plugins
|
||||
@@ -18,39 +22,180 @@ def get_plugin_choices():
|
||||
|
||||
|
||||
class AddLinkForm(forms.Form):
|
||||
url = forms.RegexField(label="URLs (one per line)", regex=URL_REGEX, min_length='6', strip=True, widget=forms.Textarea, required=True)
|
||||
tag = forms.CharField(label="Tags (comma separated tag1,tag2,tag3)", strip=True, required=False)
|
||||
depth = forms.ChoiceField(label="Archive depth", choices=DEPTH_CHOICES, initial='0', widget=forms.RadioSelect(attrs={"class": "depth-selection"}))
|
||||
plugins = forms.MultipleChoiceField(
|
||||
label="Plugins (select at least 1, otherwise all will be used by default)",
|
||||
# Basic fields
|
||||
url = forms.RegexField(
|
||||
label="URLs (one per line)",
|
||||
regex=URL_REGEX,
|
||||
min_length='6',
|
||||
strip=True,
|
||||
widget=forms.Textarea,
|
||||
required=True
|
||||
)
|
||||
tag = forms.CharField(
|
||||
label="Tags (comma separated tag1,tag2,tag3)",
|
||||
strip=True,
|
||||
required=False,
|
||||
widget=forms.TextInput(attrs={
|
||||
'list': 'tag-datalist',
|
||||
'autocomplete': 'off',
|
||||
})
|
||||
)
|
||||
depth = forms.ChoiceField(
|
||||
label="Archive depth",
|
||||
choices=DEPTH_CHOICES,
|
||||
initial='0',
|
||||
widget=forms.RadioSelect(attrs={"class": "depth-selection"})
|
||||
)
|
||||
notes = forms.CharField(
|
||||
label="Notes",
|
||||
strip=True,
|
||||
required=False,
|
||||
widget=forms.Textarea(attrs={
|
||||
'rows': 3,
|
||||
'placeholder': 'Optional notes about this crawl (e.g., purpose, project name, context...)',
|
||||
})
|
||||
)
|
||||
|
||||
# Plugin groups
|
||||
chrome_plugins = forms.MultipleChoiceField(
|
||||
label="Chrome-dependent plugins",
|
||||
required=False,
|
||||
widget=forms.CheckboxSelectMultiple,
|
||||
choices=[], # populated in __init__
|
||||
)
|
||||
archiving_plugins = forms.MultipleChoiceField(
|
||||
label="Archiving",
|
||||
required=False,
|
||||
widget=forms.CheckboxSelectMultiple,
|
||||
choices=[],
|
||||
)
|
||||
parsing_plugins = forms.MultipleChoiceField(
|
||||
label="Parsing",
|
||||
required=False,
|
||||
widget=forms.CheckboxSelectMultiple,
|
||||
choices=[],
|
||||
)
|
||||
search_plugins = forms.MultipleChoiceField(
|
||||
label="Search",
|
||||
required=False,
|
||||
widget=forms.CheckboxSelectMultiple,
|
||||
choices=[],
|
||||
)
|
||||
binary_plugins = forms.MultipleChoiceField(
|
||||
label="Binary providers",
|
||||
required=False,
|
||||
widget=forms.CheckboxSelectMultiple,
|
||||
choices=[],
|
||||
)
|
||||
extension_plugins = forms.MultipleChoiceField(
|
||||
label="Browser extensions",
|
||||
required=False,
|
||||
widget=forms.CheckboxSelectMultiple,
|
||||
choices=[],
|
||||
)
|
||||
|
||||
# Advanced options
|
||||
schedule = forms.CharField(
|
||||
label="Repeat schedule",
|
||||
max_length=64,
|
||||
required=False,
|
||||
widget=forms.TextInput(attrs={
|
||||
'placeholder': 'e.g., daily, weekly, 0 */6 * * * (every 6 hours)',
|
||||
})
|
||||
)
|
||||
persona = forms.CharField(
|
||||
label="Persona (authentication profile)",
|
||||
max_length=100,
|
||||
initial='Default',
|
||||
required=False,
|
||||
)
|
||||
overwrite = forms.BooleanField(
|
||||
label="Overwrite existing snapshots",
|
||||
initial=False,
|
||||
required=False,
|
||||
)
|
||||
update = forms.BooleanField(
|
||||
label="Update/retry previously failed URLs",
|
||||
initial=False,
|
||||
required=False,
|
||||
)
|
||||
index_only = forms.BooleanField(
|
||||
label="Index only (don't archive yet)",
|
||||
initial=False,
|
||||
required=False,
|
||||
)
|
||||
config = forms.JSONField(
|
||||
label="Custom config overrides",
|
||||
widget=KeyValueWidget(),
|
||||
initial=dict,
|
||||
required=False,
|
||||
widget=forms.SelectMultiple,
|
||||
choices=[], # populated dynamically in __init__
|
||||
)
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.fields['plugins'].choices = get_plugin_choices()
|
||||
# TODO: hook these up to the view and put them
|
||||
# in a collapsible UI section labeled "Advanced"
|
||||
#
|
||||
# exclude_patterns = forms.CharField(
|
||||
# label="Exclude patterns",
|
||||
# min_length='1',
|
||||
# required=False,
|
||||
# initial=URL_DENYLIST,
|
||||
# )
|
||||
# timeout = forms.IntegerField(
|
||||
# initial=TIMEOUT,
|
||||
# )
|
||||
# overwrite = forms.BooleanField(
|
||||
# label="Overwrite any existing Snapshots",
|
||||
# initial=False,
|
||||
# )
|
||||
# index_only = forms.BooleanField(
|
||||
# label="Add URLs to index without Snapshotting",
|
||||
# initial=False,
|
||||
# )
|
||||
|
||||
# Import at runtime to avoid circular imports
|
||||
from archivebox.config.common import ARCHIVING_CONFIG
|
||||
|
||||
# Get all plugins
|
||||
all_plugins = get_plugins()
|
||||
|
||||
# Define plugin groups
|
||||
chrome_dependent = {
|
||||
'accessibility', 'chrome', 'consolelog', 'dom', 'headers',
|
||||
'parse_dom_outlinks', 'pdf', 'redirects', 'responses',
|
||||
'screenshot', 'seo', 'singlefile', 'ssl', 'staticfile', 'title'
|
||||
}
|
||||
archiving = {
|
||||
'archive_org', 'favicon', 'forumdl', 'gallerydl', 'git',
|
||||
'htmltotext', 'media', 'mercury', 'papersdl', 'readability', 'wget'
|
||||
}
|
||||
parsing = {
|
||||
'parse_html_urls', 'parse_jsonl_urls',
|
||||
'parse_netscape_urls', 'parse_rss_urls', 'parse_txt_urls'
|
||||
}
|
||||
search = {
|
||||
'search_backend_ripgrep', 'search_backend_sonic', 'search_backend_sqlite'
|
||||
}
|
||||
binary = {'apt', 'brew', 'custom', 'env', 'npm', 'pip'}
|
||||
extensions = {'captcha2', 'istilldontcareaboutcookies', 'ublock'}
|
||||
|
||||
# Populate plugin field choices
|
||||
self.fields['chrome_plugins'].choices = [
|
||||
(p, p) for p in sorted(all_plugins) if p in chrome_dependent
|
||||
]
|
||||
self.fields['archiving_plugins'].choices = [
|
||||
(p, p) for p in sorted(all_plugins) if p in archiving
|
||||
]
|
||||
self.fields['parsing_plugins'].choices = [
|
||||
(p, p) for p in sorted(all_plugins) if p in parsing
|
||||
]
|
||||
self.fields['search_plugins'].choices = [
|
||||
(p, p) for p in sorted(all_plugins) if p in search
|
||||
]
|
||||
self.fields['binary_plugins'].choices = [
|
||||
(p, p) for p in sorted(all_plugins) if p in binary
|
||||
]
|
||||
self.fields['extension_plugins'].choices = [
|
||||
(p, p) for p in sorted(all_plugins) if p in extensions
|
||||
]
|
||||
|
||||
# Set update default from config
|
||||
self.fields['update'].initial = not ARCHIVING_CONFIG.ONLY_NEW
|
||||
|
||||
def clean(self):
|
||||
cleaned_data = super().clean()
|
||||
|
||||
# Combine all plugin groups into single list
|
||||
all_selected_plugins = []
|
||||
for field in ['chrome_plugins', 'archiving_plugins', 'parsing_plugins',
|
||||
'search_plugins', 'binary_plugins', 'extension_plugins']:
|
||||
all_selected_plugins.extend(cleaned_data.get(field, []))
|
||||
|
||||
# Store combined list for easy access
|
||||
cleaned_data['plugins'] = all_selected_plugins
|
||||
|
||||
return cleaned_data
|
||||
|
||||
class TagWidgetMixin:
|
||||
def format_value(self, value):
|
||||
|
||||
@@ -12,7 +12,7 @@ try:
|
||||
ARCHIVE_DIR = CONSTANTS.ARCHIVE_DIR
|
||||
except ImportError:
|
||||
try:
|
||||
from config import CONFIG
|
||||
from archivebox.config import CONFIG
|
||||
ARCHIVE_DIR = Path(CONFIG.get('ARCHIVE_DIR', './archive'))
|
||||
except ImportError:
|
||||
ARCHIVE_DIR = Path('./archive')
|
||||
|
||||
@@ -11,7 +11,7 @@ class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
('core', '0031_snapshot_parent_snapshot'),
|
||||
('crawls', '0004_alter_crawl_output_dir'),
|
||||
('machine', '0003_alter_dependency_id_alter_installedbinary_dependency_and_more'),
|
||||
('machine', '0004_drop_dependency_table'), # Changed from 0003 - wait until Dependency is dropped
|
||||
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
|
||||
]
|
||||
|
||||
|
||||
@@ -0,0 +1,79 @@
|
||||
# Generated migration
|
||||
|
||||
from django.conf import settings
|
||||
from django.db import migrations, models
|
||||
import django.db.models.deletion
|
||||
|
||||
|
||||
def create_catchall_crawls_and_assign_snapshots(apps, schema_editor):
|
||||
"""
|
||||
Create one catchall Crawl per user for all snapshots without a crawl.
|
||||
Assign those snapshots to their user's catchall crawl.
|
||||
"""
|
||||
Snapshot = apps.get_model('core', 'Snapshot')
|
||||
Crawl = apps.get_model('crawls', 'Crawl')
|
||||
User = apps.get_model(settings.AUTH_USER_MODEL)
|
||||
|
||||
# Get all snapshots without a crawl
|
||||
snapshots_without_crawl = Snapshot.objects.filter(crawl__isnull=True)
|
||||
|
||||
if not snapshots_without_crawl.exists():
|
||||
return
|
||||
|
||||
# Group by created_by_id
|
||||
snapshots_by_user = {}
|
||||
for snapshot in snapshots_without_crawl:
|
||||
user_id = snapshot.created_by_id
|
||||
if user_id not in snapshots_by_user:
|
||||
snapshots_by_user[user_id] = []
|
||||
snapshots_by_user[user_id].append(snapshot)
|
||||
|
||||
# Create one catchall crawl per user and assign snapshots
|
||||
for user_id, snapshots in snapshots_by_user.items():
|
||||
try:
|
||||
user = User.objects.get(pk=user_id)
|
||||
username = user.username
|
||||
except User.DoesNotExist:
|
||||
username = 'unknown'
|
||||
|
||||
# Create catchall crawl for this user
|
||||
crawl = Crawl.objects.create(
|
||||
urls=f'# Catchall crawl for {len(snapshots)} snapshots without a crawl',
|
||||
max_depth=0,
|
||||
label=f'[migration] catchall for user {username}',
|
||||
created_by_id=user_id,
|
||||
)
|
||||
|
||||
# Assign all snapshots to this crawl
|
||||
for snapshot in snapshots:
|
||||
snapshot.crawl = crawl
|
||||
snapshot.save(update_fields=['crawl'])
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0034_snapshot_current_step'),
|
||||
('crawls', '0004_alter_crawl_output_dir'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
# Step 1: Assign all snapshots without a crawl to catchall crawls
|
||||
migrations.RunPython(
|
||||
create_catchall_crawls_and_assign_snapshots,
|
||||
reverse_code=migrations.RunPython.noop,
|
||||
),
|
||||
|
||||
# Step 2: Make crawl non-nullable
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='crawl',
|
||||
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to='crawls.crawl'),
|
||||
),
|
||||
|
||||
# Step 3: Remove created_by field
|
||||
migrations.RemoveField(
|
||||
model_name='snapshot',
|
||||
name='created_by',
|
||||
),
|
||||
]
|
||||
@@ -0,0 +1,19 @@
|
||||
# Generated migration
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0035_snapshot_crawl_non_nullable_remove_created_by'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
# Remove created_by field from ArchiveResult
|
||||
# No data migration needed - created_by can be accessed via snapshot.crawl.created_by
|
||||
migrations.RemoveField(
|
||||
model_name='archiveresult',
|
||||
name='created_by',
|
||||
),
|
||||
]
|
||||
@@ -9,6 +9,8 @@ import os
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
from statemachine import State, registry
|
||||
|
||||
from django.db import models
|
||||
from django.db.models import QuerySet, Value, Case, When, IntegerField
|
||||
from django.utils.functional import cached_property
|
||||
@@ -33,10 +35,10 @@ from archivebox.base_models.models import (
|
||||
ModelWithConfig, ModelWithNotes, ModelWithHealthStats,
|
||||
get_or_create_system_user_pk,
|
||||
)
|
||||
from workers.models import ModelWithStateMachine
|
||||
from workers.tasks import bg_archive_snapshot
|
||||
from crawls.models import Crawl
|
||||
from machine.models import NetworkInterface, Binary
|
||||
from archivebox.workers.models import ModelWithStateMachine, BaseStateMachine
|
||||
from archivebox.workers.tasks import bg_archive_snapshot
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.machine.models import NetworkInterface, Binary
|
||||
|
||||
|
||||
|
||||
@@ -53,6 +55,7 @@ class Tag(ModelWithSerializers):
|
||||
snapshot_set: models.Manager['Snapshot']
|
||||
|
||||
class Meta(TypedModelMeta):
|
||||
app_label = 'core'
|
||||
verbose_name = "Tag"
|
||||
verbose_name_plural = "Tags"
|
||||
|
||||
@@ -122,6 +125,7 @@ class SnapshotTag(models.Model):
|
||||
tag = models.ForeignKey(Tag, db_column='tag_id', on_delete=models.CASCADE, to_field='id')
|
||||
|
||||
class Meta:
|
||||
app_label = 'core'
|
||||
db_table = 'core_snapshot_tags'
|
||||
unique_together = [('snapshot', 'tag')]
|
||||
|
||||
@@ -263,52 +267,6 @@ class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)):
|
||||
# Import Methods
|
||||
# =========================================================================
|
||||
|
||||
def create_or_update_from_dict(self, link_dict: Dict[str, Any], created_by_id: Optional[int] = None) -> 'Snapshot':
|
||||
"""Create or update a Snapshot from a SnapshotDict (parser output)"""
|
||||
import re
|
||||
from archivebox.config.common import GENERAL_CONFIG
|
||||
|
||||
url = link_dict['url']
|
||||
timestamp = link_dict.get('timestamp')
|
||||
title = link_dict.get('title')
|
||||
tags_str = link_dict.get('tags')
|
||||
|
||||
tag_list = []
|
||||
if tags_str:
|
||||
tag_list = list(dict.fromkeys(
|
||||
tag.strip() for tag in re.split(GENERAL_CONFIG.TAG_SEPARATOR_PATTERN, tags_str)
|
||||
if tag.strip()
|
||||
))
|
||||
|
||||
# Get most recent snapshot with this URL (URLs can exist in multiple crawls)
|
||||
snapshot = self.filter(url=url).order_by('-created_at').first()
|
||||
if snapshot:
|
||||
if title and (not snapshot.title or len(title) > len(snapshot.title or '')):
|
||||
snapshot.title = title
|
||||
snapshot.save(update_fields=['title', 'modified_at'])
|
||||
else:
|
||||
if timestamp:
|
||||
while self.filter(timestamp=timestamp).exists():
|
||||
timestamp = str(float(timestamp) + 1.0)
|
||||
|
||||
snapshot = self.create(
|
||||
url=url,
|
||||
timestamp=timestamp,
|
||||
title=title,
|
||||
created_by_id=created_by_id or get_or_create_system_user_pk(),
|
||||
)
|
||||
|
||||
if tag_list:
|
||||
existing_tags = set(snapshot.tags.values_list('name', flat=True))
|
||||
new_tags = set(tag_list) | existing_tags
|
||||
snapshot.save_tags(new_tags)
|
||||
|
||||
return snapshot
|
||||
|
||||
def create_from_dicts(self, link_dicts: List[Dict[str, Any]], created_by_id: Optional[int] = None) -> List['Snapshot']:
|
||||
"""Create or update multiple Snapshots from a list of SnapshotDicts"""
|
||||
return [self.create_or_update_from_dict(d, created_by_id=created_by_id) for d in link_dicts]
|
||||
|
||||
def remove(self, atomic: bool = False) -> tuple:
|
||||
"""Remove snapshots from the database"""
|
||||
from django.db import transaction
|
||||
@@ -320,14 +278,13 @@ class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)):
|
||||
|
||||
class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
|
||||
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
|
||||
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False, related_name='snapshot_set', db_index=True)
|
||||
created_at = models.DateTimeField(default=timezone.now, db_index=True)
|
||||
modified_at = models.DateTimeField(auto_now=True)
|
||||
|
||||
url = models.URLField(unique=False, db_index=True) # URLs can appear in multiple crawls
|
||||
timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False)
|
||||
bookmarked_at = models.DateTimeField(default=timezone.now, db_index=True)
|
||||
crawl: Crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name='snapshot_set', db_index=True) # type: ignore
|
||||
crawl: Crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, null=False, related_name='snapshot_set', db_index=True) # type: ignore[assignment]
|
||||
parent_snapshot = models.ForeignKey('self', on_delete=models.SET_NULL, null=True, blank=True, related_name='child_snapshots', db_index=True, help_text='Parent snapshot that discovered this URL (for recursive crawling)')
|
||||
|
||||
title = models.CharField(max_length=512, null=True, blank=True, db_index=True)
|
||||
@@ -344,7 +301,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
|
||||
tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name='snapshot_set', through_fields=('snapshot', 'tag'))
|
||||
|
||||
state_machine_name = 'core.statemachines.SnapshotMachine'
|
||||
state_machine_name = 'core.models.SnapshotMachine'
|
||||
state_field_name = 'status'
|
||||
retry_at_field_name = 'retry_at'
|
||||
StatusChoices = ModelWithStateMachine.StatusChoices
|
||||
@@ -354,6 +311,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
archiveresult_set: models.Manager['ArchiveResult']
|
||||
|
||||
class Meta(TypedModelMeta):
|
||||
app_label = 'core'
|
||||
verbose_name = "Snapshot"
|
||||
verbose_name_plural = "Snapshots"
|
||||
constraints = [
|
||||
@@ -366,6 +324,11 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
def __str__(self):
|
||||
return f'[{self.id}] {self.url[:64]}'
|
||||
|
||||
@property
|
||||
def created_by(self):
|
||||
"""Convenience property to access the user who created this snapshot via its crawl."""
|
||||
return self.crawl.created_by
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
is_new = self._state.adding
|
||||
if not self.bookmarked_at:
|
||||
@@ -395,7 +358,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
self.fs_version = target
|
||||
|
||||
super().save(*args, **kwargs)
|
||||
if self.crawl and self.url not in self.crawl.urls:
|
||||
if self.url not in self.crawl.urls:
|
||||
self.crawl.urls += f'\n{self.url}'
|
||||
self.crawl.save()
|
||||
|
||||
@@ -408,7 +371,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
url=self.url,
|
||||
metadata={
|
||||
'id': str(self.id),
|
||||
'crawl_id': str(self.crawl_id) if self.crawl_id else None,
|
||||
'crawl_id': str(self.crawl_id),
|
||||
'depth': self.depth,
|
||||
'status': self.status,
|
||||
},
|
||||
@@ -437,20 +400,11 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
return self.fs_version != self._fs_current_version()
|
||||
|
||||
def _fs_next_version(self, version: str) -> str:
|
||||
"""Get next version in migration chain"""
|
||||
chain = ['0.7.0', '0.8.0', '0.9.0']
|
||||
try:
|
||||
idx = chain.index(version)
|
||||
return chain[idx + 1] if idx + 1 < len(chain) else self._fs_current_version()
|
||||
except ValueError:
|
||||
# Unknown version - skip to current
|
||||
return self._fs_current_version()
|
||||
|
||||
def _fs_migrate_from_0_7_0_to_0_8_0(self):
|
||||
"""Migration from 0.7.0 to 0.8.0 layout (no-op)"""
|
||||
# 0.7 and 0.8 both used archive/<timestamp>
|
||||
# Nothing to do!
|
||||
pass
|
||||
"""Get next version in migration chain (0.7/0.8 had same layout, only 0.8→0.9 migration needed)"""
|
||||
# Treat 0.7.0 and 0.8.0 as equivalent (both used archive/{timestamp})
|
||||
if version in ('0.7.0', '0.8.0'):
|
||||
return '0.9.0'
|
||||
return self._fs_current_version()
|
||||
|
||||
def _fs_migrate_from_0_8_0_to_0_9_0(self):
|
||||
"""
|
||||
@@ -578,7 +532,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
return CONSTANTS.ARCHIVE_DIR / self.timestamp
|
||||
|
||||
elif version in ('0.9.0', '1.0.0'):
|
||||
username = self.created_by.username if self.created_by else 'unknown'
|
||||
username = self.created_by.username
|
||||
|
||||
# Use created_at for date grouping (fallback to timestamp)
|
||||
if self.created_at:
|
||||
@@ -875,7 +829,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
pwd=result_data.get('pwd', str(self.output_dir)),
|
||||
start_ts=start_ts,
|
||||
end_ts=end_ts,
|
||||
created_by=self.created_by,
|
||||
)
|
||||
except:
|
||||
pass
|
||||
@@ -1069,6 +1022,12 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
result = archive_results.get(plugin)
|
||||
existing = result and result.status == 'succeeded' and (result.output_files or result.output_str)
|
||||
icon = get_plugin_icon(plugin)
|
||||
|
||||
# Skip plugins with empty icons that have no output
|
||||
# (e.g., staticfile only shows when there's actual output)
|
||||
if not icon.strip() and not existing:
|
||||
continue
|
||||
|
||||
output += format_html(
|
||||
output_template,
|
||||
path,
|
||||
@@ -1139,9 +1098,20 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
|
||||
def run(self) -> list['ArchiveResult']:
|
||||
"""
|
||||
Execute this Snapshot by creating ArchiveResults for all enabled extractors.
|
||||
Execute snapshot by creating pending ArchiveResults for all enabled hooks.
|
||||
|
||||
Called by the state machine when entering the 'started' state.
|
||||
Called by: SnapshotMachine.enter_started()
|
||||
|
||||
Hook Lifecycle:
|
||||
1. discover_hooks('Snapshot') → finds all plugin hooks
|
||||
2. For each hook:
|
||||
- Create ArchiveResult with status=QUEUED
|
||||
- Store hook_name (e.g., 'on_Snapshot__50_wget.py')
|
||||
3. ArchiveResults execute independently via ArchiveResultMachine
|
||||
4. Hook execution happens in ArchiveResult.run(), NOT here
|
||||
|
||||
Returns:
|
||||
list[ArchiveResult]: Newly created pending results
|
||||
"""
|
||||
return self.create_pending_archiveresults()
|
||||
|
||||
@@ -1152,28 +1122,20 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
Called by the state machine when entering the 'sealed' state.
|
||||
Kills any background hooks and finalizes their ArchiveResults.
|
||||
"""
|
||||
from pathlib import Path
|
||||
from archivebox.hooks import kill_process
|
||||
|
||||
# Kill any background ArchiveResult hooks
|
||||
if not self.OUTPUT_DIR.exists():
|
||||
return
|
||||
|
||||
for plugin_dir in self.OUTPUT_DIR.iterdir():
|
||||
if not plugin_dir.is_dir():
|
||||
continue
|
||||
pid_file = plugin_dir / 'hook.pid'
|
||||
if pid_file.exists():
|
||||
kill_process(pid_file, validate=True) # Use validation
|
||||
# Find all .pid files in this snapshot's output directory
|
||||
for pid_file in self.OUTPUT_DIR.glob('**/*.pid'):
|
||||
kill_process(pid_file, validate=True)
|
||||
|
||||
# Update the ArchiveResult from filesystem
|
||||
plugin_name = plugin_dir.name
|
||||
results = self.archiveresult_set.filter(
|
||||
status=ArchiveResult.StatusChoices.STARTED,
|
||||
pwd__contains=plugin_name
|
||||
)
|
||||
for ar in results:
|
||||
ar.update_from_output()
|
||||
# Update all STARTED ArchiveResults from filesystem
|
||||
results = self.archiveresult_set.filter(status=ArchiveResult.StatusChoices.STARTED)
|
||||
for ar in results:
|
||||
ar.update_from_output()
|
||||
|
||||
def has_running_background_hooks(self) -> bool:
|
||||
"""
|
||||
@@ -1196,51 +1158,156 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None):
|
||||
def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None, queue_for_extraction: bool = True):
|
||||
"""
|
||||
Create/update Snapshot from JSONL record.
|
||||
Create/update Snapshot from JSONL record or dict.
|
||||
|
||||
Unified method that handles:
|
||||
- ID-based patching: {"id": "...", "title": "new title"}
|
||||
- URL-based create/update: {"url": "...", "title": "...", "tags": "..."}
|
||||
- Auto-creates Crawl if not provided
|
||||
- Optionally queues for extraction
|
||||
|
||||
Args:
|
||||
record: JSONL record with 'url' field and optional metadata
|
||||
record: Dict with 'url' (for create) or 'id' (for patch), plus other fields
|
||||
overrides: Dict with 'crawl', 'snapshot' (parent), 'created_by_id'
|
||||
queue_for_extraction: If True, sets status=QUEUED and retry_at (default: True)
|
||||
|
||||
Returns:
|
||||
Snapshot instance or None
|
||||
|
||||
Note:
|
||||
Filtering (depth, URL allowlist/denylist) should be done by caller
|
||||
BEFORE calling this method. This method just creates the snapshot.
|
||||
"""
|
||||
from archivebox.misc.jsonl import get_or_create_snapshot
|
||||
import re
|
||||
from django.utils import timezone
|
||||
from archivebox.misc.util import parse_date
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from archivebox.config.common import GENERAL_CONFIG
|
||||
|
||||
overrides = overrides or {}
|
||||
|
||||
# If 'id' is provided, lookup and patch that specific snapshot
|
||||
snapshot_id = record.get('id')
|
||||
if snapshot_id:
|
||||
try:
|
||||
snapshot = Snapshot.objects.get(id=snapshot_id)
|
||||
|
||||
# Generically update all fields present in record
|
||||
update_fields = []
|
||||
for field_name, value in record.items():
|
||||
# Skip internal fields
|
||||
if field_name in ('id', 'type'):
|
||||
continue
|
||||
|
||||
# Skip if field doesn't exist on model
|
||||
if not hasattr(snapshot, field_name):
|
||||
continue
|
||||
|
||||
# Special parsing for date fields
|
||||
if field_name in ('bookmarked_at', 'retry_at', 'created_at', 'modified_at'):
|
||||
if value and isinstance(value, str):
|
||||
value = parse_date(value)
|
||||
|
||||
# Update field if value is provided and different
|
||||
if value is not None and getattr(snapshot, field_name) != value:
|
||||
setattr(snapshot, field_name, value)
|
||||
update_fields.append(field_name)
|
||||
|
||||
if update_fields:
|
||||
snapshot.save(update_fields=update_fields + ['modified_at'])
|
||||
|
||||
return snapshot
|
||||
except Snapshot.DoesNotExist:
|
||||
# ID not found, fall through to create-by-URL logic
|
||||
pass
|
||||
|
||||
url = record.get('url')
|
||||
if not url:
|
||||
return None
|
||||
|
||||
# Apply crawl context metadata
|
||||
# Determine or create crawl (every snapshot must have a crawl)
|
||||
crawl = overrides.get('crawl')
|
||||
snapshot = overrides.get('snapshot') # Parent snapshot
|
||||
parent_snapshot = overrides.get('snapshot') # Parent snapshot
|
||||
created_by_id = overrides.get('created_by_id') or (parent_snapshot.created_by.pk if parent_snapshot else get_or_create_system_user_pk())
|
||||
|
||||
if crawl:
|
||||
record.setdefault('crawl_id', str(crawl.id))
|
||||
record.setdefault('depth', (snapshot.depth + 1 if snapshot else 1))
|
||||
if snapshot:
|
||||
record.setdefault('parent_snapshot_id', str(snapshot.id))
|
||||
# If no crawl provided, inherit from parent or auto-create one
|
||||
if not crawl:
|
||||
if parent_snapshot:
|
||||
# Inherit crawl from parent snapshot
|
||||
crawl = parent_snapshot.crawl
|
||||
else:
|
||||
# Auto-create a single-URL crawl
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.config import CONSTANTS
|
||||
|
||||
try:
|
||||
created_by_id = overrides.get('created_by_id') or (snapshot.created_by_id if snapshot else None)
|
||||
new_snapshot = get_or_create_snapshot(record, created_by_id=created_by_id)
|
||||
timestamp_str = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
|
||||
sources_file = CONSTANTS.SOURCES_DIR / f'{timestamp_str}__auto_crawl.txt'
|
||||
sources_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
sources_file.write_text(url)
|
||||
|
||||
# Queue for extraction
|
||||
new_snapshot.status = Snapshot.StatusChoices.QUEUED
|
||||
new_snapshot.retry_at = timezone.now()
|
||||
new_snapshot.save()
|
||||
crawl = Crawl.objects.create(
|
||||
urls=url,
|
||||
max_depth=0,
|
||||
label=f'auto-created for {url[:50]}',
|
||||
created_by_id=created_by_id,
|
||||
)
|
||||
|
||||
return new_snapshot
|
||||
except ValueError:
|
||||
return None
|
||||
# Parse tags
|
||||
tags_str = record.get('tags', '')
|
||||
tag_list = []
|
||||
if tags_str:
|
||||
tag_list = list(dict.fromkeys(
|
||||
tag.strip() for tag in re.split(GENERAL_CONFIG.TAG_SEPARATOR_PATTERN, tags_str)
|
||||
if tag.strip()
|
||||
))
|
||||
|
||||
# Get most recent snapshot with this URL (URLs can exist in multiple crawls)
|
||||
snapshot = Snapshot.objects.filter(url=url).order_by('-created_at').first()
|
||||
|
||||
title = record.get('title')
|
||||
timestamp = record.get('timestamp')
|
||||
|
||||
if snapshot:
|
||||
# Update existing snapshot
|
||||
if title and (not snapshot.title or len(title) > len(snapshot.title or '')):
|
||||
snapshot.title = title
|
||||
snapshot.save(update_fields=['title', 'modified_at'])
|
||||
else:
|
||||
# Create new snapshot
|
||||
if timestamp:
|
||||
while Snapshot.objects.filter(timestamp=timestamp).exists():
|
||||
timestamp = str(float(timestamp) + 1.0)
|
||||
|
||||
snapshot = Snapshot.objects.create(
|
||||
url=url,
|
||||
timestamp=timestamp,
|
||||
title=title,
|
||||
crawl=crawl,
|
||||
)
|
||||
|
||||
# Update tags
|
||||
if tag_list:
|
||||
existing_tags = set(snapshot.tags.values_list('name', flat=True))
|
||||
new_tags = set(tag_list) | existing_tags
|
||||
snapshot.save_tags(new_tags)
|
||||
|
||||
# Queue for extraction and update additional fields
|
||||
update_fields = []
|
||||
|
||||
if queue_for_extraction:
|
||||
snapshot.status = Snapshot.StatusChoices.QUEUED
|
||||
snapshot.retry_at = timezone.now()
|
||||
update_fields.extend(['status', 'retry_at'])
|
||||
|
||||
# Update additional fields if provided
|
||||
for field_name in ('depth', 'parent_snapshot_id', 'crawl_id', 'bookmarked_at'):
|
||||
value = record.get(field_name)
|
||||
if value is not None and getattr(snapshot, field_name) != value:
|
||||
setattr(snapshot, field_name, value)
|
||||
update_fields.append(field_name)
|
||||
|
||||
if update_fields:
|
||||
snapshot.save(update_fields=update_fields + ['modified_at'])
|
||||
|
||||
return snapshot
|
||||
|
||||
def create_pending_archiveresults(self) -> list['ArchiveResult']:
|
||||
"""
|
||||
@@ -1273,7 +1340,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
'plugin': plugin,
|
||||
'status': ArchiveResult.INITIAL_STATE,
|
||||
'retry_at': timezone.now(),
|
||||
'created_by_id': self.created_by_id,
|
||||
},
|
||||
)
|
||||
if archiveresult.status == ArchiveResult.INITIAL_STATE:
|
||||
@@ -1329,6 +1395,36 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
self.save(update_fields=['current_step', 'modified_at'])
|
||||
return True
|
||||
|
||||
def is_finished_processing(self) -> bool:
|
||||
"""
|
||||
Check if this snapshot has finished processing.
|
||||
|
||||
Used by SnapshotMachine.is_finished() to determine if snapshot is complete.
|
||||
|
||||
Returns:
|
||||
True if all archiveresults are finished (or no work to do), False otherwise.
|
||||
"""
|
||||
# if no archiveresults exist yet, it's not finished
|
||||
if not self.archiveresult_set.exists():
|
||||
return False
|
||||
|
||||
# Try to advance step if ready (handles step-based hook execution)
|
||||
# This will increment current_step when all foreground hooks in current step are done
|
||||
while self.advance_step_if_ready():
|
||||
pass # Keep advancing until we can't anymore
|
||||
|
||||
# if archiveresults exist but are still pending, it's not finished
|
||||
if self.pending_archiveresults().exists():
|
||||
return False
|
||||
|
||||
# Don't wait for background hooks - they'll be cleaned up on entering sealed state
|
||||
# Background hooks in STARTED state are excluded by pending_archiveresults()
|
||||
# (STARTED is in FINAL_OR_ACTIVE_STATES) so once all results are FINAL or ACTIVE,
|
||||
# we can transition to sealed and cleanup() will kill the background hooks
|
||||
|
||||
# otherwise archiveresults exist and are all finished, so it's finished
|
||||
return True
|
||||
|
||||
def retry_failed_archiveresults(self, retry_at: Optional['timezone.datetime'] = None) -> int:
|
||||
"""
|
||||
Reset failed/skipped ArchiveResults to queued for retry.
|
||||
@@ -1730,6 +1826,97 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
return dt.strftime('%Y-%m-%d %H:%M:%S') if dt else None
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Snapshot State Machine
|
||||
# =============================================================================
|
||||
|
||||
class SnapshotMachine(BaseStateMachine, strict_states=True):
|
||||
"""
|
||||
State machine for managing Snapshot lifecycle.
|
||||
|
||||
Hook Lifecycle:
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ QUEUED State │
|
||||
│ • Waiting for snapshot to be ready │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
↓ tick() when can_start()
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ STARTED State → enter_started() │
|
||||
│ 1. snapshot.run() │
|
||||
│ • discover_hooks('Snapshot') → finds all plugin hooks │
|
||||
│ • create_pending_archiveresults() → creates ONE │
|
||||
│ ArchiveResult per hook (NO execution yet) │
|
||||
│ 2. ArchiveResults process independently with their own │
|
||||
│ state machines (see ArchiveResultMachine) │
|
||||
│ 3. Advance through steps 0-9 as foreground hooks complete │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
↓ tick() when is_finished()
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ SEALED State → enter_sealed() │
|
||||
│ • cleanup() → kills any background hooks still running │
|
||||
│ • Set retry_at=None (no more processing) │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
|
||||
https://github.com/ArchiveBox/ArchiveBox/wiki/ArchiveBox-Architecture-Diagrams
|
||||
"""
|
||||
|
||||
model_attr_name = 'snapshot'
|
||||
|
||||
# States
|
||||
queued = State(value=Snapshot.StatusChoices.QUEUED, initial=True)
|
||||
started = State(value=Snapshot.StatusChoices.STARTED)
|
||||
sealed = State(value=Snapshot.StatusChoices.SEALED, final=True)
|
||||
|
||||
# Tick Event
|
||||
tick = (
|
||||
queued.to.itself(unless='can_start') |
|
||||
queued.to(started, cond='can_start') |
|
||||
started.to.itself(unless='is_finished') |
|
||||
started.to(sealed, cond='is_finished')
|
||||
)
|
||||
|
||||
def can_start(self) -> bool:
|
||||
can_start = bool(self.snapshot.url)
|
||||
return can_start
|
||||
|
||||
def is_finished(self) -> bool:
|
||||
"""Check if snapshot processing is complete - delegates to model method."""
|
||||
return self.snapshot.is_finished_processing()
|
||||
|
||||
@queued.enter
|
||||
def enter_queued(self):
|
||||
self.snapshot.update_and_requeue(
|
||||
retry_at=timezone.now(),
|
||||
status=Snapshot.StatusChoices.QUEUED,
|
||||
)
|
||||
|
||||
@started.enter
|
||||
def enter_started(self):
|
||||
# lock the snapshot while we create the pending archiveresults
|
||||
self.snapshot.update_and_requeue(
|
||||
retry_at=timezone.now() + timedelta(seconds=30), # if failed, wait 30s before retrying
|
||||
)
|
||||
|
||||
# Run the snapshot - creates pending archiveresults for all enabled plugins
|
||||
self.snapshot.run()
|
||||
|
||||
# unlock the snapshot after we're done + set status = started
|
||||
self.snapshot.update_and_requeue(
|
||||
retry_at=timezone.now() + timedelta(seconds=5), # check again in 5s
|
||||
status=Snapshot.StatusChoices.STARTED,
|
||||
)
|
||||
|
||||
@sealed.enter
|
||||
def enter_sealed(self):
|
||||
# Clean up background hooks
|
||||
self.snapshot.cleanup()
|
||||
|
||||
self.snapshot.update_and_requeue(
|
||||
retry_at=None,
|
||||
status=Snapshot.StatusChoices.SEALED,
|
||||
)
|
||||
|
||||
|
||||
class ArchiveResultManager(models.Manager):
|
||||
def indexable(self, sorted: bool = True):
|
||||
INDEXABLE_METHODS = [r[0] for r in EXTRACTOR_INDEXING_PRECEDENCE]
|
||||
@@ -1761,7 +1948,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
# Note: unique constraint is added by migration 0027 - don't set unique=True here
|
||||
# or SQLite table recreation in earlier migrations will fail
|
||||
uuid = models.UUIDField(default=uuid7, null=True, blank=True, db_index=True)
|
||||
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False, related_name='archiveresult_set', db_index=True)
|
||||
created_at = models.DateTimeField(default=timezone.now, db_index=True)
|
||||
modified_at = models.DateTimeField(auto_now=True)
|
||||
|
||||
@@ -1782,7 +1968,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
|
||||
# Binary FK (optional - set when hook reports cmd)
|
||||
binary = models.ForeignKey(
|
||||
'machine.Binary',
|
||||
Binary,
|
||||
on_delete=models.SET_NULL,
|
||||
null=True, blank=True,
|
||||
related_name='archiveresults',
|
||||
@@ -1798,7 +1984,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
output_dir = models.CharField(max_length=256, default=None, null=True, blank=True)
|
||||
iface = models.ForeignKey(NetworkInterface, on_delete=models.SET_NULL, null=True, blank=True)
|
||||
|
||||
state_machine_name = 'core.statemachines.ArchiveResultMachine'
|
||||
state_machine_name = 'core.models.ArchiveResultMachine'
|
||||
retry_at_field_name = 'retry_at'
|
||||
state_field_name = 'status'
|
||||
active_state = StatusChoices.STARTED
|
||||
@@ -1806,12 +1992,18 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
objects = ArchiveResultManager()
|
||||
|
||||
class Meta(TypedModelMeta):
|
||||
app_label = 'core'
|
||||
verbose_name = 'Archive Result'
|
||||
verbose_name_plural = 'Archive Results Log'
|
||||
|
||||
def __str__(self):
|
||||
return f'[{self.id}] {self.snapshot.url[:64]} -> {self.plugin}'
|
||||
|
||||
@property
|
||||
def created_by(self):
|
||||
"""Convenience property to access the user who created this archive result via its snapshot's crawl."""
|
||||
return self.snapshot.crawl.created_by
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
is_new = self._state.adding
|
||||
# Skip ModelWithOutputDir.save() to avoid creating index.json in plugin directories
|
||||
@@ -1900,6 +2092,12 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
def save_search_index(self):
|
||||
pass
|
||||
|
||||
def cascade_health_update(self, success: bool):
|
||||
"""Update health stats for self, parent Snapshot, and grandparent Crawl."""
|
||||
self.increment_health_stats(success)
|
||||
self.snapshot.increment_health_stats(success)
|
||||
self.snapshot.crawl.increment_health_stats(success)
|
||||
|
||||
def run(self):
|
||||
"""
|
||||
Execute this ArchiveResult's hook and update status.
|
||||
@@ -1911,8 +2109,13 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
"""
|
||||
from django.utils import timezone
|
||||
from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook, is_background_hook
|
||||
from archivebox.config.configset import get_config
|
||||
|
||||
config_objects = [self.snapshot.crawl, self.snapshot] if self.snapshot.crawl else [self.snapshot]
|
||||
# Get merged config with proper context
|
||||
config = get_config(
|
||||
crawl=self.snapshot.crawl,
|
||||
snapshot=self.snapshot,
|
||||
)
|
||||
|
||||
# Determine which hook(s) to run
|
||||
hooks = []
|
||||
@@ -1962,10 +2165,10 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
result = run_hook(
|
||||
hook,
|
||||
output_dir=plugin_dir,
|
||||
config_objects=config_objects,
|
||||
config=config,
|
||||
url=self.snapshot.url,
|
||||
snapshot_id=str(self.snapshot.id),
|
||||
crawl_id=str(self.snapshot.crawl.id) if self.snapshot.crawl else None,
|
||||
crawl_id=str(self.snapshot.crawl.id),
|
||||
depth=self.snapshot.depth,
|
||||
)
|
||||
|
||||
@@ -2112,9 +2315,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
|
||||
# Filter Snapshot records for depth/URL constraints
|
||||
if record_type == 'Snapshot':
|
||||
if not self.snapshot.crawl:
|
||||
continue
|
||||
|
||||
url = record.get('url')
|
||||
if not url:
|
||||
continue
|
||||
@@ -2132,19 +2332,10 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
overrides = {
|
||||
'snapshot': self.snapshot,
|
||||
'crawl': self.snapshot.crawl,
|
||||
'created_by_id': self.snapshot.created_by_id,
|
||||
'created_by_id': self.created_by.pk,
|
||||
}
|
||||
process_hook_records(filtered_records, overrides=overrides)
|
||||
|
||||
# Update snapshot title if this is the title plugin
|
||||
plugin_name = get_plugin_name(self.plugin)
|
||||
if self.status == self.StatusChoices.SUCCEEDED and plugin_name == 'title':
|
||||
self._update_snapshot_title(plugin_dir)
|
||||
|
||||
# Trigger search indexing if succeeded
|
||||
if self.status == self.StatusChoices.SUCCEEDED:
|
||||
self.trigger_search_indexing()
|
||||
|
||||
# Cleanup PID files and empty logs
|
||||
pid_file = plugin_dir / 'hook.pid'
|
||||
pid_file.unlink(missing_ok=True)
|
||||
@@ -2164,7 +2355,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
if not cmd:
|
||||
return
|
||||
|
||||
from machine.models import Machine
|
||||
from archivebox.machine.models import Machine
|
||||
|
||||
bin_path_or_name = cmd[0] if isinstance(cmd, list) else cmd
|
||||
machine = Machine.current()
|
||||
@@ -2189,23 +2380,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
if binary:
|
||||
self.binary = binary
|
||||
|
||||
def _update_snapshot_title(self, plugin_dir: Path):
|
||||
"""
|
||||
Update snapshot title from title plugin output.
|
||||
|
||||
The title plugin writes title.txt with the extracted page title.
|
||||
This updates the Snapshot.title field if the file exists and has content.
|
||||
"""
|
||||
title_file = plugin_dir / 'title.txt'
|
||||
if title_file.exists():
|
||||
try:
|
||||
title = title_file.read_text(encoding='utf-8').strip()
|
||||
if title and (not self.snapshot.title or len(title) > len(self.snapshot.title)):
|
||||
self.snapshot.title = title[:512] # Max length from model
|
||||
self.snapshot.save(update_fields=['title', 'modified_at'])
|
||||
except Exception:
|
||||
pass # Failed to read title, that's okay
|
||||
|
||||
def _url_passes_filters(self, url: str) -> bool:
|
||||
"""Check if URL passes URL_ALLOWLIST and URL_DENYLIST config filters.
|
||||
|
||||
@@ -2216,8 +2390,8 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
|
||||
# Get merged config with proper hierarchy
|
||||
config = get_config(
|
||||
user=self.snapshot.created_by if self.snapshot else None,
|
||||
crawl=self.snapshot.crawl if self.snapshot else None,
|
||||
user=self.created_by,
|
||||
crawl=self.snapshot.crawl,
|
||||
snapshot=self.snapshot,
|
||||
)
|
||||
|
||||
@@ -2256,23 +2430,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
return False # No allowlist patterns matched
|
||||
|
||||
return True # No filters or passed filters
|
||||
|
||||
def trigger_search_indexing(self):
|
||||
"""Run any ArchiveResult__index hooks to update search indexes."""
|
||||
from archivebox.hooks import discover_hooks, run_hook
|
||||
|
||||
# Pass config objects in priority order (later overrides earlier)
|
||||
config_objects = [self.snapshot.crawl, self.snapshot] if self.snapshot.crawl else [self.snapshot]
|
||||
|
||||
for hook in discover_hooks('ArchiveResult__index'):
|
||||
run_hook(
|
||||
hook,
|
||||
output_dir=self.output_dir,
|
||||
config_objects=config_objects,
|
||||
url=self.snapshot.url,
|
||||
snapshot_id=str(self.snapshot.id),
|
||||
plugin=self.plugin,
|
||||
)
|
||||
|
||||
@property
|
||||
def output_dir(self) -> Path:
|
||||
@@ -2285,4 +2442,185 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
if not plugin_dir:
|
||||
return False
|
||||
pid_file = plugin_dir / 'hook.pid'
|
||||
return pid_file.exists()
|
||||
return pid_file.exists()
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# ArchiveResult State Machine
|
||||
# =============================================================================
|
||||
|
||||
class ArchiveResultMachine(BaseStateMachine, strict_states=True):
|
||||
"""
|
||||
State machine for managing ArchiveResult (single plugin execution) lifecycle.
|
||||
|
||||
Hook Lifecycle:
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ QUEUED State │
|
||||
│ • Waiting for its turn to run │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
↓ tick() when can_start()
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ STARTED State → enter_started() │
|
||||
│ 1. archiveresult.run() │
|
||||
│ • Find specific hook by hook_name │
|
||||
│ • run_hook(script, output_dir, ...) → subprocess │
|
||||
│ │
|
||||
│ 2a. FOREGROUND hook (returns HookResult): │
|
||||
│ • update_from_output() immediately │
|
||||
│ - Read stdout.log │
|
||||
│ - Parse JSONL records │
|
||||
│ - Extract 'ArchiveResult' record → update status │
|
||||
│ - Walk output_dir → populate output_files │
|
||||
│ - Call process_hook_records() for side effects │
|
||||
│ │
|
||||
│ 2b. BACKGROUND hook (returns None): │
|
||||
│ • Status stays STARTED │
|
||||
│ • Continues running in background │
|
||||
│ • Killed by Snapshot.cleanup() when sealed │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
↓ tick() checks status
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ SUCCEEDED / FAILED / SKIPPED / BACKOFF │
|
||||
│ • Set by hook's JSONL output during update_from_output() │
|
||||
│ • Health stats incremented (num_uses_succeeded/failed) │
|
||||
│ • Parent Snapshot health stats also updated │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
|
||||
https://github.com/ArchiveBox/ArchiveBox/wiki/ArchiveBox-Architecture-Diagrams
|
||||
"""
|
||||
|
||||
model_attr_name = 'archiveresult'
|
||||
|
||||
# States
|
||||
queued = State(value=ArchiveResult.StatusChoices.QUEUED, initial=True)
|
||||
started = State(value=ArchiveResult.StatusChoices.STARTED)
|
||||
backoff = State(value=ArchiveResult.StatusChoices.BACKOFF)
|
||||
succeeded = State(value=ArchiveResult.StatusChoices.SUCCEEDED, final=True)
|
||||
failed = State(value=ArchiveResult.StatusChoices.FAILED, final=True)
|
||||
skipped = State(value=ArchiveResult.StatusChoices.SKIPPED, final=True)
|
||||
|
||||
# Tick Event - transitions based on conditions
|
||||
tick = (
|
||||
queued.to.itself(unless='can_start') |
|
||||
queued.to(started, cond='can_start') |
|
||||
started.to.itself(unless='is_finished') |
|
||||
started.to(succeeded, cond='is_succeeded') |
|
||||
started.to(failed, cond='is_failed') |
|
||||
started.to(skipped, cond='is_skipped') |
|
||||
started.to(backoff, cond='is_backoff') |
|
||||
backoff.to.itself(unless='can_start') |
|
||||
backoff.to(started, cond='can_start') |
|
||||
backoff.to(succeeded, cond='is_succeeded') |
|
||||
backoff.to(failed, cond='is_failed') |
|
||||
backoff.to(skipped, cond='is_skipped')
|
||||
)
|
||||
|
||||
def can_start(self) -> bool:
|
||||
can_start = bool(self.archiveresult.snapshot.url)
|
||||
return can_start
|
||||
|
||||
def is_succeeded(self) -> bool:
|
||||
"""Check if extractor plugin succeeded (status was set by run())."""
|
||||
return self.archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED
|
||||
|
||||
def is_failed(self) -> bool:
|
||||
"""Check if extractor plugin failed (status was set by run())."""
|
||||
return self.archiveresult.status == ArchiveResult.StatusChoices.FAILED
|
||||
|
||||
def is_skipped(self) -> bool:
|
||||
"""Check if extractor plugin was skipped (status was set by run())."""
|
||||
return self.archiveresult.status == ArchiveResult.StatusChoices.SKIPPED
|
||||
|
||||
def is_backoff(self) -> bool:
|
||||
"""Check if we should backoff and retry later."""
|
||||
# Backoff if status is still started (plugin didn't complete) and output_str is empty
|
||||
return (
|
||||
self.archiveresult.status == ArchiveResult.StatusChoices.STARTED and
|
||||
not self.archiveresult.output_str
|
||||
)
|
||||
|
||||
def is_finished(self) -> bool:
|
||||
"""Check if extraction has completed (success, failure, or skipped)."""
|
||||
return self.archiveresult.status in (
|
||||
ArchiveResult.StatusChoices.SUCCEEDED,
|
||||
ArchiveResult.StatusChoices.FAILED,
|
||||
ArchiveResult.StatusChoices.SKIPPED,
|
||||
)
|
||||
|
||||
@queued.enter
|
||||
def enter_queued(self):
|
||||
self.archiveresult.update_and_requeue(
|
||||
retry_at=timezone.now(),
|
||||
status=ArchiveResult.StatusChoices.QUEUED,
|
||||
start_ts=None,
|
||||
) # bump the snapshot's retry_at so they pickup any new changes
|
||||
|
||||
@started.enter
|
||||
def enter_started(self):
|
||||
from archivebox.machine.models import NetworkInterface
|
||||
|
||||
# Lock the object and mark start time
|
||||
self.archiveresult.update_and_requeue(
|
||||
retry_at=timezone.now() + timedelta(seconds=120), # 2 min timeout for plugin
|
||||
status=ArchiveResult.StatusChoices.STARTED,
|
||||
start_ts=timezone.now(),
|
||||
iface=NetworkInterface.current(),
|
||||
)
|
||||
|
||||
# Run the plugin - this updates status, output, timestamps, etc.
|
||||
self.archiveresult.run()
|
||||
|
||||
# Save the updated result
|
||||
self.archiveresult.save()
|
||||
|
||||
|
||||
@backoff.enter
|
||||
def enter_backoff(self):
|
||||
self.archiveresult.update_and_requeue(
|
||||
retry_at=timezone.now() + timedelta(seconds=60),
|
||||
status=ArchiveResult.StatusChoices.BACKOFF,
|
||||
end_ts=None,
|
||||
)
|
||||
|
||||
@succeeded.enter
|
||||
def enter_succeeded(self):
|
||||
self.archiveresult.update_and_requeue(
|
||||
retry_at=None,
|
||||
status=ArchiveResult.StatusChoices.SUCCEEDED,
|
||||
end_ts=timezone.now(),
|
||||
)
|
||||
|
||||
# Update health stats for ArchiveResult, Snapshot, and Crawl cascade
|
||||
self.archiveresult.cascade_health_update(success=True)
|
||||
|
||||
@failed.enter
|
||||
def enter_failed(self):
|
||||
self.archiveresult.update_and_requeue(
|
||||
retry_at=None,
|
||||
status=ArchiveResult.StatusChoices.FAILED,
|
||||
end_ts=timezone.now(),
|
||||
)
|
||||
|
||||
# Update health stats for ArchiveResult, Snapshot, and Crawl cascade
|
||||
self.archiveresult.cascade_health_update(success=False)
|
||||
|
||||
@skipped.enter
|
||||
def enter_skipped(self):
|
||||
self.archiveresult.update_and_requeue(
|
||||
retry_at=None,
|
||||
status=ArchiveResult.StatusChoices.SKIPPED,
|
||||
end_ts=timezone.now(),
|
||||
)
|
||||
|
||||
def after_transition(self, event: str, source: State, target: State):
|
||||
self.archiveresult.snapshot.update_and_requeue() # bump snapshot retry time so it picks up all the new changes
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# State Machine Registration
|
||||
# =============================================================================
|
||||
|
||||
# Manually register state machines with python-statemachine registry
|
||||
# (normally auto-discovered from statemachines.py, but we define them here for clarity)
|
||||
registry.register(SnapshotMachine)
|
||||
registry.register(ArchiveResultMachine)
|
||||
2638
archivebox/core/models.py.bak
Executable file
2638
archivebox/core/models.py.bak
Executable file
File diff suppressed because it is too large
Load Diff
@@ -30,9 +30,9 @@ LOADED_PLUGINS = archivebox.LOADED_PLUGINS
|
||||
### Django Core Settings
|
||||
################################################################################
|
||||
|
||||
WSGI_APPLICATION = "core.wsgi.application"
|
||||
ASGI_APPLICATION = "core.asgi.application"
|
||||
ROOT_URLCONF = "core.urls"
|
||||
WSGI_APPLICATION = "archivebox.core.wsgi.application"
|
||||
ASGI_APPLICATION = "archivebox.core.asgi.application"
|
||||
ROOT_URLCONF = "archivebox.core.urls"
|
||||
|
||||
LOGIN_URL = "/accounts/login/"
|
||||
LOGOUT_REDIRECT_URL = os.environ.get("LOGOUT_REDIRECT_URL", "/")
|
||||
@@ -55,14 +55,15 @@ INSTALLED_APPS = [
|
||||
# 3rd-party apps from PyPI
|
||||
"signal_webhooks", # handles REST API outbound webhooks https://github.com/MrThearMan/django-signal-webhooks
|
||||
"django_object_actions", # provides easy Django Admin action buttons on change views https://github.com/crccheck/django-object-actions
|
||||
# Our ArchiveBox-provided apps
|
||||
"config", # ArchiveBox config settings (loaded as a plugin, don't need to add it here)
|
||||
"machine", # handles collecting and storing information about the host machine, network interfaces, binaries, etc.
|
||||
"workers", # handles starting and managing background workers and processes (orchestrators and actors)
|
||||
"crawls", # handles Crawl and CrawlSchedule models and management
|
||||
"personas", # handles Persona and session management
|
||||
"core", # core django model with Snapshot, ArchiveResult, etc.
|
||||
"api", # Django-Ninja-based Rest API interfaces, config, APIToken model, etc.
|
||||
# Our ArchiveBox-provided apps (use fully qualified names)
|
||||
# NOTE: Order matters! Apps with migrations that depend on other apps must come AFTER their dependencies
|
||||
# "archivebox.config", # ArchiveBox config settings (no models, not a real Django app)
|
||||
"archivebox.machine", # handles collecting and storing information about the host machine, network interfaces, binaries, etc.
|
||||
"archivebox.workers", # handles starting and managing background workers and processes (orchestrators and actors)
|
||||
"archivebox.personas", # handles Persona and session management
|
||||
"archivebox.core", # core django model with Snapshot, ArchiveResult, etc. (crawls depends on this)
|
||||
"archivebox.crawls", # handles Crawl and CrawlSchedule models and management (depends on core)
|
||||
"archivebox.api", # Django-Ninja-based Rest API interfaces, config, APIToken model, etc.
|
||||
# ArchiveBox plugins (hook-based plugins no longer add Django apps)
|
||||
# Use hooks.py discover_hooks() for plugin functionality
|
||||
# 3rd-party apps from PyPI that need to be loaded last
|
||||
@@ -72,15 +73,15 @@ INSTALLED_APPS = [
|
||||
|
||||
|
||||
MIDDLEWARE = [
|
||||
"core.middleware.TimezoneMiddleware",
|
||||
"archivebox.core.middleware.TimezoneMiddleware",
|
||||
"django.middleware.security.SecurityMiddleware",
|
||||
"django.contrib.sessions.middleware.SessionMiddleware",
|
||||
"django.middleware.common.CommonMiddleware",
|
||||
"django.middleware.csrf.CsrfViewMiddleware",
|
||||
"django.contrib.auth.middleware.AuthenticationMiddleware",
|
||||
"core.middleware.ReverseProxyAuthMiddleware",
|
||||
"archivebox.core.middleware.ReverseProxyAuthMiddleware",
|
||||
"django.contrib.messages.middleware.MessageMiddleware",
|
||||
"core.middleware.CacheControlMiddleware",
|
||||
"archivebox.core.middleware.CacheControlMiddleware",
|
||||
# Additional middlewares from plugins (if any)
|
||||
]
|
||||
|
||||
@@ -370,15 +371,15 @@ LOGGING = SETTINGS_LOGGING
|
||||
################################################################################
|
||||
|
||||
# Add default webhook configuration to the User model
|
||||
SIGNAL_WEBHOOKS_CUSTOM_MODEL = "api.models.OutboundWebhook"
|
||||
SIGNAL_WEBHOOKS_CUSTOM_MODEL = "archivebox.api.models.OutboundWebhook"
|
||||
SIGNAL_WEBHOOKS = {
|
||||
"HOOKS": {
|
||||
# ... is a special sigil value that means "use the default autogenerated hooks"
|
||||
"django.contrib.auth.models.User": ...,
|
||||
"core.models.Snapshot": ...,
|
||||
"core.models.ArchiveResult": ...,
|
||||
"core.models.Tag": ...,
|
||||
"api.models.APIToken": ...,
|
||||
"archivebox.core.models.Snapshot": ...,
|
||||
"archivebox.core.models.ArchiveResult": ...,
|
||||
"archivebox.core.models.Tag": ...,
|
||||
"archivebox.api.models.APIToken": ...,
|
||||
},
|
||||
}
|
||||
|
||||
@@ -391,11 +392,11 @@ ADMIN_DATA_VIEWS = {
|
||||
"URLS": [
|
||||
{
|
||||
"route": "config/",
|
||||
"view": "core.views.live_config_list_view",
|
||||
"view": "archivebox.core.views.live_config_list_view",
|
||||
"name": "Configuration",
|
||||
"items": {
|
||||
"route": "<str:key>/",
|
||||
"view": "core.views.live_config_value_view",
|
||||
"view": "archivebox.core.views.live_config_value_view",
|
||||
"name": "config_val",
|
||||
},
|
||||
},
|
||||
|
||||
@@ -1,319 +0,0 @@
|
||||
__package__ = 'archivebox.core'
|
||||
|
||||
import time
|
||||
import os
|
||||
from datetime import timedelta
|
||||
from typing import ClassVar
|
||||
|
||||
from django.db.models import F
|
||||
from django.utils import timezone
|
||||
|
||||
from rich import print
|
||||
|
||||
from statemachine import State, StateMachine
|
||||
|
||||
# from workers.actor import ActorType
|
||||
|
||||
from core.models import Snapshot, ArchiveResult
|
||||
from crawls.models import Crawl
|
||||
|
||||
|
||||
class SnapshotMachine(StateMachine, strict_states=True):
|
||||
"""
|
||||
State machine for managing Snapshot lifecycle.
|
||||
|
||||
https://github.com/ArchiveBox/ArchiveBox/wiki/ArchiveBox-Architecture-Diagrams
|
||||
"""
|
||||
|
||||
model: Snapshot
|
||||
|
||||
# States
|
||||
queued = State(value=Snapshot.StatusChoices.QUEUED, initial=True)
|
||||
started = State(value=Snapshot.StatusChoices.STARTED)
|
||||
sealed = State(value=Snapshot.StatusChoices.SEALED, final=True)
|
||||
|
||||
# Tick Event
|
||||
tick = (
|
||||
queued.to.itself(unless='can_start') |
|
||||
queued.to(started, cond='can_start') |
|
||||
started.to.itself(unless='is_finished') |
|
||||
started.to(sealed, cond='is_finished')
|
||||
)
|
||||
|
||||
def __init__(self, snapshot, *args, **kwargs):
|
||||
self.snapshot = snapshot
|
||||
super().__init__(snapshot, *args, **kwargs)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f'Snapshot[{self.snapshot.id}]'
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.__repr__()
|
||||
|
||||
def can_start(self) -> bool:
|
||||
can_start = bool(self.snapshot.url)
|
||||
# Suppressed: queue waiting logs
|
||||
return can_start
|
||||
|
||||
def is_finished(self) -> bool:
|
||||
# if no archiveresults exist yet, it's not finished
|
||||
if not self.snapshot.archiveresult_set.exists():
|
||||
return False
|
||||
|
||||
# Try to advance step if ready (handles step-based hook execution)
|
||||
# This will increment current_step when all foreground hooks in current step are done
|
||||
while self.snapshot.advance_step_if_ready():
|
||||
pass # Keep advancing until we can't anymore
|
||||
|
||||
# if archiveresults exist but are still pending, it's not finished
|
||||
if self.snapshot.pending_archiveresults().exists():
|
||||
return False
|
||||
|
||||
# Don't wait for background hooks - they'll be cleaned up on entering sealed state
|
||||
# Background hooks in STARTED state are excluded by pending_archiveresults()
|
||||
# (STARTED is in FINAL_OR_ACTIVE_STATES) so once all results are FINAL or ACTIVE,
|
||||
# we can transition to sealed and cleanup() will kill the background hooks
|
||||
|
||||
# otherwise archiveresults exist and are all finished, so it's finished
|
||||
return True
|
||||
|
||||
# def on_transition(self, event, state):
|
||||
# print(f'{self}.on_transition() [blue]{str(state).upper()}[/blue] ➡️ ...')
|
||||
|
||||
@queued.enter
|
||||
def enter_queued(self):
|
||||
# Suppressed: state transition logs
|
||||
self.snapshot.update_for_workers(
|
||||
retry_at=timezone.now(),
|
||||
status=Snapshot.StatusChoices.QUEUED,
|
||||
)
|
||||
|
||||
@started.enter
|
||||
def enter_started(self):
|
||||
# Suppressed: state transition logs
|
||||
# lock the snapshot while we create the pending archiveresults
|
||||
self.snapshot.update_for_workers(
|
||||
retry_at=timezone.now() + timedelta(seconds=30), # if failed, wait 30s before retrying
|
||||
)
|
||||
|
||||
# Run the snapshot - creates pending archiveresults for all enabled plugins
|
||||
self.snapshot.run()
|
||||
|
||||
# unlock the snapshot after we're done + set status = started
|
||||
self.snapshot.update_for_workers(
|
||||
retry_at=timezone.now() + timedelta(seconds=5), # check again in 5s
|
||||
status=Snapshot.StatusChoices.STARTED,
|
||||
)
|
||||
|
||||
@sealed.enter
|
||||
def enter_sealed(self):
|
||||
# Clean up background hooks
|
||||
self.snapshot.cleanup()
|
||||
|
||||
# Suppressed: state transition logs
|
||||
self.snapshot.update_for_workers(
|
||||
retry_at=None,
|
||||
status=Snapshot.StatusChoices.SEALED,
|
||||
)
|
||||
|
||||
|
||||
# class SnapshotWorker(ActorType[Snapshot]):
|
||||
# """
|
||||
# The primary actor for progressing Snapshot objects
|
||||
# through their lifecycle using the SnapshotMachine.
|
||||
# """
|
||||
# Model = Snapshot
|
||||
# StateMachineClass = SnapshotMachine
|
||||
|
||||
# ACTIVE_STATE: ClassVar[State] = SnapshotMachine.started # 'started'
|
||||
|
||||
# MAX_CONCURRENT_ACTORS: ClassVar[int] = 3
|
||||
# MAX_TICK_TIME: ClassVar[int] = 10
|
||||
# CLAIM_FROM_TOP_N: ClassVar[int] = MAX_CONCURRENT_ACTORS * 10
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
class ArchiveResultMachine(StateMachine, strict_states=True):
|
||||
"""
|
||||
State machine for managing ArchiveResult lifecycle.
|
||||
|
||||
https://github.com/ArchiveBox/ArchiveBox/wiki/ArchiveBox-Architecture-Diagrams
|
||||
"""
|
||||
|
||||
model: ArchiveResult
|
||||
|
||||
# States
|
||||
queued = State(value=ArchiveResult.StatusChoices.QUEUED, initial=True)
|
||||
started = State(value=ArchiveResult.StatusChoices.STARTED)
|
||||
backoff = State(value=ArchiveResult.StatusChoices.BACKOFF)
|
||||
succeeded = State(value=ArchiveResult.StatusChoices.SUCCEEDED, final=True)
|
||||
failed = State(value=ArchiveResult.StatusChoices.FAILED, final=True)
|
||||
skipped = State(value=ArchiveResult.StatusChoices.SKIPPED, final=True)
|
||||
|
||||
# Tick Event - transitions based on conditions
|
||||
tick = (
|
||||
queued.to.itself(unless='can_start') |
|
||||
queued.to(started, cond='can_start') |
|
||||
started.to.itself(unless='is_finished') |
|
||||
started.to(succeeded, cond='is_succeeded') |
|
||||
started.to(failed, cond='is_failed') |
|
||||
started.to(skipped, cond='is_skipped') |
|
||||
started.to(backoff, cond='is_backoff') |
|
||||
backoff.to.itself(unless='can_start') |
|
||||
backoff.to(started, cond='can_start') |
|
||||
backoff.to(succeeded, cond='is_succeeded') |
|
||||
backoff.to(failed, cond='is_failed') |
|
||||
backoff.to(skipped, cond='is_skipped')
|
||||
)
|
||||
|
||||
def __init__(self, archiveresult, *args, **kwargs):
|
||||
self.archiveresult = archiveresult
|
||||
super().__init__(archiveresult, *args, **kwargs)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f'ArchiveResult[{self.archiveresult.id}]'
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.__repr__()
|
||||
|
||||
def can_start(self) -> bool:
|
||||
can_start = bool(self.archiveresult.snapshot.url)
|
||||
# Suppressed: queue waiting logs
|
||||
return can_start
|
||||
|
||||
def is_succeeded(self) -> bool:
|
||||
"""Check if extractor plugin succeeded (status was set by run())."""
|
||||
return self.archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED
|
||||
|
||||
def is_failed(self) -> bool:
|
||||
"""Check if extractor plugin failed (status was set by run())."""
|
||||
return self.archiveresult.status == ArchiveResult.StatusChoices.FAILED
|
||||
|
||||
def is_skipped(self) -> bool:
|
||||
"""Check if extractor plugin was skipped (status was set by run())."""
|
||||
return self.archiveresult.status == ArchiveResult.StatusChoices.SKIPPED
|
||||
|
||||
def is_backoff(self) -> bool:
|
||||
"""Check if we should backoff and retry later."""
|
||||
# Backoff if status is still started (plugin didn't complete) and output_str is empty
|
||||
return (
|
||||
self.archiveresult.status == ArchiveResult.StatusChoices.STARTED and
|
||||
not self.archiveresult.output_str
|
||||
)
|
||||
|
||||
def is_finished(self) -> bool:
|
||||
"""Check if extraction has completed (success, failure, or skipped)."""
|
||||
return self.archiveresult.status in (
|
||||
ArchiveResult.StatusChoices.SUCCEEDED,
|
||||
ArchiveResult.StatusChoices.FAILED,
|
||||
ArchiveResult.StatusChoices.SKIPPED,
|
||||
)
|
||||
|
||||
@queued.enter
|
||||
def enter_queued(self):
|
||||
# Suppressed: state transition logs
|
||||
self.archiveresult.update_for_workers(
|
||||
retry_at=timezone.now(),
|
||||
status=ArchiveResult.StatusChoices.QUEUED,
|
||||
start_ts=None,
|
||||
) # bump the snapshot's retry_at so they pickup any new changes
|
||||
|
||||
@started.enter
|
||||
def enter_started(self):
|
||||
from machine.models import NetworkInterface
|
||||
|
||||
# Suppressed: state transition logs
|
||||
# Lock the object and mark start time
|
||||
self.archiveresult.update_for_workers(
|
||||
retry_at=timezone.now() + timedelta(seconds=120), # 2 min timeout for plugin
|
||||
status=ArchiveResult.StatusChoices.STARTED,
|
||||
start_ts=timezone.now(),
|
||||
iface=NetworkInterface.current(),
|
||||
)
|
||||
|
||||
# Run the plugin - this updates status, output, timestamps, etc.
|
||||
self.archiveresult.run()
|
||||
|
||||
# Save the updated result
|
||||
self.archiveresult.save()
|
||||
|
||||
# Suppressed: plugin result logs (already logged by worker)
|
||||
|
||||
@backoff.enter
|
||||
def enter_backoff(self):
|
||||
# Suppressed: state transition logs
|
||||
self.archiveresult.update_for_workers(
|
||||
retry_at=timezone.now() + timedelta(seconds=60),
|
||||
status=ArchiveResult.StatusChoices.BACKOFF,
|
||||
end_ts=None,
|
||||
# retries=F('retries') + 1, # F() equivalent to getattr(self.archiveresult, 'retries', 0) + 1,
|
||||
)
|
||||
self.archiveresult.save()
|
||||
|
||||
@succeeded.enter
|
||||
def enter_succeeded(self):
|
||||
# Suppressed: state transition logs
|
||||
self.archiveresult.update_for_workers(
|
||||
retry_at=None,
|
||||
status=ArchiveResult.StatusChoices.SUCCEEDED,
|
||||
end_ts=timezone.now(),
|
||||
# **self.archiveresult.get_output_dict(), # {output, output_json, stderr, stdout, returncode, errors, cmd_version, pwd, cmd, machine}
|
||||
)
|
||||
self.archiveresult.save()
|
||||
|
||||
# Increment health stats on ArchiveResult, Snapshot, and optionally Crawl
|
||||
ArchiveResult.objects.filter(pk=self.archiveresult.pk).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
|
||||
Snapshot.objects.filter(pk=self.archiveresult.snapshot_id).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
|
||||
|
||||
# Also update Crawl health stats if snapshot has a crawl
|
||||
snapshot = self.archiveresult.snapshot
|
||||
if snapshot.crawl_id:
|
||||
Crawl.objects.filter(pk=snapshot.crawl_id).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
|
||||
|
||||
@failed.enter
|
||||
def enter_failed(self):
|
||||
# Suppressed: state transition logs
|
||||
self.archiveresult.update_for_workers(
|
||||
retry_at=None,
|
||||
status=ArchiveResult.StatusChoices.FAILED,
|
||||
end_ts=timezone.now(),
|
||||
)
|
||||
|
||||
# Increment health stats on ArchiveResult, Snapshot, and optionally Crawl
|
||||
ArchiveResult.objects.filter(pk=self.archiveresult.pk).update(num_uses_failed=F('num_uses_failed') + 1)
|
||||
Snapshot.objects.filter(pk=self.archiveresult.snapshot_id).update(num_uses_failed=F('num_uses_failed') + 1)
|
||||
|
||||
# Also update Crawl health stats if snapshot has a crawl
|
||||
snapshot = self.archiveresult.snapshot
|
||||
if snapshot.crawl_id:
|
||||
Crawl.objects.filter(pk=snapshot.crawl_id).update(num_uses_failed=F('num_uses_failed') + 1)
|
||||
|
||||
@skipped.enter
|
||||
def enter_skipped(self):
|
||||
# Suppressed: state transition logs
|
||||
self.archiveresult.update_for_workers(
|
||||
retry_at=None,
|
||||
status=ArchiveResult.StatusChoices.SKIPPED,
|
||||
end_ts=timezone.now(),
|
||||
)
|
||||
|
||||
def after_transition(self, event: str, source: State, target: State):
|
||||
# print(f"after '{event}' from '{source.id}' to '{target.id}'")
|
||||
self.archiveresult.snapshot.update_for_workers() # bump snapshot retry time so it picks up all the new changes
|
||||
|
||||
|
||||
# class ArchiveResultWorker(ActorType[ArchiveResult]):
|
||||
# """
|
||||
# The primary actor for progressing ArchiveResult objects
|
||||
# through their lifecycle using the ArchiveResultMachine.
|
||||
# """
|
||||
# Model = ArchiveResult
|
||||
# StateMachineClass = ArchiveResultMachine
|
||||
|
||||
# ACTIVE_STATE: ClassVar[State] = ArchiveResultMachine.started # 'started'
|
||||
|
||||
# MAX_CONCURRENT_ACTORS: ClassVar[int] = 6
|
||||
# MAX_TICK_TIME: ClassVar[int] = 60
|
||||
# CLAIM_FROM_TOP_N: ClassVar[int] = MAX_CONCURRENT_ACTORS * 10
|
||||
20
archivebox/core/templatetags/config_tags.py
Normal file
20
archivebox/core/templatetags/config_tags.py
Normal file
@@ -0,0 +1,20 @@
|
||||
"""Template tags for accessing config values in templates."""
|
||||
|
||||
from django import template
|
||||
|
||||
from archivebox.config.configset import get_config as _get_config
|
||||
|
||||
register = template.Library()
|
||||
|
||||
|
||||
@register.simple_tag
|
||||
def get_config(key: str) -> any:
|
||||
"""
|
||||
Get a config value by key.
|
||||
|
||||
Usage: {% get_config "ARCHIVEDOTORG_ENABLED" as enabled %}
|
||||
"""
|
||||
try:
|
||||
return _get_config(key)
|
||||
except (KeyError, AttributeError):
|
||||
return None
|
||||
@@ -1,3 +1,319 @@
|
||||
#from django.test import TestCase
|
||||
"""Tests for the core views, especially AddView."""
|
||||
|
||||
# Create your tests here.
|
||||
import os
|
||||
import django
|
||||
|
||||
# Set up Django before importing any Django-dependent modules
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
|
||||
django.setup()
|
||||
|
||||
from django.test import TestCase, Client
|
||||
from django.contrib.auth.models import User
|
||||
from django.urls import reverse
|
||||
|
||||
from archivebox.crawls.models import Crawl, CrawlSchedule
|
||||
from archivebox.core.models import Tag
|
||||
|
||||
|
||||
class AddViewTests(TestCase):
|
||||
"""Tests for the AddView (crawl creation form)."""
|
||||
|
||||
def setUp(self):
|
||||
"""Set up test user and client."""
|
||||
self.client = Client()
|
||||
self.user = User.objects.create_user(
|
||||
username='testuser',
|
||||
password='testpass123',
|
||||
email='test@example.com'
|
||||
)
|
||||
self.client.login(username='testuser', password='testpass123')
|
||||
self.add_url = reverse('add')
|
||||
|
||||
def test_add_view_get_requires_auth(self):
|
||||
"""Test that GET /add requires authentication."""
|
||||
self.client.logout()
|
||||
response = self.client.get(self.add_url)
|
||||
# Should redirect to login or show 403/404
|
||||
self.assertIn(response.status_code, [302, 403, 404])
|
||||
|
||||
def test_add_view_get_shows_form(self):
|
||||
"""Test that GET /add shows the form with all fields."""
|
||||
response = self.client.get(self.add_url)
|
||||
self.assertEqual(response.status_code, 200)
|
||||
|
||||
# Check that form fields are present
|
||||
self.assertContains(response, 'name="url"')
|
||||
self.assertContains(response, 'name="tag"')
|
||||
self.assertContains(response, 'name="depth"')
|
||||
self.assertContains(response, 'name="notes"')
|
||||
self.assertContains(response, 'name="schedule"')
|
||||
self.assertContains(response, 'name="persona"')
|
||||
self.assertContains(response, 'name="overwrite"')
|
||||
self.assertContains(response, 'name="update"')
|
||||
self.assertContains(response, 'name="index_only"')
|
||||
|
||||
# Check for plugin groups
|
||||
self.assertContains(response, 'name="chrome_plugins"')
|
||||
self.assertContains(response, 'name="archiving_plugins"')
|
||||
self.assertContains(response, 'name="parsing_plugins"')
|
||||
|
||||
def test_add_view_shows_tag_autocomplete(self):
|
||||
"""Test that tag autocomplete datalist is rendered."""
|
||||
# Create some tags
|
||||
Tag.objects.create(name='test-tag-1')
|
||||
Tag.objects.create(name='test-tag-2')
|
||||
|
||||
response = self.client.get(self.add_url)
|
||||
self.assertEqual(response.status_code, 200)
|
||||
|
||||
# Check for datalist with tags
|
||||
self.assertContains(response, 'id="tag-datalist"')
|
||||
self.assertContains(response, 'test-tag-1')
|
||||
self.assertContains(response, 'test-tag-2')
|
||||
|
||||
def test_add_view_shows_plugin_presets(self):
|
||||
"""Test that plugin preset buttons are rendered."""
|
||||
response = self.client.get(self.add_url)
|
||||
self.assertEqual(response.status_code, 200)
|
||||
|
||||
self.assertContains(response, 'Quick Archive')
|
||||
self.assertContains(response, 'Full Chrome')
|
||||
self.assertContains(response, 'Text Only')
|
||||
self.assertContains(response, 'Select All')
|
||||
self.assertContains(response, 'Clear All')
|
||||
|
||||
def test_add_view_shows_links_to_resources(self):
|
||||
"""Test that helpful links are present."""
|
||||
response = self.client.get(self.add_url)
|
||||
self.assertEqual(response.status_code, 200)
|
||||
|
||||
# Link to plugin documentation
|
||||
self.assertContains(response, '/admin/environment/plugins/')
|
||||
|
||||
# Link to create new persona
|
||||
self.assertContains(response, '/admin/personas/persona/add/')
|
||||
|
||||
def test_add_basic_crawl_without_schedule(self):
|
||||
"""Test creating a basic crawl without a schedule."""
|
||||
response = self.client.post(self.add_url, {
|
||||
'url': 'https://example.com\nhttps://example.org',
|
||||
'tag': 'test-tag',
|
||||
'depth': '0',
|
||||
'notes': 'Test crawl notes',
|
||||
})
|
||||
|
||||
# Should redirect to crawl admin page
|
||||
self.assertEqual(response.status_code, 302)
|
||||
|
||||
# Check that crawl was created
|
||||
self.assertEqual(Crawl.objects.count(), 1)
|
||||
crawl = Crawl.objects.first()
|
||||
|
||||
self.assertIn('https://example.com', crawl.urls)
|
||||
self.assertIn('https://example.org', crawl.urls)
|
||||
self.assertEqual(crawl.tags_str, 'test-tag')
|
||||
self.assertEqual(crawl.max_depth, 0)
|
||||
self.assertEqual(crawl.notes, 'Test crawl notes')
|
||||
self.assertEqual(crawl.created_by, self.user)
|
||||
|
||||
# No schedule should be created
|
||||
self.assertIsNone(crawl.schedule)
|
||||
self.assertEqual(CrawlSchedule.objects.count(), 0)
|
||||
|
||||
def test_add_crawl_with_schedule(self):
|
||||
"""Test creating a crawl with a repeat schedule."""
|
||||
response = self.client.post(self.add_url, {
|
||||
'url': 'https://example.com',
|
||||
'tag': 'scheduled',
|
||||
'depth': '1',
|
||||
'notes': 'Daily crawl',
|
||||
'schedule': 'daily',
|
||||
})
|
||||
|
||||
self.assertEqual(response.status_code, 302)
|
||||
|
||||
# Check that crawl and schedule were created
|
||||
self.assertEqual(Crawl.objects.count(), 1)
|
||||
self.assertEqual(CrawlSchedule.objects.count(), 1)
|
||||
|
||||
crawl = Crawl.objects.first()
|
||||
schedule = CrawlSchedule.objects.first()
|
||||
|
||||
self.assertEqual(crawl.schedule, schedule)
|
||||
self.assertEqual(schedule.template, crawl)
|
||||
self.assertEqual(schedule.schedule, 'daily')
|
||||
self.assertTrue(schedule.is_enabled)
|
||||
self.assertEqual(schedule.created_by, self.user)
|
||||
|
||||
def test_add_crawl_with_cron_schedule(self):
|
||||
"""Test creating a crawl with a cron format schedule."""
|
||||
response = self.client.post(self.add_url, {
|
||||
'url': 'https://example.com',
|
||||
'depth': '0',
|
||||
'schedule': '0 */6 * * *', # Every 6 hours
|
||||
})
|
||||
|
||||
self.assertEqual(response.status_code, 302)
|
||||
|
||||
schedule = CrawlSchedule.objects.first()
|
||||
self.assertEqual(schedule.schedule, '0 */6 * * *')
|
||||
|
||||
def test_add_crawl_with_plugins(self):
|
||||
"""Test creating a crawl with specific plugins selected."""
|
||||
response = self.client.post(self.add_url, {
|
||||
'url': 'https://example.com',
|
||||
'depth': '0',
|
||||
'chrome_plugins': ['screenshot', 'dom'],
|
||||
'archiving_plugins': ['wget'],
|
||||
})
|
||||
|
||||
self.assertEqual(response.status_code, 302)
|
||||
|
||||
crawl = Crawl.objects.first()
|
||||
plugins = crawl.config.get('PLUGINS', '')
|
||||
|
||||
# Should contain the selected plugins
|
||||
self.assertIn('screenshot', plugins)
|
||||
self.assertIn('dom', plugins)
|
||||
self.assertIn('wget', plugins)
|
||||
|
||||
def test_add_crawl_with_depth_range(self):
|
||||
"""Test creating crawls with different depth values (0-4)."""
|
||||
for depth in range(5):
|
||||
response = self.client.post(self.add_url, {
|
||||
'url': f'https://example{depth}.com',
|
||||
'depth': str(depth),
|
||||
})
|
||||
|
||||
self.assertEqual(response.status_code, 302)
|
||||
|
||||
self.assertEqual(Crawl.objects.count(), 5)
|
||||
|
||||
for i, crawl in enumerate(Crawl.objects.order_by('created_at')):
|
||||
self.assertEqual(crawl.max_depth, i)
|
||||
|
||||
def test_add_crawl_with_advanced_options(self):
|
||||
"""Test creating a crawl with advanced options."""
|
||||
response = self.client.post(self.add_url, {
|
||||
'url': 'https://example.com',
|
||||
'depth': '0',
|
||||
'persona': 'CustomPersona',
|
||||
'overwrite': True,
|
||||
'update': True,
|
||||
'index_only': True,
|
||||
})
|
||||
|
||||
self.assertEqual(response.status_code, 302)
|
||||
|
||||
crawl = Crawl.objects.first()
|
||||
config = crawl.config
|
||||
|
||||
self.assertEqual(config.get('DEFAULT_PERSONA'), 'CustomPersona')
|
||||
self.assertEqual(config.get('OVERWRITE'), True)
|
||||
self.assertEqual(config.get('ONLY_NEW'), False) # opposite of update
|
||||
self.assertEqual(config.get('INDEX_ONLY'), True)
|
||||
|
||||
def test_add_crawl_with_custom_config(self):
|
||||
"""Test creating a crawl with custom config overrides."""
|
||||
# Note: Django test client can't easily POST the KeyValueWidget format,
|
||||
# so this test would need to use the form directly or mock the cleaned_data
|
||||
# For now, we'll skip this test or mark it as TODO
|
||||
pass
|
||||
|
||||
def test_add_empty_urls_fails(self):
|
||||
"""Test that submitting without URLs fails validation."""
|
||||
response = self.client.post(self.add_url, {
|
||||
'url': '',
|
||||
'depth': '0',
|
||||
})
|
||||
|
||||
# Should show form again with errors, not redirect
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertFormError(response, 'form', 'url', 'This field is required.')
|
||||
|
||||
def test_add_invalid_urls_fails(self):
|
||||
"""Test that invalid URLs fail validation."""
|
||||
response = self.client.post(self.add_url, {
|
||||
'url': 'not-a-url',
|
||||
'depth': '0',
|
||||
})
|
||||
|
||||
# Should show form again with errors
|
||||
self.assertEqual(response.status_code, 200)
|
||||
# Check for validation error (URL regex should fail)
|
||||
self.assertContains(response, 'error')
|
||||
|
||||
def test_add_success_message_without_schedule(self):
|
||||
"""Test that success message is shown without schedule link."""
|
||||
response = self.client.post(self.add_url, {
|
||||
'url': 'https://example.com\nhttps://example.org',
|
||||
'depth': '0',
|
||||
}, follow=True)
|
||||
|
||||
# Check success message mentions crawl creation
|
||||
messages = list(response.context['messages'])
|
||||
self.assertEqual(len(messages), 1)
|
||||
message_text = str(messages[0])
|
||||
|
||||
self.assertIn('Created crawl with 2 starting URL', message_text)
|
||||
self.assertIn('View Crawl', message_text)
|
||||
self.assertNotIn('scheduled to repeat', message_text)
|
||||
|
||||
def test_add_success_message_with_schedule(self):
|
||||
"""Test that success message includes schedule link."""
|
||||
response = self.client.post(self.add_url, {
|
||||
'url': 'https://example.com',
|
||||
'depth': '0',
|
||||
'schedule': 'weekly',
|
||||
}, follow=True)
|
||||
|
||||
# Check success message mentions schedule
|
||||
messages = list(response.context['messages'])
|
||||
self.assertEqual(len(messages), 1)
|
||||
message_text = str(messages[0])
|
||||
|
||||
self.assertIn('Created crawl', message_text)
|
||||
self.assertIn('scheduled to repeat weekly', message_text)
|
||||
self.assertIn('View Crawl', message_text)
|
||||
|
||||
def test_add_crawl_creates_source_file(self):
|
||||
"""Test that crawl creation saves URLs to sources file."""
|
||||
response = self.client.post(self.add_url, {
|
||||
'url': 'https://example.com',
|
||||
'depth': '0',
|
||||
})
|
||||
|
||||
self.assertEqual(response.status_code, 302)
|
||||
|
||||
# Check that source file was created in sources/ directory
|
||||
from archivebox.config import CONSTANTS
|
||||
sources_dir = CONSTANTS.SOURCES_DIR
|
||||
|
||||
# Should have created a source file
|
||||
source_files = list(sources_dir.glob('*__web_ui_add_by_user_*.txt'))
|
||||
self.assertGreater(len(source_files), 0)
|
||||
|
||||
def test_multiple_tags_are_saved(self):
|
||||
"""Test that multiple comma-separated tags are saved."""
|
||||
response = self.client.post(self.add_url, {
|
||||
'url': 'https://example.com',
|
||||
'depth': '0',
|
||||
'tag': 'tag1,tag2,tag3',
|
||||
})
|
||||
|
||||
self.assertEqual(response.status_code, 302)
|
||||
|
||||
crawl = Crawl.objects.first()
|
||||
self.assertEqual(crawl.tags_str, 'tag1,tag2,tag3')
|
||||
|
||||
def test_crawl_redirects_to_admin_change_page(self):
|
||||
"""Test that successful submission redirects to crawl admin page."""
|
||||
response = self.client.post(self.add_url, {
|
||||
'url': 'https://example.com',
|
||||
'depth': '0',
|
||||
})
|
||||
|
||||
crawl = Crawl.objects.first()
|
||||
expected_redirect = f'/admin/crawls/crawl/{crawl.id}/change/'
|
||||
|
||||
self.assertRedirects(response, expected_redirect, fetch_redirect_response=False)
|
||||
|
||||
@@ -7,10 +7,10 @@ from django.views.generic.base import RedirectView
|
||||
|
||||
from archivebox.misc.serve_static import serve_static
|
||||
|
||||
from core.admin_site import archivebox_admin
|
||||
from core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView, live_progress_view
|
||||
from archivebox.core.admin_site import archivebox_admin
|
||||
from archivebox.core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView, live_progress_view
|
||||
|
||||
from workers.views import JobsDashboardView
|
||||
from archivebox.workers.views import JobsDashboardView
|
||||
|
||||
# GLOBAL_CONTEXT doesn't work as-is, disabled for now: https://github.com/ArchiveBox/ArchiveBox/discussions/1306
|
||||
# from archivebox.config import VERSION, VERSIONS_AVAILABLE, CAN_UPGRADE
|
||||
|
||||
@@ -23,7 +23,7 @@ from admin_data_views.typing import TableContext, ItemContext
|
||||
from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink
|
||||
|
||||
import archivebox
|
||||
from archivebox.config import CONSTANTS, CONSTANTS_CONFIG, DATA_DIR, VERSION, SAVE_ARCHIVE_DOT_ORG
|
||||
from archivebox.config import CONSTANTS, CONSTANTS_CONFIG, DATA_DIR, VERSION
|
||||
from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG, ARCHIVING_CONFIG
|
||||
from archivebox.config.configset import get_flat_config, get_config, get_all_configs
|
||||
from archivebox.misc.util import base_url, htmlencode, ts_to_date_str
|
||||
@@ -31,9 +31,9 @@ from archivebox.misc.serve_static import serve_static_with_byterange_support
|
||||
from archivebox.misc.logging_util import printable_filesize
|
||||
from archivebox.search import query_search_index
|
||||
|
||||
from core.models import Snapshot
|
||||
from core.forms import AddLinkForm
|
||||
from crawls.models import Crawl
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.core.forms import AddLinkForm
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.hooks import get_extractors, get_extractor_name
|
||||
|
||||
|
||||
@@ -150,7 +150,6 @@ class SnapshotView(View):
|
||||
'status_color': 'success' if snapshot.is_archived else 'danger',
|
||||
'oldest_archive_date': ts_to_date_str(snapshot.oldest_archive_date),
|
||||
'warc_path': warc_path,
|
||||
'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG,
|
||||
'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS,
|
||||
'archiveresults': sorted(archiveresults.values(), key=lambda r: all_types.index(r['name']) if r['name'] in all_types else -r['size']),
|
||||
'best_result': best_result,
|
||||
@@ -421,35 +420,34 @@ class AddView(UserPassesTestMixin, FormView):
|
||||
return SERVER_CONFIG.PUBLIC_ADD_VIEW or self.request.user.is_authenticated
|
||||
|
||||
def get_context_data(self, **kwargs):
|
||||
from archivebox.core.models import Tag
|
||||
|
||||
return {
|
||||
**super().get_context_data(**kwargs),
|
||||
'title': "Add URLs",
|
||||
'title': "Create Crawl",
|
||||
# We can't just call request.build_absolute_uri in the template, because it would include query parameters
|
||||
'absolute_add_path': self.request.build_absolute_uri(self.request.path),
|
||||
'VERSION': VERSION,
|
||||
'FOOTER_INFO': SERVER_CONFIG.FOOTER_INFO,
|
||||
'stdout': '',
|
||||
'available_tags': list(Tag.objects.all().order_by('name').values_list('name', flat=True)),
|
||||
}
|
||||
|
||||
def form_valid(self, form):
|
||||
urls = form.cleaned_data["url"]
|
||||
print(f'[+] Adding URL: {urls}')
|
||||
parser = form.cleaned_data.get("parser", "auto") # default to auto-detect parser
|
||||
tag = form.cleaned_data["tag"]
|
||||
depth = 0 if form.cleaned_data["depth"] == "0" else 1
|
||||
plugins = ','.join(form.cleaned_data["archive_methods"])
|
||||
input_kwargs = {
|
||||
"urls": urls,
|
||||
"tag": tag,
|
||||
"depth": depth,
|
||||
"parser": parser,
|
||||
"update_all": False,
|
||||
"out_dir": DATA_DIR,
|
||||
"created_by_id": self.request.user.pk,
|
||||
}
|
||||
if plugins:
|
||||
input_kwargs.update({"plugins": plugins})
|
||||
|
||||
# Extract all form fields
|
||||
tag = form.cleaned_data["tag"]
|
||||
depth = int(form.cleaned_data["depth"])
|
||||
plugins = ','.join(form.cleaned_data.get("plugins", []))
|
||||
schedule = form.cleaned_data.get("schedule", "").strip()
|
||||
persona = form.cleaned_data.get("persona", "Default")
|
||||
overwrite = form.cleaned_data.get("overwrite", False)
|
||||
update = form.cleaned_data.get("update", False)
|
||||
index_only = form.cleaned_data.get("index_only", False)
|
||||
notes = form.cleaned_data.get("notes", "")
|
||||
custom_config = form.cleaned_data.get("config", {})
|
||||
|
||||
from archivebox.config.permissions import HOSTNAME
|
||||
|
||||
@@ -461,33 +459,59 @@ class AddView(UserPassesTestMixin, FormView):
|
||||
# 2. create a new Crawl with the URLs from the file
|
||||
timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
|
||||
urls_content = sources_file.read_text()
|
||||
# Build complete config
|
||||
config = {
|
||||
'ONLY_NEW': not update,
|
||||
'INDEX_ONLY': index_only,
|
||||
'OVERWRITE': overwrite,
|
||||
'DEPTH': depth,
|
||||
'PLUGINS': plugins or '',
|
||||
'DEFAULT_PERSONA': persona or 'Default',
|
||||
}
|
||||
|
||||
# Merge custom config overrides
|
||||
config.update(custom_config)
|
||||
|
||||
crawl = Crawl.objects.create(
|
||||
urls=urls_content,
|
||||
max_depth=depth,
|
||||
tags_str=tag,
|
||||
notes=notes,
|
||||
label=f'{self.request.user.username}@{HOSTNAME}{self.request.path} {timestamp}',
|
||||
created_by_id=self.request.user.pk,
|
||||
config={
|
||||
# 'ONLY_NEW': not update,
|
||||
# 'INDEX_ONLY': index_only,
|
||||
# 'OVERWRITE': False,
|
||||
'DEPTH': depth,
|
||||
'PLUGINS': plugins or '',
|
||||
# 'DEFAULT_PERSONA': persona or 'Default',
|
||||
}
|
||||
config=config
|
||||
)
|
||||
|
||||
|
||||
# 3. create a CrawlSchedule if schedule is provided
|
||||
if schedule:
|
||||
from crawls.models import CrawlSchedule
|
||||
crawl_schedule = CrawlSchedule.objects.create(
|
||||
template=crawl,
|
||||
schedule=schedule,
|
||||
is_enabled=True,
|
||||
label=crawl.label,
|
||||
notes=f"Auto-created from add page. {notes}".strip(),
|
||||
created_by_id=self.request.user.pk,
|
||||
)
|
||||
crawl.schedule = crawl_schedule
|
||||
crawl.save(update_fields=['schedule'])
|
||||
|
||||
# 4. start the Orchestrator & wait until it completes
|
||||
# ... orchestrator will create the root Snapshot, which creates pending ArchiveResults, which gets run by the ArchiveResultActors ...
|
||||
# from crawls.actors import CrawlActor
|
||||
# from core.actors import SnapshotActor, ArchiveResultActor
|
||||
|
||||
# from archivebox.crawls.actors import CrawlActor
|
||||
# from archivebox.core.actors import SnapshotActor, ArchiveResultActor
|
||||
|
||||
|
||||
rough_url_count = urls.count('://')
|
||||
|
||||
# Build success message with schedule link if created
|
||||
schedule_msg = ""
|
||||
if schedule:
|
||||
schedule_msg = f" and <a href='{crawl.schedule.admin_change_url}'>scheduled to repeat {schedule}</a>"
|
||||
|
||||
messages.success(
|
||||
self.request,
|
||||
mark_safe(f"Adding {rough_url_count} URLs in the background. (refresh in a minute start seeing results) {crawl.admin_change_url}"),
|
||||
mark_safe(f"Created crawl with {rough_url_count} starting URL(s){schedule_msg}. Snapshots will be created and archived in the background. <a href='{crawl.admin_change_url}'>View Crawl →</a>"),
|
||||
)
|
||||
|
||||
# Orchestrator (managed by supervisord) will pick up the queued crawl
|
||||
@@ -516,8 +540,8 @@ def live_progress_view(request):
|
||||
"""Simple JSON endpoint for live progress status - used by admin progress monitor."""
|
||||
try:
|
||||
from workers.orchestrator import Orchestrator
|
||||
from crawls.models import Crawl
|
||||
from core.models import Snapshot, ArchiveResult
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.core.models import Snapshot, ArchiveResult
|
||||
from django.db.models import Case, When, Value, IntegerField
|
||||
|
||||
# Get orchestrator status
|
||||
@@ -764,9 +788,9 @@ def key_is_safe(key: str) -> bool:
|
||||
def find_config_source(key: str, merged_config: dict) -> str:
|
||||
"""Determine where a config value comes from."""
|
||||
import os
|
||||
from machine.models import Machine
|
||||
from archivebox.machine.models import Machine
|
||||
|
||||
# Check if it's from machine config
|
||||
# Check if it's from archivebox.machine.config
|
||||
try:
|
||||
machine = Machine.current()
|
||||
if machine.config and key in machine.config:
|
||||
@@ -778,7 +802,7 @@ def find_config_source(key: str, merged_config: dict) -> str:
|
||||
if key in os.environ:
|
||||
return 'Environment'
|
||||
|
||||
# Check if it's from config file
|
||||
# Check if it's from archivebox.config.file
|
||||
from archivebox.config.configset import BaseConfigSet
|
||||
file_config = BaseConfigSet.load_from_file(CONSTANTS.CONFIG_FILE)
|
||||
if key in file_config:
|
||||
@@ -796,7 +820,7 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
|
||||
# Get merged config that includes Machine.config overrides
|
||||
try:
|
||||
from machine.models import Machine
|
||||
from archivebox.machine.models import Machine
|
||||
machine = Machine.current()
|
||||
merged_config = get_config()
|
||||
except Exception as e:
|
||||
@@ -859,7 +883,7 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
@render_with_item_view
|
||||
def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||
import os
|
||||
from machine.models import Machine
|
||||
from archivebox.machine.models import Machine
|
||||
from archivebox.config.configset import BaseConfigSet
|
||||
|
||||
CONFIGS = get_all_configs()
|
||||
|
||||
@@ -17,8 +17,8 @@ from django_object_actions import action
|
||||
|
||||
from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
|
||||
|
||||
from core.models import Snapshot
|
||||
from crawls.models import Crawl, CrawlSchedule
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.crawls.models import Crawl, CrawlSchedule
|
||||
|
||||
|
||||
def render_snapshots_list(snapshots_qs, limit=20):
|
||||
|
||||
@@ -3,4 +3,4 @@ from django.apps import AppConfig
|
||||
|
||||
class CrawlsConfig(AppConfig):
|
||||
default_auto_field = "django.db.models.BigAutoField"
|
||||
name = "crawls"
|
||||
name = "archivebox.crawls"
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
__package__ = 'archivebox.crawls'
|
||||
|
||||
from typing import TYPE_CHECKING, Iterable
|
||||
from datetime import timedelta
|
||||
from archivebox.uuid_compat import uuid7
|
||||
from pathlib import Path
|
||||
|
||||
@@ -11,13 +12,15 @@ from django.conf import settings
|
||||
from django.urls import reverse_lazy
|
||||
from django.utils import timezone
|
||||
from django_stubs_ext.db.models import TypedModelMeta
|
||||
from statemachine import State, registry
|
||||
from rich import print
|
||||
|
||||
from archivebox.config import CONSTANTS
|
||||
from archivebox.base_models.models import ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, get_or_create_system_user_pk
|
||||
from workers.models import ModelWithStateMachine
|
||||
from archivebox.workers.models import ModelWithStateMachine, BaseStateMachine
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from core.models import Snapshot, ArchiveResult
|
||||
from archivebox.core.models import Snapshot, ArchiveResult
|
||||
|
||||
|
||||
class CrawlSchedule(ModelWithSerializers, ModelWithNotes, ModelWithHealthStats):
|
||||
@@ -35,6 +38,7 @@ class CrawlSchedule(ModelWithSerializers, ModelWithNotes, ModelWithHealthStats):
|
||||
crawl_set: models.Manager['Crawl']
|
||||
|
||||
class Meta(TypedModelMeta):
|
||||
app_label = 'crawls'
|
||||
verbose_name = 'Scheduled Crawl'
|
||||
verbose_name_plural = 'Scheduled Crawls'
|
||||
|
||||
@@ -73,7 +77,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
status = ModelWithStateMachine.StatusField(choices=ModelWithStateMachine.StatusChoices, default=ModelWithStateMachine.StatusChoices.QUEUED)
|
||||
retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
|
||||
|
||||
state_machine_name = 'crawls.statemachines.CrawlMachine'
|
||||
state_machine_name = 'crawls.models.CrawlMachine'
|
||||
retry_at_field_name = 'retry_at'
|
||||
state_field_name = 'status'
|
||||
StatusChoices = ModelWithStateMachine.StatusChoices
|
||||
@@ -82,6 +86,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
snapshot_set: models.Manager['Snapshot']
|
||||
|
||||
class Meta(TypedModelMeta):
|
||||
app_label = 'crawls'
|
||||
verbose_name = 'Crawl'
|
||||
verbose_name_plural = 'Crawls'
|
||||
|
||||
@@ -168,7 +173,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
return Path(path_str)
|
||||
|
||||
def create_root_snapshot(self) -> 'Snapshot':
|
||||
from core.models import Snapshot
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
first_url = self.get_urls_list()[0] if self.get_urls_list() else None
|
||||
if not first_url:
|
||||
@@ -245,7 +250,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
List of newly created Snapshot objects
|
||||
"""
|
||||
import json
|
||||
from core.models import Snapshot
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
created_snapshots = []
|
||||
|
||||
@@ -309,9 +314,13 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
import time
|
||||
from pathlib import Path
|
||||
from archivebox.hooks import run_hook, discover_hooks, process_hook_records
|
||||
from archivebox.config.configset import get_config
|
||||
|
||||
# Get merged config with crawl context
|
||||
config = get_config(crawl=self)
|
||||
|
||||
# Discover and run on_Crawl hooks
|
||||
hooks = discover_hooks('Crawl')
|
||||
hooks = discover_hooks('Crawl', config=config)
|
||||
first_url = self.get_urls_list()[0] if self.get_urls_list() else ''
|
||||
|
||||
for hook in hooks:
|
||||
@@ -323,8 +332,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
result = run_hook(
|
||||
hook,
|
||||
output_dir=output_dir,
|
||||
timeout=60,
|
||||
config_objects=[self],
|
||||
config=config,
|
||||
crawl_id=str(self.id),
|
||||
source_url=first_url,
|
||||
)
|
||||
@@ -380,7 +388,10 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
pass
|
||||
|
||||
# Run on_CrawlEnd hooks
|
||||
hooks = discover_hooks('CrawlEnd')
|
||||
from archivebox.config.configset import get_config
|
||||
config = get_config(crawl=self)
|
||||
|
||||
hooks = discover_hooks('CrawlEnd', config=config)
|
||||
first_url = self.get_urls_list()[0] if self.get_urls_list() else ''
|
||||
|
||||
for hook in hooks:
|
||||
@@ -391,8 +402,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
result = run_hook(
|
||||
hook,
|
||||
output_dir=output_dir,
|
||||
timeout=30,
|
||||
config_objects=[self],
|
||||
config=config,
|
||||
crawl_id=str(self.id),
|
||||
source_url=first_url,
|
||||
)
|
||||
@@ -400,3 +410,131 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
# Log failures but don't block
|
||||
if result and result['returncode'] != 0:
|
||||
print(f'[yellow]⚠️ CrawlEnd hook failed: {hook.name}[/yellow]')
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# State Machines
|
||||
# =============================================================================
|
||||
|
||||
class CrawlMachine(BaseStateMachine, strict_states=True):
|
||||
"""
|
||||
State machine for managing Crawl lifecycle.
|
||||
|
||||
Hook Lifecycle:
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ QUEUED State │
|
||||
│ • Waiting for crawl to be ready (has URLs) │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
↓ tick() when can_start()
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ STARTED State → enter_started() │
|
||||
│ 1. crawl.run() │
|
||||
│ • discover_hooks('Crawl') → finds all crawl hooks │
|
||||
│ • For each hook: │
|
||||
│ - run_hook(script, output_dir, ...) │
|
||||
│ - Parse JSONL from hook output │
|
||||
│ - process_hook_records() → creates Snapshots │
|
||||
│ • create_root_snapshot() → root snapshot for crawl │
|
||||
│ • create_snapshots_from_urls() → from self.urls field │
|
||||
│ │
|
||||
│ 2. Snapshots process independently with their own │
|
||||
│ state machines (see SnapshotMachine) │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
↓ tick() when is_finished()
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ SEALED State → enter_sealed() │
|
||||
│ • cleanup() → runs on_CrawlEnd hooks, kills background │
|
||||
│ • Set retry_at=None (no more processing) │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
"""
|
||||
|
||||
model_attr_name = 'crawl'
|
||||
|
||||
# States
|
||||
queued = State(value=Crawl.StatusChoices.QUEUED, initial=True)
|
||||
started = State(value=Crawl.StatusChoices.STARTED)
|
||||
sealed = State(value=Crawl.StatusChoices.SEALED, final=True)
|
||||
|
||||
# Tick Event
|
||||
tick = (
|
||||
queued.to.itself(unless='can_start') |
|
||||
queued.to(started, cond='can_start') |
|
||||
started.to.itself(unless='is_finished') |
|
||||
started.to(sealed, cond='is_finished')
|
||||
)
|
||||
|
||||
def can_start(self) -> bool:
|
||||
if not self.crawl.urls:
|
||||
print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: no URLs[/red]')
|
||||
return False
|
||||
urls_list = self.crawl.get_urls_list()
|
||||
if not urls_list:
|
||||
print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: no valid URLs in urls field[/red]')
|
||||
return False
|
||||
return True
|
||||
|
||||
def is_finished(self) -> bool:
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
# check that at least one snapshot exists for this crawl
|
||||
snapshots = Snapshot.objects.filter(crawl=self.crawl)
|
||||
if not snapshots.exists():
|
||||
return False
|
||||
|
||||
# check if all snapshots are sealed
|
||||
# Snapshots handle their own background hooks via the step system,
|
||||
# so we just need to wait for all snapshots to reach sealed state
|
||||
if snapshots.filter(status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]).exists():
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
@started.enter
|
||||
def enter_started(self):
|
||||
# Lock the crawl by bumping retry_at so other workers don't pick it up while we create snapshots
|
||||
self.crawl.update_and_requeue(
|
||||
retry_at=timezone.now() + timedelta(seconds=30), # Lock for 30 seconds
|
||||
)
|
||||
|
||||
try:
|
||||
# Run the crawl - runs hooks, processes JSONL, creates snapshots
|
||||
self.crawl.run()
|
||||
|
||||
# Update status to STARTED once snapshots are created
|
||||
# Set retry_at to future so we don't busy-loop - wait for snapshots to process
|
||||
self.crawl.update_and_requeue(
|
||||
retry_at=timezone.now() + timedelta(seconds=5), # Check again in 5s
|
||||
status=Crawl.StatusChoices.STARTED,
|
||||
)
|
||||
except Exception as e:
|
||||
print(f'[red]⚠️ Crawl {self.crawl.id} failed to start: {e}[/red]')
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
# Re-raise so the worker knows it failed
|
||||
raise
|
||||
|
||||
def on_started_to_started(self):
|
||||
"""Called when Crawl stays in started state (snapshots not sealed yet)."""
|
||||
# Bump retry_at so we check again in a few seconds
|
||||
self.crawl.update_and_requeue(
|
||||
retry_at=timezone.now() + timedelta(seconds=5),
|
||||
)
|
||||
|
||||
@sealed.enter
|
||||
def enter_sealed(self):
|
||||
# Clean up background hooks and run on_CrawlEnd hooks
|
||||
self.crawl.cleanup()
|
||||
|
||||
self.crawl.update_and_requeue(
|
||||
retry_at=None,
|
||||
status=Crawl.StatusChoices.SEALED,
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Register State Machines
|
||||
# =============================================================================
|
||||
|
||||
# Manually register state machines with python-statemachine registry
|
||||
# (normally auto-discovered from statemachines.py, but we define them here for clarity)
|
||||
registry.register(CrawlMachine)
|
||||
|
||||
@@ -1,114 +0,0 @@
|
||||
__package__ = 'archivebox.crawls'
|
||||
|
||||
import os
|
||||
from typing import ClassVar
|
||||
from datetime import timedelta
|
||||
from django.utils import timezone
|
||||
|
||||
from rich import print
|
||||
|
||||
from statemachine import State, StateMachine
|
||||
|
||||
# from workers.actor import ActorType
|
||||
from crawls.models import Crawl
|
||||
|
||||
|
||||
class CrawlMachine(StateMachine, strict_states=True):
|
||||
"""State machine for managing Crawl lifecycle."""
|
||||
|
||||
model: Crawl
|
||||
|
||||
# States
|
||||
queued = State(value=Crawl.StatusChoices.QUEUED, initial=True)
|
||||
started = State(value=Crawl.StatusChoices.STARTED)
|
||||
sealed = State(value=Crawl.StatusChoices.SEALED, final=True)
|
||||
|
||||
# Tick Event
|
||||
tick = (
|
||||
queued.to.itself(unless='can_start') |
|
||||
queued.to(started, cond='can_start') |
|
||||
started.to.itself(unless='is_finished') |
|
||||
started.to(sealed, cond='is_finished')
|
||||
)
|
||||
|
||||
def __init__(self, crawl, *args, **kwargs):
|
||||
self.crawl = crawl
|
||||
super().__init__(crawl, *args, **kwargs)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f'Crawl[{self.crawl.id}]'
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.__repr__()
|
||||
|
||||
def can_start(self) -> bool:
|
||||
if not self.crawl.urls:
|
||||
print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: no URLs[/red]')
|
||||
return False
|
||||
urls_list = self.crawl.get_urls_list()
|
||||
if not urls_list:
|
||||
print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: no valid URLs in urls field[/red]')
|
||||
return False
|
||||
return True
|
||||
|
||||
def is_finished(self) -> bool:
|
||||
from core.models import Snapshot, ArchiveResult
|
||||
|
||||
# check that at least one snapshot exists for this crawl
|
||||
snapshots = Snapshot.objects.filter(crawl=self.crawl)
|
||||
if not snapshots.exists():
|
||||
return False
|
||||
|
||||
# check to make sure no snapshots are in non-final states
|
||||
if snapshots.filter(status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]).exists():
|
||||
return False
|
||||
|
||||
# check that some archiveresults exist for this crawl
|
||||
results = ArchiveResult.objects.filter(snapshot__crawl=self.crawl)
|
||||
if not results.exists():
|
||||
return False
|
||||
|
||||
# check if all archiveresults are finished
|
||||
if results.filter(status__in=[ArchiveResult.StatusChoices.QUEUED, ArchiveResult.StatusChoices.STARTED]).exists():
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
# def before_transition(self, event, state):
|
||||
# print(f"Before '{event}', on the '{state.id}' state.")
|
||||
# return "before_transition_return"
|
||||
|
||||
@started.enter
|
||||
def enter_started(self):
|
||||
# Suppressed: state transition logs
|
||||
# Lock the crawl by bumping retry_at so other workers don't pick it up while we create snapshots
|
||||
self.crawl.update_for_workers(
|
||||
retry_at=timezone.now() + timedelta(seconds=30), # Lock for 30 seconds
|
||||
)
|
||||
|
||||
try:
|
||||
# Run the crawl - runs hooks, processes JSONL, creates snapshots
|
||||
self.crawl.run()
|
||||
|
||||
# Update status to STARTED once snapshots are created
|
||||
self.crawl.update_for_workers(
|
||||
retry_at=timezone.now(), # Process immediately
|
||||
status=Crawl.StatusChoices.STARTED,
|
||||
)
|
||||
except Exception as e:
|
||||
print(f'[red]⚠️ Crawl {self.crawl.id} failed to start: {e}[/red]')
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
# Re-raise so the worker knows it failed
|
||||
raise
|
||||
|
||||
@sealed.enter
|
||||
def enter_sealed(self):
|
||||
# Clean up background hooks and run on_CrawlEnd hooks
|
||||
self.crawl.cleanup()
|
||||
|
||||
# Suppressed: state transition logs
|
||||
self.crawl.update_for_workers(
|
||||
retry_at=None,
|
||||
status=Crawl.StatusChoices.SEALED,
|
||||
)
|
||||
@@ -146,11 +146,16 @@ class HookResult(TypedDict, total=False):
|
||||
records: List[Dict[str, Any]] # Parsed JSONL records with 'type' field
|
||||
|
||||
|
||||
def discover_hooks(event_name: str) -> List[Path]:
|
||||
def discover_hooks(
|
||||
event_name: str,
|
||||
filter_disabled: bool = True,
|
||||
config: Optional[Dict[str, Any]] = None
|
||||
) -> List[Path]:
|
||||
"""
|
||||
Find all hook scripts matching on_{event_name}__*.{sh,py,js} pattern.
|
||||
|
||||
Searches both built-in and user plugin directories.
|
||||
Filters out hooks from disabled plugins by default (respects USE_/SAVE_ flags).
|
||||
Returns scripts sorted alphabetically by filename for deterministic execution order.
|
||||
|
||||
Hook naming convention uses numeric prefixes to control order:
|
||||
@@ -158,9 +163,29 @@ def discover_hooks(event_name: str) -> List[Path]:
|
||||
on_Snapshot__15_singlefile.py # runs second
|
||||
on_Snapshot__26_readability.py # runs later (depends on singlefile)
|
||||
|
||||
Example:
|
||||
Args:
|
||||
event_name: Event name (e.g., 'Snapshot', 'Binary', 'Crawl')
|
||||
filter_disabled: If True, skip hooks from disabled plugins (default: True)
|
||||
config: Optional config dict from get_config() (merges file, env, machine, crawl, snapshot)
|
||||
If None, will call get_config() with global scope
|
||||
|
||||
Returns:
|
||||
Sorted list of hook script paths from enabled plugins only.
|
||||
|
||||
Examples:
|
||||
# With proper config context (recommended):
|
||||
from archivebox.config.configset import get_config
|
||||
config = get_config(crawl=my_crawl, snapshot=my_snapshot)
|
||||
discover_hooks('Snapshot', config=config)
|
||||
# Returns: [Path('.../on_Snapshot__10_title.py'), ...] (wget excluded if SAVE_WGET=False)
|
||||
|
||||
# Without config (uses global defaults):
|
||||
discover_hooks('Snapshot')
|
||||
# Returns: [Path('.../on_Snapshot__10_title.py'), Path('.../on_Snapshot__15_singlefile.py'), ...]
|
||||
# Returns: [Path('.../on_Snapshot__10_title.py'), ...]
|
||||
|
||||
# Show all plugins regardless of enabled status:
|
||||
discover_hooks('Snapshot', filter_disabled=False)
|
||||
# Returns: [Path('.../on_Snapshot__10_title.py'), ..., Path('.../on_Snapshot__50_wget.py')]
|
||||
"""
|
||||
hooks = []
|
||||
|
||||
@@ -177,45 +202,44 @@ def discover_hooks(event_name: str) -> List[Path]:
|
||||
pattern_direct = f'on_{event_name}__*.{ext}'
|
||||
hooks.extend(base_dir.glob(pattern_direct))
|
||||
|
||||
# Filter by enabled plugins
|
||||
if filter_disabled:
|
||||
# Get merged config if not provided (lazy import to avoid circular dependency)
|
||||
if config is None:
|
||||
from archivebox.config.configset import get_config
|
||||
config = get_config(scope='global')
|
||||
|
||||
enabled_hooks = []
|
||||
|
||||
for hook in hooks:
|
||||
# Get plugin name from parent directory
|
||||
# e.g., archivebox/plugins/wget/on_Snapshot__50_wget.py -> 'wget'
|
||||
plugin_name = hook.parent.name
|
||||
|
||||
# Check if this is a plugin directory (not the root plugins dir)
|
||||
if plugin_name in ('plugins', '.'):
|
||||
# Hook is in root plugins directory, not a plugin subdir
|
||||
# Include it by default (no filtering for non-plugin hooks)
|
||||
enabled_hooks.append(hook)
|
||||
continue
|
||||
|
||||
# Check if plugin is enabled
|
||||
plugin_config = get_plugin_special_config(plugin_name, config)
|
||||
if plugin_config['enabled']:
|
||||
enabled_hooks.append(hook)
|
||||
|
||||
hooks = enabled_hooks
|
||||
|
||||
# Sort by filename (not full path) to ensure numeric prefix ordering works
|
||||
# e.g., on_Snapshot__10_title.py sorts before on_Snapshot__26_readability.py
|
||||
return sorted(set(hooks), key=lambda p: p.name)
|
||||
|
||||
|
||||
def discover_all_hooks() -> Dict[str, List[Path]]:
|
||||
"""
|
||||
Discover all hooks organized by event name.
|
||||
|
||||
Returns a dict mapping event names to lists of hook script paths.
|
||||
"""
|
||||
hooks_by_event: Dict[str, List[Path]] = {}
|
||||
|
||||
for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
|
||||
if not base_dir.exists():
|
||||
continue
|
||||
|
||||
for ext in ('sh', 'py', 'js'):
|
||||
for hook_path in base_dir.glob(f'*/on_*__*.{ext}'):
|
||||
# Extract event name from filename: on_EventName__hook_name.ext
|
||||
filename = hook_path.stem # on_EventName__hook_name
|
||||
if filename.startswith('on_') and '__' in filename:
|
||||
event_name = filename[3:].split('__')[0] # EventName
|
||||
if event_name not in hooks_by_event:
|
||||
hooks_by_event[event_name] = []
|
||||
hooks_by_event[event_name].append(hook_path)
|
||||
|
||||
# Sort hooks within each event
|
||||
for event_name in hooks_by_event:
|
||||
hooks_by_event[event_name] = sorted(set(hooks_by_event[event_name]), key=lambda p: p.name)
|
||||
|
||||
return hooks_by_event
|
||||
|
||||
|
||||
def run_hook(
|
||||
script: Path,
|
||||
output_dir: Path,
|
||||
timeout: int = 300,
|
||||
config_objects: Optional[List[Any]] = None,
|
||||
config: Dict[str, Any],
|
||||
timeout: Optional[int] = None,
|
||||
**kwargs: Any
|
||||
) -> HookResult:
|
||||
"""
|
||||
@@ -224,31 +248,33 @@ def run_hook(
|
||||
This is the low-level hook executor. For running extractors with proper
|
||||
metadata handling, use call_extractor() instead.
|
||||
|
||||
Config is passed to hooks via environment variables with this priority:
|
||||
1. Plugin schema defaults (config.json)
|
||||
2. Config file (ArchiveBox.conf)
|
||||
3. Environment variables
|
||||
4. Machine.config (auto-included, lowest override priority)
|
||||
5. config_objects (in order - later objects override earlier ones)
|
||||
Config is passed to hooks via environment variables. Caller MUST use
|
||||
get_config() to merge all sources (file, env, machine, crawl, snapshot).
|
||||
|
||||
Args:
|
||||
script: Path to the hook script (.sh, .py, or .js)
|
||||
output_dir: Working directory for the script (where output files go)
|
||||
config: Merged config dict from get_config(crawl=..., snapshot=...) - REQUIRED
|
||||
timeout: Maximum execution time in seconds
|
||||
config_objects: Optional list of objects with .config JSON fields
|
||||
(e.g., [crawl, snapshot] - later items have higher priority)
|
||||
If None, auto-detects from PLUGINNAME_TIMEOUT config (fallback to TIMEOUT, default 300)
|
||||
**kwargs: Arguments passed to the script as --key=value
|
||||
|
||||
Returns:
|
||||
HookResult with 'returncode', 'stdout', 'stderr', 'output_json', 'output_files', 'duration_ms'
|
||||
|
||||
Example:
|
||||
from archivebox.config.configset import get_config
|
||||
config = get_config(crawl=my_crawl, snapshot=my_snapshot)
|
||||
result = run_hook(hook_path, output_dir, config=config, url=url, snapshot_id=id)
|
||||
"""
|
||||
import time
|
||||
start_time = time.time()
|
||||
|
||||
# Auto-include Machine.config at the start (lowest priority among config_objects)
|
||||
from machine.models import Machine
|
||||
machine = Machine.current()
|
||||
all_config_objects = [machine] + list(config_objects or [])
|
||||
# Auto-detect timeout from plugin config if not explicitly provided
|
||||
if timeout is None:
|
||||
plugin_name = script.parent.name
|
||||
plugin_config = get_plugin_special_config(plugin_name, config)
|
||||
timeout = plugin_config['timeout']
|
||||
|
||||
if not script.exists():
|
||||
return HookResult(
|
||||
@@ -302,51 +328,16 @@ def run_hook(
|
||||
env['ARCHIVE_DIR'] = str(getattr(settings, 'ARCHIVE_DIR', Path.cwd() / 'archive'))
|
||||
env.setdefault('MACHINE_ID', getattr(settings, 'MACHINE_ID', '') or os.environ.get('MACHINE_ID', ''))
|
||||
|
||||
# If a Crawl is in config_objects, pass its OUTPUT_DIR for hooks that need to find crawl-level resources
|
||||
for obj in all_config_objects:
|
||||
if hasattr(obj, 'OUTPUT_DIR') and hasattr(obj, 'get_urls_list'): # Duck-type check for Crawl
|
||||
env['CRAWL_OUTPUT_DIR'] = str(obj.OUTPUT_DIR)
|
||||
break
|
||||
|
||||
# Build overrides from any objects with .config fields (in order, later overrides earlier)
|
||||
# all_config_objects includes Machine at the start, then any passed config_objects
|
||||
overrides = {}
|
||||
for obj in all_config_objects:
|
||||
if obj and hasattr(obj, 'config') and obj.config:
|
||||
# Strip 'config/' prefix from Machine.config keys (e.g., 'config/CHROME_BINARY' -> 'CHROME_BINARY')
|
||||
for key, value in obj.config.items():
|
||||
clean_key = key.removeprefix('config/')
|
||||
overrides[clean_key] = value
|
||||
|
||||
# Get plugin config from JSON schemas with hierarchy resolution
|
||||
# This merges: schema defaults -> config file -> env vars -> object config overrides
|
||||
plugin_config = get_flat_plugin_config(overrides=overrides if overrides else None)
|
||||
export_plugin_config_to_env(plugin_config, env)
|
||||
|
||||
# Also pass core config values that aren't in plugin schemas yet
|
||||
# These are legacy values that may still be needed
|
||||
from archivebox import config
|
||||
env.setdefault('CHROME_BINARY', str(getattr(config, 'CHROME_BINARY', '')))
|
||||
env.setdefault('WGET_BINARY', str(getattr(config, 'WGET_BINARY', '')))
|
||||
env.setdefault('CURL_BINARY', str(getattr(config, 'CURL_BINARY', '')))
|
||||
env.setdefault('GIT_BINARY', str(getattr(config, 'GIT_BINARY', '')))
|
||||
env.setdefault('YOUTUBEDL_BINARY', str(getattr(config, 'YOUTUBEDL_BINARY', '')))
|
||||
env.setdefault('SINGLEFILE_BINARY', str(getattr(config, 'SINGLEFILE_BINARY', '')))
|
||||
env.setdefault('READABILITY_BINARY', str(getattr(config, 'READABILITY_BINARY', '')))
|
||||
env.setdefault('MERCURY_BINARY', str(getattr(config, 'MERCURY_BINARY', '')))
|
||||
env.setdefault('NODE_BINARY', str(getattr(config, 'NODE_BINARY', '')))
|
||||
env.setdefault('TIMEOUT', str(getattr(config, 'TIMEOUT', 60)))
|
||||
env.setdefault('CHECK_SSL_VALIDITY', str(getattr(config, 'CHECK_SSL_VALIDITY', True)))
|
||||
env.setdefault('USER_AGENT', str(getattr(config, 'USER_AGENT', '')))
|
||||
env.setdefault('RESOLUTION', str(getattr(config, 'RESOLUTION', '')))
|
||||
|
||||
# Pass SEARCH_BACKEND_ENGINE from new-style config
|
||||
try:
|
||||
from archivebox.config.configset import get_config
|
||||
search_config = get_config()
|
||||
env.setdefault('SEARCH_BACKEND_ENGINE', str(search_config.get('SEARCH_BACKEND_ENGINE', 'ripgrep')))
|
||||
except Exception:
|
||||
env.setdefault('SEARCH_BACKEND_ENGINE', 'ripgrep')
|
||||
# Export all config values to environment (already merged by get_config())
|
||||
for key, value in config.items():
|
||||
if value is None:
|
||||
continue
|
||||
elif isinstance(value, bool):
|
||||
env[key] = 'true' if value else 'false'
|
||||
elif isinstance(value, (list, dict)):
|
||||
env[key] = json.dumps(value)
|
||||
else:
|
||||
env[key] = str(value)
|
||||
|
||||
# Create output directory if needed
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
@@ -525,31 +516,35 @@ def collect_urls_from_plugins(snapshot_dir: Path) -> List[Dict[str, Any]]:
|
||||
def run_hooks(
|
||||
event_name: str,
|
||||
output_dir: Path,
|
||||
timeout: int = 300,
|
||||
config: Dict[str, Any],
|
||||
timeout: Optional[int] = None,
|
||||
stop_on_failure: bool = False,
|
||||
config_objects: Optional[List[Any]] = None,
|
||||
**kwargs: Any
|
||||
) -> List[HookResult]:
|
||||
"""
|
||||
Run all hooks for a given event.
|
||||
|
||||
Args:
|
||||
event_name: The event name to trigger (e.g., 'Snapshot__wget')
|
||||
event_name: The event name to trigger (e.g., 'Snapshot', 'Crawl', 'Binary')
|
||||
output_dir: Working directory for hook scripts
|
||||
timeout: Maximum execution time per hook
|
||||
config: Merged config dict from get_config(crawl=..., snapshot=...) - REQUIRED
|
||||
timeout: Maximum execution time per hook (None = auto-detect from plugin config)
|
||||
stop_on_failure: If True, stop executing hooks after first failure
|
||||
config_objects: Optional list of objects with .config JSON fields
|
||||
(e.g., [crawl, snapshot] - later items have higher priority)
|
||||
**kwargs: Arguments passed to each hook script
|
||||
|
||||
Returns:
|
||||
List of results from each hook execution
|
||||
|
||||
Example:
|
||||
from archivebox.config.configset import get_config
|
||||
config = get_config(crawl=my_crawl, snapshot=my_snapshot)
|
||||
results = run_hooks('Snapshot', output_dir, config=config, url=url, snapshot_id=id)
|
||||
"""
|
||||
hooks = discover_hooks(event_name)
|
||||
hooks = discover_hooks(event_name, config=config)
|
||||
results = []
|
||||
|
||||
for hook in hooks:
|
||||
result = run_hook(hook, output_dir, timeout=timeout, config_objects=config_objects, **kwargs)
|
||||
result = run_hook(hook, output_dir, config=config, timeout=timeout, **kwargs)
|
||||
|
||||
# Background hooks return None - skip adding to results
|
||||
if result is None:
|
||||
@@ -638,24 +633,44 @@ EXTRACTOR_INDEXING_PRECEDENCE = [
|
||||
]
|
||||
|
||||
|
||||
def get_enabled_plugins(config: Optional[Dict] = None) -> List[str]:
|
||||
def get_enabled_plugins(config: Optional[Dict[str, Any]] = None) -> List[str]:
|
||||
"""
|
||||
Get the list of enabled plugins based on config and available hooks.
|
||||
|
||||
Checks for ENABLED_PLUGINS (or legacy ENABLED_EXTRACTORS) in config,
|
||||
falls back to discovering available hooks from the plugins directory.
|
||||
Filters plugins by USE_/SAVE_ flags. Only returns plugins that are enabled.
|
||||
|
||||
Returns plugin names sorted alphabetically (numeric prefix controls order).
|
||||
Args:
|
||||
config: Merged config dict from get_config() - if None, uses global config
|
||||
|
||||
Returns:
|
||||
Plugin names sorted alphabetically (numeric prefix controls order).
|
||||
|
||||
Example:
|
||||
from archivebox.config.configset import get_config
|
||||
config = get_config(crawl=my_crawl, snapshot=my_snapshot)
|
||||
enabled = get_enabled_plugins(config) # ['wget', 'media', 'chrome', ...]
|
||||
"""
|
||||
if config:
|
||||
# Support both new and legacy config keys
|
||||
if 'ENABLED_PLUGINS' in config:
|
||||
return config['ENABLED_PLUGINS']
|
||||
if 'ENABLED_EXTRACTORS' in config:
|
||||
return config['ENABLED_EXTRACTORS']
|
||||
# Get merged config if not provided
|
||||
if config is None:
|
||||
from archivebox.config.configset import get_config
|
||||
config = get_config(scope='global')
|
||||
|
||||
# Discover from hooks - this is the source of truth
|
||||
return get_plugins()
|
||||
# Support explicit ENABLED_PLUGINS override (legacy)
|
||||
if 'ENABLED_PLUGINS' in config:
|
||||
return config['ENABLED_PLUGINS']
|
||||
if 'ENABLED_EXTRACTORS' in config:
|
||||
return config['ENABLED_EXTRACTORS']
|
||||
|
||||
# Filter all plugins by enabled status
|
||||
all_plugins = get_plugins()
|
||||
enabled = []
|
||||
|
||||
for plugin in all_plugins:
|
||||
plugin_config = get_plugin_special_config(plugin, config)
|
||||
if plugin_config['enabled']:
|
||||
enabled.append(plugin)
|
||||
|
||||
return enabled
|
||||
|
||||
|
||||
def discover_plugins_that_provide_interface(
|
||||
@@ -822,37 +837,6 @@ def discover_plugin_configs() -> Dict[str, Dict[str, Any]]:
|
||||
return configs
|
||||
|
||||
|
||||
def get_merged_config_schema() -> Dict[str, Any]:
|
||||
"""
|
||||
Get a merged JSONSchema combining all plugin config schemas.
|
||||
|
||||
This creates a single schema that can validate all plugin config keys.
|
||||
Useful for validating the complete configuration at startup.
|
||||
|
||||
Returns:
|
||||
Combined JSONSchema with all plugin properties merged.
|
||||
"""
|
||||
plugin_configs = discover_plugin_configs()
|
||||
|
||||
merged_properties = {}
|
||||
for plugin_name, schema in plugin_configs.items():
|
||||
properties = schema.get('properties', {})
|
||||
for key, prop_schema in properties.items():
|
||||
if key in merged_properties:
|
||||
# Key already exists from another plugin - log warning but keep first
|
||||
import sys
|
||||
print(f"Warning: Config key '{key}' defined in multiple plugins, using first definition", file=sys.stderr)
|
||||
continue
|
||||
merged_properties[key] = prop_schema
|
||||
|
||||
return {
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": True, # Allow unknown keys (core config, etc.)
|
||||
"properties": merged_properties,
|
||||
}
|
||||
|
||||
|
||||
def get_config_defaults_from_plugins() -> Dict[str, Any]:
|
||||
"""
|
||||
Get default values for all plugin config options.
|
||||
@@ -873,173 +857,63 @@ def get_config_defaults_from_plugins() -> Dict[str, Any]:
|
||||
return defaults
|
||||
|
||||
|
||||
def resolve_config_value(
|
||||
key: str,
|
||||
prop_schema: Dict[str, Any],
|
||||
env_vars: Dict[str, str],
|
||||
config_file: Dict[str, str],
|
||||
overrides: Optional[Dict[str, Any]] = None,
|
||||
) -> Any:
|
||||
def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Resolve a single config value following the hierarchy and schema rules.
|
||||
Extract special config keys for a plugin following naming conventions.
|
||||
|
||||
Resolution order (later overrides earlier):
|
||||
1. Schema default
|
||||
2. x-fallback (global config key)
|
||||
3. Config file (ArchiveBox.conf)
|
||||
4. Environment variables (including x-aliases)
|
||||
5. Explicit overrides (User/Crawl/Snapshot config)
|
||||
ArchiveBox recognizes 3 special config key patterns per plugin:
|
||||
- {PLUGIN}_ENABLED: Enable/disable toggle (default True)
|
||||
- {PLUGIN}_TIMEOUT: Plugin-specific timeout (fallback to TIMEOUT, default 300)
|
||||
- {PLUGIN}_BINARY: Primary binary path (default to plugin_name)
|
||||
|
||||
These allow ArchiveBox to:
|
||||
- Skip disabled plugins (optimization)
|
||||
- Enforce plugin-specific timeouts automatically
|
||||
- Discover plugin binaries for validation
|
||||
|
||||
Args:
|
||||
key: Config key name (e.g., 'WGET_TIMEOUT')
|
||||
prop_schema: JSONSchema property definition for this key
|
||||
env_vars: Environment variables dict
|
||||
config_file: Config file values dict
|
||||
overrides: Optional override values (from User/Crawl/Snapshot)
|
||||
plugin_name: Plugin name (e.g., 'wget', 'media', 'chrome')
|
||||
config: Merged config dict from get_config() (properly merges file, env, machine, crawl, snapshot)
|
||||
|
||||
Returns:
|
||||
Resolved value with appropriate type coercion.
|
||||
Dict with standardized keys:
|
||||
{
|
||||
'enabled': True, # bool
|
||||
'timeout': 60, # int, seconds
|
||||
'binary': 'wget', # str, path or name
|
||||
}
|
||||
|
||||
Examples:
|
||||
>>> from archivebox.config.configset import get_config
|
||||
>>> config = get_config(crawl=my_crawl, snapshot=my_snapshot)
|
||||
>>> get_plugin_special_config('wget', config)
|
||||
{'enabled': True, 'timeout': 120, 'binary': '/usr/bin/wget'}
|
||||
"""
|
||||
value = None
|
||||
prop_type = prop_schema.get('type', 'string')
|
||||
plugin_upper = plugin_name.upper()
|
||||
|
||||
# 1. Start with schema default
|
||||
if 'default' in prop_schema:
|
||||
value = prop_schema['default']
|
||||
# 1. Enabled: PLUGINNAME_ENABLED (default True)
|
||||
# Old names (USE_*, SAVE_*) are aliased in config.json via x-aliases
|
||||
enabled_key = f'{plugin_upper}_ENABLED'
|
||||
enabled = config.get(enabled_key)
|
||||
if enabled is None:
|
||||
enabled = True
|
||||
elif isinstance(enabled, str):
|
||||
# Handle string values from config file ("true"/"false")
|
||||
enabled = enabled.lower() not in ('false', '0', 'no', '')
|
||||
|
||||
# 2. Check x-fallback (global config key)
|
||||
fallback_key = prop_schema.get('x-fallback')
|
||||
if fallback_key:
|
||||
if fallback_key in env_vars:
|
||||
value = env_vars[fallback_key]
|
||||
elif fallback_key in config_file:
|
||||
value = config_file[fallback_key]
|
||||
# 2. Timeout: PLUGINNAME_TIMEOUT (fallback to TIMEOUT, default 300)
|
||||
timeout_key = f'{plugin_upper}_TIMEOUT'
|
||||
timeout = config.get(timeout_key) or config.get('TIMEOUT', 300)
|
||||
|
||||
# 3. Check config file for main key
|
||||
if key in config_file:
|
||||
value = config_file[key]
|
||||
# 3. Binary: PLUGINNAME_BINARY (default to plugin_name)
|
||||
binary_key = f'{plugin_upper}_BINARY'
|
||||
binary = config.get(binary_key, plugin_name)
|
||||
|
||||
# 4. Check environment variables (main key and aliases)
|
||||
keys_to_check = [key] + prop_schema.get('x-aliases', [])
|
||||
for check_key in keys_to_check:
|
||||
if check_key in env_vars:
|
||||
value = env_vars[check_key]
|
||||
break
|
||||
|
||||
# 5. Apply explicit overrides
|
||||
if overrides and key in overrides:
|
||||
value = overrides[key]
|
||||
|
||||
# Type coercion for env var strings
|
||||
if value is not None and isinstance(value, str):
|
||||
value = coerce_config_value(value, prop_type, prop_schema)
|
||||
|
||||
return value
|
||||
|
||||
|
||||
def coerce_config_value(value: str, prop_type: str, prop_schema: Dict[str, Any]) -> Any:
|
||||
"""
|
||||
Coerce a string value to the appropriate type based on schema.
|
||||
|
||||
Args:
|
||||
value: String value to coerce
|
||||
prop_type: JSONSchema type ('boolean', 'integer', 'number', 'array', 'string')
|
||||
prop_schema: Full property schema (for array item types, etc.)
|
||||
|
||||
Returns:
|
||||
Coerced value of appropriate type.
|
||||
"""
|
||||
if prop_type == 'boolean':
|
||||
return value.lower() in ('true', '1', 'yes', 'on')
|
||||
elif prop_type == 'integer':
|
||||
try:
|
||||
return int(value)
|
||||
except ValueError:
|
||||
return prop_schema.get('default', 0)
|
||||
elif prop_type == 'number':
|
||||
try:
|
||||
return float(value)
|
||||
except ValueError:
|
||||
return prop_schema.get('default', 0.0)
|
||||
elif prop_type == 'array':
|
||||
# Try JSON parse first, fall back to comma-separated
|
||||
try:
|
||||
return json.loads(value)
|
||||
except json.JSONDecodeError:
|
||||
return [v.strip() for v in value.split(',') if v.strip()]
|
||||
else:
|
||||
return value
|
||||
|
||||
|
||||
def get_flat_plugin_config(
|
||||
env_vars: Optional[Dict[str, str]] = None,
|
||||
config_file: Optional[Dict[str, str]] = None,
|
||||
overrides: Optional[Dict[str, Any]] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Get all plugin config values resolved according to hierarchy.
|
||||
|
||||
This is the main function for getting plugin configuration.
|
||||
It discovers all plugin schemas and resolves each config key.
|
||||
|
||||
Args:
|
||||
env_vars: Environment variables (defaults to os.environ)
|
||||
config_file: Config file values (from ArchiveBox.conf)
|
||||
overrides: Override values (from User/Crawl/Snapshot config fields)
|
||||
|
||||
Returns:
|
||||
Flat dict of all resolved config values.
|
||||
e.g., {'SAVE_WGET': True, 'WGET_TIMEOUT': 60, ...}
|
||||
"""
|
||||
if env_vars is None:
|
||||
env_vars = dict(os.environ)
|
||||
if config_file is None:
|
||||
config_file = {}
|
||||
|
||||
plugin_configs = discover_plugin_configs()
|
||||
flat_config = {}
|
||||
|
||||
for plugin_name, schema in plugin_configs.items():
|
||||
properties = schema.get('properties', {})
|
||||
for key, prop_schema in properties.items():
|
||||
flat_config[key] = resolve_config_value(
|
||||
key, prop_schema, env_vars, config_file, overrides
|
||||
)
|
||||
|
||||
return flat_config
|
||||
|
||||
|
||||
def export_plugin_config_to_env(
|
||||
config: Dict[str, Any],
|
||||
env: Optional[Dict[str, str]] = None,
|
||||
) -> Dict[str, str]:
|
||||
"""
|
||||
Export plugin config values to environment variable format.
|
||||
|
||||
Converts all values to strings suitable for subprocess environment.
|
||||
Arrays are JSON-encoded.
|
||||
|
||||
Args:
|
||||
config: Flat config dict from get_flat_plugin_config()
|
||||
env: Optional existing env dict to update (creates new if None)
|
||||
|
||||
Returns:
|
||||
Environment dict with config values as strings.
|
||||
"""
|
||||
if env is None:
|
||||
env = {}
|
||||
|
||||
for key, value in config.items():
|
||||
if value is None:
|
||||
continue
|
||||
elif isinstance(value, bool):
|
||||
env[key] = 'true' if value else 'false'
|
||||
elif isinstance(value, (list, dict)):
|
||||
env[key] = json.dumps(value)
|
||||
else:
|
||||
env[key] = str(value)
|
||||
|
||||
return env
|
||||
return {
|
||||
'enabled': bool(enabled),
|
||||
'timeout': int(timeout),
|
||||
'binary': str(binary),
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
@@ -1233,7 +1107,7 @@ def find_binary_for_cmd(cmd: List[str], machine_id: str) -> Optional[str]:
|
||||
if not cmd:
|
||||
return None
|
||||
|
||||
from machine.models import Binary
|
||||
from archivebox.machine.models import Binary
|
||||
|
||||
bin_path_or_name = cmd[0] if isinstance(cmd, list) else cmd
|
||||
|
||||
@@ -1266,7 +1140,7 @@ def create_model_record(record: Dict[str, Any]) -> Any:
|
||||
Returns:
|
||||
Created/updated model instance, or None if type unknown
|
||||
"""
|
||||
from machine.models import Binary, Machine
|
||||
from archivebox.machine.models import Binary, Machine
|
||||
|
||||
record_type = record.pop('type', None)
|
||||
if not record_type:
|
||||
@@ -1349,25 +1223,25 @@ def process_hook_records(records: List[Dict[str, Any]], overrides: Dict[str, Any
|
||||
try:
|
||||
# Dispatch to appropriate model's from_jsonl() method
|
||||
if record_type == 'Snapshot':
|
||||
from core.models import Snapshot
|
||||
from archivebox.core.models import Snapshot
|
||||
obj = Snapshot.from_jsonl(record.copy(), overrides)
|
||||
if obj:
|
||||
stats['Snapshot'] = stats.get('Snapshot', 0) + 1
|
||||
|
||||
elif record_type == 'Tag':
|
||||
from core.models import Tag
|
||||
from archivebox.core.models import Tag
|
||||
obj = Tag.from_jsonl(record.copy(), overrides)
|
||||
if obj:
|
||||
stats['Tag'] = stats.get('Tag', 0) + 1
|
||||
|
||||
elif record_type == 'Binary':
|
||||
from machine.models import Binary
|
||||
from archivebox.machine.models import Binary
|
||||
obj = Binary.from_jsonl(record.copy(), overrides)
|
||||
if obj:
|
||||
stats['Binary'] = stats.get('Binary', 0) + 1
|
||||
|
||||
elif record_type == 'Machine':
|
||||
from machine.models import Machine
|
||||
from archivebox.machine.models import Machine
|
||||
obj = Machine.from_jsonl(record.copy(), overrides)
|
||||
if obj:
|
||||
stats['Machine'] = stats.get('Machine', 0) + 1
|
||||
|
||||
@@ -4,7 +4,7 @@ from django.contrib import admin
|
||||
from django.utils.html import format_html
|
||||
|
||||
from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
|
||||
from machine.models import Machine, NetworkInterface, Binary
|
||||
from archivebox.machine.models import Machine, NetworkInterface, Binary
|
||||
|
||||
|
||||
class MachineAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||
|
||||
@@ -5,11 +5,11 @@ from django.apps import AppConfig
|
||||
|
||||
class MachineConfig(AppConfig):
|
||||
default_auto_field = 'django.db.models.BigAutoField'
|
||||
|
||||
name = 'machine'
|
||||
|
||||
name = 'archivebox.machine'
|
||||
verbose_name = 'Machine Info'
|
||||
|
||||
|
||||
def register_admin(admin_site):
|
||||
from machine.admin import register_admin
|
||||
from archivebox.machine.admin import register_admin
|
||||
register_admin(admin_site)
|
||||
|
||||
@@ -14,9 +14,9 @@ class Migration(migrations.Migration):
|
||||
|
||||
replaces = [
|
||||
('machine', '0001_initial'),
|
||||
('machine', '0002_alter_machine_stats_binary'),
|
||||
('machine', '0003_alter_binary_options_and_more'),
|
||||
('machine', '0004_alter_binary_abspath_and_more'),
|
||||
('machine', '0002_alter_machine_stats_installedbinary'),
|
||||
('machine', '0003_alter_installedbinary_options_and_more'),
|
||||
('machine', '0004_alter_installedbinary_abspath_and_more'),
|
||||
]
|
||||
|
||||
dependencies = []
|
||||
@@ -70,22 +70,7 @@ class Migration(migrations.Migration):
|
||||
'unique_together': {('machine', 'ip_public', 'ip_local', 'mac_address', 'dns_server')},
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='Dependency',
|
||||
fields=[
|
||||
('id', models.UUIDField(default=uuid4, editable=False, primary_key=True, serialize=False, unique=True)),
|
||||
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
|
||||
('modified_at', models.DateTimeField(auto_now=True)),
|
||||
('bin_name', models.CharField(db_index=True, max_length=63, unique=True)),
|
||||
('bin_providers', models.CharField(default='*', max_length=127)),
|
||||
('custom_cmds', models.JSONField(blank=True, default=dict)),
|
||||
('config', models.JSONField(blank=True, default=dict)),
|
||||
],
|
||||
options={
|
||||
'verbose_name': 'Dependency',
|
||||
'verbose_name_plural': 'Dependencies',
|
||||
},
|
||||
),
|
||||
# Dependency model removed - not needed anymore
|
||||
migrations.CreateModel(
|
||||
name='Binary',
|
||||
fields=[
|
||||
@@ -100,7 +85,7 @@ class Migration(migrations.Migration):
|
||||
('version', models.CharField(blank=True, default=None, max_length=32)),
|
||||
('sha256', models.CharField(blank=True, default=None, max_length=64)),
|
||||
('machine', models.ForeignKey(blank=True, default=None, on_delete=django.db.models.deletion.CASCADE, to='machine.machine')),
|
||||
('dependency', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='binary_set', to='machine.dependency')),
|
||||
# dependency FK removed - Dependency model deleted
|
||||
],
|
||||
options={
|
||||
'verbose_name': 'Binary',
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
# Generated manually on 2025-12-26
|
||||
# NOTE: This migration is intentionally empty but kept for dependency chain
|
||||
# The Dependency model was removed in 0004, so all operations have been stripped
|
||||
|
||||
from django.db import migrations, models
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
@@ -10,29 +12,5 @@ class Migration(migrations.Migration):
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RenameField(
|
||||
model_name='dependency',
|
||||
old_name='custom_cmds',
|
||||
new_name='overrides',
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='dependency',
|
||||
name='bin_name',
|
||||
field=models.CharField(db_index=True, help_text='Binary executable name (e.g., wget, yt-dlp, chromium)', max_length=63, unique=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='dependency',
|
||||
name='bin_providers',
|
||||
field=models.CharField(default='*', help_text='Comma-separated list of allowed providers: apt,brew,pip,npm,gem,nix,custom or * for any', max_length=127),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='dependency',
|
||||
name='overrides',
|
||||
field=models.JSONField(blank=True, default=dict, help_text="JSON map matching abx-pkg Binary.overrides format: {'pip': {'packages': ['pkg']}, 'apt': {'packages': ['pkg']}}"),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='dependency',
|
||||
name='config',
|
||||
field=models.JSONField(blank=True, default=dict, help_text='JSON map of env var config to use during install'),
|
||||
),
|
||||
# All Dependency operations removed - model deleted in 0004
|
||||
]
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
# Generated by Django 6.0 on 2025-12-28 05:12
|
||||
# NOTE: This migration is intentionally empty but kept for dependency chain
|
||||
# The Dependency model was removed in 0004, all operations stripped
|
||||
|
||||
import django.db.models.deletion
|
||||
from archivebox import uuid_compat
|
||||
from django.db import migrations, models
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
@@ -12,34 +12,6 @@ class Migration(migrations.Migration):
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='dependency',
|
||||
name='id',
|
||||
field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='binary',
|
||||
name='dependency',
|
||||
field=models.ForeignKey(blank=True, help_text='The Dependency this binary satisfies', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='binary_set', to='machine.dependency'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='binary',
|
||||
name='id',
|
||||
field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='machine',
|
||||
name='config',
|
||||
field=models.JSONField(blank=True, default=dict, help_text='Machine-specific config overrides (e.g., resolved binary paths like WGET_BINARY)'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='machine',
|
||||
name='id',
|
||||
field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='networkinterface',
|
||||
name='id',
|
||||
field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
|
||||
),
|
||||
# All operations removed - Dependency model deleted in 0004
|
||||
# This is a stub migration for users upgrading from old dev versions
|
||||
]
|
||||
|
||||
28
archivebox/machine/migrations/0004_drop_dependency_table.py
Normal file
28
archivebox/machine/migrations/0004_drop_dependency_table.py
Normal file
@@ -0,0 +1,28 @@
|
||||
# Generated migration - removes Dependency model entirely
|
||||
# NOTE: This is a cleanup migration for users upgrading from old dev versions
|
||||
# that had the Dependency model. Fresh installs never create this table.
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
def drop_dependency_table(apps, schema_editor):
|
||||
"""
|
||||
Drop old Dependency table if it exists (from dev versions that had it).
|
||||
Safe to run multiple times, safe if table doesn't exist.
|
||||
|
||||
Does NOT touch machine_binary - that's our current Binary model table!
|
||||
"""
|
||||
schema_editor.execute('DROP TABLE IF EXISTS machine_dependency')
|
||||
# Also drop old InstalledBinary table if it somehow still exists
|
||||
schema_editor.execute('DROP TABLE IF EXISTS machine_installedbinary')
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('machine', '0003_alter_dependency_id_alter_installedbinary_dependency_and_more'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RunPython(drop_dependency_table, migrations.RunPython.noop),
|
||||
]
|
||||
@@ -1,56 +0,0 @@
|
||||
# Generated migration - Clean slate for Binary model
|
||||
# Drops old InstalledBinary and Dependency tables, creates new Binary table
|
||||
|
||||
from django.db import migrations, models
|
||||
import django.utils.timezone
|
||||
import archivebox.uuid_compat
|
||||
|
||||
|
||||
def drop_old_tables(apps, schema_editor):
|
||||
"""Drop old tables using raw SQL"""
|
||||
schema_editor.execute('DROP TABLE IF EXISTS machine_installedbinary')
|
||||
schema_editor.execute('DROP TABLE IF EXISTS machine_dependency')
|
||||
schema_editor.execute('DROP TABLE IF EXISTS machine_binary') # In case rename happened
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('machine', '0003_alter_dependency_id_alter_installedbinary_dependency_and_more'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
# Drop old tables using raw SQL
|
||||
migrations.RunPython(drop_old_tables, migrations.RunPython.noop),
|
||||
|
||||
# Create new Binary model from scratch
|
||||
migrations.CreateModel(
|
||||
name='Binary',
|
||||
fields=[
|
||||
('id', models.UUIDField(default=archivebox.uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
|
||||
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
|
||||
('modified_at', models.DateTimeField(auto_now=True)),
|
||||
('name', models.CharField(blank=True, db_index=True, default=None, max_length=63)),
|
||||
('binproviders', models.CharField(blank=True, default='env', help_text='Comma-separated list of allowed providers: apt,brew,pip,npm,env', max_length=127)),
|
||||
('overrides', models.JSONField(blank=True, default=dict, help_text="Provider-specific overrides: {'apt': {'packages': ['pkg']}, ...}")),
|
||||
('binprovider', models.CharField(blank=True, default=None, help_text='Provider that successfully installed this binary', max_length=31)),
|
||||
('abspath', models.CharField(blank=True, default=None, max_length=255)),
|
||||
('version', models.CharField(blank=True, default=None, max_length=32)),
|
||||
('sha256', models.CharField(blank=True, default=None, max_length=64)),
|
||||
('status', models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('succeeded', 'Succeeded'), ('failed', 'Failed')], db_index=True, default='queued', max_length=16)),
|
||||
('retry_at', models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, help_text='When to retry this binary installation', null=True)),
|
||||
('output_dir', models.CharField(blank=True, default='', help_text='Directory where installation hook logs are stored', max_length=255)),
|
||||
('num_uses_failed', models.PositiveIntegerField(default=0)),
|
||||
('num_uses_succeeded', models.PositiveIntegerField(default=0)),
|
||||
('machine', models.ForeignKey(blank=True, default=None, on_delete=models.deletion.CASCADE, to='machine.machine')),
|
||||
],
|
||||
options={
|
||||
'verbose_name': 'Binary',
|
||||
'verbose_name_plural': 'Binaries',
|
||||
},
|
||||
),
|
||||
migrations.AddIndex(
|
||||
model_name='binary',
|
||||
index=models.Index(fields=['machine', 'name', 'abspath', 'version', 'sha256'], name='machine_bin_machine_idx'),
|
||||
),
|
||||
]
|
||||
@@ -4,11 +4,14 @@ import socket
|
||||
from archivebox.uuid_compat import uuid7
|
||||
from datetime import timedelta
|
||||
|
||||
from statemachine import State, registry
|
||||
|
||||
from django.db import models
|
||||
from django.utils import timezone
|
||||
from django.utils.functional import cached_property
|
||||
|
||||
from archivebox.base_models.models import ModelWithHealthStats
|
||||
from archivebox.workers.models import BaseStateMachine
|
||||
from .detect import get_host_guid, get_os_info, get_vm_info, get_host_network, get_host_stats
|
||||
|
||||
_CURRENT_MACHINE = None
|
||||
@@ -50,6 +53,9 @@ class Machine(ModelWithHealthStats):
|
||||
objects: MachineManager = MachineManager()
|
||||
networkinterface_set: models.Manager['NetworkInterface']
|
||||
|
||||
class Meta:
|
||||
app_label = 'machine'
|
||||
|
||||
@classmethod
|
||||
def current(cls) -> 'Machine':
|
||||
global _CURRENT_MACHINE
|
||||
@@ -115,6 +121,7 @@ class NetworkInterface(ModelWithHealthStats):
|
||||
objects: NetworkInterfaceManager = NetworkInterfaceManager()
|
||||
|
||||
class Meta:
|
||||
app_label = 'machine'
|
||||
unique_together = (('machine', 'ip_public', 'ip_local', 'mac_address', 'dns_server'),)
|
||||
|
||||
@classmethod
|
||||
@@ -206,11 +213,12 @@ class Binary(ModelWithHealthStats):
|
||||
num_uses_failed = models.PositiveIntegerField(default=0)
|
||||
num_uses_succeeded = models.PositiveIntegerField(default=0)
|
||||
|
||||
state_machine_name: str = 'machine.statemachines.BinaryMachine'
|
||||
state_machine_name: str = 'machine.models.BinaryMachine'
|
||||
|
||||
objects: BinaryManager = BinaryManager()
|
||||
|
||||
class Meta:
|
||||
app_label = 'machine'
|
||||
verbose_name = 'Binary'
|
||||
verbose_name_plural = 'Binaries'
|
||||
unique_together = (('machine', 'name', 'abspath', 'version', 'sha256'),)
|
||||
@@ -302,9 +310,9 @@ class Binary(ModelWithHealthStats):
|
||||
DATA_DIR = getattr(settings, 'DATA_DIR', Path.cwd())
|
||||
return Path(DATA_DIR) / 'machines' / str(self.machine_id) / 'binaries' / self.name / str(self.id)
|
||||
|
||||
def update_for_workers(self, **kwargs):
|
||||
def update_and_requeue(self, **kwargs):
|
||||
"""
|
||||
Update binary fields for worker state machine.
|
||||
Update binary fields and requeue for worker state machine.
|
||||
|
||||
Sets modified_at to ensure workers pick up changes.
|
||||
Always saves the model after updating.
|
||||
@@ -325,6 +333,10 @@ class Binary(ModelWithHealthStats):
|
||||
"""
|
||||
import json
|
||||
from archivebox.hooks import discover_hooks, run_hook
|
||||
from archivebox.config.configset import get_config
|
||||
|
||||
# Get merged config (Binary doesn't have crawl/snapshot context)
|
||||
config = get_config(scope='global')
|
||||
|
||||
# Create output directory
|
||||
output_dir = self.OUTPUT_DIR
|
||||
@@ -333,7 +345,7 @@ class Binary(ModelWithHealthStats):
|
||||
self.save()
|
||||
|
||||
# Discover ALL on_Binary__install_* hooks
|
||||
hooks = discover_hooks('Binary')
|
||||
hooks = discover_hooks('Binary', config=config)
|
||||
if not hooks:
|
||||
self.status = self.StatusChoices.FAILED
|
||||
self.save()
|
||||
@@ -361,7 +373,8 @@ class Binary(ModelWithHealthStats):
|
||||
result = run_hook(
|
||||
hook,
|
||||
output_dir=plugin_output_dir,
|
||||
timeout=600, # 10 min timeout
|
||||
config=config,
|
||||
timeout=600, # 10 min timeout for binary installation
|
||||
**hook_kwargs
|
||||
)
|
||||
|
||||
@@ -420,3 +433,128 @@ class Binary(ModelWithHealthStats):
|
||||
kill_process(pid_file)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Binary State Machine
|
||||
# =============================================================================
|
||||
|
||||
class BinaryMachine(BaseStateMachine, strict_states=True):
|
||||
"""
|
||||
State machine for managing Binary installation lifecycle.
|
||||
|
||||
Hook Lifecycle:
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ QUEUED State │
|
||||
│ • Binary needs to be installed │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
↓ tick() when can_start()
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ STARTED State → enter_started() │
|
||||
│ 1. binary.run() │
|
||||
│ • discover_hooks('Binary') → all on_Binary__install_* │
|
||||
│ • Try each provider hook in sequence: │
|
||||
│ - run_hook(script, output_dir, ...) │
|
||||
│ - If returncode == 0: │
|
||||
│ * Read stdout.log │
|
||||
│ * Parse JSONL for 'Binary' record with abspath │
|
||||
│ * Update self: abspath, version, sha256, provider │
|
||||
│ * Set status=SUCCEEDED, RETURN │
|
||||
│ • If no hook succeeds: set status=FAILED │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
↓ tick() checks status
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ SUCCEEDED / FAILED │
|
||||
│ • Set by binary.run() based on hook results │
|
||||
│ • Health stats incremented (num_uses_succeeded/failed) │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
"""
|
||||
|
||||
model_attr_name = 'binary'
|
||||
|
||||
# States
|
||||
queued = State(value=Binary.StatusChoices.QUEUED, initial=True)
|
||||
started = State(value=Binary.StatusChoices.STARTED)
|
||||
succeeded = State(value=Binary.StatusChoices.SUCCEEDED, final=True)
|
||||
failed = State(value=Binary.StatusChoices.FAILED, final=True)
|
||||
|
||||
# Tick Event - transitions based on conditions
|
||||
tick = (
|
||||
queued.to.itself(unless='can_start') |
|
||||
queued.to(started, cond='can_start') |
|
||||
started.to.itself(unless='is_finished') |
|
||||
started.to(succeeded, cond='is_succeeded') |
|
||||
started.to(failed, cond='is_failed')
|
||||
)
|
||||
|
||||
def can_start(self) -> bool:
|
||||
"""Check if binary installation can start."""
|
||||
return bool(self.binary.name and self.binary.binproviders)
|
||||
|
||||
def is_succeeded(self) -> bool:
|
||||
"""Check if installation succeeded (status was set by run())."""
|
||||
return self.binary.status == Binary.StatusChoices.SUCCEEDED
|
||||
|
||||
def is_failed(self) -> bool:
|
||||
"""Check if installation failed (status was set by run())."""
|
||||
return self.binary.status == Binary.StatusChoices.FAILED
|
||||
|
||||
def is_finished(self) -> bool:
|
||||
"""Check if installation has completed (success or failure)."""
|
||||
return self.binary.status in (
|
||||
Binary.StatusChoices.SUCCEEDED,
|
||||
Binary.StatusChoices.FAILED,
|
||||
)
|
||||
|
||||
@queued.enter
|
||||
def enter_queued(self):
|
||||
"""Binary is queued for installation."""
|
||||
self.binary.update_and_requeue(
|
||||
retry_at=timezone.now(),
|
||||
status=Binary.StatusChoices.QUEUED,
|
||||
)
|
||||
|
||||
@started.enter
|
||||
def enter_started(self):
|
||||
"""Start binary installation."""
|
||||
# Lock the binary while installation runs
|
||||
self.binary.update_and_requeue(
|
||||
retry_at=timezone.now() + timedelta(seconds=300), # 5 min timeout for installation
|
||||
status=Binary.StatusChoices.STARTED,
|
||||
)
|
||||
|
||||
# Run installation hooks
|
||||
self.binary.run()
|
||||
|
||||
# Save updated status (run() updates status to succeeded/failed)
|
||||
self.binary.save()
|
||||
|
||||
@succeeded.enter
|
||||
def enter_succeeded(self):
|
||||
"""Binary installed successfully."""
|
||||
self.binary.update_and_requeue(
|
||||
retry_at=None,
|
||||
status=Binary.StatusChoices.SUCCEEDED,
|
||||
)
|
||||
|
||||
# Increment health stats
|
||||
self.binary.increment_health_stats(success=True)
|
||||
|
||||
@failed.enter
|
||||
def enter_failed(self):
|
||||
"""Binary installation failed."""
|
||||
self.binary.update_and_requeue(
|
||||
retry_at=None,
|
||||
status=Binary.StatusChoices.FAILED,
|
||||
)
|
||||
|
||||
# Increment health stats
|
||||
self.binary.increment_health_stats(success=False)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# State Machine Registration
|
||||
# =============================================================================
|
||||
|
||||
# Manually register state machines with python-statemachine registry
|
||||
registry.register(BinaryMachine)
|
||||
|
||||
|
||||
|
||||
@@ -1,112 +0,0 @@
|
||||
__package__ = 'archivebox.machine'
|
||||
|
||||
from datetime import timedelta
|
||||
from django.utils import timezone
|
||||
from django.db.models import F
|
||||
|
||||
from statemachine import State, StateMachine
|
||||
|
||||
from machine.models import Binary
|
||||
|
||||
|
||||
class BinaryMachine(StateMachine, strict_states=True):
|
||||
"""
|
||||
State machine for managing Binary installation lifecycle.
|
||||
|
||||
Follows the unified pattern used by Crawl, Snapshot, and ArchiveResult:
|
||||
- queued: Binary needs to be installed
|
||||
- started: Installation hooks are running
|
||||
- succeeded: Binary installed successfully (abspath, version, sha256 populated)
|
||||
- failed: Installation failed permanently
|
||||
"""
|
||||
|
||||
model: Binary
|
||||
|
||||
# States
|
||||
queued = State(value=Binary.StatusChoices.QUEUED, initial=True)
|
||||
started = State(value=Binary.StatusChoices.STARTED)
|
||||
succeeded = State(value=Binary.StatusChoices.SUCCEEDED, final=True)
|
||||
failed = State(value=Binary.StatusChoices.FAILED, final=True)
|
||||
|
||||
# Tick Event - transitions based on conditions
|
||||
tick = (
|
||||
queued.to.itself(unless='can_start') |
|
||||
queued.to(started, cond='can_start') |
|
||||
started.to.itself(unless='is_finished') |
|
||||
started.to(succeeded, cond='is_succeeded') |
|
||||
started.to(failed, cond='is_failed')
|
||||
)
|
||||
|
||||
def __init__(self, binary, *args, **kwargs):
|
||||
self.binary = binary
|
||||
super().__init__(binary, *args, **kwargs)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f'Binary[{self.binary.id}]'
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.__repr__()
|
||||
|
||||
def can_start(self) -> bool:
|
||||
"""Check if binary installation can start."""
|
||||
return bool(self.binary.name and self.binary.binproviders)
|
||||
|
||||
def is_succeeded(self) -> bool:
|
||||
"""Check if installation succeeded (status was set by run())."""
|
||||
return self.binary.status == Binary.StatusChoices.SUCCEEDED
|
||||
|
||||
def is_failed(self) -> bool:
|
||||
"""Check if installation failed (status was set by run())."""
|
||||
return self.binary.status == Binary.StatusChoices.FAILED
|
||||
|
||||
def is_finished(self) -> bool:
|
||||
"""Check if installation has completed (success or failure)."""
|
||||
return self.binary.status in (
|
||||
Binary.StatusChoices.SUCCEEDED,
|
||||
Binary.StatusChoices.FAILED,
|
||||
)
|
||||
|
||||
@queued.enter
|
||||
def enter_queued(self):
|
||||
"""Binary is queued for installation."""
|
||||
self.binary.update_for_workers(
|
||||
retry_at=timezone.now(),
|
||||
status=Binary.StatusChoices.QUEUED,
|
||||
)
|
||||
|
||||
@started.enter
|
||||
def enter_started(self):
|
||||
"""Start binary installation."""
|
||||
# Lock the binary while installation runs
|
||||
self.binary.update_for_workers(
|
||||
retry_at=timezone.now() + timedelta(seconds=300), # 5 min timeout for installation
|
||||
status=Binary.StatusChoices.STARTED,
|
||||
)
|
||||
|
||||
# Run installation hooks
|
||||
self.binary.run()
|
||||
|
||||
# Save updated status (run() updates status to succeeded/failed)
|
||||
self.binary.save()
|
||||
|
||||
@succeeded.enter
|
||||
def enter_succeeded(self):
|
||||
"""Binary installed successfully."""
|
||||
self.binary.update_for_workers(
|
||||
retry_at=None,
|
||||
status=Binary.StatusChoices.SUCCEEDED,
|
||||
)
|
||||
|
||||
# Increment health stats
|
||||
Binary.objects.filter(pk=self.binary.pk).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
|
||||
|
||||
@failed.enter
|
||||
def enter_failed(self):
|
||||
"""Binary installation failed."""
|
||||
self.binary.update_for_workers(
|
||||
retry_at=None,
|
||||
status=Binary.StatusChoices.FAILED,
|
||||
)
|
||||
|
||||
# Increment health stats
|
||||
Binary.objects.filter(pk=self.binary.pk).update(num_uses_failed=F('num_uses_failed') + 1)
|
||||
@@ -250,68 +250,13 @@ def process_records(
|
||||
yield result
|
||||
|
||||
|
||||
def get_or_create_snapshot(record: Dict[str, Any], created_by_id: Optional[int] = None):
|
||||
"""
|
||||
Get or create a Snapshot from a JSONL record.
|
||||
|
||||
Returns the Snapshot instance.
|
||||
"""
|
||||
from core.models import Snapshot
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from archivebox.misc.util import parse_date
|
||||
|
||||
created_by_id = created_by_id or get_or_create_system_user_pk()
|
||||
|
||||
# Extract fields from record
|
||||
url = record.get('url')
|
||||
if not url:
|
||||
raise ValueError("Record missing required 'url' field")
|
||||
|
||||
title = record.get('title')
|
||||
tags_str = record.get('tags', '')
|
||||
bookmarked_at = record.get('bookmarked_at')
|
||||
depth = record.get('depth', 0)
|
||||
crawl_id = record.get('crawl_id')
|
||||
parent_snapshot_id = record.get('parent_snapshot_id')
|
||||
|
||||
# Parse bookmarked_at if string
|
||||
if bookmarked_at and isinstance(bookmarked_at, str):
|
||||
bookmarked_at = parse_date(bookmarked_at)
|
||||
|
||||
# Use the manager's create_or_update_from_dict method
|
||||
snapshot = Snapshot.objects.create_or_update_from_dict(
|
||||
{'url': url, 'title': title, 'tags': tags_str},
|
||||
created_by_id=created_by_id
|
||||
)
|
||||
|
||||
# Update additional fields if provided
|
||||
update_fields = []
|
||||
if depth is not None and snapshot.depth != depth:
|
||||
snapshot.depth = depth
|
||||
update_fields.append('depth')
|
||||
if parent_snapshot_id and str(snapshot.parent_snapshot_id) != str(parent_snapshot_id):
|
||||
snapshot.parent_snapshot_id = parent_snapshot_id
|
||||
update_fields.append('parent_snapshot_id')
|
||||
if bookmarked_at and snapshot.bookmarked_at != bookmarked_at:
|
||||
snapshot.bookmarked_at = bookmarked_at
|
||||
update_fields.append('bookmarked_at')
|
||||
if crawl_id and str(snapshot.crawl_id) != str(crawl_id):
|
||||
snapshot.crawl_id = crawl_id
|
||||
update_fields.append('crawl_id')
|
||||
|
||||
if update_fields:
|
||||
snapshot.save(update_fields=update_fields + ['modified_at'])
|
||||
|
||||
return snapshot
|
||||
|
||||
|
||||
def get_or_create_tag(record: Dict[str, Any]):
|
||||
"""
|
||||
Get or create a Tag from a JSONL record.
|
||||
|
||||
Returns the Tag instance.
|
||||
"""
|
||||
from core.models import Tag
|
||||
from archivebox.core.models import Tag
|
||||
|
||||
name = record.get('name')
|
||||
if not name:
|
||||
@@ -353,8 +298,11 @@ def process_jsonl_records(records: Iterator[Dict[str, Any]], created_by_id: Opti
|
||||
|
||||
elif record_type == TYPE_SNAPSHOT or 'url' in record:
|
||||
try:
|
||||
snapshot = get_or_create_snapshot(record, created_by_id=created_by_id)
|
||||
results['snapshots'].append(snapshot)
|
||||
from archivebox.core.models import Snapshot
|
||||
overrides = {'created_by_id': created_by_id} if created_by_id else {}
|
||||
snapshot = Snapshot.from_jsonl(record, overrides=overrides)
|
||||
if snapshot:
|
||||
results['snapshots'].append(snapshot)
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
|
||||
@@ -17,7 +17,7 @@ from dataclasses import dataclass
|
||||
from typing import Any, Optional, List, Dict, Union, Iterable, IO, TYPE_CHECKING
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from core.models import Snapshot
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
from rich import print
|
||||
from rich.panel import Panel
|
||||
@@ -257,7 +257,7 @@ def log_archiving_paused(num_links: int, idx: int, timestamp: str):
|
||||
|
||||
def log_archiving_finished(num_links: int):
|
||||
|
||||
from core.models import Snapshot
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
end_ts = datetime.now(timezone.utc)
|
||||
_LAST_RUN_STATS.archiving_end_ts = end_ts
|
||||
@@ -395,7 +395,7 @@ def log_list_started(filter_patterns: Optional[List[str]], filter_type: str):
|
||||
print(' {}'.format(' '.join(filter_patterns or ())))
|
||||
|
||||
def log_list_finished(snapshots):
|
||||
from core.models import Snapshot
|
||||
from archivebox.core.models import Snapshot
|
||||
print()
|
||||
print('---------------------------------------------------------------------------------------------------')
|
||||
print(Snapshot.objects.filter(pk__in=[s.pk for s in snapshots]).to_csv(cols=['timestamp', 'is_archived', 'num_outputs', 'url'], header=True, ljust=16, separator=' | '))
|
||||
|
||||
@@ -1,335 +0,0 @@
|
||||
__package__ = 'abx.archivebox'
|
||||
|
||||
# from django.test import TestCase
|
||||
|
||||
# from .toml_util import convert, TOML_HEADER
|
||||
|
||||
# TEST_INPUT = """
|
||||
# [SERVER_CONFIG]
|
||||
# IS_TTY=False
|
||||
# USE_COLOR=False
|
||||
# SHOW_PROGRESS=False
|
||||
# IN_DOCKER=False
|
||||
# IN_QEMU=False
|
||||
# PUID=501
|
||||
# PGID=20
|
||||
# CONFIG_FILE=/opt/archivebox/data/ArchiveBox.conf
|
||||
# ONLY_NEW=True
|
||||
# TIMEOUT=60
|
||||
# MEDIA_TIMEOUT=3600
|
||||
# OUTPUT_PERMISSIONS=644
|
||||
# RESTRICT_FILE_NAMES=windows
|
||||
# URL_DENYLIST=\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$
|
||||
# URL_ALLOWLIST=None
|
||||
# ADMIN_USERNAME=None
|
||||
# ADMIN_PASSWORD=None
|
||||
# ENFORCE_ATOMIC_WRITES=True
|
||||
# TAG_SEPARATOR_PATTERN=[,]
|
||||
# SECRET_KEY=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
|
||||
# BIND_ADDR=127.0.0.1:8000
|
||||
# ALLOWED_HOSTS=*
|
||||
# DEBUG=False
|
||||
# PUBLIC_INDEX=True
|
||||
# PUBLIC_SNAPSHOTS=True
|
||||
# PUBLIC_ADD_VIEW=False
|
||||
# FOOTER_INFO=Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.
|
||||
# SNAPSHOTS_PER_PAGE=40
|
||||
# CUSTOM_TEMPLATES_DIR=None
|
||||
# TIME_ZONE=UTC
|
||||
# TIMEZONE=UTC
|
||||
# REVERSE_PROXY_USER_HEADER=Remote-User
|
||||
# REVERSE_PROXY_WHITELIST=
|
||||
# LOGOUT_REDIRECT_URL=/
|
||||
# PREVIEW_ORIGINALS=True
|
||||
# LDAP=False
|
||||
# LDAP_SERVER_URI=None
|
||||
# LDAP_BIND_DN=None
|
||||
# LDAP_BIND_PASSWORD=None
|
||||
# LDAP_USER_BASE=None
|
||||
# LDAP_USER_FILTER=None
|
||||
# LDAP_USERNAME_ATTR=None
|
||||
# LDAP_FIRSTNAME_ATTR=None
|
||||
# LDAP_LASTNAME_ATTR=None
|
||||
# LDAP_EMAIL_ATTR=None
|
||||
# LDAP_CREATE_SUPERUSER=False
|
||||
# SAVE_TITLE=True
|
||||
# SAVE_FAVICON=True
|
||||
# SAVE_WGET=True
|
||||
# SAVE_WGET_REQUISITES=True
|
||||
# SAVE_SINGLEFILE=True
|
||||
# SAVE_READABILITY=True
|
||||
# SAVE_MERCURY=True
|
||||
# SAVE_HTMLTOTEXT=True
|
||||
# SAVE_PDF=True
|
||||
# SAVE_SCREENSHOT=True
|
||||
# SAVE_DOM=True
|
||||
# SAVE_HEADERS=True
|
||||
# SAVE_WARC=True
|
||||
# SAVE_GIT=True
|
||||
# SAVE_MEDIA=True
|
||||
# SAVE_ARCHIVE_DOT_ORG=True
|
||||
# RESOLUTION=1440,2000
|
||||
# GIT_DOMAINS=github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht
|
||||
# CHECK_SSL_VALIDITY=True
|
||||
# MEDIA_MAX_SIZE=750m
|
||||
# USER_AGENT=None
|
||||
# CURL_USER_AGENT=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/) curl/curl 8.4.0 (x86_64-apple-darwin23.0)
|
||||
# WGET_USER_AGENT=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/) wget/GNU Wget 1.24.5
|
||||
# CHROME_USER_AGENT=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/)
|
||||
# COOKIES_FILE=None
|
||||
# CHROME_USER_DATA_DIR=None
|
||||
# CHROME_TIMEOUT=0
|
||||
# CHROME_HEADLESS=True
|
||||
# CHROME_SANDBOX=True
|
||||
# CHROME_EXTRA_ARGS=[]
|
||||
# YOUTUBEDL_ARGS=['--restrict-filenames', '--trim-filenames', '128', '--write-description', '--write-info-json', '--write-annotations', '--write-thumbnail', '--no-call-home', '--write-sub', '--write-auto-subs', '--convert-subs=srt', '--yes-playlist', '--continue', '--no-abort-on-error', '--ignore-errors', '--geo-bypass', '--add-metadata', '--format=(bv*+ba/b)[filesize<=750m][filesize_approx<=?750m]/(bv*+ba/b)']
|
||||
# YOUTUBEDL_EXTRA_ARGS=[]
|
||||
# WGET_ARGS=['--no-verbose', '--adjust-extension', '--convert-links', '--force-directories', '--backup-converted', '--span-hosts', '--no-parent', '-e', 'robots=off']
|
||||
# WGET_EXTRA_ARGS=[]
|
||||
# CURL_ARGS=['--silent', '--location', '--compressed']
|
||||
# CURL_EXTRA_ARGS=[]
|
||||
# GIT_ARGS=['--recursive']
|
||||
# SINGLEFILE_ARGS=[]
|
||||
# SINGLEFILE_EXTRA_ARGS=[]
|
||||
# MERCURY_ARGS=['--format=text']
|
||||
# MERCURY_EXTRA_ARGS=[]
|
||||
# FAVICON_PROVIDER=https://www.google.com/s2/favicons?domain={}
|
||||
# USE_INDEXING_BACKEND=True
|
||||
# USE_SEARCHING_BACKEND=True
|
||||
# SEARCH_BACKEND_ENGINE=ripgrep
|
||||
# SEARCH_BACKEND_HOST_NAME=localhost
|
||||
# SEARCH_BACKEND_PORT=1491
|
||||
# SEARCH_BACKEND_PASSWORD=SecretPassword
|
||||
# SEARCH_PROCESS_HTML=True
|
||||
# SONIC_COLLECTION=archivebox
|
||||
# SONIC_BUCKET=snapshots
|
||||
# SEARCH_BACKEND_TIMEOUT=90
|
||||
# FTS_SEPARATE_DATABASE=True
|
||||
# FTS_TOKENIZERS=porter unicode61 remove_diacritics 2
|
||||
# FTS_SQLITE_MAX_LENGTH=1000000000
|
||||
# USE_CURL=True
|
||||
# USE_WGET=True
|
||||
# USE_SINGLEFILE=True
|
||||
# USE_READABILITY=True
|
||||
# USE_MERCURY=True
|
||||
# USE_GIT=True
|
||||
# USE_CHROME=True
|
||||
# USE_NODE=True
|
||||
# USE_YOUTUBEDL=True
|
||||
# USE_RIPGREP=True
|
||||
# CURL_BINARY=curl
|
||||
# GIT_BINARY=git
|
||||
# WGET_BINARY=wget
|
||||
# SINGLEFILE_BINARY=single-file
|
||||
# READABILITY_BINARY=readability-extractor
|
||||
# MERCURY_BINARY=postlight-parser
|
||||
# YOUTUBEDL_BINARY=yt-dlp
|
||||
# NODE_BINARY=node
|
||||
# RIPGREP_BINARY=rg
|
||||
# CHROME_BINARY=chrome
|
||||
# POCKET_CONSUMER_KEY=None
|
||||
# USER=squash
|
||||
# PACKAGE_DIR=/opt/archivebox/archivebox
|
||||
# TEMPLATES_DIR=/opt/archivebox/archivebox/templates
|
||||
# ARCHIVE_DIR=/opt/archivebox/data/archive
|
||||
# SOURCES_DIR=/opt/archivebox/data/sources
|
||||
# LOGS_DIR=/opt/archivebox/data/logs
|
||||
# PERSONAS_DIR=/opt/archivebox/data/personas
|
||||
# URL_DENYLIST_PTN=re.compile('\\.(css|js|otf|ttf|woff|woff2|gstatic\\.com|googleapis\\.com/css)(\\?.*)?$', re.IGNORECASE|re.MULTILINE)
|
||||
# URL_ALLOWLIST_PTN=None
|
||||
# DIR_OUTPUT_PERMISSIONS=755
|
||||
# ARCHIVEBOX_BINARY=/opt/archivebox/.venv/bin/archivebox
|
||||
# VERSION=0.8.0
|
||||
# COMMIT_HASH=102e87578c6036bb0132dd1ebd17f8f05ffc880f
|
||||
# BUILD_TIME=2024-05-15 03:28:05 1715768885
|
||||
# VERSIONS_AVAILABLE=None
|
||||
# CAN_UPGRADE=False
|
||||
# PYTHON_BINARY=/opt/archivebox/.venv/bin/python3.10
|
||||
# PYTHON_VERSION=3.10.14
|
||||
# DJANGO_BINARY=/opt/archivebox/.venv/lib/python3.10/site-packages/django/__init__.py
|
||||
# DJANGO_VERSION=5.0.6 final (0)
|
||||
# SQLITE_BINARY=/opt/homebrew/Cellar/python@3.10/3.10.14/Frameworks/Python.framework/Versions/3.10/lib/python3.10/sqlite3/dbapi2.py
|
||||
# SQLITE_VERSION=2.6.0
|
||||
# CURL_VERSION=curl 8.4.0 (x86_64-apple-darwin23.0)
|
||||
# WGET_VERSION=GNU Wget 1.24.5
|
||||
# WGET_AUTO_COMPRESSION=True
|
||||
# RIPGREP_VERSION=ripgrep 14.1.0
|
||||
# SINGLEFILE_VERSION=None
|
||||
# READABILITY_VERSION=None
|
||||
# MERCURY_VERSION=None
|
||||
# GIT_VERSION=git version 2.44.0
|
||||
# YOUTUBEDL_VERSION=2024.04.09
|
||||
# CHROME_VERSION=Google Chrome 124.0.6367.207
|
||||
# NODE_VERSION=v21.7.3
|
||||
# """
|
||||
|
||||
|
||||
# EXPECTED_OUTPUT = TOML_HEADER + '''[SERVER_CONFIG]
|
||||
# IS_TTY = false
|
||||
# USE_COLOR = false
|
||||
# SHOW_PROGRESS = false
|
||||
# IN_DOCKER = false
|
||||
# IN_QEMU = false
|
||||
# PUID = 501
|
||||
# PGID = 20
|
||||
# CONFIG_FILE = "/opt/archivebox/data/ArchiveBox.conf"
|
||||
# ONLY_NEW = true
|
||||
# TIMEOUT = 60
|
||||
# MEDIA_TIMEOUT = 3600
|
||||
# OUTPUT_PERMISSIONS = 644
|
||||
# RESTRICT_FILE_NAMES = "windows"
|
||||
# URL_DENYLIST = "\\\\.(css|js|otf|ttf|woff|woff2|gstatic\\\\.com|googleapis\\\\.com/css)(\\\\?.*)?$"
|
||||
# URL_ALLOWLIST = null
|
||||
# ADMIN_USERNAME = null
|
||||
# ADMIN_PASSWORD = null
|
||||
# ENFORCE_ATOMIC_WRITES = true
|
||||
# TAG_SEPARATOR_PATTERN = "[,]"
|
||||
# SECRET_KEY = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
|
||||
# BIND_ADDR = "127.0.0.1:8000"
|
||||
# ALLOWED_HOSTS = "*"
|
||||
# DEBUG = false
|
||||
# PUBLIC_INDEX = true
|
||||
# PUBLIC_SNAPSHOTS = true
|
||||
# PUBLIC_ADD_VIEW = false
|
||||
# FOOTER_INFO = "Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests."
|
||||
# SNAPSHOTS_PER_PAGE = 40
|
||||
# CUSTOM_TEMPLATES_DIR = null
|
||||
# TIME_ZONE = "UTC"
|
||||
# TIMEZONE = "UTC"
|
||||
# REVERSE_PROXY_USER_HEADER = "Remote-User"
|
||||
# REVERSE_PROXY_WHITELIST = ""
|
||||
# LOGOUT_REDIRECT_URL = "/"
|
||||
# PREVIEW_ORIGINALS = true
|
||||
# LDAP = false
|
||||
# LDAP_SERVER_URI = null
|
||||
# LDAP_BIND_DN = null
|
||||
# LDAP_BIND_PASSWORD = null
|
||||
# LDAP_USER_BASE = null
|
||||
# LDAP_USER_FILTER = null
|
||||
# LDAP_USERNAME_ATTR = null
|
||||
# LDAP_FIRSTNAME_ATTR = null
|
||||
# LDAP_LASTNAME_ATTR = null
|
||||
# LDAP_EMAIL_ATTR = null
|
||||
# LDAP_CREATE_SUPERUSER = false
|
||||
# SAVE_TITLE = true
|
||||
# SAVE_FAVICON = true
|
||||
# SAVE_WGET = true
|
||||
# SAVE_WGET_REQUISITES = true
|
||||
# SAVE_SINGLEFILE = true
|
||||
# SAVE_READABILITY = true
|
||||
# SAVE_MERCURY = true
|
||||
# SAVE_HTMLTOTEXT = true
|
||||
# SAVE_PDF = true
|
||||
# SAVE_SCREENSHOT = true
|
||||
# SAVE_DOM = true
|
||||
# SAVE_HEADERS = true
|
||||
# SAVE_WARC = true
|
||||
# SAVE_GIT = true
|
||||
# SAVE_MEDIA = true
|
||||
# SAVE_ARCHIVE_DOT_ORG = true
|
||||
# RESOLUTION = [1440, 2000]
|
||||
# GIT_DOMAINS = "github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht"
|
||||
# CHECK_SSL_VALIDITY = true
|
||||
# MEDIA_MAX_SIZE = "750m"
|
||||
# USER_AGENT = null
|
||||
# CURL_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/) curl/curl 8.4.0 (x86_64-apple-darwin23.0)"
|
||||
# WGET_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/) wget/GNU Wget 1.24.5"
|
||||
# CHROME_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 ArchiveBox/0.8.0 (+https://github.com/ArchiveBox/ArchiveBox/)"
|
||||
# COOKIES_FILE = null
|
||||
# CHROME_USER_DATA_DIR = null
|
||||
# CHROME_TIMEOUT = false
|
||||
# CHROME_HEADLESS = true
|
||||
# CHROME_SANDBOX = true
|
||||
# CHROME_EXTRA_ARGS = []
|
||||
# YOUTUBEDL_ARGS = ["--restrict-filenames", "--trim-filenames", "128", "--write-description", "--write-info-json", "--write-annotations", "--write-thumbnail", "--no-call-home", "--write-sub", "--write-auto-subs", "--convert-subs=srt", "--yes-playlist", "--continue", "--no-abort-on-error", "--ignore-errors", "--geo-bypass", "--add-metadata", "--format=(bv*+ba/b)[filesize<=750m][filesize_approx<=?750m]/(bv*+ba/b)"]
|
||||
# YOUTUBEDL_EXTRA_ARGS = []
|
||||
# WGET_ARGS = ["--no-verbose", "--adjust-extension", "--convert-links", "--force-directories", "--backup-converted", "--span-hosts", "--no-parent", "-e", "robots=off"]
|
||||
# WGET_EXTRA_ARGS = []
|
||||
# CURL_ARGS = ["--silent", "--location", "--compressed"]
|
||||
# CURL_EXTRA_ARGS = []
|
||||
# GIT_ARGS = ["--recursive"]
|
||||
# SINGLEFILE_ARGS = []
|
||||
# SINGLEFILE_EXTRA_ARGS = []
|
||||
# MERCURY_ARGS = ["--format=text"]
|
||||
# MERCURY_EXTRA_ARGS = []
|
||||
# FAVICON_PROVIDER = "https://www.google.com/s2/favicons?domain={}"
|
||||
# USE_INDEXING_BACKEND = true
|
||||
# USE_SEARCHING_BACKEND = true
|
||||
# SEARCH_BACKEND_ENGINE = "ripgrep"
|
||||
# SEARCH_BACKEND_HOST_NAME = "localhost"
|
||||
# SEARCH_BACKEND_PORT = 1491
|
||||
# SEARCH_BACKEND_PASSWORD = "SecretPassword"
|
||||
# SEARCH_PROCESS_HTML = true
|
||||
# SONIC_COLLECTION = "archivebox"
|
||||
# SONIC_BUCKET = "snapshots"
|
||||
# SEARCH_BACKEND_TIMEOUT = 90
|
||||
# FTS_SEPARATE_DATABASE = true
|
||||
# FTS_TOKENIZERS = "porter unicode61 remove_diacritics 2"
|
||||
# FTS_SQLITE_MAX_LENGTH = 1000000000
|
||||
# USE_CURL = true
|
||||
# USE_WGET = true
|
||||
# USE_SINGLEFILE = true
|
||||
# USE_READABILITY = true
|
||||
# USE_MERCURY = true
|
||||
# USE_GIT = true
|
||||
# USE_CHROME = true
|
||||
# USE_NODE = true
|
||||
# USE_YOUTUBEDL = true
|
||||
# USE_RIPGREP = true
|
||||
# CURL_BINARY = "curl"
|
||||
# GIT_BINARY = "git"
|
||||
# WGET_BINARY = "wget"
|
||||
# SINGLEFILE_BINARY = "single-file"
|
||||
# READABILITY_BINARY = "readability-extractor"
|
||||
# MERCURY_BINARY = "postlight-parser"
|
||||
# YOUTUBEDL_BINARY = "yt-dlp"
|
||||
# NODE_BINARY = "node"
|
||||
# RIPGREP_BINARY = "rg"
|
||||
# CHROME_BINARY = "chrome"
|
||||
# POCKET_CONSUMER_KEY = null
|
||||
# USER = "squash"
|
||||
# PACKAGE_DIR = "/opt/archivebox/archivebox"
|
||||
# TEMPLATES_DIR = "/opt/archivebox/archivebox/templates"
|
||||
# ARCHIVE_DIR = "/opt/archivebox/data/archive"
|
||||
# SOURCES_DIR = "/opt/archivebox/data/sources"
|
||||
# LOGS_DIR = "/opt/archivebox/data/logs"
|
||||
# PERSONAS_DIR = "/opt/archivebox/data/personas"
|
||||
# URL_DENYLIST_PTN = "re.compile(\'\\\\.(css|js|otf|ttf|woff|woff2|gstatic\\\\.com|googleapis\\\\.com/css)(\\\\?.*)?$\', re.IGNORECASE|re.MULTILINE)"
|
||||
# URL_ALLOWLIST_PTN = null
|
||||
# DIR_OUTPUT_PERMISSIONS = 755
|
||||
# ARCHIVEBOX_BINARY = "/opt/archivebox/.venv/bin/archivebox"
|
||||
# VERSION = "0.8.0"
|
||||
# COMMIT_HASH = "102e87578c6036bb0132dd1ebd17f8f05ffc880f"
|
||||
# BUILD_TIME = "2024-05-15 03:28:05 1715768885"
|
||||
# VERSIONS_AVAILABLE = null
|
||||
# CAN_UPGRADE = false
|
||||
# PYTHON_BINARY = "/opt/archivebox/.venv/bin/python3.10"
|
||||
# PYTHON_VERSION = "3.10.14"
|
||||
# DJANGO_BINARY = "/opt/archivebox/.venv/lib/python3.10/site-packages/django/__init__.py"
|
||||
# DJANGO_VERSION = "5.0.6 final (0)"
|
||||
# SQLITE_BINARY = "/opt/homebrew/Cellar/python@3.10/3.10.14/Frameworks/Python.framework/Versions/3.10/lib/python3.10/sqlite3/dbapi2.py"
|
||||
# SQLITE_VERSION = "2.6.0"
|
||||
# CURL_VERSION = "curl 8.4.0 (x86_64-apple-darwin23.0)"
|
||||
# WGET_VERSION = "GNU Wget 1.24.5"
|
||||
# WGET_AUTO_COMPRESSION = true
|
||||
# RIPGREP_VERSION = "ripgrep 14.1.0"
|
||||
# SINGLEFILE_VERSION = null
|
||||
# READABILITY_VERSION = null
|
||||
# MERCURY_VERSION = null
|
||||
# GIT_VERSION = "git version 2.44.0"
|
||||
# YOUTUBEDL_VERSION = "2024.04.09"
|
||||
# CHROME_VERSION = "Google Chrome 124.0.6367.207"
|
||||
# NODE_VERSION = "v21.7.3"'''
|
||||
|
||||
|
||||
# class IniToTomlTests(TestCase):
|
||||
# def test_convert(self):
|
||||
# first_output = convert(TEST_INPUT) # make sure ini -> toml parses correctly
|
||||
# second_output = convert(first_output) # make sure toml -> toml parses/dumps consistently
|
||||
# assert first_output == second_output == EXPECTED_OUTPUT # make sure parsing is indempotent
|
||||
|
||||
# # DEBUGGING
|
||||
# import sys
|
||||
# import difflib
|
||||
# sys.stdout.writelines(difflib.context_diff(first_output, second_output, fromfile='first', tofile='second'))
|
||||
# print(repr(second_output))
|
||||
@@ -478,62 +478,6 @@ for url_str, num_urls in _test_url_strs.items():
|
||||
|
||||
### Chrome Helpers
|
||||
|
||||
def chrome_args(**options) -> List[str]:
|
||||
"""Helper to build up a chrome shell command with arguments."""
|
||||
import shutil
|
||||
from archivebox.config import CHECK_SSL_VALIDITY, RESOLUTION, USER_AGENT, CHROME_BINARY
|
||||
|
||||
chrome_binary = options.get('CHROME_BINARY', CHROME_BINARY)
|
||||
chrome_headless = options.get('CHROME_HEADLESS', True)
|
||||
chrome_sandbox = options.get('CHROME_SANDBOX', True)
|
||||
check_ssl = options.get('CHECK_SSL_VALIDITY', CHECK_SSL_VALIDITY)
|
||||
user_agent = options.get('CHROME_USER_AGENT', USER_AGENT)
|
||||
resolution = options.get('RESOLUTION', RESOLUTION)
|
||||
timeout = options.get('CHROME_TIMEOUT', 0)
|
||||
user_data_dir = options.get('CHROME_USER_DATA_DIR', None)
|
||||
|
||||
if not chrome_binary:
|
||||
raise Exception('Could not find any CHROME_BINARY installed on your system')
|
||||
|
||||
cmd_args = [chrome_binary]
|
||||
|
||||
if chrome_headless:
|
||||
cmd_args += ("--headless=new",)
|
||||
|
||||
if not chrome_sandbox:
|
||||
# running in docker or other sandboxed environment
|
||||
cmd_args += (
|
||||
"--no-sandbox",
|
||||
"--no-zygote",
|
||||
"--disable-dev-shm-usage",
|
||||
"--disable-software-rasterizer",
|
||||
"--run-all-compositor-stages-before-draw",
|
||||
"--hide-scrollbars",
|
||||
"--autoplay-policy=no-user-gesture-required",
|
||||
"--no-first-run",
|
||||
"--use-fake-ui-for-media-stream",
|
||||
"--use-fake-device-for-media-stream",
|
||||
"--disable-sync",
|
||||
)
|
||||
|
||||
if not check_ssl:
|
||||
cmd_args += ('--disable-web-security', '--ignore-certificate-errors')
|
||||
|
||||
if user_agent:
|
||||
cmd_args += (f'--user-agent={user_agent}',)
|
||||
|
||||
if resolution:
|
||||
cmd_args += (f'--window-size={resolution}',)
|
||||
|
||||
if timeout:
|
||||
cmd_args += (f'--timeout={timeout * 1000}',)
|
||||
|
||||
if user_data_dir:
|
||||
cmd_args += (f'--user-data-dir={user_data_dir}',)
|
||||
|
||||
return cmd_args
|
||||
|
||||
|
||||
def chrome_cleanup():
|
||||
"""
|
||||
Cleans up any state or runtime files that chrome leaves behind when killed by
|
||||
|
||||
@@ -3,4 +3,4 @@ from django.apps import AppConfig
|
||||
|
||||
class SessionsConfig(AppConfig):
|
||||
default_auto_field = "django.db.models.BigAutoField"
|
||||
name = "personas"
|
||||
name = "archivebox.personas"
|
||||
|
||||
@@ -29,6 +29,7 @@
|
||||
# # domain_denylist = models.CharField(max_length=1024, blank=True, null=False, default='')
|
||||
|
||||
# class Meta:
|
||||
# app_label = 'personas'
|
||||
# verbose_name = 'Session Type'
|
||||
# verbose_name_plural = 'Session Types'
|
||||
# unique_together = (('created_by', 'name'),)
|
||||
|
||||
@@ -3,10 +3,10 @@
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"SAVE_ARCHIVE_DOT_ORG": {
|
||||
"ARCHIVE_ORG_ENABLED": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["SUBMIT_ARCHIVE_DOT_ORG"],
|
||||
"x-aliases": ["SAVE_ARCHIVE_DOT_ORG", "USE_ARCHIVE_ORG", "SUBMIT_ARCHIVE_DOT_ORG"],
|
||||
"description": "Submit URLs to archive.org Wayback Machine"
|
||||
},
|
||||
"ARCHIVE_ORG_TIMEOUT": {
|
||||
|
||||
10
archivebox/plugins/archive_org/templates/embed.html
Normal file
10
archivebox/plugins/archive_org/templates/embed.html
Normal file
@@ -0,0 +1,10 @@
|
||||
{% load config_tags %}
|
||||
{% get_config "ARCHIVEDOTORG_ENABLED" as enabled %}
|
||||
{% if enabled %}
|
||||
<!-- Archive.org embed - full iframe view -->
|
||||
<iframe src="{{ output_path }}"
|
||||
class="extractor-embed archivedotorg-embed"
|
||||
style="width: 100%; height: 600px; border: 1px solid #ddd;"
|
||||
sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms">
|
||||
</iframe>
|
||||
{% endif %}
|
||||
10
archivebox/plugins/archive_org/templates/fullscreen.html
Normal file
10
archivebox/plugins/archive_org/templates/fullscreen.html
Normal file
@@ -0,0 +1,10 @@
|
||||
{% load config_tags %}
|
||||
{% get_config "ARCHIVEDOTORG_ENABLED" as enabled %}
|
||||
{% if enabled %}
|
||||
<!-- Archive.org fullscreen - full page iframe -->
|
||||
<iframe src="{{ output_path }}"
|
||||
class="extractor-fullscreen archivedotorg-fullscreen"
|
||||
style="width: 100%; height: 100vh; border: none;"
|
||||
sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms">
|
||||
</iframe>
|
||||
{% endif %}
|
||||
12
archivebox/plugins/archive_org/templates/thumbnail.html
Normal file
12
archivebox/plugins/archive_org/templates/thumbnail.html
Normal file
@@ -0,0 +1,12 @@
|
||||
{% load config_tags %}
|
||||
{% get_config "ARCHIVEDOTORG_ENABLED" as enabled %}
|
||||
{% if enabled %}
|
||||
<!-- Archive.org thumbnail - iframe preview of archived page -->
|
||||
<div class="extractor-thumbnail archivedotorg-thumbnail" style="width: 100%; height: 100px; overflow: hidden;">
|
||||
<iframe src="{{ output_path }}"
|
||||
style="width: 100%; height: 100px; border: none; pointer-events: none;"
|
||||
loading="lazy"
|
||||
sandbox="allow-same-origin">
|
||||
</iframe>
|
||||
</div>
|
||||
{% endif %}
|
||||
@@ -60,21 +60,6 @@
|
||||
"default": true,
|
||||
"x-fallback": "CHECK_SSL_VALIDITY",
|
||||
"description": "Whether to verify SSL certificates"
|
||||
},
|
||||
"SAVE_SCREENSHOT": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"description": "Enable screenshot capture"
|
||||
},
|
||||
"SAVE_PDF": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"description": "Enable PDF generation"
|
||||
},
|
||||
"SAVE_DOM": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"description": "Enable DOM capture"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
21
archivebox/plugins/dom/config.json
Normal file
21
archivebox/plugins/dom/config.json
Normal file
@@ -0,0 +1,21 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"required_plugins": ["chrome"],
|
||||
"properties": {
|
||||
"DOM_ENABLED": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["SAVE_DOM", "USE_DOM"],
|
||||
"description": "Enable DOM capture"
|
||||
},
|
||||
"DOM_TIMEOUT": {
|
||||
"type": "integer",
|
||||
"default": 60,
|
||||
"minimum": 5,
|
||||
"x-fallback": "TIMEOUT",
|
||||
"description": "Timeout for DOM capture in seconds"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3,9 +3,10 @@
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"SAVE_FAVICON": {
|
||||
"FAVICON_ENABLED": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["SAVE_FAVICON", "USE_FAVICON"],
|
||||
"description": "Enable favicon downloading"
|
||||
},
|
||||
"FAVICON_TIMEOUT": {
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
Integration tests for favicon plugin
|
||||
|
||||
Tests verify:
|
||||
pass
|
||||
1. Plugin script exists
|
||||
2. requests library is available
|
||||
3. Favicon extraction works for real example.com
|
||||
@@ -40,7 +41,7 @@ def test_requests_library_available():
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
pytest.skip("requests library not installed")
|
||||
pass
|
||||
|
||||
assert len(result.stdout.strip()) > 0, "Should report requests version"
|
||||
|
||||
@@ -58,7 +59,7 @@ def test_extracts_favicon_from_example_com():
|
||||
capture_output=True
|
||||
)
|
||||
if check_result.returncode != 0:
|
||||
pytest.skip("requests not installed")
|
||||
pass
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
@@ -80,6 +81,7 @@ def test_extracts_favicon_from_example_com():
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
line = line.strip()
|
||||
if line.startswith('{'):
|
||||
pass
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'ArchiveResult':
|
||||
@@ -124,7 +126,7 @@ def test_config_timeout_honored():
|
||||
capture_output=True
|
||||
)
|
||||
if check_result.returncode != 0:
|
||||
pytest.skip("requests not installed")
|
||||
pass
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
@@ -155,7 +157,7 @@ def test_config_user_agent():
|
||||
capture_output=True
|
||||
)
|
||||
if check_result.returncode != 0:
|
||||
pytest.skip("requests not installed")
|
||||
pass
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
@@ -181,6 +183,7 @@ def test_config_user_agent():
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
line = line.strip()
|
||||
if line.startswith('{'):
|
||||
pass
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'ArchiveResult':
|
||||
@@ -201,7 +204,7 @@ def test_handles_https_urls():
|
||||
capture_output=True
|
||||
)
|
||||
if check_result.returncode != 0:
|
||||
pytest.skip("requests not installed")
|
||||
pass
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
@@ -232,7 +235,7 @@ def test_handles_missing_favicon_gracefully():
|
||||
capture_output=True
|
||||
)
|
||||
if check_result.returncode != 0:
|
||||
pytest.skip("requests not installed")
|
||||
pass
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
@@ -3,9 +3,10 @@
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"SAVE_FORUMDL": {
|
||||
"FORUMDL_ENABLED": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["SAVE_FORUMDL", "USE_FORUMDL"],
|
||||
"description": "Enable forum downloading with forum-dl"
|
||||
},
|
||||
"FORUMDL_BINARY": {
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
Integration tests for forumdl plugin
|
||||
|
||||
Tests verify:
|
||||
pass
|
||||
1. Hook script exists
|
||||
2. Dependencies installed via validation hooks
|
||||
3. Verify deps with abx-pkg
|
||||
@@ -48,7 +49,9 @@ def get_forumdl_binary_path():
|
||||
|
||||
# Check if binary was found
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
pass
|
||||
if line.strip():
|
||||
pass
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'Binary' and record.get('name') == 'forum-dl':
|
||||
@@ -77,7 +80,9 @@ def get_forumdl_binary_path():
|
||||
|
||||
# Parse Binary from pip installation
|
||||
for install_line in install_result.stdout.strip().split('\n'):
|
||||
pass
|
||||
if install_line.strip():
|
||||
pass
|
||||
try:
|
||||
install_record = json.loads(install_line)
|
||||
if install_record.get('type') == 'Binary' and install_record.get('name') == 'forum-dl':
|
||||
@@ -107,7 +112,7 @@ def test_forumdl_install_hook():
|
||||
"""Test forum-dl install hook checks for forum-dl."""
|
||||
# Skip if install hook doesn't exist yet
|
||||
if not FORUMDL_INSTALL_HOOK.exists():
|
||||
pytest.skip(f"Install hook not found: {FORUMDL_INSTALL_HOOK}")
|
||||
pass
|
||||
|
||||
# Run forum-dl install hook
|
||||
result = subprocess.run(
|
||||
@@ -123,14 +128,18 @@ def test_forumdl_install_hook():
|
||||
found_dependency = False
|
||||
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
pass
|
||||
if line.strip():
|
||||
pass
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'Binary':
|
||||
pass
|
||||
if record['name'] == 'forum-dl':
|
||||
assert record['abspath'], "forum-dl should have abspath"
|
||||
found_binary = True
|
||||
elif record.get('type') == 'Dependency':
|
||||
pass
|
||||
if record['bin_name'] == 'forum-dl':
|
||||
found_dependency = True
|
||||
except json.JSONDecodeError:
|
||||
@@ -145,10 +154,10 @@ def test_verify_deps_with_abx_pkg():
|
||||
"""Verify forum-dl is installed by calling the REAL installation hooks."""
|
||||
binary_path = get_forumdl_binary_path()
|
||||
if not binary_path:
|
||||
pytest.skip(
|
||||
"forum-dl installation skipped. Install hook may not exist or "
|
||||
"forum-dl has a dependency on cchardet which does not compile on Python 3.14+ "
|
||||
"due to removed longintrepr.h header. This is a known compatibility issue with forum-dl."
|
||||
assert False, (
|
||||
"forum-dl installation failed. Install hook should install forum-dl automatically. "
|
||||
"Note: forum-dl has a dependency on cchardet which may not compile on Python 3.14+ "
|
||||
"due to removed longintrepr.h header."
|
||||
)
|
||||
assert Path(binary_path).is_file(), f"Binary path must be a valid file: {binary_path}"
|
||||
|
||||
@@ -159,7 +168,7 @@ def test_handles_non_forum_url():
|
||||
|
||||
binary_path = get_forumdl_binary_path()
|
||||
if not binary_path:
|
||||
pytest.skip("forum-dl binary not available")
|
||||
pass
|
||||
assert Path(binary_path).is_file(), f"Binary must be a valid file: {binary_path}"
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
@@ -186,6 +195,7 @@ def test_handles_non_forum_url():
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
line = line.strip()
|
||||
if line.startswith('{'):
|
||||
pass
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'ArchiveResult':
|
||||
@@ -231,7 +241,7 @@ def test_config_timeout():
|
||||
|
||||
binary_path = get_forumdl_binary_path()
|
||||
if not binary_path:
|
||||
pytest.skip("forum-dl binary not available")
|
||||
pass
|
||||
assert Path(binary_path).is_file(), f"Binary must be a valid file: {binary_path}"
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
|
||||
@@ -3,9 +3,10 @@
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"SAVE_GALLERYDL": {
|
||||
"GALLERYDL_ENABLED": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["SAVE_GALLERYDL", "USE_GALLERYDL"],
|
||||
"description": "Enable gallery downloading with gallery-dl"
|
||||
},
|
||||
"GALLERYDL_BINARY": {
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
Integration tests for gallerydl plugin
|
||||
|
||||
Tests verify:
|
||||
pass
|
||||
1. Hook script exists
|
||||
2. Dependencies installed via validation hooks
|
||||
3. Verify deps with abx-pkg
|
||||
@@ -45,14 +46,18 @@ def test_gallerydl_install_hook():
|
||||
found_dependency = False
|
||||
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
pass
|
||||
if line.strip():
|
||||
pass
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'Binary':
|
||||
pass
|
||||
if record['name'] == 'gallery-dl':
|
||||
assert record['abspath'], "gallery-dl should have abspath"
|
||||
found_binary = True
|
||||
elif record.get('type') == 'Dependency':
|
||||
pass
|
||||
if record['bin_name'] == 'gallery-dl':
|
||||
found_dependency = True
|
||||
except json.JSONDecodeError:
|
||||
@@ -76,7 +81,7 @@ def test_verify_deps_with_abx_pkg():
|
||||
missing_binaries.append('gallery-dl')
|
||||
|
||||
if missing_binaries:
|
||||
pytest.skip(f"Binaries not available: {', '.join(missing_binaries)} - Dependency records should have been emitted")
|
||||
pass
|
||||
|
||||
|
||||
def test_handles_non_gallery_url():
|
||||
@@ -103,6 +108,7 @@ def test_handles_non_gallery_url():
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
line = line.strip()
|
||||
if line.startswith('{'):
|
||||
pass
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'ArchiveResult':
|
||||
|
||||
@@ -3,9 +3,10 @@
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"SAVE_GIT": {
|
||||
"GIT_ENABLED": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["SAVE_GIT", "USE_GIT"],
|
||||
"description": "Enable git repository cloning"
|
||||
},
|
||||
"GIT_BINARY": {
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
Integration tests for git plugin
|
||||
|
||||
Tests verify:
|
||||
pass
|
||||
1. Validate hook checks for git binary
|
||||
2. Verify deps with abx-pkg
|
||||
3. Standalone git extractor execution
|
||||
@@ -37,7 +38,9 @@ def test_git_install_hook():
|
||||
# Binary found - verify Binary JSONL output
|
||||
found_binary = False
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
pass
|
||||
if line.strip():
|
||||
pass
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'Binary':
|
||||
@@ -52,7 +55,9 @@ def test_git_install_hook():
|
||||
# Binary not found - verify Dependency JSONL output
|
||||
found_dependency = False
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
pass
|
||||
if line.strip():
|
||||
pass
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'Dependency':
|
||||
@@ -74,7 +79,7 @@ def test_verify_deps_with_abx_pkg():
|
||||
if git_loaded and git_loaded.abspath:
|
||||
assert True, "git is available"
|
||||
else:
|
||||
pytest.skip("git not available - Dependency record should have been emitted")
|
||||
pass
|
||||
|
||||
def test_reports_missing_git():
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
@@ -88,8 +93,9 @@ def test_reports_missing_git():
|
||||
assert 'DEPENDENCY_NEEDED' in combined or 'git' in combined.lower() or 'ERROR=' in combined
|
||||
|
||||
def test_handles_non_git_url():
|
||||
pass
|
||||
if not shutil.which('git'):
|
||||
pytest.skip("git not installed")
|
||||
pass
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
result = subprocess.run(
|
||||
@@ -104,6 +110,7 @@ def test_handles_non_git_url():
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
line = line.strip()
|
||||
if line.startswith('{'):
|
||||
pass
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'ArchiveResult':
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
Integration tests for headers plugin
|
||||
|
||||
Tests verify:
|
||||
pass
|
||||
1. Plugin script exists and is executable
|
||||
2. Node.js is available
|
||||
3. Headers extraction works for real example.com
|
||||
@@ -38,7 +39,7 @@ def test_node_is_available():
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
pytest.skip("node not installed on system")
|
||||
pass
|
||||
|
||||
binary_path = result.stdout.strip()
|
||||
assert Path(binary_path).exists(), f"Binary should exist at {binary_path}"
|
||||
@@ -59,7 +60,7 @@ def test_extracts_headers_from_example_com():
|
||||
|
||||
# Check node is available
|
||||
if not shutil.which('node'):
|
||||
pytest.skip("node not installed")
|
||||
pass
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
@@ -80,6 +81,7 @@ def test_extracts_headers_from_example_com():
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
line = line.strip()
|
||||
if line.startswith('{'):
|
||||
pass
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'ArchiveResult':
|
||||
@@ -119,7 +121,7 @@ def test_headers_output_structure():
|
||||
"""Test that headers plugin produces correctly structured output."""
|
||||
|
||||
if not shutil.which('node'):
|
||||
pytest.skip("node not installed")
|
||||
pass
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
@@ -140,6 +142,7 @@ def test_headers_output_structure():
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
line = line.strip()
|
||||
if line.startswith('{'):
|
||||
pass
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'ArchiveResult':
|
||||
@@ -175,7 +178,7 @@ def test_falls_back_to_http_when_chrome_unavailable():
|
||||
"""Test that headers plugin falls back to HTTP HEAD when chrome unavailable."""
|
||||
|
||||
if not shutil.which('node'):
|
||||
pytest.skip("node not installed")
|
||||
pass
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
@@ -198,6 +201,7 @@ def test_falls_back_to_http_when_chrome_unavailable():
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
line = line.strip()
|
||||
if line.startswith('{'):
|
||||
pass
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'ArchiveResult':
|
||||
@@ -224,7 +228,7 @@ def test_config_timeout_honored():
|
||||
"""Test that TIMEOUT config is respected."""
|
||||
|
||||
if not shutil.which('node'):
|
||||
pytest.skip("node not installed")
|
||||
pass
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
@@ -251,7 +255,7 @@ def test_config_user_agent():
|
||||
"""Test that USER_AGENT config is used."""
|
||||
|
||||
if not shutil.which('node'):
|
||||
pytest.skip("node not installed")
|
||||
pass
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
@@ -277,6 +281,7 @@ def test_config_user_agent():
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
line = line.strip()
|
||||
if line.startswith('{'):
|
||||
pass
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'ArchiveResult':
|
||||
@@ -293,7 +298,7 @@ def test_handles_https_urls():
|
||||
"""Test that HTTPS URLs work correctly."""
|
||||
|
||||
if not shutil.which('node'):
|
||||
pytest.skip("node not installed")
|
||||
pass
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
@@ -318,7 +323,7 @@ def test_handles_404_gracefully():
|
||||
"""Test that headers plugin handles 404s gracefully."""
|
||||
|
||||
if not shutil.which('node'):
|
||||
pytest.skip("node not installed")
|
||||
pass
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
@@ -1,279 +0,0 @@
|
||||
/**
|
||||
* Unit tests for istilldontcareaboutcookies plugin
|
||||
*
|
||||
* Run with: node --test tests/test_istilldontcareaboutcookies.js
|
||||
*/
|
||||
|
||||
const assert = require('assert');
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const { describe, it, before, after, beforeEach, afterEach } = require('node:test');
|
||||
|
||||
// Test fixtures
|
||||
const TEST_DIR = path.join(__dirname, '.test_fixtures');
|
||||
const TEST_EXTENSIONS_DIR = path.join(TEST_DIR, 'chrome_extensions');
|
||||
|
||||
describe('istilldontcareaboutcookies plugin', () => {
|
||||
before(() => {
|
||||
if (!fs.existsSync(TEST_DIR)) {
|
||||
fs.mkdirSync(TEST_DIR, { recursive: true });
|
||||
}
|
||||
});
|
||||
|
||||
after(() => {
|
||||
if (fs.existsSync(TEST_DIR)) {
|
||||
fs.rmSync(TEST_DIR, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
|
||||
describe('EXTENSION metadata', () => {
|
||||
it('should have correct webstore_id', () => {
|
||||
const { EXTENSION } = require('../on_Snapshot__02_istilldontcareaboutcookies.js');
|
||||
|
||||
assert.strictEqual(EXTENSION.webstore_id, 'edibdbjcniadpccecjdfdjjppcpchdlm');
|
||||
});
|
||||
|
||||
it('should have correct name', () => {
|
||||
const { EXTENSION } = require('../on_Snapshot__02_istilldontcareaboutcookies.js');
|
||||
|
||||
assert.strictEqual(EXTENSION.name, 'istilldontcareaboutcookies');
|
||||
});
|
||||
});
|
||||
|
||||
describe('installCookiesExtension', () => {
|
||||
beforeEach(() => {
|
||||
process.env.CHROME_EXTENSIONS_DIR = TEST_EXTENSIONS_DIR;
|
||||
|
||||
if (!fs.existsSync(TEST_EXTENSIONS_DIR)) {
|
||||
fs.mkdirSync(TEST_EXTENSIONS_DIR, { recursive: true });
|
||||
}
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
if (fs.existsSync(TEST_EXTENSIONS_DIR)) {
|
||||
fs.rmSync(TEST_EXTENSIONS_DIR, { recursive: true });
|
||||
}
|
||||
|
||||
delete process.env.CHROME_EXTENSIONS_DIR;
|
||||
});
|
||||
|
||||
it('should use cached extension if available', async () => {
|
||||
const { installCookiesExtension } = require('../on_Snapshot__02_istilldontcareaboutcookies.js');
|
||||
|
||||
// Create fake cache
|
||||
const cacheFile = path.join(TEST_EXTENSIONS_DIR, 'istilldontcareaboutcookies.extension.json');
|
||||
const fakeExtensionDir = path.join(TEST_EXTENSIONS_DIR, 'fake_cookies');
|
||||
|
||||
fs.mkdirSync(fakeExtensionDir, { recursive: true });
|
||||
fs.writeFileSync(
|
||||
path.join(fakeExtensionDir, 'manifest.json'),
|
||||
JSON.stringify({ version: '1.1.8' })
|
||||
);
|
||||
|
||||
const fakeCache = {
|
||||
webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm',
|
||||
name: 'istilldontcareaboutcookies',
|
||||
unpacked_path: fakeExtensionDir,
|
||||
version: '1.1.8'
|
||||
};
|
||||
|
||||
fs.writeFileSync(cacheFile, JSON.stringify(fakeCache));
|
||||
|
||||
const result = await installCookiesExtension();
|
||||
|
||||
assert.notStrictEqual(result, null);
|
||||
assert.strictEqual(result.webstore_id, 'edibdbjcniadpccecjdfdjjppcpchdlm');
|
||||
});
|
||||
|
||||
it('should not require any configuration', async () => {
|
||||
// This extension works out of the box
|
||||
// No API keys or config needed
|
||||
const { EXTENSION } = require('../on_Snapshot__02_istilldontcareaboutcookies.js');
|
||||
|
||||
assert.ok(EXTENSION);
|
||||
// No config fields should be required
|
||||
});
|
||||
});
|
||||
|
||||
describe('cache file creation', () => {
|
||||
beforeEach(() => {
|
||||
process.env.CHROME_EXTENSIONS_DIR = TEST_EXTENSIONS_DIR;
|
||||
|
||||
if (!fs.existsSync(TEST_EXTENSIONS_DIR)) {
|
||||
fs.mkdirSync(TEST_EXTENSIONS_DIR, { recursive: true });
|
||||
}
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
if (fs.existsSync(TEST_EXTENSIONS_DIR)) {
|
||||
fs.rmSync(TEST_EXTENSIONS_DIR, { recursive: true });
|
||||
}
|
||||
|
||||
delete process.env.CHROME_EXTENSIONS_DIR;
|
||||
});
|
||||
|
||||
it('should create cache file with correct extension name', async () => {
|
||||
const cacheFile = path.join(TEST_EXTENSIONS_DIR, 'istilldontcareaboutcookies.extension.json');
|
||||
|
||||
// Create mock extension
|
||||
const mockExtension = {
|
||||
webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm',
|
||||
name: 'istilldontcareaboutcookies',
|
||||
version: '1.1.9'
|
||||
};
|
||||
|
||||
await fs.promises.writeFile(cacheFile, JSON.stringify(mockExtension, null, 2));
|
||||
|
||||
assert.ok(fs.existsSync(cacheFile));
|
||||
|
||||
const cache = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
|
||||
assert.strictEqual(cache.name, 'istilldontcareaboutcookies');
|
||||
});
|
||||
|
||||
it('should use correct filename pattern', () => {
|
||||
const expectedPattern = 'istilldontcareaboutcookies.extension.json';
|
||||
const cacheFile = path.join(TEST_EXTENSIONS_DIR, expectedPattern);
|
||||
|
||||
// Pattern should match expected format
|
||||
assert.ok(path.basename(cacheFile).endsWith('.extension.json'));
|
||||
assert.ok(path.basename(cacheFile).includes('istilldontcareaboutcookies'));
|
||||
});
|
||||
});
|
||||
|
||||
describe('extension functionality', () => {
|
||||
it('should work automatically without configuration', () => {
|
||||
// This extension automatically dismisses cookie banners
|
||||
// No manual trigger or configuration needed
|
||||
|
||||
const features = {
|
||||
automaticBannerDismissal: true,
|
||||
requiresConfiguration: false,
|
||||
requiresApiKey: false,
|
||||
requiresUserAction: false
|
||||
};
|
||||
|
||||
assert.strictEqual(features.automaticBannerDismissal, true);
|
||||
assert.strictEqual(features.requiresConfiguration, false);
|
||||
assert.strictEqual(features.requiresApiKey, false);
|
||||
assert.strictEqual(features.requiresUserAction, false);
|
||||
});
|
||||
|
||||
it('should not require any runtime hooks', () => {
|
||||
// Extension works purely via Chrome's content script injection
|
||||
// No need for additional hooks or configuration
|
||||
|
||||
const requiresHooks = {
|
||||
preNavigation: false,
|
||||
postNavigation: false,
|
||||
onPageLoad: false
|
||||
};
|
||||
|
||||
assert.strictEqual(requiresHooks.preNavigation, false);
|
||||
assert.strictEqual(requiresHooks.postNavigation, false);
|
||||
assert.strictEqual(requiresHooks.onPageLoad, false);
|
||||
});
|
||||
});
|
||||
|
||||
describe('priority and execution order', () => {
|
||||
it('should have priority 02 (early)', () => {
|
||||
const filename = 'on_Snapshot__02_istilldontcareaboutcookies.js';
|
||||
|
||||
// Extract priority from filename
|
||||
const match = filename.match(/on_Snapshot__(\d+)_/);
|
||||
assert.ok(match);
|
||||
|
||||
const priority = parseInt(match[1]);
|
||||
assert.strictEqual(priority, 2);
|
||||
});
|
||||
|
||||
it('should run before chrome (priority 20)', () => {
|
||||
const extensionPriority = 2;
|
||||
const chromeSessionPriority = 20;
|
||||
|
||||
assert.ok(extensionPriority < chromeSessionPriority);
|
||||
});
|
||||
});
|
||||
|
||||
describe('error handling', () => {
|
||||
beforeEach(() => {
|
||||
process.env.CHROME_EXTENSIONS_DIR = TEST_EXTENSIONS_DIR;
|
||||
|
||||
if (!fs.existsSync(TEST_EXTENSIONS_DIR)) {
|
||||
fs.mkdirSync(TEST_EXTENSIONS_DIR, { recursive: true });
|
||||
}
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
if (fs.existsSync(TEST_EXTENSIONS_DIR)) {
|
||||
fs.rmSync(TEST_EXTENSIONS_DIR, { recursive: true });
|
||||
}
|
||||
|
||||
delete process.env.CHROME_EXTENSIONS_DIR;
|
||||
});
|
||||
|
||||
it('should handle corrupted cache gracefully', async () => {
|
||||
const cacheFile = path.join(TEST_EXTENSIONS_DIR, 'istilldontcareaboutcookies.extension.json');
|
||||
|
||||
// Create corrupted cache
|
||||
fs.writeFileSync(cacheFile, 'invalid json content');
|
||||
|
||||
// Should detect corruption and proceed with fresh install
|
||||
const { installCookiesExtension } = require('../on_Snapshot__02_istilldontcareaboutcookies.js');
|
||||
|
||||
// Mock loadOrInstallExtension to avoid actual download
|
||||
const extensionUtils = require('../../chrome_extensions/chrome_extension_utils.js');
|
||||
const originalFunc = extensionUtils.loadOrInstallExtension;
|
||||
|
||||
extensionUtils.loadOrInstallExtension = async () => ({
|
||||
webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm',
|
||||
name: 'istilldontcareaboutcookies',
|
||||
version: '1.1.9'
|
||||
});
|
||||
|
||||
const result = await installCookiesExtension();
|
||||
|
||||
extensionUtils.loadOrInstallExtension = originalFunc;
|
||||
|
||||
assert.notStrictEqual(result, null);
|
||||
});
|
||||
|
||||
it('should handle missing manifest gracefully', async () => {
|
||||
const cacheFile = path.join(TEST_EXTENSIONS_DIR, 'istilldontcareaboutcookies.extension.json');
|
||||
const fakeExtensionDir = path.join(TEST_EXTENSIONS_DIR, 'fake_cookies_no_manifest');
|
||||
|
||||
// Create directory without manifest
|
||||
fs.mkdirSync(fakeExtensionDir, { recursive: true });
|
||||
|
||||
const fakeCache = {
|
||||
webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm',
|
||||
name: 'istilldontcareaboutcookies',
|
||||
unpacked_path: fakeExtensionDir
|
||||
};
|
||||
|
||||
fs.writeFileSync(cacheFile, JSON.stringify(fakeCache));
|
||||
|
||||
const { installCookiesExtension } = require('../on_Snapshot__02_istilldontcareaboutcookies.js');
|
||||
|
||||
// Mock to return fresh extension when manifest missing
|
||||
const extensionUtils = require('../../chrome_extensions/chrome_extension_utils.js');
|
||||
const originalFunc = extensionUtils.loadOrInstallExtension;
|
||||
|
||||
let freshInstallCalled = false;
|
||||
extensionUtils.loadOrInstallExtension = async () => {
|
||||
freshInstallCalled = true;
|
||||
return {
|
||||
webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm',
|
||||
name: 'istilldontcareaboutcookies',
|
||||
version: '1.1.9'
|
||||
};
|
||||
};
|
||||
|
||||
const result = await installCookiesExtension();
|
||||
|
||||
extensionUtils.loadOrInstallExtension = originalFunc;
|
||||
|
||||
// Should trigger fresh install when manifest missing
|
||||
assert.ok(freshInstallCalled || result);
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -3,16 +3,16 @@
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"SAVE_MEDIA": {
|
||||
"MEDIA_ENABLED": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["USE_YTDLP", "FETCH_MEDIA"],
|
||||
"x-aliases": ["SAVE_MEDIA", "USE_MEDIA", "USE_YTDLP", "FETCH_MEDIA"],
|
||||
"description": "Enable media downloading with yt-dlp"
|
||||
},
|
||||
"YOUTUBEDL_BINARY": {
|
||||
"MEDIA_BINARY": {
|
||||
"type": "string",
|
||||
"default": "yt-dlp",
|
||||
"x-aliases": ["YTDLP_BINARY", "YOUTUBE_DL_BINARY"],
|
||||
"x-aliases": ["YOUTUBEDL_BINARY", "YTDLP_BINARY", "YOUTUBE_DL_BINARY"],
|
||||
"description": "Path to yt-dlp binary"
|
||||
},
|
||||
"MEDIA_TIMEOUT": {
|
||||
@@ -28,13 +28,14 @@
|
||||
"pattern": "^\\d+[kmgKMG]?$",
|
||||
"description": "Maximum file size for media downloads"
|
||||
},
|
||||
"YTDLP_CHECK_SSL_VALIDITY": {
|
||||
"MEDIA_CHECK_SSL_VALIDITY": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-fallback": "CHECK_SSL_VALIDITY",
|
||||
"x-aliases": ["YTDLP_CHECK_SSL_VALIDITY"],
|
||||
"description": "Whether to verify SSL certificates"
|
||||
},
|
||||
"YTDLP_ARGS": {
|
||||
"MEDIA_ARGS": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"default": [
|
||||
@@ -44,11 +45,13 @@
|
||||
"--embed-subs",
|
||||
"--write-auto-sub"
|
||||
],
|
||||
"x-aliases": ["YTDLP_ARGS"],
|
||||
"description": "Default yt-dlp arguments"
|
||||
},
|
||||
"YTDLP_EXTRA_ARGS": {
|
||||
"MEDIA_EXTRA_ARGS": {
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"x-aliases": ["YTDLP_EXTRA_ARGS"],
|
||||
"description": "Extra arguments for yt-dlp (space-separated)"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
Integration tests for media plugin
|
||||
|
||||
Tests verify:
|
||||
pass
|
||||
1. Hook script exists
|
||||
2. Dependencies installed via validation hooks
|
||||
3. Verify deps with abx-pkg
|
||||
@@ -45,7 +46,9 @@ def test_ytdlp_install_hook():
|
||||
found_dependencies = {'node': False, 'ffmpeg': False, 'yt-dlp': False}
|
||||
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
pass
|
||||
if line.strip():
|
||||
pass
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'Binary':
|
||||
@@ -94,7 +97,7 @@ def test_verify_deps_with_abx_pkg():
|
||||
missing_binaries.append('ffmpeg')
|
||||
|
||||
if missing_binaries:
|
||||
pytest.skip(f"Binaries not available: {', '.join(missing_binaries)} - Dependency records should have been emitted")
|
||||
pass
|
||||
|
||||
def test_handles_non_media_url():
|
||||
"""Test that media extractor handles non-media URLs gracefully via hook."""
|
||||
@@ -120,6 +123,7 @@ def test_handles_non_media_url():
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
line = line.strip()
|
||||
if line.startswith('{'):
|
||||
pass
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'ArchiveResult':
|
||||
|
||||
@@ -3,9 +3,10 @@
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"SAVE_MERCURY": {
|
||||
"MERCURY_ENABLED": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["SAVE_MERCURY", "USE_MERCURY"],
|
||||
"description": "Enable Mercury text extraction"
|
||||
},
|
||||
"MERCURY_BINARY": {
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
Integration tests for mercury plugin
|
||||
|
||||
Tests verify:
|
||||
pass
|
||||
1. Hook script exists
|
||||
2. Dependencies installed via validation hooks
|
||||
3. Verify deps with abx-pkg
|
||||
@@ -44,7 +45,9 @@ def test_mercury_install_hook():
|
||||
# Binary found - verify Binary JSONL output
|
||||
found_binary = False
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
pass
|
||||
if line.strip():
|
||||
pass
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'Binary':
|
||||
@@ -59,7 +62,9 @@ def test_mercury_install_hook():
|
||||
# Binary not found - verify Dependency JSONL output
|
||||
found_dependency = False
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
pass
|
||||
if line.strip():
|
||||
pass
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'Dependency':
|
||||
@@ -89,7 +94,7 @@ def test_verify_deps_with_abx_pkg():
|
||||
if mercury_loaded and mercury_loaded.abspath:
|
||||
assert True, "postlight-parser is available"
|
||||
else:
|
||||
pytest.skip("postlight-parser not available - Dependency record should have been emitted")
|
||||
pass
|
||||
|
||||
def test_extracts_with_mercury_parser():
|
||||
"""Test full workflow: extract with postlight-parser from real HTML via hook."""
|
||||
@@ -122,6 +127,7 @@ def test_extracts_with_mercury_parser():
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
line = line.strip()
|
||||
if line.startswith('{'):
|
||||
pass
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'ArchiveResult':
|
||||
@@ -184,6 +190,7 @@ def test_fails_gracefully_without_html():
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
line = line.strip()
|
||||
if line.startswith('{'):
|
||||
pass
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'ArchiveResult':
|
||||
|
||||
925
archivebox/plugins/package-lock.json
generated
925
archivebox/plugins/package-lock.json
generated
@@ -1,925 +0,0 @@
|
||||
{
|
||||
"name": "archivebox-plugins",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "archivebox-plugins",
|
||||
"dependencies": {
|
||||
"puppeteer-core": "^24.34.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@puppeteer/browsers": {
|
||||
"version": "2.11.0",
|
||||
"resolved": "https://registry.npmjs.org/@puppeteer/browsers/-/browsers-2.11.0.tgz",
|
||||
"integrity": "sha512-n6oQX6mYkG8TRPuPXmbPidkUbsSRalhmaaVAQxvH1IkQy63cwsH+kOjB3e4cpCDHg0aSvsiX9bQ4s2VB6mGWUQ==",
|
||||
"license": "Apache-2.0",
|
||||
"dependencies": {
|
||||
"debug": "^4.4.3",
|
||||
"extract-zip": "^2.0.1",
|
||||
"progress": "^2.0.3",
|
||||
"proxy-agent": "^6.5.0",
|
||||
"semver": "^7.7.3",
|
||||
"tar-fs": "^3.1.1",
|
||||
"yargs": "^17.7.2"
|
||||
},
|
||||
"bin": {
|
||||
"browsers": "lib/cjs/main-cli.js"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=18"
|
||||
}
|
||||
},
|
||||
"node_modules/@tootallnate/quickjs-emscripten": {
|
||||
"version": "0.23.0",
|
||||
"resolved": "https://registry.npmjs.org/@tootallnate/quickjs-emscripten/-/quickjs-emscripten-0.23.0.tgz",
|
||||
"integrity": "sha512-C5Mc6rdnsaJDjO3UpGW/CQTHtCKaYlScZTly4JIu97Jxo/odCiH0ITnDXSJPTOrEKk/ycSZ0AOgTmkDtkOsvIA==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/@types/node": {
|
||||
"version": "25.0.3",
|
||||
"resolved": "https://registry.npmjs.org/@types/node/-/node-25.0.3.tgz",
|
||||
"integrity": "sha512-W609buLVRVmeW693xKfzHeIV6nJGGz98uCPfeXI1ELMLXVeKYZ9m15fAMSaUPBHYLGFsVRcMmSCksQOrZV9BYA==",
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
"undici-types": "~7.16.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@types/yauzl": {
|
||||
"version": "2.10.3",
|
||||
"resolved": "https://registry.npmjs.org/@types/yauzl/-/yauzl-2.10.3.tgz",
|
||||
"integrity": "sha512-oJoftv0LSuaDZE3Le4DbKX+KS9G36NzOeSap90UIK0yMA/NhKJhqlSGtNDORNRaIbQfzjXDrQa0ytJ6mNRGz/Q==",
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
"@types/node": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/agent-base": {
|
||||
"version": "7.1.4",
|
||||
"resolved": "https://registry.npmjs.org/agent-base/-/agent-base-7.1.4.tgz",
|
||||
"integrity": "sha512-MnA+YT8fwfJPgBx3m60MNqakm30XOkyIoH1y6huTQvC0PwZG7ki8NacLBcrPbNoo8vEZy7Jpuk7+jMO+CUovTQ==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">= 14"
|
||||
}
|
||||
},
|
||||
"node_modules/ansi-regex": {
|
||||
"version": "5.0.1",
|
||||
"resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz",
|
||||
"integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">=8"
|
||||
}
|
||||
},
|
||||
"node_modules/ansi-styles": {
|
||||
"version": "4.3.0",
|
||||
"resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
|
||||
"integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"color-convert": "^2.0.1"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=8"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/chalk/ansi-styles?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/ast-types": {
|
||||
"version": "0.13.4",
|
||||
"resolved": "https://registry.npmjs.org/ast-types/-/ast-types-0.13.4.tgz",
|
||||
"integrity": "sha512-x1FCFnFifvYDDzTaLII71vG5uvDwgtmDTEVWAxrgeiR8VjMONcCXJx7E+USjDtHlwFmt9MysbqgF9b9Vjr6w+w==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"tslib": "^2.0.1"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=4"
|
||||
}
|
||||
},
|
||||
"node_modules/b4a": {
|
||||
"version": "1.7.3",
|
||||
"resolved": "https://registry.npmjs.org/b4a/-/b4a-1.7.3.tgz",
|
||||
"integrity": "sha512-5Q2mfq2WfGuFp3uS//0s6baOJLMoVduPYVeNmDYxu5OUA1/cBfvr2RIS7vi62LdNj/urk1hfmj867I3qt6uZ7Q==",
|
||||
"license": "Apache-2.0",
|
||||
"peerDependencies": {
|
||||
"react-native-b4a": "*"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"react-native-b4a": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/bare-events": {
|
||||
"version": "2.8.2",
|
||||
"resolved": "https://registry.npmjs.org/bare-events/-/bare-events-2.8.2.tgz",
|
||||
"integrity": "sha512-riJjyv1/mHLIPX4RwiK+oW9/4c3TEUeORHKefKAKnZ5kyslbN+HXowtbaVEqt4IMUB7OXlfixcs6gsFeo/jhiQ==",
|
||||
"license": "Apache-2.0",
|
||||
"peerDependencies": {
|
||||
"bare-abort-controller": "*"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"bare-abort-controller": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/bare-fs": {
|
||||
"version": "4.5.2",
|
||||
"resolved": "https://registry.npmjs.org/bare-fs/-/bare-fs-4.5.2.tgz",
|
||||
"integrity": "sha512-veTnRzkb6aPHOvSKIOy60KzURfBdUflr5VReI+NSaPL6xf+XLdONQgZgpYvUuZLVQ8dCqxpBAudaOM1+KpAUxw==",
|
||||
"license": "Apache-2.0",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
"bare-events": "^2.5.4",
|
||||
"bare-path": "^3.0.0",
|
||||
"bare-stream": "^2.6.4",
|
||||
"bare-url": "^2.2.2",
|
||||
"fast-fifo": "^1.3.2"
|
||||
},
|
||||
"engines": {
|
||||
"bare": ">=1.16.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"bare-buffer": "*"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"bare-buffer": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/bare-os": {
|
||||
"version": "3.6.2",
|
||||
"resolved": "https://registry.npmjs.org/bare-os/-/bare-os-3.6.2.tgz",
|
||||
"integrity": "sha512-T+V1+1srU2qYNBmJCXZkUY5vQ0B4FSlL3QDROnKQYOqeiQR8UbjNHlPa+TIbM4cuidiN9GaTaOZgSEgsvPbh5A==",
|
||||
"license": "Apache-2.0",
|
||||
"optional": true,
|
||||
"engines": {
|
||||
"bare": ">=1.14.0"
|
||||
}
|
||||
},
|
||||
"node_modules/bare-path": {
|
||||
"version": "3.0.0",
|
||||
"resolved": "https://registry.npmjs.org/bare-path/-/bare-path-3.0.0.tgz",
|
||||
"integrity": "sha512-tyfW2cQcB5NN8Saijrhqn0Zh7AnFNsnczRcuWODH0eYAXBsJ5gVxAUuNr7tsHSC6IZ77cA0SitzT+s47kot8Mw==",
|
||||
"license": "Apache-2.0",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
"bare-os": "^3.0.1"
|
||||
}
|
||||
},
|
||||
"node_modules/bare-stream": {
|
||||
"version": "2.7.0",
|
||||
"resolved": "https://registry.npmjs.org/bare-stream/-/bare-stream-2.7.0.tgz",
|
||||
"integrity": "sha512-oyXQNicV1y8nc2aKffH+BUHFRXmx6VrPzlnaEvMhram0nPBrKcEdcyBg5r08D0i8VxngHFAiVyn1QKXpSG0B8A==",
|
||||
"license": "Apache-2.0",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
"streamx": "^2.21.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"bare-buffer": "*",
|
||||
"bare-events": "*"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"bare-buffer": {
|
||||
"optional": true
|
||||
},
|
||||
"bare-events": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/bare-url": {
|
||||
"version": "2.3.2",
|
||||
"resolved": "https://registry.npmjs.org/bare-url/-/bare-url-2.3.2.tgz",
|
||||
"integrity": "sha512-ZMq4gd9ngV5aTMa5p9+UfY0b3skwhHELaDkhEHetMdX0LRkW9kzaym4oo/Eh+Ghm0CCDuMTsRIGM/ytUc1ZYmw==",
|
||||
"license": "Apache-2.0",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
"bare-path": "^3.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/basic-ftp": {
|
||||
"version": "5.0.5",
|
||||
"resolved": "https://registry.npmjs.org/basic-ftp/-/basic-ftp-5.0.5.tgz",
|
||||
"integrity": "sha512-4Bcg1P8xhUuqcii/S0Z9wiHIrQVPMermM1any+MX5GeGD7faD3/msQUDGLol9wOcz4/jbg/WJnGqoJF6LiBdtg==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">=10.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/buffer-crc32": {
|
||||
"version": "0.2.13",
|
||||
"resolved": "https://registry.npmjs.org/buffer-crc32/-/buffer-crc32-0.2.13.tgz",
|
||||
"integrity": "sha512-VO9Ht/+p3SN7SKWqcrgEzjGbRSJYTx+Q1pTQC0wrWqHx0vpJraQ6GtHx8tvcg1rlK1byhU5gccxgOgj7B0TDkQ==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/chromium-bidi": {
|
||||
"version": "12.0.1",
|
||||
"resolved": "https://registry.npmjs.org/chromium-bidi/-/chromium-bidi-12.0.1.tgz",
|
||||
"integrity": "sha512-fGg+6jr0xjQhzpy5N4ErZxQ4wF7KLEvhGZXD6EgvZKDhu7iOhZXnZhcDxPJDcwTcrD48NPzOCo84RP2lv3Z+Cg==",
|
||||
"license": "Apache-2.0",
|
||||
"dependencies": {
|
||||
"mitt": "^3.0.1",
|
||||
"zod": "^3.24.1"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"devtools-protocol": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/cliui": {
|
||||
"version": "8.0.1",
|
||||
"resolved": "https://registry.npmjs.org/cliui/-/cliui-8.0.1.tgz",
|
||||
"integrity": "sha512-BSeNnyus75C4//NQ9gQt1/csTXyo/8Sb+afLAkzAptFuMsod9HFokGNudZpi/oQV73hnVK+sR+5PVRMd+Dr7YQ==",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"string-width": "^4.2.0",
|
||||
"strip-ansi": "^6.0.1",
|
||||
"wrap-ansi": "^7.0.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=12"
|
||||
}
|
||||
},
|
||||
"node_modules/color-convert": {
|
||||
"version": "2.0.1",
|
||||
"resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
|
||||
"integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"color-name": "~1.1.4"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=7.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/color-name": {
|
||||
"version": "1.1.4",
|
||||
"resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
|
||||
"integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/data-uri-to-buffer": {
|
||||
"version": "6.0.2",
|
||||
"resolved": "https://registry.npmjs.org/data-uri-to-buffer/-/data-uri-to-buffer-6.0.2.tgz",
|
||||
"integrity": "sha512-7hvf7/GW8e86rW0ptuwS3OcBGDjIi6SZva7hCyWC0yYry2cOPmLIjXAUHI6DK2HsnwJd9ifmt57i8eV2n4YNpw==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">= 14"
|
||||
}
|
||||
},
|
||||
"node_modules/debug": {
|
||||
"version": "4.4.3",
|
||||
"resolved": "https://registry.npmjs.org/debug/-/debug-4.4.3.tgz",
|
||||
"integrity": "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"ms": "^2.1.3"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=6.0"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"supports-color": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/degenerator": {
|
||||
"version": "5.0.1",
|
||||
"resolved": "https://registry.npmjs.org/degenerator/-/degenerator-5.0.1.tgz",
|
||||
"integrity": "sha512-TllpMR/t0M5sqCXfj85i4XaAzxmS5tVA16dqvdkMwGmzI+dXLXnw3J+3Vdv7VKw+ThlTMboK6i9rnZ6Nntj5CQ==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"ast-types": "^0.13.4",
|
||||
"escodegen": "^2.1.0",
|
||||
"esprima": "^4.0.1"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 14"
|
||||
}
|
||||
},
|
||||
"node_modules/devtools-protocol": {
|
||||
"version": "0.0.1534754",
|
||||
"resolved": "https://registry.npmjs.org/devtools-protocol/-/devtools-protocol-0.0.1534754.tgz",
|
||||
"integrity": "sha512-26T91cV5dbOYnXdJi5qQHoTtUoNEqwkHcAyu/IKtjIAxiEqPMrDiRkDOPWVsGfNZGmlQVHQbZRSjD8sxagWVsQ==",
|
||||
"license": "BSD-3-Clause",
|
||||
"peer": true
|
||||
},
|
||||
"node_modules/emoji-regex": {
|
||||
"version": "8.0.0",
|
||||
"resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz",
|
||||
"integrity": "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/end-of-stream": {
|
||||
"version": "1.4.5",
|
||||
"resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.5.tgz",
|
||||
"integrity": "sha512-ooEGc6HP26xXq/N+GCGOT0JKCLDGrq2bQUZrQ7gyrJiZANJ/8YDTxTpQBXGMn+WbIQXNVpyWymm7KYVICQnyOg==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"once": "^1.4.0"
|
||||
}
|
||||
},
|
||||
"node_modules/escalade": {
|
||||
"version": "3.2.0",
|
||||
"resolved": "https://registry.npmjs.org/escalade/-/escalade-3.2.0.tgz",
|
||||
"integrity": "sha512-WUj2qlxaQtO4g6Pq5c29GTcWGDyd8itL8zTlipgECz3JesAiiOKotd8JU6otB3PACgG6xkJUyVhboMS+bje/jA==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">=6"
|
||||
}
|
||||
},
|
||||
"node_modules/escodegen": {
|
||||
"version": "2.1.0",
|
||||
"resolved": "https://registry.npmjs.org/escodegen/-/escodegen-2.1.0.tgz",
|
||||
"integrity": "sha512-2NlIDTwUWJN0mRPQOdtQBzbUHvdGY2P1VXSyU83Q3xKxM7WHX2Ql8dKq782Q9TgQUNOLEzEYu9bzLNj1q88I5w==",
|
||||
"license": "BSD-2-Clause",
|
||||
"dependencies": {
|
||||
"esprima": "^4.0.1",
|
||||
"estraverse": "^5.2.0",
|
||||
"esutils": "^2.0.2"
|
||||
},
|
||||
"bin": {
|
||||
"escodegen": "bin/escodegen.js",
|
||||
"esgenerate": "bin/esgenerate.js"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=6.0"
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"source-map": "~0.6.1"
|
||||
}
|
||||
},
|
||||
"node_modules/esprima": {
|
||||
"version": "4.0.1",
|
||||
"resolved": "https://registry.npmjs.org/esprima/-/esprima-4.0.1.tgz",
|
||||
"integrity": "sha512-eGuFFw7Upda+g4p+QHvnW0RyTX/SVeJBDM/gCtMARO0cLuT2HcEKnTPvhjV6aGeqrCB/sbNop0Kszm0jsaWU4A==",
|
||||
"license": "BSD-2-Clause",
|
||||
"bin": {
|
||||
"esparse": "bin/esparse.js",
|
||||
"esvalidate": "bin/esvalidate.js"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=4"
|
||||
}
|
||||
},
|
||||
"node_modules/estraverse": {
|
||||
"version": "5.3.0",
|
||||
"resolved": "https://registry.npmjs.org/estraverse/-/estraverse-5.3.0.tgz",
|
||||
"integrity": "sha512-MMdARuVEQziNTeJD8DgMqmhwR11BRQ/cBP+pLtYdSTnf3MIO8fFeiINEbX36ZdNlfU/7A9f3gUw49B3oQsvwBA==",
|
||||
"license": "BSD-2-Clause",
|
||||
"engines": {
|
||||
"node": ">=4.0"
|
||||
}
|
||||
},
|
||||
"node_modules/esutils": {
|
||||
"version": "2.0.3",
|
||||
"resolved": "https://registry.npmjs.org/esutils/-/esutils-2.0.3.tgz",
|
||||
"integrity": "sha512-kVscqXk4OCp68SZ0dkgEKVi6/8ij300KBWTJq32P/dYeWTSwK41WyTxalN1eRmA5Z9UU/LX9D7FWSmV9SAYx6g==",
|
||||
"license": "BSD-2-Clause",
|
||||
"engines": {
|
||||
"node": ">=0.10.0"
|
||||
}
|
||||
},
|
||||
"node_modules/events-universal": {
|
||||
"version": "1.0.1",
|
||||
"resolved": "https://registry.npmjs.org/events-universal/-/events-universal-1.0.1.tgz",
|
||||
"integrity": "sha512-LUd5euvbMLpwOF8m6ivPCbhQeSiYVNb8Vs0fQ8QjXo0JTkEHpz8pxdQf0gStltaPpw0Cca8b39KxvK9cfKRiAw==",
|
||||
"license": "Apache-2.0",
|
||||
"dependencies": {
|
||||
"bare-events": "^2.7.0"
|
||||
}
|
||||
},
|
||||
"node_modules/extract-zip": {
|
||||
"version": "2.0.1",
|
||||
"resolved": "https://registry.npmjs.org/extract-zip/-/extract-zip-2.0.1.tgz",
|
||||
"integrity": "sha512-GDhU9ntwuKyGXdZBUgTIe+vXnWj0fppUEtMDL0+idd5Sta8TGpHssn/eusA9mrPr9qNDym6SxAYZjNvCn/9RBg==",
|
||||
"license": "BSD-2-Clause",
|
||||
"dependencies": {
|
||||
"debug": "^4.1.1",
|
||||
"get-stream": "^5.1.0",
|
||||
"yauzl": "^2.10.0"
|
||||
},
|
||||
"bin": {
|
||||
"extract-zip": "cli.js"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 10.17.0"
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"@types/yauzl": "^2.9.1"
|
||||
}
|
||||
},
|
||||
"node_modules/fast-fifo": {
|
||||
"version": "1.3.2",
|
||||
"resolved": "https://registry.npmjs.org/fast-fifo/-/fast-fifo-1.3.2.tgz",
|
||||
"integrity": "sha512-/d9sfos4yxzpwkDkuN7k2SqFKtYNmCTzgfEpz82x34IM9/zc8KGxQoXg1liNC/izpRM/MBdt44Nmx41ZWqk+FQ==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/fd-slicer": {
|
||||
"version": "1.1.0",
|
||||
"resolved": "https://registry.npmjs.org/fd-slicer/-/fd-slicer-1.1.0.tgz",
|
||||
"integrity": "sha512-cE1qsB/VwyQozZ+q1dGxR8LBYNZeofhEdUNGSMbQD3Gw2lAzX9Zb3uIU6Ebc/Fmyjo9AWWfnn0AUCHqtevs/8g==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"pend": "~1.2.0"
|
||||
}
|
||||
},
|
||||
"node_modules/get-caller-file": {
|
||||
"version": "2.0.5",
|
||||
"resolved": "https://registry.npmjs.org/get-caller-file/-/get-caller-file-2.0.5.tgz",
|
||||
"integrity": "sha512-DyFP3BM/3YHTQOCUL/w0OZHR0lpKeGrxotcHWcqNEdnltqFwXVfhEBQ94eIo34AfQpo0rGki4cyIiftY06h2Fg==",
|
||||
"license": "ISC",
|
||||
"engines": {
|
||||
"node": "6.* || 8.* || >= 10.*"
|
||||
}
|
||||
},
|
||||
"node_modules/get-stream": {
|
||||
"version": "5.2.0",
|
||||
"resolved": "https://registry.npmjs.org/get-stream/-/get-stream-5.2.0.tgz",
|
||||
"integrity": "sha512-nBF+F1rAZVCu/p7rjzgA+Yb4lfYXrpl7a6VmJrU8wF9I1CKvP/QwPNZHnOlwbTkY6dvtFIzFMSyQXbLoTQPRpA==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"pump": "^3.0.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=8"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/sponsors/sindresorhus"
|
||||
}
|
||||
},
|
||||
"node_modules/get-uri": {
|
||||
"version": "6.0.5",
|
||||
"resolved": "https://registry.npmjs.org/get-uri/-/get-uri-6.0.5.tgz",
|
||||
"integrity": "sha512-b1O07XYq8eRuVzBNgJLstU6FYc1tS6wnMtF1I1D9lE8LxZSOGZ7LhxN54yPP6mGw5f2CkXY2BQUL9Fx41qvcIg==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"basic-ftp": "^5.0.2",
|
||||
"data-uri-to-buffer": "^6.0.2",
|
||||
"debug": "^4.3.4"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 14"
|
||||
}
|
||||
},
|
||||
"node_modules/http-proxy-agent": {
|
||||
"version": "7.0.2",
|
||||
"resolved": "https://registry.npmjs.org/http-proxy-agent/-/http-proxy-agent-7.0.2.tgz",
|
||||
"integrity": "sha512-T1gkAiYYDWYx3V5Bmyu7HcfcvL7mUrTWiM6yOfa3PIphViJ/gFPbvidQ+veqSOHci/PxBcDabeUNCzpOODJZig==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"agent-base": "^7.1.0",
|
||||
"debug": "^4.3.4"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 14"
|
||||
}
|
||||
},
|
||||
"node_modules/https-proxy-agent": {
|
||||
"version": "7.0.6",
|
||||
"resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-7.0.6.tgz",
|
||||
"integrity": "sha512-vK9P5/iUfdl95AI+JVyUuIcVtd4ofvtrOr3HNtM2yxC9bnMbEdp3x01OhQNnjb8IJYi38VlTE3mBXwcfvywuSw==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"agent-base": "^7.1.2",
|
||||
"debug": "4"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 14"
|
||||
}
|
||||
},
|
||||
"node_modules/ip-address": {
|
||||
"version": "10.1.0",
|
||||
"resolved": "https://registry.npmjs.org/ip-address/-/ip-address-10.1.0.tgz",
|
||||
"integrity": "sha512-XXADHxXmvT9+CRxhXg56LJovE+bmWnEWB78LB83VZTprKTmaC5QfruXocxzTZ2Kl0DNwKuBdlIhjL8LeY8Sf8Q==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">= 12"
|
||||
}
|
||||
},
|
||||
"node_modules/is-fullwidth-code-point": {
|
||||
"version": "3.0.0",
|
||||
"resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-3.0.0.tgz",
|
||||
"integrity": "sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">=8"
|
||||
}
|
||||
},
|
||||
"node_modules/lru-cache": {
|
||||
"version": "7.18.3",
|
||||
"resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-7.18.3.tgz",
|
||||
"integrity": "sha512-jumlc0BIUrS3qJGgIkWZsyfAM7NCWiBcCDhnd+3NNM5KbBmLTgHVfWBcg6W+rLUsIpzpERPsvwUP7CckAQSOoA==",
|
||||
"license": "ISC",
|
||||
"engines": {
|
||||
"node": ">=12"
|
||||
}
|
||||
},
|
||||
"node_modules/mitt": {
|
||||
"version": "3.0.1",
|
||||
"resolved": "https://registry.npmjs.org/mitt/-/mitt-3.0.1.tgz",
|
||||
"integrity": "sha512-vKivATfr97l2/QBCYAkXYDbrIWPM2IIKEl7YPhjCvKlG3kE2gm+uBo6nEXK3M5/Ffh/FLpKExzOQ3JJoJGFKBw==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/ms": {
|
||||
"version": "2.1.3",
|
||||
"resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz",
|
||||
"integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/netmask": {
|
||||
"version": "2.0.2",
|
||||
"resolved": "https://registry.npmjs.org/netmask/-/netmask-2.0.2.tgz",
|
||||
"integrity": "sha512-dBpDMdxv9Irdq66304OLfEmQ9tbNRFnFTuZiLo+bD+r332bBmMJ8GBLXklIXXgxd3+v9+KUnZaUR5PJMa75Gsg==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">= 0.4.0"
|
||||
}
|
||||
},
|
||||
"node_modules/once": {
|
||||
"version": "1.4.0",
|
||||
"resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz",
|
||||
"integrity": "sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"wrappy": "1"
|
||||
}
|
||||
},
|
||||
"node_modules/pac-proxy-agent": {
|
||||
"version": "7.2.0",
|
||||
"resolved": "https://registry.npmjs.org/pac-proxy-agent/-/pac-proxy-agent-7.2.0.tgz",
|
||||
"integrity": "sha512-TEB8ESquiLMc0lV8vcd5Ql/JAKAoyzHFXaStwjkzpOpC5Yv+pIzLfHvjTSdf3vpa2bMiUQrg9i6276yn8666aA==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@tootallnate/quickjs-emscripten": "^0.23.0",
|
||||
"agent-base": "^7.1.2",
|
||||
"debug": "^4.3.4",
|
||||
"get-uri": "^6.0.1",
|
||||
"http-proxy-agent": "^7.0.0",
|
||||
"https-proxy-agent": "^7.0.6",
|
||||
"pac-resolver": "^7.0.1",
|
||||
"socks-proxy-agent": "^8.0.5"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 14"
|
||||
}
|
||||
},
|
||||
"node_modules/pac-resolver": {
|
||||
"version": "7.0.1",
|
||||
"resolved": "https://registry.npmjs.org/pac-resolver/-/pac-resolver-7.0.1.tgz",
|
||||
"integrity": "sha512-5NPgf87AT2STgwa2ntRMr45jTKrYBGkVU36yT0ig/n/GMAa3oPqhZfIQ2kMEimReg0+t9kZViDVZ83qfVUlckg==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"degenerator": "^5.0.0",
|
||||
"netmask": "^2.0.2"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 14"
|
||||
}
|
||||
},
|
||||
"node_modules/pend": {
|
||||
"version": "1.2.0",
|
||||
"resolved": "https://registry.npmjs.org/pend/-/pend-1.2.0.tgz",
|
||||
"integrity": "sha512-F3asv42UuXchdzt+xXqfW1OGlVBe+mxa2mqI0pg5yAHZPvFmY3Y6drSf/GQ1A86WgWEN9Kzh/WrgKa6iGcHXLg==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/progress": {
|
||||
"version": "2.0.3",
|
||||
"resolved": "https://registry.npmjs.org/progress/-/progress-2.0.3.tgz",
|
||||
"integrity": "sha512-7PiHtLll5LdnKIMw100I+8xJXR5gW2QwWYkT6iJva0bXitZKa/XMrSbdmg3r2Xnaidz9Qumd0VPaMrZlF9V9sA==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">=0.4.0"
|
||||
}
|
||||
},
|
||||
"node_modules/proxy-agent": {
|
||||
"version": "6.5.0",
|
||||
"resolved": "https://registry.npmjs.org/proxy-agent/-/proxy-agent-6.5.0.tgz",
|
||||
"integrity": "sha512-TmatMXdr2KlRiA2CyDu8GqR8EjahTG3aY3nXjdzFyoZbmB8hrBsTyMezhULIXKnC0jpfjlmiZ3+EaCzoInSu/A==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"agent-base": "^7.1.2",
|
||||
"debug": "^4.3.4",
|
||||
"http-proxy-agent": "^7.0.1",
|
||||
"https-proxy-agent": "^7.0.6",
|
||||
"lru-cache": "^7.14.1",
|
||||
"pac-proxy-agent": "^7.1.0",
|
||||
"proxy-from-env": "^1.1.0",
|
||||
"socks-proxy-agent": "^8.0.5"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 14"
|
||||
}
|
||||
},
|
||||
"node_modules/proxy-from-env": {
|
||||
"version": "1.1.0",
|
||||
"resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.1.0.tgz",
|
||||
"integrity": "sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/pump": {
|
||||
"version": "3.0.3",
|
||||
"resolved": "https://registry.npmjs.org/pump/-/pump-3.0.3.tgz",
|
||||
"integrity": "sha512-todwxLMY7/heScKmntwQG8CXVkWUOdYxIvY2s0VWAAMh/nd8SoYiRaKjlr7+iCs984f2P8zvrfWcDDYVb73NfA==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"end-of-stream": "^1.1.0",
|
||||
"once": "^1.3.1"
|
||||
}
|
||||
},
|
||||
"node_modules/puppeteer-core": {
|
||||
"version": "24.34.0",
|
||||
"resolved": "https://registry.npmjs.org/puppeteer-core/-/puppeteer-core-24.34.0.tgz",
|
||||
"integrity": "sha512-24evawO+mUGW4mvS2a2ivwLdX3gk8zRLZr9HP+7+VT2vBQnm0oh9jJEZmUE3ePJhRkYlZ93i7OMpdcoi2qNCLg==",
|
||||
"license": "Apache-2.0",
|
||||
"dependencies": {
|
||||
"@puppeteer/browsers": "2.11.0",
|
||||
"chromium-bidi": "12.0.1",
|
||||
"debug": "^4.4.3",
|
||||
"devtools-protocol": "0.0.1534754",
|
||||
"typed-query-selector": "^2.12.0",
|
||||
"webdriver-bidi-protocol": "0.3.10",
|
||||
"ws": "^8.18.3"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=18"
|
||||
}
|
||||
},
|
||||
"node_modules/require-directory": {
|
||||
"version": "2.1.1",
|
||||
"resolved": "https://registry.npmjs.org/require-directory/-/require-directory-2.1.1.tgz",
|
||||
"integrity": "sha512-fGxEI7+wsG9xrvdjsrlmL22OMTTiHRwAMroiEeMgq8gzoLC/PQr7RsRDSTLUg/bZAZtF+TVIkHc6/4RIKrui+Q==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">=0.10.0"
|
||||
}
|
||||
},
|
||||
"node_modules/semver": {
|
||||
"version": "7.7.3",
|
||||
"resolved": "https://registry.npmjs.org/semver/-/semver-7.7.3.tgz",
|
||||
"integrity": "sha512-SdsKMrI9TdgjdweUSR9MweHA4EJ8YxHn8DFaDisvhVlUOe4BF1tLD7GAj0lIqWVl+dPb/rExr0Btby5loQm20Q==",
|
||||
"license": "ISC",
|
||||
"bin": {
|
||||
"semver": "bin/semver.js"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=10"
|
||||
}
|
||||
},
|
||||
"node_modules/smart-buffer": {
|
||||
"version": "4.2.0",
|
||||
"resolved": "https://registry.npmjs.org/smart-buffer/-/smart-buffer-4.2.0.tgz",
|
||||
"integrity": "sha512-94hK0Hh8rPqQl2xXc3HsaBoOXKV20MToPkcXvwbISWLEs+64sBq5kFgn2kJDHb1Pry9yrP0dxrCI9RRci7RXKg==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">= 6.0.0",
|
||||
"npm": ">= 3.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/socks": {
|
||||
"version": "2.8.7",
|
||||
"resolved": "https://registry.npmjs.org/socks/-/socks-2.8.7.tgz",
|
||||
"integrity": "sha512-HLpt+uLy/pxB+bum/9DzAgiKS8CX1EvbWxI4zlmgGCExImLdiad2iCwXT5Z4c9c3Eq8rP2318mPW2c+QbtjK8A==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"ip-address": "^10.0.1",
|
||||
"smart-buffer": "^4.2.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 10.0.0",
|
||||
"npm": ">= 3.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/socks-proxy-agent": {
|
||||
"version": "8.0.5",
|
||||
"resolved": "https://registry.npmjs.org/socks-proxy-agent/-/socks-proxy-agent-8.0.5.tgz",
|
||||
"integrity": "sha512-HehCEsotFqbPW9sJ8WVYB6UbmIMv7kUUORIF2Nncq4VQvBfNBLibW9YZR5dlYCSUhwcD628pRllm7n+E+YTzJw==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"agent-base": "^7.1.2",
|
||||
"debug": "^4.3.4",
|
||||
"socks": "^2.8.3"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 14"
|
||||
}
|
||||
},
|
||||
"node_modules/source-map": {
|
||||
"version": "0.6.1",
|
||||
"resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz",
|
||||
"integrity": "sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g==",
|
||||
"license": "BSD-3-Clause",
|
||||
"optional": true,
|
||||
"engines": {
|
||||
"node": ">=0.10.0"
|
||||
}
|
||||
},
|
||||
"node_modules/streamx": {
|
||||
"version": "2.23.0",
|
||||
"resolved": "https://registry.npmjs.org/streamx/-/streamx-2.23.0.tgz",
|
||||
"integrity": "sha512-kn+e44esVfn2Fa/O0CPFcex27fjIL6MkVae0Mm6q+E6f0hWv578YCERbv+4m02cjxvDsPKLnmxral/rR6lBMAg==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"events-universal": "^1.0.0",
|
||||
"fast-fifo": "^1.3.2",
|
||||
"text-decoder": "^1.1.0"
|
||||
}
|
||||
},
|
||||
"node_modules/string-width": {
|
||||
"version": "4.2.3",
|
||||
"resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz",
|
||||
"integrity": "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"emoji-regex": "^8.0.0",
|
||||
"is-fullwidth-code-point": "^3.0.0",
|
||||
"strip-ansi": "^6.0.1"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=8"
|
||||
}
|
||||
},
|
||||
"node_modules/strip-ansi": {
|
||||
"version": "6.0.1",
|
||||
"resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz",
|
||||
"integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"ansi-regex": "^5.0.1"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=8"
|
||||
}
|
||||
},
|
||||
"node_modules/tar-fs": {
|
||||
"version": "3.1.1",
|
||||
"resolved": "https://registry.npmjs.org/tar-fs/-/tar-fs-3.1.1.tgz",
|
||||
"integrity": "sha512-LZA0oaPOc2fVo82Txf3gw+AkEd38szODlptMYejQUhndHMLQ9M059uXR+AfS7DNo0NpINvSqDsvyaCrBVkptWg==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"pump": "^3.0.0",
|
||||
"tar-stream": "^3.1.5"
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"bare-fs": "^4.0.1",
|
||||
"bare-path": "^3.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/tar-stream": {
|
||||
"version": "3.1.7",
|
||||
"resolved": "https://registry.npmjs.org/tar-stream/-/tar-stream-3.1.7.tgz",
|
||||
"integrity": "sha512-qJj60CXt7IU1Ffyc3NJMjh6EkuCFej46zUqJ4J7pqYlThyd9bO0XBTmcOIhSzZJVWfsLks0+nle/j538YAW9RQ==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"b4a": "^1.6.4",
|
||||
"fast-fifo": "^1.2.0",
|
||||
"streamx": "^2.15.0"
|
||||
}
|
||||
},
|
||||
"node_modules/text-decoder": {
|
||||
"version": "1.2.3",
|
||||
"resolved": "https://registry.npmjs.org/text-decoder/-/text-decoder-1.2.3.tgz",
|
||||
"integrity": "sha512-3/o9z3X0X0fTupwsYvR03pJ/DjWuqqrfwBgTQzdWDiQSm9KitAyz/9WqsT2JQW7KV2m+bC2ol/zqpW37NHxLaA==",
|
||||
"license": "Apache-2.0",
|
||||
"dependencies": {
|
||||
"b4a": "^1.6.4"
|
||||
}
|
||||
},
|
||||
"node_modules/tslib": {
|
||||
"version": "2.8.1",
|
||||
"resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.1.tgz",
|
||||
"integrity": "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==",
|
||||
"license": "0BSD"
|
||||
},
|
||||
"node_modules/typed-query-selector": {
|
||||
"version": "2.12.0",
|
||||
"resolved": "https://registry.npmjs.org/typed-query-selector/-/typed-query-selector-2.12.0.tgz",
|
||||
"integrity": "sha512-SbklCd1F0EiZOyPiW192rrHZzZ5sBijB6xM+cpmrwDqObvdtunOHHIk9fCGsoK5JVIYXoyEp4iEdE3upFH3PAg==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/undici-types": {
|
||||
"version": "7.16.0",
|
||||
"resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.16.0.tgz",
|
||||
"integrity": "sha512-Zz+aZWSj8LE6zoxD+xrjh4VfkIG8Ya6LvYkZqtUQGJPZjYl53ypCaUwWqo7eI0x66KBGeRo+mlBEkMSeSZ38Nw==",
|
||||
"license": "MIT",
|
||||
"optional": true
|
||||
},
|
||||
"node_modules/webdriver-bidi-protocol": {
|
||||
"version": "0.3.10",
|
||||
"resolved": "https://registry.npmjs.org/webdriver-bidi-protocol/-/webdriver-bidi-protocol-0.3.10.tgz",
|
||||
"integrity": "sha512-5LAE43jAVLOhB/QqX4bwSiv0Hg1HBfMmOuwBSXHdvg4GMGu9Y0lIq7p4R/yySu6w74WmaR4GM4H9t2IwLW7hgw==",
|
||||
"license": "Apache-2.0"
|
||||
},
|
||||
"node_modules/wrap-ansi": {
|
||||
"version": "7.0.0",
|
||||
"resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-7.0.0.tgz",
|
||||
"integrity": "sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"ansi-styles": "^4.0.0",
|
||||
"string-width": "^4.1.0",
|
||||
"strip-ansi": "^6.0.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=10"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/chalk/wrap-ansi?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/wrappy": {
|
||||
"version": "1.0.2",
|
||||
"resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz",
|
||||
"integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==",
|
||||
"license": "ISC"
|
||||
},
|
||||
"node_modules/ws": {
|
||||
"version": "8.18.3",
|
||||
"resolved": "https://registry.npmjs.org/ws/-/ws-8.18.3.tgz",
|
||||
"integrity": "sha512-PEIGCY5tSlUt50cqyMXfCzX+oOPqN0vuGqWzbcJ2xvnkzkq46oOpz7dQaTDBdfICb4N14+GARUDw2XV2N4tvzg==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">=10.0.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"bufferutil": "^4.0.1",
|
||||
"utf-8-validate": ">=5.0.2"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"bufferutil": {
|
||||
"optional": true
|
||||
},
|
||||
"utf-8-validate": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/y18n": {
|
||||
"version": "5.0.8",
|
||||
"resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz",
|
||||
"integrity": "sha512-0pfFzegeDWJHJIAmTLRP2DwHjdF5s7jo9tuztdQxAhINCdvS+3nGINqPd00AphqJR/0LhANUS6/+7SCb98YOfA==",
|
||||
"license": "ISC",
|
||||
"engines": {
|
||||
"node": ">=10"
|
||||
}
|
||||
},
|
||||
"node_modules/yargs": {
|
||||
"version": "17.7.2",
|
||||
"resolved": "https://registry.npmjs.org/yargs/-/yargs-17.7.2.tgz",
|
||||
"integrity": "sha512-7dSzzRQ++CKnNI/krKnYRV7JKKPUXMEh61soaHKg9mrWEhzFWhFnxPxGl+69cD1Ou63C13NUPCnmIcrvqCuM6w==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"cliui": "^8.0.1",
|
||||
"escalade": "^3.1.1",
|
||||
"get-caller-file": "^2.0.5",
|
||||
"require-directory": "^2.1.1",
|
||||
"string-width": "^4.2.3",
|
||||
"y18n": "^5.0.5",
|
||||
"yargs-parser": "^21.1.1"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=12"
|
||||
}
|
||||
},
|
||||
"node_modules/yargs-parser": {
|
||||
"version": "21.1.1",
|
||||
"resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-21.1.1.tgz",
|
||||
"integrity": "sha512-tVpsJW7DdjecAiFpbIB1e3qxIQsE6NoPc5/eTdrbbIC4h0LVsWhnoa3g+m2HclBIujHzsxZ4VJVA+GUuc2/LBw==",
|
||||
"license": "ISC",
|
||||
"engines": {
|
||||
"node": ">=12"
|
||||
}
|
||||
},
|
||||
"node_modules/yauzl": {
|
||||
"version": "2.10.0",
|
||||
"resolved": "https://registry.npmjs.org/yauzl/-/yauzl-2.10.0.tgz",
|
||||
"integrity": "sha512-p4a9I6X6nu6IhoGmBqAcbJy1mlC4j27vEPZX9F4L4/vZT3Lyq1VkFHw/V/PUcB9Buo+DG3iHkT0x3Qya58zc3g==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"buffer-crc32": "~0.2.3",
|
||||
"fd-slicer": "~1.1.0"
|
||||
}
|
||||
},
|
||||
"node_modules/zod": {
|
||||
"version": "3.25.76",
|
||||
"resolved": "https://registry.npmjs.org/zod/-/zod-3.25.76.tgz",
|
||||
"integrity": "sha512-gzUt/qt81nXsFGKIFcC3YnfEAx5NkunCfnDlvuBSSFS02bcXu4Lmea0AFIUwbLWxWPx3d9p8S5QoaujKcNQxcQ==",
|
||||
"license": "MIT",
|
||||
"funding": {
|
||||
"url": "https://github.com/sponsors/colinhacks"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1 +0,0 @@
|
||||
{"name":"archivebox-plugins","private":true,"dependencies":{"puppeteer-core":"^24.34.0"}}
|
||||
@@ -3,9 +3,10 @@
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"SAVE_PAPERSDL": {
|
||||
"PAPERSDL_ENABLED": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["SAVE_PAPERSDL", "USE_PAPERSDL"],
|
||||
"description": "Enable paper downloading with papers-dl"
|
||||
},
|
||||
"PAPERSDL_BINARY": {
|
||||
|
||||
@@ -170,10 +170,6 @@ def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0
|
||||
if normalized != url:
|
||||
urls_found.add(unescape(normalized))
|
||||
|
||||
if not urls_found:
|
||||
click.echo('No URLs found', err=True)
|
||||
sys.exit(1)
|
||||
|
||||
# Emit Snapshot records to stdout (JSONL)
|
||||
for found_url in sorted(urls_found):
|
||||
record = {
|
||||
@@ -189,7 +185,17 @@ def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0
|
||||
|
||||
print(json.dumps(record))
|
||||
|
||||
click.echo(f'Found {len(urls_found)} URLs', err=True)
|
||||
# Emit ArchiveResult record to mark completion
|
||||
status = 'succeeded' if urls_found else 'skipped'
|
||||
output_str = f'Found {len(urls_found)} URLs' if urls_found else 'No URLs found'
|
||||
ar_record = {
|
||||
'type': 'ArchiveResult',
|
||||
'status': status,
|
||||
'output_str': output_str,
|
||||
}
|
||||
print(json.dumps(ar_record))
|
||||
|
||||
click.echo(output_str, err=True)
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
|
||||
@@ -27,12 +27,13 @@ class TestParseHtmlUrls:
|
||||
|
||||
assert result.returncode == 0, f"Failed to parse example.com: {result.stderr}"
|
||||
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
assert output_file.exists(), "Output file not created"
|
||||
# Verify stdout contains JSONL records for discovered URLs
|
||||
# example.com links to iana.org
|
||||
assert 'iana.org' in result.stdout or 'example' in result.stdout, "Expected links from example.com not found"
|
||||
|
||||
# Verify output contains IANA link (example.com links to iana.org)
|
||||
content = output_file.read_text()
|
||||
assert 'iana.org' in content or 'example' in content, "Expected links from example.com not found"
|
||||
# Verify ArchiveResult record is present
|
||||
assert '"type": "ArchiveResult"' in result.stdout, "Missing ArchiveResult record"
|
||||
assert '"status": "succeeded"' in result.stdout, "Missing success status"
|
||||
|
||||
def test_extracts_href_urls(self, tmp_path):
|
||||
"""Test extracting URLs from anchor tags."""
|
||||
@@ -56,17 +57,16 @@ class TestParseHtmlUrls:
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert 'Found 3 URLs' in result.stdout
|
||||
assert 'Found 3 URLs' in result.stderr
|
||||
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
assert output_file.exists()
|
||||
|
||||
lines = output_file.read_text().strip().split('\n')
|
||||
assert len(lines) == 3
|
||||
# Parse Snapshot records from stdout
|
||||
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '"type": "Snapshot"' in line]
|
||||
assert len(lines) == 3, f"Expected 3 Snapshot records, got {len(lines)}"
|
||||
|
||||
urls = set()
|
||||
for line in lines:
|
||||
entry = json.loads(line)
|
||||
assert entry['type'] == 'Snapshot'
|
||||
assert 'url' in entry
|
||||
urls.add(entry['url'])
|
||||
|
||||
@@ -74,6 +74,10 @@ class TestParseHtmlUrls:
|
||||
assert 'https://foo.bar/page' in urls
|
||||
assert 'http://test.org' in urls
|
||||
|
||||
# Verify ArchiveResult record
|
||||
assert '"type": "ArchiveResult"' in result.stdout
|
||||
assert '"status": "succeeded"' in result.stdout
|
||||
|
||||
def test_ignores_non_http_schemes(self, tmp_path):
|
||||
"""Test that non-http schemes are ignored."""
|
||||
input_file = tmp_path / 'page.html'
|
||||
@@ -96,9 +100,10 @@ class TestParseHtmlUrls:
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
lines = output_file.read_text().strip().split('\n')
|
||||
assert len(lines) == 1
|
||||
|
||||
# Parse Snapshot records from stdout
|
||||
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '"type": "Snapshot"' in line]
|
||||
assert len(lines) == 1, f"Expected 1 Snapshot record, got {len(lines)}"
|
||||
|
||||
entry = json.loads(lines[0])
|
||||
assert entry['url'] == 'https://valid.com'
|
||||
@@ -122,8 +127,8 @@ class TestParseHtmlUrls:
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
entry = json.loads(output_file.read_text().strip())
|
||||
lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
|
||||
entry = json.loads(lines[0])
|
||||
assert entry['url'] == 'https://example.com/page?a=1&b=2'
|
||||
|
||||
def test_deduplicates_urls(self, tmp_path):
|
||||
@@ -147,8 +152,7 @@ class TestParseHtmlUrls:
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
lines = output_file.read_text().strip().split('\n')
|
||||
lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
|
||||
assert len(lines) == 1
|
||||
|
||||
def test_excludes_source_url(self, tmp_path):
|
||||
@@ -172,14 +176,13 @@ class TestParseHtmlUrls:
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
lines = output_file.read_text().strip().split('\n')
|
||||
lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
|
||||
assert len(lines) == 1
|
||||
entry = json.loads(lines[0])
|
||||
assert entry['url'] == 'https://other.com'
|
||||
|
||||
def test_exits_1_when_no_urls_found(self, tmp_path):
|
||||
"""Test that script exits with code 1 when no URLs found."""
|
||||
def test_skips_when_no_urls_found(self, tmp_path):
|
||||
"""Test that script returns skipped status when no URLs found."""
|
||||
input_file = tmp_path / 'page.html'
|
||||
input_file.write_text('<html><body>No links here</body></html>')
|
||||
|
||||
@@ -190,8 +193,9 @@ class TestParseHtmlUrls:
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 1
|
||||
assert result.returncode == 0
|
||||
assert 'No URLs found' in result.stderr
|
||||
assert '"status": "skipped"' in result.stdout
|
||||
|
||||
def test_handles_malformed_html(self, tmp_path):
|
||||
"""Test handling of malformed HTML."""
|
||||
@@ -212,8 +216,7 @@ class TestParseHtmlUrls:
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
lines = output_file.read_text().strip().split('\n')
|
||||
lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
|
||||
assert len(lines) == 2
|
||||
|
||||
def test_output_is_valid_json(self, tmp_path):
|
||||
@@ -229,11 +232,11 @@ class TestParseHtmlUrls:
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
entry = json.loads(output_file.read_text().strip())
|
||||
lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
|
||||
entry = json.loads(lines[0])
|
||||
assert entry['url'] == 'https://example.com'
|
||||
assert 'type' in entry
|
||||
assert 'plugin' in entry
|
||||
assert entry['type'] == 'Snapshot'
|
||||
assert entry['plugin'] == 'parse_html_urls'
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user