diff --git a/.github/workflows/release-runner.yml b/.github/workflows/release-runner.yml
new file mode 100644
index 00000000..e9dd3ac4
--- /dev/null
+++ b/.github/workflows/release-runner.yml
@@ -0,0 +1,45 @@
+name: Release State
+
+on:
+ push:
+ branches:
+ - '**'
+ workflow_dispatch:
+
+permissions:
+ contents: write
+ id-token: write
+
+jobs:
+ release-state:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v4
+ with:
+ fetch-depth: 0
+ submodules: true
+ ref: ${{ github.ref_name }}
+
+ - uses: actions/setup-python@v5
+ with:
+ python-version: "3.13"
+
+ - uses: astral-sh/setup-uv@v6
+ with:
+ enable-cache: true
+
+ - uses: actions/setup-node@v4
+ with:
+ node-version: 22
+
+ - name: Configure git identity
+ run: |
+ git config user.name "github-actions[bot]"
+ git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
+
+ - name: Run release script
+ env:
+ DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
+ GH_TOKEN: ${{ github.token }}
+ PYPI_PAT_SECRET: ${{ secrets.PYPI_PAT_SECRET }}
+ run: ./bin/release.sh
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 86a507fa..032127ae 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -9,7 +9,6 @@ name: Release
# This workflow ensures the correct ordering during a release.
on:
- workflow_dispatch:
release:
types: [published]
diff --git a/archivebox/api/urls.py b/archivebox/api/urls.py
index 81f8cb43..d22e07f7 100644
--- a/archivebox/api/urls.py
+++ b/archivebox/api/urls.py
@@ -6,8 +6,9 @@ from django.views.generic.base import RedirectView
from .v1_api import urls as v1_api_urls
urlpatterns = [
- path("", RedirectView.as_view(url='/api/v1')),
+ path("", RedirectView.as_view(url='/api/v1/docs')),
+ path("v1/", RedirectView.as_view(url='/api/v1/docs')),
path("v1/", v1_api_urls),
path("v1", RedirectView.as_view(url='/api/v1/docs')),
diff --git a/archivebox/api/v1_core.py b/archivebox/api/v1_core.py
index 062eba8b..51dab0e9 100644
--- a/archivebox/api/v1_core.py
+++ b/archivebox/api/v1_core.py
@@ -6,7 +6,8 @@ from typing import List, Optional, Union, Any, Annotated
from datetime import datetime
from django.db.models import Model, Q
-from django.http import HttpRequest
+from django.conf import settings
+from django.http import HttpRequest, HttpResponse
from django.core.exceptions import ValidationError
from django.contrib.auth import get_user_model
from django.contrib.auth.models import User
@@ -18,6 +19,22 @@ from ninja.pagination import paginate, PaginationBase
from ninja.errors import HttpError
from archivebox.core.models import Snapshot, ArchiveResult, Tag
+from archivebox.api.auth import auth_using_token
+from archivebox.config.common import SERVER_CONFIG
+from archivebox.core.tag_utils import (
+ build_tag_cards,
+ delete_tag as delete_tag_record,
+ export_tag_snapshots_jsonl,
+ export_tag_urls,
+ get_matching_tags,
+ get_or_create_tag,
+ get_tag_by_ref,
+ normalize_created_by_filter,
+ normalize_created_year_filter,
+ normalize_has_snapshots_filter,
+ normalize_tag_sort,
+ rename_tag as rename_tag_record,
+)
from archivebox.crawls.models import Crawl
from archivebox.api.v1_crawls import CrawlSchema
@@ -404,7 +421,7 @@ class TagSchema(Schema):
def get_tags(request: HttpRequest):
setattr(request, 'with_snapshots', False)
setattr(request, 'with_archiveresults', False)
- return Tag.objects.all().distinct()
+ return get_matching_tags()
@router.get("/tag/{tag_id}", response=TagSchema, url_name="get_tag")
@@ -412,9 +429,9 @@ def get_tag(request: HttpRequest, tag_id: str, with_snapshots: bool = True):
setattr(request, 'with_snapshots', with_snapshots)
setattr(request, 'with_archiveresults', False)
try:
- return Tag.objects.get(id__icontains=tag_id)
+ return get_tag_by_ref(tag_id)
except (Tag.DoesNotExist, ValidationError):
- return Tag.objects.get(slug__icontains=tag_id)
+ raise HttpError(404, 'Tag not found')
@router.get("/any/{id}", response=Union[SnapshotSchema, ArchiveResultSchema, TagSchema, CrawlSchema], url_name="get_any", summary="Get any object by its ID")
@@ -459,6 +476,55 @@ class TagCreateResponseSchema(Schema):
created: bool
+class TagSearchSnapshotSchema(Schema):
+ id: str
+ title: str
+ url: str
+ favicon_url: str
+ admin_url: str
+ archive_url: str
+ downloaded_at: Optional[str] = None
+
+
+class TagSearchCardSchema(Schema):
+ id: int
+ name: str
+ slug: str
+ num_snapshots: int
+ filter_url: str
+ edit_url: str
+ export_urls_url: str
+ export_jsonl_url: str
+ rename_url: str
+ delete_url: str
+ snapshots: List[TagSearchSnapshotSchema]
+
+
+class TagSearchResponseSchema(Schema):
+ tags: List[TagSearchCardSchema]
+ sort: str
+ created_by: str
+ year: str
+ has_snapshots: str
+
+
+class TagUpdateSchema(Schema):
+ name: str
+
+
+class TagUpdateResponseSchema(Schema):
+ success: bool
+ tag_id: int
+ tag_name: str
+ slug: str
+
+
+class TagDeleteResponseSchema(Schema):
+ success: bool
+ tag_id: int
+ deleted_count: int
+
+
class TagSnapshotRequestSchema(Schema):
snapshot_id: str
tag_name: Optional[str] = None
@@ -471,41 +537,82 @@ class TagSnapshotResponseSchema(Schema):
tag_name: str
-@router.get("/tags/autocomplete/", response=TagAutocompleteSchema, url_name="tags_autocomplete")
+@router.get("/tags/search/", response=TagSearchResponseSchema, url_name="search_tags")
+def search_tags(
+ request: HttpRequest,
+ q: str = "",
+ sort: str = 'created_desc',
+ created_by: str = '',
+ year: str = '',
+ has_snapshots: str = 'all',
+):
+ """Return detailed tag cards for admin/live-search UIs."""
+ normalized_sort = normalize_tag_sort(sort)
+ normalized_created_by = normalize_created_by_filter(created_by)
+ normalized_year = normalize_created_year_filter(year)
+ normalized_has_snapshots = normalize_has_snapshots_filter(has_snapshots)
+ return {
+ 'tags': build_tag_cards(
+ query=q,
+ request=request,
+ sort=normalized_sort,
+ created_by=normalized_created_by,
+ year=normalized_year,
+ has_snapshots=normalized_has_snapshots,
+ ),
+ 'sort': normalized_sort,
+ 'created_by': normalized_created_by,
+ 'year': normalized_year,
+ 'has_snapshots': normalized_has_snapshots,
+ }
+
+
+def _public_tag_listing_enabled() -> bool:
+ explicit = getattr(settings, 'PUBLIC_SNAPSHOTS_LIST', None)
+ if explicit is not None:
+ return bool(explicit)
+ return bool(getattr(settings, 'PUBLIC_INDEX', SERVER_CONFIG.PUBLIC_INDEX))
+
+
+def _request_has_tag_autocomplete_access(request: HttpRequest) -> bool:
+ user = getattr(request, 'user', None)
+ if getattr(user, 'is_authenticated', False):
+ return True
+
+ token = request.GET.get('api_key') or request.headers.get('X-ArchiveBox-API-Key')
+ auth_header = request.headers.get('Authorization', '')
+ if not token and auth_header.lower().startswith('bearer '):
+ token = auth_header.split(None, 1)[1].strip()
+
+ if token and auth_using_token(token=token, request=request):
+ return True
+
+ return _public_tag_listing_enabled()
+
+
+@router.get("/tags/autocomplete/", response=TagAutocompleteSchema, url_name="tags_autocomplete", auth=None)
def tags_autocomplete(request: HttpRequest, q: str = ""):
"""Return tags matching the query for autocomplete."""
- if not q:
- # Return all tags if no query (limited to 50)
- tags = Tag.objects.all().order_by('name')[:50]
- else:
- tags = Tag.objects.filter(name__icontains=q).order_by('name')[:20]
+ if not _request_has_tag_autocomplete_access(request):
+ raise HttpError(401, 'Authentication required')
+
+ tags = get_matching_tags(q)[:50 if not q else 20]
return {
- 'tags': [{'id': tag.pk, 'name': tag.name, 'slug': tag.slug} for tag in tags]
+ 'tags': [{'id': tag.pk, 'name': tag.name, 'slug': tag.slug, 'num_snapshots': getattr(tag, 'num_snapshots', 0)} for tag in tags]
}
@router.post("/tags/create/", response=TagCreateResponseSchema, url_name="tags_create")
def tags_create(request: HttpRequest, data: TagCreateSchema):
"""Create a new tag or return existing one."""
- name = data.name.strip()
- if not name:
- raise HttpError(400, 'Tag name is required')
-
- tag, created = Tag.objects.get_or_create(
- name__iexact=name,
- defaults={
- 'name': name,
- 'created_by': request.user if request.user.is_authenticated else None,
- }
- )
-
- # If found by case-insensitive match, use that tag
- if not created:
- existing_tag = Tag.objects.filter(name__iexact=name).first()
- if existing_tag is None:
- raise HttpError(500, 'Failed to load existing tag after get_or_create')
- tag = existing_tag
+ try:
+ tag, created = get_or_create_tag(
+ data.name,
+ created_by=request.user if request.user.is_authenticated else None,
+ )
+ except ValueError as err:
+ raise HttpError(400, str(err)) from err
return {
'success': True,
@@ -515,6 +622,62 @@ def tags_create(request: HttpRequest, data: TagCreateSchema):
}
+@router.post("/tag/{tag_id}/rename", response=TagUpdateResponseSchema, url_name="rename_tag")
+def rename_tag(request: HttpRequest, tag_id: int, data: TagUpdateSchema):
+ try:
+ tag = rename_tag_record(get_tag_by_ref(tag_id), data.name)
+ except Tag.DoesNotExist as err:
+ raise HttpError(404, 'Tag not found') from err
+ except ValueError as err:
+ raise HttpError(400, str(err)) from err
+
+ return {
+ 'success': True,
+ 'tag_id': tag.pk,
+ 'tag_name': tag.name,
+ 'slug': tag.slug,
+ }
+
+
+@router.delete("/tag/{tag_id}", response=TagDeleteResponseSchema, url_name="delete_tag")
+def delete_tag(request: HttpRequest, tag_id: int):
+ try:
+ tag = get_tag_by_ref(tag_id)
+ except Tag.DoesNotExist as err:
+ raise HttpError(404, 'Tag not found') from err
+
+ deleted_count, _ = delete_tag_record(tag)
+ return {
+ 'success': True,
+ 'tag_id': int(tag_id),
+ 'deleted_count': deleted_count,
+ }
+
+
+@router.get("/tag/{tag_id}/urls.txt", url_name="tag_urls_export")
+def tag_urls_export(request: HttpRequest, tag_id: int):
+ try:
+ tag = get_tag_by_ref(tag_id)
+ except Tag.DoesNotExist as err:
+ raise HttpError(404, 'Tag not found') from err
+
+ response = HttpResponse(export_tag_urls(tag), content_type='text/plain; charset=utf-8')
+ response['Content-Disposition'] = f'attachment; filename="tag-{tag.slug}-urls.txt"'
+ return response
+
+
+@router.get("/tag/{tag_id}/snapshots.jsonl", url_name="tag_snapshots_export")
+def tag_snapshots_export(request: HttpRequest, tag_id: int):
+ try:
+ tag = get_tag_by_ref(tag_id)
+ except Tag.DoesNotExist as err:
+ raise HttpError(404, 'Tag not found') from err
+
+ response = HttpResponse(export_tag_snapshots_jsonl(tag), content_type='application/x-ndjson; charset=utf-8')
+ response['Content-Disposition'] = f'attachment; filename="tag-{tag.slug}-snapshots.jsonl"'
+ return response
+
+
@router.post("/tags/add-to-snapshot/", response=TagSnapshotResponseSchema, url_name="tags_add_to_snapshot")
def tags_add_to_snapshot(request: HttpRequest, data: TagSnapshotRequestSchema):
"""Add a tag to a snapshot. Creates the tag if it doesn't exist."""
@@ -534,24 +697,16 @@ def tags_add_to_snapshot(request: HttpRequest, data: TagSnapshotRequestSchema):
# Get or create the tag
if data.tag_name:
- name = data.tag_name.strip()
- if not name:
- raise HttpError(400, 'Tag name is required')
-
- tag, _ = Tag.objects.get_or_create(
- name__iexact=name,
- defaults={
- 'name': name,
- 'created_by': request.user if request.user.is_authenticated else None,
- }
- )
- # If found by case-insensitive match, use that tag
- existing_tag = Tag.objects.filter(name__iexact=name).first()
- if existing_tag is not None:
- tag = existing_tag
+ try:
+ tag, _ = get_or_create_tag(
+ data.tag_name,
+ created_by=request.user if request.user.is_authenticated else None,
+ )
+ except ValueError as err:
+ raise HttpError(400, str(err)) from err
elif data.tag_id:
try:
- tag = Tag.objects.get(pk=data.tag_id)
+ tag = get_tag_by_ref(data.tag_id)
except Tag.DoesNotExist:
raise HttpError(404, 'Tag not found')
else:
diff --git a/archivebox/base_models/admin.py b/archivebox/base_models/admin.py
index 0cd64854..116e3654 100644
--- a/archivebox/base_models/admin.py
+++ b/archivebox/base_models/admin.py
@@ -4,7 +4,7 @@ __package__ = 'archivebox.base_models'
import json
from collections.abc import Mapping
-from typing import TypedDict
+from typing import NotRequired, TypedDict
from django import forms
from django.contrib import admin
@@ -17,9 +17,13 @@ from django_object_actions import DjangoObjectActions
class ConfigOption(TypedDict):
plugin: str
- type: str
+ type: str | list[str]
default: object
description: str
+ enum: NotRequired[list[object]]
+ pattern: NotRequired[str]
+ minimum: NotRequired[int | float]
+ maximum: NotRequired[int | float]
class KeyValueWidget(forms.Widget):
@@ -44,12 +48,16 @@ class KeyValueWidget(forms.Widget):
options: dict[str, ConfigOption] = {}
for plugin_name, schema in plugin_configs.items():
for key, prop in schema.get('properties', {}).items():
- options[key] = {
+ option: ConfigOption = {
'plugin': plugin_name,
'type': prop.get('type', 'string'),
'default': prop.get('default', ''),
'description': prop.get('description', ''),
}
+ for schema_key in ('enum', 'pattern', 'minimum', 'maximum'):
+ if schema_key in prop:
+ option[schema_key] = prop[schema_key]
+ options[key] = option
return options
except Exception:
return {}
@@ -98,14 +106,12 @@ class KeyValueWidget(forms.Widget):
'''
# Render existing key-value pairs
- row_idx = 0
for key, val in data.items():
val_str = json.dumps(val) if not isinstance(val, str) else val
- html += self._render_row(widget_id, row_idx, key, val_str)
- row_idx += 1
+ html += self._render_row(widget_id, key, val_str)
# Always add one empty row for new entries
- html += self._render_row(widget_id, row_idx, '', '')
+ html += self._render_row(widget_id, '', '')
html += f'''
@@ -114,22 +120,450 @@ class KeyValueWidget(forms.Widget):
style="padding: 4px 12px; cursor: pointer; background: #417690; color: white; border: none; border-radius: 4px;">
+ Add Row
-
'''
return mark_safe(html)
- def _render_row(self, widget_id: str, idx: int, key: str, value: str) -> str:
+ def _render_row(self, widget_id: str, key: str, value: str) -> str:
return f'''
-
-
-
-
−
+
'''
diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py
index cbb6c7de..e38f4155 100644
--- a/archivebox/cli/archivebox_add.py
+++ b/archivebox/cli/archivebox_add.py
@@ -47,11 +47,13 @@ def _collect_input_urls(args: tuple[str, ...]) -> list[str]:
def add(urls: str | list[str],
depth: int | str=0,
tag: str='',
+ url_allowlist: str='',
+ url_denylist: str='',
parser: str="auto",
plugins: str="",
persona: str='Default',
overwrite: bool=False,
- update: bool=not ARCHIVING_CONFIG.ONLY_NEW,
+ update: bool | None=None,
index_only: bool=False,
bg: bool=False,
created_by_id: int | None=None) -> tuple['Crawl', QuerySet['Snapshot']]:
@@ -85,6 +87,8 @@ def add(urls: str | list[str],
created_by_id = created_by_id or get_or_create_system_user_pk()
started_at = timezone.now()
+ if update is None:
+ update = not ARCHIVING_CONFIG.ONLY_NEW
# 1. Save the provided URLs to sources/2024-11-05__23-59-59__cli_add.txt
sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__cli_add.txt'
@@ -120,6 +124,8 @@ def add(urls: str | list[str],
'PLUGINS': plugins,
'DEFAULT_PERSONA': persona_name,
'PARSER': parser,
+ **({'URL_ALLOWLIST': url_allowlist} if url_allowlist else {}),
+ **({'URL_DENYLIST': url_denylist} if url_denylist else {}),
}
)
@@ -150,6 +156,9 @@ def add(urls: str | list[str],
snapshot.ensure_crawl_symlink()
return crawl, crawl.snapshot_set.all()
+ if bg:
+ crawl.create_snapshots_from_urls()
+
# 5. Start the crawl runner to process the queue
# The runner will:
# - Process Crawl -> create Snapshots from all URLs
@@ -192,8 +201,7 @@ def add(urls: str | list[str],
except Exception:
rel_output_str = str(crawl.output_dir)
- # Build admin URL from SERVER_CONFIG
- bind_addr = SERVER_CONFIG.BIND_ADDR
+ bind_addr = SERVER_CONFIG.BIND_ADDR or '127.0.0.1:8000'
if bind_addr.startswith('http://') or bind_addr.startswith('https://'):
base_url = bind_addr
else:
@@ -218,11 +226,13 @@ def add(urls: str | list[str],
@click.command()
@click.option('--depth', '-d', type=click.Choice([str(i) for i in range(5)]), default='0', help='Recursively archive linked pages up to N hops away')
@click.option('--tag', '-t', default='', help='Comma-separated list of tags to add to each snapshot e.g. tag1,tag2,tag3')
+@click.option('--url-allowlist', '--domain-allowlist', default='', help='Comma-separated URL/domain allowlist for this crawl')
+@click.option('--url-denylist', '--domain-denylist', default='', help='Comma-separated URL/domain denylist for this crawl')
@click.option('--parser', default='auto', help='Parser for reading input URLs (auto, txt, html, rss, json, jsonl, netscape, ...)')
@click.option('--plugins', '-p', default='', help='Comma-separated list of plugins to run e.g. title,favicon,screenshot,singlefile,...')
@click.option('--persona', default='Default', help='Authentication profile to use when archiving')
@click.option('--overwrite', '-F', is_flag=True, help='Overwrite existing data if URLs have been archived previously')
-@click.option('--update', is_flag=True, default=ARCHIVING_CONFIG.ONLY_NEW, help='Retry any previously skipped/failed URLs when re-adding them')
+@click.option('--update', is_flag=True, default=None, help='Retry any previously skipped/failed URLs when re-adding them')
@click.option('--index-only', is_flag=True, help='Just add the URLs to the index without archiving them now')
@click.option('--bg', is_flag=True, help='Run archiving in background (queue work and return immediately)')
@click.argument('urls', nargs=-1, type=click.Path())
diff --git a/archivebox/cli/archivebox_archiveresult.py b/archivebox/cli/archivebox_archiveresult.py
index aea83413..6cf0dffc 100644
--- a/archivebox/cli/archivebox_archiveresult.py
+++ b/archivebox/cli/archivebox_archiveresult.py
@@ -42,6 +42,16 @@ from rich import print as rprint
from archivebox.cli.cli_utils import apply_filters
+def build_archiveresult_request(snapshot_id: str, plugin: str, hook_name: str = '', status: str = 'queued') -> dict:
+ return {
+ 'type': 'ArchiveResult',
+ 'snapshot_id': str(snapshot_id),
+ 'plugin': plugin,
+ 'hook_name': hook_name,
+ 'status': status,
+ }
+
+
# =============================================================================
# CREATE
# =============================================================================
@@ -52,21 +62,21 @@ def create_archiveresults(
status: str = 'queued',
) -> int:
"""
- Create ArchiveResults for Snapshots.
+ Create ArchiveResult request records for Snapshots.
- Reads Snapshot records from stdin and creates ArchiveResult entries.
+ Reads Snapshot records from stdin and emits ArchiveResult request JSONL.
Pass-through: Non-Snapshot/ArchiveResult records are output unchanged.
- If --plugin is specified, only creates results for that plugin.
- Otherwise, creates results for all pending plugins.
+ If --plugin is specified, only emits requests for that plugin.
+ Otherwise, emits requests for all enabled snapshot hooks.
Exit codes:
0: Success
1: Failure
"""
- from django.utils import timezone
-
+ from archivebox.config.configset import get_config
+ from archivebox.hooks import discover_hooks
from archivebox.misc.jsonl import read_stdin, write_record, TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
- from archivebox.core.models import Snapshot, ArchiveResult
+ from archivebox.core.models import Snapshot
is_tty = sys.stdout.isatty()
@@ -135,33 +145,20 @@ def create_archiveresults(
created_count = 0
for snapshot in snapshots:
if plugin:
- # Create for specific plugin only
- result, created = ArchiveResult.objects.get_or_create(
- snapshot=snapshot,
- plugin=plugin,
- defaults={
- 'status': status,
- 'retry_at': timezone.now(),
- }
- )
- if not created and result.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED]:
- # Reset for retry
- result.status = status
- result.retry_at = timezone.now()
- result.save()
-
if not is_tty:
- write_record(result.to_json())
+ write_record(build_archiveresult_request(snapshot.id, plugin, status=status))
created_count += 1
else:
- # Create all pending plugins
- snapshot.create_pending_archiveresults()
- for result in snapshot.archiveresult_set.filter(status=ArchiveResult.StatusChoices.QUEUED):
+ config = get_config(crawl=snapshot.crawl, snapshot=snapshot)
+ hooks = discover_hooks('Snapshot', config=config)
+ for hook_path in hooks:
+ hook_name = hook_path.name
+ plugin_name = hook_path.parent.name
if not is_tty:
- write_record(result.to_json())
+ write_record(build_archiveresult_request(snapshot.id, plugin_name, hook_name=hook_name, status=status))
created_count += 1
- rprint(f'[green]Created/queued {created_count} archive results[/green]', file=sys.stderr)
+ rprint(f'[green]Created {created_count} archive result request records[/green]', file=sys.stderr)
return 0
@@ -205,6 +202,7 @@ def list_archiveresults(
'succeeded': 'green',
'failed': 'red',
'skipped': 'dim',
+ 'noresults': 'dim',
'backoff': 'magenta',
}.get(result.status, 'dim')
rprint(f'[{status_color}]{result.status:10}[/{status_color}] {result.plugin:15} [dim]{result.id}[/dim] {result.snapshot.url[:40]}')
@@ -233,8 +231,6 @@ def update_archiveresults(
0: Success
1: No input or error
"""
- from django.utils import timezone
-
from archivebox.misc.jsonl import read_stdin, write_record
from archivebox.core.models import ArchiveResult
@@ -257,7 +253,6 @@ def update_archiveresults(
# Apply updates from CLI flags
if status:
result.status = status
- result.retry_at = timezone.now()
result.save()
updated_count += 1
diff --git a/archivebox/cli/archivebox_extract.py b/archivebox/cli/archivebox_extract.py
index dde97edb..8f132a58 100644
--- a/archivebox/cli/archivebox_extract.py
+++ b/archivebox/cli/archivebox_extract.py
@@ -38,15 +38,16 @@ import rich_click as click
def process_archiveresult_by_id(archiveresult_id: str) -> int:
"""
- Run extraction for a single ArchiveResult by ID (used by workers).
+ Re-run extraction for a single ArchiveResult by ID.
- Triggers the ArchiveResult's state machine tick() to run the extractor
- plugin, but only after claiming ownership via retry_at. This keeps direct
- CLI execution aligned with the worker lifecycle and prevents duplicate hook
- runs if another process already owns the same ArchiveResult.
+ ArchiveResults are projected status rows, not queued work items. Re-running
+ a single result means resetting that row and queueing its parent snapshot
+ through the shared crawl runner with the corresponding plugin selected.
"""
from rich import print as rprint
+ from django.utils import timezone
from archivebox.core.models import ArchiveResult
+ from archivebox.services.runner import run_crawl
try:
archiveresult = ArchiveResult.objects.get(id=archiveresult_id)
@@ -57,16 +58,27 @@ def process_archiveresult_by_id(archiveresult_id: str) -> int:
rprint(f'[blue]Extracting {archiveresult.plugin} for {archiveresult.snapshot.url}[/blue]', file=sys.stderr)
try:
- # Claim-before-tick is the required calling pattern for direct
- # state-machine drivers. If another worker already owns this row,
- # report that and exit without running duplicate extractor side effects.
- if not archiveresult.tick_claimed(lock_seconds=120):
- print(f'[yellow]Extraction already claimed by another process: {archiveresult.plugin}[/yellow]')
- return 0
+ archiveresult.reset_for_retry()
+ snapshot = archiveresult.snapshot
+ snapshot.status = snapshot.StatusChoices.QUEUED
+ snapshot.retry_at = timezone.now()
+ snapshot.save(update_fields=['status', 'retry_at', 'modified_at'])
+
+ crawl = snapshot.crawl
+ if crawl.status != crawl.StatusChoices.STARTED:
+ crawl.status = crawl.StatusChoices.QUEUED
+ crawl.retry_at = timezone.now()
+ crawl.save(update_fields=['status', 'retry_at', 'modified_at'])
+
+ run_crawl(str(crawl.id), snapshot_ids=[str(snapshot.id)], selected_plugins=[archiveresult.plugin])
+ archiveresult.refresh_from_db()
if archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED:
print(f'[green]Extraction succeeded: {archiveresult.output_str}[/green]')
return 0
+ elif archiveresult.status == ArchiveResult.StatusChoices.NORESULTS:
+ print(f'[dim]Extraction completed with no results: {archiveresult.output_str}[/dim]')
+ return 0
elif archiveresult.status == ArchiveResult.StatusChoices.FAILED:
print(f'[red]Extraction failed: {archiveresult.output_str}[/red]', file=sys.stderr)
return 1
@@ -121,8 +133,9 @@ def run_plugins(
rprint('[yellow]No snapshots provided. Pass snapshot IDs as arguments or via stdin.[/yellow]', file=sys.stderr)
return 1
- # Gather snapshot IDs to process
+ # Gather snapshot IDs and optional plugin constraints to process
snapshot_ids = set()
+ requested_plugins_by_snapshot: dict[str, set[str]] = defaultdict(set)
for record in records:
record_type = record.get('type')
@@ -142,6 +155,9 @@ def run_plugins(
snapshot_id = record.get('snapshot_id')
if snapshot_id:
snapshot_ids.add(snapshot_id)
+ plugin_name = record.get('plugin')
+ if plugin_name and not plugins_list:
+ requested_plugins_by_snapshot[str(snapshot_id)].add(str(plugin_name))
elif 'id' in record:
# Assume it's a snapshot ID
@@ -160,26 +176,15 @@ def run_plugins(
rprint(f'[yellow]Snapshot {snapshot_id} not found[/yellow]', file=sys.stderr)
continue
- # Create pending ArchiveResults if needed
- if plugins_list:
- # Only create for specific plugins
- for plugin_name in plugins_list:
- result, created = ArchiveResult.objects.get_or_create(
- snapshot=snapshot,
- plugin=plugin_name,
- defaults={
- 'status': ArchiveResult.StatusChoices.QUEUED,
- 'retry_at': timezone.now(),
- }
- )
- if not created and result.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED]:
- # Reset for retry
- result.status = ArchiveResult.StatusChoices.QUEUED
- result.retry_at = timezone.now()
- result.save()
- else:
- # Create all pending plugins
- snapshot.create_pending_archiveresults()
+ for plugin_name in requested_plugins_by_snapshot.get(str(snapshot.id), set()):
+ existing_result = snapshot.archiveresult_set.filter(plugin=plugin_name).order_by('-created_at').first()
+ if existing_result and existing_result.status in [
+ ArchiveResult.StatusChoices.FAILED,
+ ArchiveResult.StatusChoices.SKIPPED,
+ ArchiveResult.StatusChoices.NORESULTS,
+ ArchiveResult.StatusChoices.BACKOFF,
+ ]:
+ existing_result.reset_for_retry()
# Reset snapshot status to allow processing
if snapshot.status == Snapshot.StatusChoices.SEALED:
@@ -207,10 +212,15 @@ def run_plugins(
snapshot_ids_by_crawl[str(snapshot.crawl_id)].add(str(snapshot.id))
for crawl_id, crawl_snapshot_ids in snapshot_ids_by_crawl.items():
+ selected_plugins = plugins_list or sorted({
+ plugin
+ for snapshot_id in crawl_snapshot_ids
+ for plugin in requested_plugins_by_snapshot.get(str(snapshot_id), set())
+ }) or None
run_crawl(
crawl_id,
snapshot_ids=sorted(crawl_snapshot_ids),
- selected_plugins=plugins_list or None,
+ selected_plugins=selected_plugins,
)
# Output results as JSONL (when piped) or human-readable (when TTY)
diff --git a/archivebox/cli/archivebox_list.py b/archivebox/cli/archivebox_list.py
index eb603b77..6714c537 100644
--- a/archivebox/cli/archivebox_list.py
+++ b/archivebox/cli/archivebox_list.py
@@ -18,9 +18,13 @@ from archivebox.cli.archivebox_snapshot import list_snapshots
@click.option('--tag', '-t', help='Filter by tag name')
@click.option('--crawl-id', help='Filter by crawl ID')
@click.option('--limit', '-n', type=int, help='Limit number of results')
+@click.option('--sort', '-o', type=str, help='Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at')
+@click.option('--csv', '-C', type=str, help='Print output as CSV with the provided fields, e.g.: timestamp,url,title')
+@click.option('--with-headers', is_flag=True, help='Include column headers in structured output')
def main(status: Optional[str], url__icontains: Optional[str], url__istartswith: Optional[str],
- tag: Optional[str], crawl_id: Optional[str], limit: Optional[int]) -> None:
- """List Snapshots as JSONL."""
+ tag: Optional[str], crawl_id: Optional[str], limit: Optional[int],
+ sort: Optional[str], csv: Optional[str], with_headers: bool) -> None:
+ """List Snapshots."""
sys.exit(list_snapshots(
status=status,
url__icontains=url__icontains,
@@ -28,6 +32,9 @@ def main(status: Optional[str], url__icontains: Optional[str], url__istartswith:
tag=tag,
crawl_id=crawl_id,
limit=limit,
+ sort=sort,
+ csv=csv,
+ with_headers=with_headers,
))
diff --git a/archivebox/cli/archivebox_persona.py b/archivebox/cli/archivebox_persona.py
index c8acbbff..6ba981f0 100644
--- a/archivebox/cli/archivebox_persona.py
+++ b/archivebox/cli/archivebox_persona.py
@@ -42,6 +42,7 @@ import rich_click as click
from rich import print as rprint
from archivebox.cli.cli_utils import apply_filters
+from archivebox.personas import importers as persona_importers
# =============================================================================
@@ -440,8 +441,6 @@ def create_personas(
browser_binary = get_browser_binary(import_from)
if browser_binary:
rprint(f'[dim]Using {import_from} binary: {browser_binary}[/dim]', file=sys.stderr)
- else:
- browser_binary = None
created_count = 0
for name in name_list:
@@ -450,7 +449,7 @@ def create_personas(
continue
# Validate persona name to prevent path traversal
- is_valid, error_msg = validate_persona_name(name)
+ is_valid, error_msg = persona_importers.validate_persona_name(name)
if not is_valid:
rprint(f'[red]Invalid persona name "{name}": {error_msg}[/red]', file=sys.stderr)
continue
@@ -468,49 +467,29 @@ def create_personas(
# Import browser profile if requested
if import_from in CHROMIUM_BROWSERS and source_profile_dir is not None:
- persona_chrome_dir = Path(persona.CHROME_USER_DATA_DIR)
-
- # Copy the browser profile
- rprint(f'[dim]Copying browser profile to {persona_chrome_dir}...[/dim]', file=sys.stderr)
-
try:
- # Remove existing chrome_user_data if it exists
- if persona_chrome_dir.exists():
- shutil.rmtree(persona_chrome_dir)
-
- # Copy the profile directory
- # We copy the entire user data dir, not just Default profile
- shutil.copytree(
- source_profile_dir,
- persona_chrome_dir,
- symlinks=True,
- ignore=shutil.ignore_patterns(
- 'Cache', 'Code Cache', 'GPUCache', 'ShaderCache',
- 'Service Worker', 'GCM Store', '*.log', 'Crashpad',
- 'BrowserMetrics', 'BrowserMetrics-spare.pma',
- 'SingletonLock', 'SingletonSocket', 'SingletonCookie',
- ),
+ import_source = persona_importers.resolve_browser_import_source(import_from, profile_dir=profile)
+ import_result = persona_importers.import_persona_from_source(
+ persona,
+ import_source,
+ copy_profile=True,
+ import_cookies=True,
+ capture_storage=False,
)
- rprint('[green]Copied browser profile to persona[/green]', file=sys.stderr)
-
- # Extract cookies via CDP
- rprint('[dim]Extracting cookies via CDP...[/dim]', file=sys.stderr)
-
- if extract_cookies_via_cdp(
- persona_chrome_dir,
- cookies_file,
- profile_dir=profile,
- chrome_binary=browser_binary,
- ):
- rprint(f'[green]Extracted cookies to {cookies_file}[/green]', file=sys.stderr)
- else:
- rprint('[yellow]Could not extract cookies automatically.[/yellow]', file=sys.stderr)
- rprint('[dim]You can manually export cookies using a browser extension.[/dim]', file=sys.stderr)
-
except Exception as e:
- rprint(f'[red]Failed to copy browser profile: {e}[/red]', file=sys.stderr)
+ rprint(f'[red]Failed to import browser profile: {e}[/red]', file=sys.stderr)
return 1
+ if import_result.profile_copied:
+ rprint('[green]Copied browser profile to persona[/green]', file=sys.stderr)
+ if import_result.cookies_imported:
+ rprint(f'[green]Extracted cookies to {cookies_file}[/green]', file=sys.stderr)
+ elif not import_result.profile_copied:
+ rprint('[yellow]Could not import cookies automatically.[/yellow]', file=sys.stderr)
+
+ for warning in import_result.warnings:
+ rprint(f'[yellow]{warning}[/yellow]', file=sys.stderr)
+
if not is_tty:
write_record({
'id': str(persona.id) if hasattr(persona, 'id') else None,
@@ -616,7 +595,7 @@ def update_personas(name: Optional[str] = None) -> int:
# Apply updates from CLI flags
if name:
# Validate new name to prevent path traversal
- is_valid, error_msg = validate_persona_name(name)
+ is_valid, error_msg = persona_importers.validate_persona_name(name)
if not is_valid:
rprint(f'[red]Invalid new persona name "{name}": {error_msg}[/red]', file=sys.stderr)
continue
diff --git a/archivebox/cli/archivebox_pluginmap.py b/archivebox/cli/archivebox_pluginmap.py
index 21938572..41c0724a 100644
--- a/archivebox/cli/archivebox_pluginmap.py
+++ b/archivebox/cli/archivebox_pluginmap.py
@@ -89,56 +89,6 @@ SNAPSHOT_MACHINE_DIAGRAM = """
└─────────────────────────────────────────────────────────────────────────────┘
"""
-ARCHIVERESULT_MACHINE_DIAGRAM = """
-┌─────────────────────────────────────────────────────────────────────────────┐
-│ ArchiveResultMachine │
-├─────────────────────────────────────────────────────────────────────────────┤
-│ │
-│ ┌─────────────┐ │
-│ │ QUEUED │◄─────────────────┐ │
-│ │ (initial) │ │ │
-│ └──┬───────┬──┘ │ │
-│ │ │ │ tick() unless can_start() │
-│ │ │ exceeded_max_ │ │
-│ │ │ attempts │ │
-│ │ ▼ │ │
-│ │ ┌──────────┐ │ │
-│ │ │ SKIPPED │ │ │
-│ │ │ (final) │ │ │
-│ │ └──────────┘ │ │
-│ │ tick() when │ │
-│ │ can_start() │ │
-│ ▼ │ │
-│ ┌─────────────┐ │ │
-│ │ STARTED │──────────────────┘ │
-│ │ │◄─────────────────────────────────────────────────┐ │
-│ │ enter: │ │ │ │
-│ │ result.run()│ tick() unless │ │ │
-│ │ (execute │ is_finished() │ │ │
-│ │ hook via │──────────────────────┘ │ │
-│ │ run_hook())│ │ │
-│ └──────┬──────┘ │ │
-│ │ │ │
-│ │ tick() checks status set by hook output │ │
-│ ├─────────────┬─────────────┬─────────────┐ │ │
-│ ▼ ▼ ▼ ▼ │ │
-│ ┌───────────┐ ┌───────────┐ ┌───────────┐ ┌───────────┐ │ │
-│ │ SUCCEEDED │ │ FAILED │ │ SKIPPED │ │ BACKOFF │ │ │
-│ │ (final) │ │ (final) │ │ (final) │ │ │ │ │
-│ └───────────┘ └───────────┘ └───────────┘ └──┬──────┬─┘ │ │
-│ │ │ │ │
-│ exceeded_max_ │ │ can_start()│ │
-│ attempts │ │ loops back │ │
-│ ▼ │ └────────────┘ │
-│ ┌──────────┐ │ │
-│ │ SKIPPED │◄─┘ │
-│ │ (final) │ │
-│ └──────────┘ │
-│ │
-│ Each ArchiveResult runs ONE specific hook (stored in .hook_name field) │
-└─────────────────────────────────────────────────────────────────────────────┘
-"""
-
BINARY_MACHINE_DIAGRAM = """
┌─────────────────────────────────────────────────────────────────────────────┐
│ BinaryMachine │
@@ -193,8 +143,8 @@ def pluginmap(
"""
Show a map of all state machines and their associated plugin hooks.
- Displays ASCII art diagrams of the core model state machines (Crawl, Snapshot,
- ArchiveResult, Binary) and lists all auto-detected on_Modelname_xyz hooks
+ Displays ASCII art diagrams of the core queued model state machines (Crawl,
+ Snapshot, Binary) and lists all auto-detected on_Modelname_xyz hooks
that will run for each model's transitions.
"""
from rich.console import Console
@@ -257,17 +207,6 @@ def pluginmap(
prnt(f'[dim]User plugins: {USER_PLUGINS_DIR}[/dim]')
prnt()
- # Show diagrams first (unless quiet mode)
- if not quiet:
- # Show ArchiveResult diagram separately since it's different
- prnt(Panel(
- ARCHIVERESULT_MACHINE_DIAGRAM,
- title='[bold green]ArchiveResultMachine[/bold green]',
- border_style='green',
- expand=False,
- ))
- prnt()
-
for event_name, info in model_events.items():
# Discover hooks for this event
hooks = discover_hooks(event_name, filter_disabled=not show_disabled)
diff --git a/archivebox/cli/archivebox_run.py b/archivebox/cli/archivebox_run.py
index fd88823b..292baf87 100644
--- a/archivebox/cli/archivebox_run.py
+++ b/archivebox/cli/archivebox_run.py
@@ -145,17 +145,25 @@ def process_stdin_records() -> int:
try:
archiveresult = ArchiveResult.objects.get(id=record_id)
except ArchiveResult.DoesNotExist:
- archiveresult = ArchiveResult.from_json(record)
+ archiveresult = None
else:
- # New archiveresult - create it
- archiveresult = ArchiveResult.from_json(record)
+ archiveresult = None
+ snapshot_id = record.get('snapshot_id')
+ plugin_name = record.get('plugin')
+ snapshot = None
if archiveresult:
- archiveresult.retry_at = timezone.now()
- if archiveresult.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED, ArchiveResult.StatusChoices.BACKOFF]:
- archiveresult.status = ArchiveResult.StatusChoices.QUEUED
- archiveresult.save()
+ if archiveresult.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED, ArchiveResult.StatusChoices.NORESULTS, ArchiveResult.StatusChoices.BACKOFF]:
+ archiveresult.reset_for_retry()
snapshot = archiveresult.snapshot
+ plugin_name = plugin_name or archiveresult.plugin
+ elif snapshot_id:
+ try:
+ snapshot = Snapshot.objects.get(id=snapshot_id)
+ except Snapshot.DoesNotExist:
+ snapshot = None
+
+ if snapshot:
snapshot.retry_at = timezone.now()
if snapshot.status != Snapshot.StatusChoices.STARTED:
snapshot.status = Snapshot.StatusChoices.QUEUED
@@ -167,9 +175,9 @@ def process_stdin_records() -> int:
crawl.save(update_fields=['status', 'retry_at', 'modified_at'])
crawl_id = str(snapshot.crawl_id)
snapshot_ids_by_crawl[crawl_id].add(str(snapshot.id))
- if archiveresult.plugin:
- plugin_names_by_crawl[crawl_id].add(archiveresult.plugin)
- output_records.append(archiveresult.to_json())
+ if plugin_name:
+ plugin_names_by_crawl[crawl_id].add(str(plugin_name))
+ output_records.append(record if not archiveresult else archiveresult.to_json())
queued_count += 1
elif record_type == TYPE_BINARY:
@@ -234,9 +242,11 @@ def run_runner(daemon: bool = False) -> int:
"""
from django.utils import timezone
from archivebox.machine.models import Machine, Process
- from archivebox.services.runner import run_pending_crawls
+ from archivebox.services.runner import recover_orphaned_crawls, recover_orphaned_snapshots, run_pending_crawls
Process.cleanup_stale_running()
+ recover_orphaned_snapshots()
+ recover_orphaned_crawls()
Machine.current()
current = Process.current()
if current.process_type != Process.TypeChoices.ORCHESTRATOR:
@@ -305,6 +315,13 @@ def main(daemon: bool, crawl_id: str, snapshot_id: str, binary_id: str):
traceback.print_exc()
sys.exit(1)
+ if daemon:
+ if not sys.stdin.isatty():
+ exit_code = process_stdin_records()
+ if exit_code != 0:
+ sys.exit(exit_code)
+ sys.exit(run_runner(daemon=True))
+
if not sys.stdin.isatty():
sys.exit(process_stdin_records())
else:
diff --git a/archivebox/cli/archivebox_server.py b/archivebox/cli/archivebox_server.py
index 36e53e91..cbd7a9ce 100644
--- a/archivebox/cli/archivebox_server.py
+++ b/archivebox/cli/archivebox_server.py
@@ -3,9 +3,7 @@
__package__ = 'archivebox.cli'
from typing import Iterable
-import os
import sys
-import subprocess
import rich_click as click
from rich import print
@@ -14,6 +12,41 @@ from archivebox.misc.util import docstring, enforce_types
from archivebox.config.common import SERVER_CONFIG
+def stop_existing_background_runner(*, machine, process_model, supervisor=None, stop_worker_fn=None, log=print) -> int:
+ """Stop any existing orchestrator process so the server can take ownership."""
+ process_model.cleanup_stale_running(machine=machine)
+
+ running_runners = list(process_model.objects.filter(
+ machine=machine,
+ status=process_model.StatusChoices.RUNNING,
+ process_type=process_model.TypeChoices.ORCHESTRATOR,
+ ).order_by('created_at'))
+
+ if not running_runners:
+ return 0
+
+ log('[yellow][*] Stopping existing ArchiveBox background runner...[/yellow]')
+
+ if supervisor is not None and stop_worker_fn is not None:
+ for worker_name in ('worker_runner', 'worker_runner_watch'):
+ try:
+ stop_worker_fn(supervisor, worker_name)
+ except Exception:
+ pass
+
+ for proc in running_runners:
+ try:
+ proc.kill_tree(graceful_timeout=2.0)
+ except Exception:
+ try:
+ proc.terminate(graceful_timeout=2.0)
+ except Exception:
+ pass
+
+ process_model.cleanup_stale_running(machine=machine)
+ return len(running_runners)
+
+
@enforce_types
def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
reload: bool=False,
@@ -39,25 +72,6 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
if debug or reload:
SHELL_CONFIG.DEBUG = True
- if run_in_debug:
- os.environ['ARCHIVEBOX_RUNSERVER'] = '1'
- if reload:
- os.environ['ARCHIVEBOX_AUTORELOAD'] = '1'
- from archivebox.config.common import STORAGE_CONFIG
- pidfile = str(STORAGE_CONFIG.TMP_DIR / 'runserver.pid')
- os.environ['ARCHIVEBOX_RUNSERVER_PIDFILE'] = pidfile
-
- from django.utils.autoreload import DJANGO_AUTORELOAD_ENV
- is_reloader_child = os.environ.get(DJANGO_AUTORELOAD_ENV) == 'true'
- if not is_reloader_child:
- env = os.environ.copy()
- subprocess.Popen(
- [sys.executable, '-m', 'archivebox', 'manage', 'runner_watch', f'--pidfile={pidfile}'],
- env=env,
- stdout=subprocess.DEVNULL,
- stderr=subprocess.DEVNULL,
- )
-
from django.contrib.auth.models import User
if not User.objects.filter(is_superuser=True).exclude(username='system').exists():
@@ -81,73 +95,62 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
except IndexError:
pass
+ from archivebox.workers.supervisord_util import (
+ get_existing_supervisord_process,
+ get_worker,
+ stop_worker,
+ start_server_workers,
+ is_port_in_use,
+ )
+ from archivebox.machine.models import Machine, Process
+
+ # Check if port is already in use
+ if is_port_in_use(host, int(port)):
+ print(f'[red][X] Error: Port {port} is already in use[/red]')
+ print(f' Another process (possibly daphne or runserver) is already listening on {host}:{port}')
+ print(' Stop the conflicting process or choose a different port')
+ sys.exit(1)
+
+ machine = Machine.current()
+ stop_existing_background_runner(
+ machine=machine,
+ process_model=Process,
+ supervisor=get_existing_supervisord_process(),
+ stop_worker_fn=stop_worker,
+ )
+
+ supervisor = get_existing_supervisord_process()
+ if supervisor:
+ server_worker_name = 'worker_runserver' if run_in_debug else 'worker_daphne'
+ server_proc = get_worker(supervisor, server_worker_name)
+ server_state = server_proc.get('statename') if isinstance(server_proc, dict) else None
+ if server_state == 'RUNNING':
+ runner_proc = get_worker(supervisor, 'worker_runner')
+ runner_watch_proc = get_worker(supervisor, 'worker_runner_watch')
+ runner_state = runner_proc.get('statename') if isinstance(runner_proc, dict) else None
+ runner_watch_state = runner_watch_proc.get('statename') if isinstance(runner_watch_proc, dict) else None
+ print('[red][X] Error: ArchiveBox server is already running[/red]')
+ print(f' [green]√[/green] Web server ({server_worker_name}) is RUNNING on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
+ if runner_state == 'RUNNING':
+ print(' [green]√[/green] Background runner (worker_runner) is RUNNING')
+ if runner_watch_state == 'RUNNING':
+ print(' [green]√[/green] Reload watcher (worker_runner_watch) is RUNNING')
+ print()
+ print('[yellow]To stop the existing server, run:[/yellow]')
+ print(' pkill -f "archivebox server"')
+ print(' pkill -f supervisord')
+ sys.exit(1)
+
if run_in_debug:
- from django.core.management import call_command
print('[green][+] Starting ArchiveBox webserver in DEBUG mode...[/green]')
- print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
- print(f' [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]')
- print(' > Writing ArchiveBox error log to ./logs/errors.log')
- if not reload:
- runserver_args.append('--noreload') # '--insecure'
- if nothreading:
- runserver_args.append('--nothreading')
- call_command("runserver", *runserver_args)
else:
- from archivebox.workers.supervisord_util import (
- get_existing_supervisord_process,
- get_worker,
- start_server_workers,
- is_port_in_use,
- )
- from archivebox.machine.models import Machine, Process
-
- # Check if port is already in use
- if is_port_in_use(host, int(port)):
- print(f'[red][X] Error: Port {port} is already in use[/red]')
- print(f' Another process (possibly daphne) is already listening on {host}:{port}')
- print(' Stop the conflicting process or choose a different port')
- sys.exit(1)
-
- # Check if the background crawl runner is already running for this data directory
- if Process.objects.filter(
- machine=Machine.current(),
- status=Process.StatusChoices.RUNNING,
- process_type=Process.TypeChoices.ORCHESTRATOR,
- ).exists():
- print('[red][X] Error: ArchiveBox background runner is already running for this data directory[/red]')
- print(' Stop the existing runner before starting a new server')
- print(' To stop: pkill -f "archivebox run --daemon"')
- sys.exit(1)
-
- # Check if supervisord is already running
- supervisor = get_existing_supervisord_process()
- if supervisor:
- daphne_proc = get_worker(supervisor, 'worker_daphne')
- daphne_state = daphne_proc.get('statename') if isinstance(daphne_proc, dict) else None
-
- # If daphne is already running, error out
- if daphne_state == 'RUNNING':
- runner_proc = get_worker(supervisor, 'worker_runner')
- runner_state = runner_proc.get('statename') if isinstance(runner_proc, dict) else None
- print('[red][X] Error: ArchiveBox server is already running[/red]')
- print(f' [green]√[/green] Web server (worker_daphne) is RUNNING on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
- if runner_state == 'RUNNING':
- print(' [green]√[/green] Background runner (worker_runner) is RUNNING')
- print()
- print('[yellow]To stop the existing server, run:[/yellow]')
- print(' pkill -f "archivebox server"')
- print(' pkill -f supervisord')
- sys.exit(1)
- # Otherwise, daphne is not running - fall through to start it
-
- # No existing workers found - start new ones
print('[green][+] Starting ArchiveBox webserver...[/green]')
- print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
- print(f' [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]')
- print(' > Writing ArchiveBox error log to ./logs/errors.log')
- print()
- start_server_workers(host=host, port=port, daemonize=daemonize)
- print("\n[i][green][🟩] ArchiveBox server shut down gracefully.[/green][/i]")
+ print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
+ print(f' [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]')
+ print(' > Writing ArchiveBox error log to ./logs/errors.log')
+ print()
+ start_server_workers(host=host, port=port, daemonize=daemonize, debug=run_in_debug, reload=reload, nothreading=nothreading)
+ print("\n[i][green][🟩] ArchiveBox server shut down gracefully.[/green][/i]")
@click.command()
diff --git a/archivebox/cli/archivebox_snapshot.py b/archivebox/cli/archivebox_snapshot.py
index 46ad2949..ae65fdab 100644
--- a/archivebox/cli/archivebox_snapshot.py
+++ b/archivebox/cli/archivebox_snapshot.py
@@ -172,6 +172,9 @@ def list_snapshots(
tag: Optional[str] = None,
crawl_id: Optional[str] = None,
limit: Optional[int] = None,
+ sort: Optional[str] = None,
+ csv: Optional[str] = None,
+ with_headers: bool = False,
) -> int:
"""
List Snapshots as JSONL with optional filters.
@@ -182,7 +185,11 @@ def list_snapshots(
from archivebox.misc.jsonl import write_record
from archivebox.core.models import Snapshot
- is_tty = sys.stdout.isatty()
+ if with_headers and not csv:
+ rprint('[red]--with-headers requires --csv[/red]', file=sys.stderr)
+ return 2
+
+ is_tty = sys.stdout.isatty() and not csv
queryset = Snapshot.objects.all().order_by('-created_at')
@@ -199,7 +206,29 @@ def list_snapshots(
if tag:
queryset = queryset.filter(tags__name__iexact=tag)
+ if sort:
+ queryset = queryset.order_by(sort)
+
count = 0
+ if csv:
+ cols = [col.strip() for col in csv.split(',') if col.strip()]
+ if not cols:
+ rprint('[red]No CSV columns provided[/red]', file=sys.stderr)
+ return 2
+ rows: list[str] = []
+ if with_headers:
+ rows.append(','.join(cols))
+ for snapshot in queryset.iterator(chunk_size=500):
+ rows.append(snapshot.to_csv(cols=cols, separator=','))
+ count += 1
+ output = '\n'.join(rows)
+ if output:
+ sys.stdout.write(output)
+ if not output.endswith('\n'):
+ sys.stdout.write('\n')
+ rprint(f'[dim]Listed {count} snapshots[/dim]', file=sys.stderr)
+ return 0
+
for snapshot in queryset:
if is_tty:
status_color = {
diff --git a/archivebox/config/common.py b/archivebox/config/common.py
index 1546332d..f0395f97 100644
--- a/archivebox/config/common.py
+++ b/archivebox/config/common.py
@@ -1,6 +1,7 @@
__package__ = "archivebox.config"
import re
+import secrets
import sys
import shutil
from typing import ClassVar, Dict, Optional, List
@@ -8,7 +9,6 @@ from pathlib import Path
from rich import print
from pydantic import Field, field_validator
-from django.utils.crypto import get_random_string
from archivebox.config.configset import BaseConfigSet
@@ -104,7 +104,7 @@ class ServerConfig(BaseConfigSet):
"danger-onedomain-fullreplay",
)
- SECRET_KEY: str = Field(default_factory=lambda: get_random_string(50, "abcdefghijklmnopqrstuvwxyz0123456789_"))
+ SECRET_KEY: str = Field(default_factory=lambda: ''.join(secrets.choice("abcdefghijklmnopqrstuvwxyz0123456789_") for _ in range(50)))
BIND_ADDR: str = Field(default="127.0.0.1:8000")
LISTEN_HOST: str = Field(default="archivebox.localhost:8000")
ADMIN_BASE_URL: str = Field(default="")
diff --git a/archivebox/config/views.py b/archivebox/config/views.py
index 8fa3adc8..df7a83d6 100644
--- a/archivebox/config/views.py
+++ b/archivebox/config/views.py
@@ -1,10 +1,13 @@
__package__ = 'archivebox.config'
+import html
+import json
import os
-import shutil
import inspect
+import re
from pathlib import Path
-from typing import Any, Dict
+from typing import Any, Callable, Dict
+from urllib.parse import quote, urlencode
from django.http import HttpRequest
from django.utils import timezone
from django.utils.html import format_html
@@ -18,16 +21,27 @@ from archivebox.misc.util import parse_date
from archivebox.machine.models import Binary
+ABX_PLUGINS_DOCS_BASE_URL = 'https://archivebox.github.io/abx-plugins/'
+ABX_PLUGINS_GITHUB_BASE_URL = 'https://github.com/ArchiveBox/abx-plugins/tree/main/abx_plugins/plugins/'
+LIVE_CONFIG_BASE_URL = '/admin/environment/config/'
+ENVIRONMENT_BINARIES_BASE_URL = '/admin/environment/binaries/'
+INSTALLED_BINARIES_BASE_URL = '/admin/machine/binary/'
+
# Common binaries to check for
KNOWN_BINARIES = [
'wget', 'curl', 'chromium', 'chrome', 'google-chrome', 'google-chrome-stable',
- 'node', 'npm', 'npx', 'yt-dlp', 'ytdlp', 'youtube-dl',
+ 'node', 'npm', 'npx', 'yt-dlp',
'git', 'singlefile', 'readability-extractor', 'mercury-parser',
'python3', 'python', 'bash', 'zsh',
'ffmpeg', 'ripgrep', 'rg', 'sonic', 'archivebox',
]
+CANONICAL_BINARY_ALIASES = {
+ 'youtube-dl': 'yt-dlp',
+ 'ytdlp': 'yt-dlp',
+}
+
def is_superuser(request: HttpRequest) -> bool:
return bool(getattr(request.user, 'is_superuser', False))
@@ -38,6 +52,249 @@ def format_parsed_datetime(value: object) -> str:
return parsed.strftime("%Y-%m-%d %H:%M:%S") if parsed else ""
+JSON_TOKEN_RE = re.compile(
+ r'(?P
"(?:\\u[a-fA-F0-9]{4}|\\[^u]|[^\\"])*")(?=\s*:)'
+ r'|(?P"(?:\\u[a-fA-F0-9]{4}|\\[^u]|[^\\"])*")'
+ r'|(?P\btrue\b|\bfalse\b)'
+ r'|(?P\bnull\b)'
+ r'|(?P-?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?)'
+)
+
+
+def render_code_block(text: str, *, highlighted: bool = False) -> str:
+ code = html.escape(text, quote=False)
+
+ if highlighted:
+ def _wrap_token(match: re.Match[str]) -> str:
+ styles = {
+ 'key': 'color: #0550ae;',
+ 'string': 'color: #0a7f45;',
+ 'boolean': 'color: #8250df; font-weight: 600;',
+ 'null': 'color: #6e7781; font-style: italic;',
+ 'number': 'color: #b35900;',
+ }
+ token_type = next(name for name, value in match.groupdict().items() if value is not None)
+ return f'{match.group(0)} '
+
+ code = JSON_TOKEN_RE.sub(_wrap_token, code)
+
+ return (
+ ''
+ ''
+ f'{code}'
+ ' '
+ )
+
+
+def render_highlighted_json_block(value: Any) -> str:
+ return render_code_block(json.dumps(value, indent=2, ensure_ascii=False), highlighted=True)
+
+
+def get_plugin_docs_url(plugin_name: str) -> str:
+ return f'{ABX_PLUGINS_DOCS_BASE_URL}#{plugin_name}'
+
+
+def get_plugin_hook_source_url(plugin_name: str, hook_name: str) -> str:
+ return f'{ABX_PLUGINS_GITHUB_BASE_URL}{quote(plugin_name)}/{quote(hook_name)}'
+
+
+def get_live_config_url(key: str) -> str:
+ return f'{LIVE_CONFIG_BASE_URL}{quote(key)}/'
+
+
+def get_environment_binary_url(name: str) -> str:
+ return f'{ENVIRONMENT_BINARIES_BASE_URL}{quote(name)}/'
+
+
+def get_installed_binary_change_url(name: str, binary: Any) -> str | None:
+ binary_id = getattr(binary, 'id', None)
+ if not binary_id:
+ return None
+
+ base_url = getattr(binary, 'admin_change_url', None) or f'{INSTALLED_BINARIES_BASE_URL}{binary_id}/change/'
+ changelist_filters = urlencode({'q': canonical_binary_name(name)})
+ return f'{base_url}?{urlencode({"_changelist_filters": changelist_filters})}'
+
+
+def get_machine_admin_url() -> str | None:
+ try:
+ from archivebox.machine.models import Machine
+ return Machine.current().admin_change_url
+ except Exception:
+ return None
+
+
+def render_code_tag_list(values: list[str]) -> str:
+ if not values:
+ return '(none) '
+
+ tags = ''.join(
+ str(format_html(
+ '{}',
+ value,
+ ))
+ for value in values
+ )
+ return f'{tags}
'
+
+
+def render_plugin_metadata_html(config: dict[str, Any]) -> str:
+ rows = (
+ ('Title', config.get('title') or '(none)'),
+ ('Description', config.get('description') or '(none)'),
+ ('Required Plugins', mark_safe(render_link_tag_list(config.get('required_plugins') or [], get_plugin_docs_url))),
+ ('Required Binaries', mark_safe(render_link_tag_list(config.get('required_binaries') or [], get_environment_binary_url))),
+ ('Output MIME Types', mark_safe(render_code_tag_list(config.get('output_mimetypes') or []))),
+ )
+
+ rendered_rows = ''.join(
+ str(format_html(
+ '',
+ label,
+ value,
+ ))
+ for label, value in rows
+ )
+ return f'{rendered_rows}
'
+
+
+def render_link_tag_list(values: list[str], url_resolver: Callable[[str], str] | None = None) -> str:
+ if not values:
+ return '(none) '
+
+ tags = []
+ for value in values:
+ if url_resolver is None:
+ tags.append(str(format_html(
+ '{}',
+ value,
+ )))
+ else:
+ tags.append(str(format_html(
+ ''
+ '{}'
+ ' ',
+ url_resolver(value),
+ value,
+ )))
+ return f'{"".join(tags)}
'
+
+
+def render_property_links(prop_name: str, prop_info: dict[str, Any], machine_admin_url: str | None) -> str:
+ links = [
+ str(format_html('Computed value ', get_live_config_url(prop_name))),
+ ]
+ if machine_admin_url:
+ links.append(str(format_html('Edit override ', machine_admin_url)))
+
+ fallback = prop_info.get('x-fallback')
+ if isinstance(fallback, str) and fallback:
+ links.append(str(format_html('Fallback: {} ', get_live_config_url(fallback), fallback)))
+
+ aliases = prop_info.get('x-aliases') or []
+ if isinstance(aliases, list):
+ for alias in aliases:
+ if isinstance(alias, str) and alias:
+ links.append(str(format_html('Alias: {} ', get_live_config_url(alias), alias)))
+
+ default = prop_info.get('default')
+ if prop_name.endswith('_BINARY') and isinstance(default, str) and default:
+ links.append(str(format_html('Binary: {} ', get_environment_binary_url(default), default)))
+
+ return ' '.join(links)
+
+
+def render_config_properties_html(properties: dict[str, Any], machine_admin_url: str | None) -> str:
+ header_links = [
+ str(format_html('Dependencies ', ENVIRONMENT_BINARIES_BASE_URL)),
+ str(format_html('Installed Binaries ', INSTALLED_BINARIES_BASE_URL)),
+ ]
+ if machine_admin_url:
+ header_links.insert(0, str(format_html('Machine Config Editor ', machine_admin_url)))
+
+ cards = [
+ f'{" | ".join(header_links)}
'
+ ]
+
+ for prop_name, prop_info in properties.items():
+ prop_type = prop_info.get('type', 'unknown')
+ if isinstance(prop_type, list):
+ prop_type = ' | '.join(str(type_name) for type_name in prop_type)
+ prop_desc = prop_info.get('description', '')
+
+ default_html = ''
+ if 'default' in prop_info:
+ default_html = str(format_html(
+ 'Default: {}
',
+ prop_info['default'],
+ ))
+
+ description_html = prop_desc or mark_safe('(no description) ')
+ cards.append(str(format_html(
+ ''
+ '
'
+ '
{}
'
+ '
{}
'
+ '{}'
+ '
',
+ get_live_config_url(prop_name),
+ prop_name,
+ prop_type,
+ description_html,
+ mark_safe(render_property_links(prop_name, prop_info, machine_admin_url)),
+ mark_safe(default_html),
+ )))
+
+ return ''.join(cards)
+
+
+def render_hook_links_html(plugin_name: str, hooks: list[str], source: str) -> str:
+ if not hooks:
+ return '(none) '
+
+ items = []
+ for hook_name in hooks:
+ if source == 'builtin':
+ items.append(str(format_html(
+ '',
+ get_plugin_hook_source_url(plugin_name, hook_name),
+ hook_name,
+ )))
+ else:
+ items.append(str(format_html(
+ '{}
',
+ hook_name,
+ )))
+ return ''.join(items)
+
+
+def render_binary_detail_description(name: str, merged: dict[str, Any], db_binary: Any) -> str:
+ installed_binary_url = get_installed_binary_change_url(name, db_binary)
+
+ if installed_binary_url:
+ return str(format_html(
+ '{} '
+ 'View Installed Binary Record ',
+ merged['abspath'],
+ installed_binary_url,
+ ))
+
+ return str(format_html('{}', merged['abspath']))
+
+
def obj_to_yaml(obj: Any, indent: int = 0) -> str:
indent_str = " " * indent
if indent == 0:
@@ -80,21 +337,41 @@ def obj_to_yaml(obj: Any, indent: int = 0) -> str:
return f" {str(obj)}"
-def get_detected_binaries() -> Dict[str, Dict[str, Any]]:
- """Detect available binaries using shutil.which."""
- binaries = {}
+def canonical_binary_name(name: str) -> str:
+ return CANONICAL_BINARY_ALIASES.get(name, name)
- for name in KNOWN_BINARIES:
- path = shutil.which(name)
- if path:
- binaries[name] = {
- 'name': name,
- 'abspath': path,
- 'version': None, # Could add version detection later
- 'is_available': True,
- }
- return binaries
+def _binary_sort_key(binary: Binary) -> tuple[int, int, int, Any]:
+ return (
+ int(binary.status == Binary.StatusChoices.INSTALLED),
+ int(bool(binary.version)),
+ int(bool(binary.abspath)),
+ binary.modified_at,
+ )
+
+
+def get_db_binaries_by_name() -> Dict[str, Binary]:
+ grouped: Dict[str, list[Binary]] = {}
+ for binary in Binary.objects.all():
+ grouped.setdefault(canonical_binary_name(binary.name), []).append(binary)
+
+ return {
+ name: max(records, key=_binary_sort_key)
+ for name, records in grouped.items()
+ }
+
+
+def serialize_binary_record(name: str, binary: Binary | None) -> Dict[str, Any]:
+ is_installed = bool(binary and binary.status == Binary.StatusChoices.INSTALLED)
+ return {
+ 'name': canonical_binary_name(name),
+ 'version': str(getattr(binary, 'version', '') or ''),
+ 'binprovider': str(getattr(binary, 'binprovider', '') or ''),
+ 'abspath': str(getattr(binary, 'abspath', '') or ''),
+ 'sha256': str(getattr(binary, 'sha256', '') or ''),
+ 'status': str(getattr(binary, 'status', '') or ''),
+ 'is_available': is_installed and bool(getattr(binary, 'abspath', '') or ''),
+ }
def get_filesystem_plugins() -> Dict[str, Dict[str, Any]]:
@@ -150,29 +427,18 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
"Found Abspath": [],
}
- # Get binaries from database (previously detected/installed)
- db_binaries = {b.name: b for b in Binary.objects.all()}
-
- # Get currently detectable binaries
- detected = get_detected_binaries()
-
- # Merge and display
- all_binary_names = sorted(set(list(db_binaries.keys()) + list(detected.keys())))
+ db_binaries = get_db_binaries_by_name()
+ all_binary_names = sorted(db_binaries.keys())
for name in all_binary_names:
- db_binary = db_binaries.get(name)
- detected_binary = detected.get(name)
+ merged = serialize_binary_record(name, db_binaries.get(name))
rows['Binary Name'].append(ItemLink(name, key=name))
- if db_binary:
- rows['Found Version'].append(f'✅ {db_binary.version}' if db_binary.version else '✅ found')
- rows['Provided By'].append(db_binary.binprovider or 'PATH')
- rows['Found Abspath'].append(str(db_binary.abspath or ''))
- elif detected_binary:
- rows['Found Version'].append('✅ found')
- rows['Provided By'].append('PATH')
- rows['Found Abspath'].append(detected_binary['abspath'])
+ if merged['is_available']:
+ rows['Found Version'].append(f"✅ {merged['version']}" if merged['version'] else '✅ found')
+ rows['Provided By'].append(merged['binprovider'] or '-')
+ rows['Found Abspath'].append(merged['abspath'] or '-')
else:
rows['Found Version'].append('❌ missing')
rows['Provided By'].append('-')
@@ -187,41 +453,22 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
@render_with_item_view
def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
assert is_superuser(request), 'Must be a superuser to view configuration settings.'
+ key = canonical_binary_name(key)
- # Try database first
- try:
- binary = Binary.objects.get(name=key)
- section: SectionData = {
- "name": binary.name,
- "description": str(binary.abspath or ''),
- "fields": {
- 'name': binary.name,
- 'binprovider': binary.binprovider,
- 'abspath': str(binary.abspath),
- 'version': binary.version,
- 'sha256': binary.sha256,
- },
- "help_texts": {},
- }
- return ItemContext(
- slug=key,
- title=key,
- data=[section],
- )
- except Binary.DoesNotExist:
- pass
+ db_binary = get_db_binaries_by_name().get(key)
+ merged = serialize_binary_record(key, db_binary)
- # Try to detect from PATH
- path = shutil.which(key)
- if path:
+ if merged['is_available']:
section: SectionData = {
"name": key,
- "description": path,
+ "description": mark_safe(render_binary_detail_description(key, merged, db_binary)),
"fields": {
'name': key,
- 'binprovider': 'PATH',
- 'abspath': path,
- 'version': 'unknown',
+ 'binprovider': merged['binprovider'] or '-',
+ 'abspath': merged['abspath'] or 'not found',
+ 'version': merged['version'] or 'unknown',
+ 'sha256': merged['sha256'],
+ 'status': merged['status'],
},
"help_texts": {},
}
@@ -233,12 +480,13 @@ def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
section: SectionData = {
"name": key,
- "description": "Binary not found",
+ "description": "No persisted Binary record found",
"fields": {
'name': key,
- 'binprovider': 'not installed',
- 'abspath': 'not found',
- 'version': 'N/A',
+ 'binprovider': merged['binprovider'] or 'not recorded',
+ 'abspath': merged['abspath'] or 'not recorded',
+ 'version': merged['version'] or 'N/A',
+ 'status': merged['status'] or 'unrecorded',
},
"help_texts": {},
}
@@ -293,8 +541,6 @@ def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext:
@render_with_item_view
def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
- import json
-
assert is_superuser(request), 'Must be a superuser to view configuration settings.'
plugins = get_filesystem_plugins()
@@ -308,45 +554,61 @@ def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
)
# Base fields that all plugins have
+ docs_url = get_plugin_docs_url(plugin['name'])
+ machine_admin_url = get_machine_admin_url()
fields = {
"id": plugin['id'],
"name": plugin['name'],
"source": plugin['source'],
- "path": plugin['path'],
- "hooks": ', '.join(plugin['hooks']),
}
- # Add config.json data if available
- if plugin.get('config'):
- config_json = json.dumps(plugin['config'], indent=2)
- fields["config.json"] = mark_safe(
- '{config_json} '
- )
-
- # Also extract and display individual config properties for easier viewing
- if 'properties' in plugin['config']:
- config_properties = plugin['config']['properties']
- properties_summary = []
- for prop_name, prop_info in config_properties.items():
- prop_type = prop_info.get('type', 'unknown')
- prop_desc = prop_info.get('description', '')
- properties_summary.append(f"• {prop_name} ({prop_type}): {prop_desc}")
-
- if properties_summary:
- fields["Config Properties"] = mark_safe(' '.join(properties_summary))
-
- section: SectionData = {
+ sections: list[SectionData] = [{
"name": plugin['name'],
- "description": plugin['path'],
+ "description": format_html(
+ '{}ABX Plugin Docs ',
+ plugin['path'],
+ docs_url,
+ ),
"fields": fields,
"help_texts": {},
- }
+ }]
+
+ if plugin['hooks']:
+ sections.append({
+ "name": "Hooks",
+ "description": mark_safe(render_hook_links_html(plugin['name'], plugin['hooks'], plugin['source'])),
+ "fields": {},
+ "help_texts": {},
+ })
+
+ if plugin.get('config'):
+ sections.append({
+ "name": "Plugin Metadata",
+ "description": mark_safe(render_plugin_metadata_html(plugin['config'])),
+ "fields": {},
+ "help_texts": {},
+ })
+
+ sections.append({
+ "name": "config.json",
+ "description": mark_safe(render_highlighted_json_block(plugin['config'])),
+ "fields": {},
+ "help_texts": {},
+ })
+
+ config_properties = plugin['config'].get('properties', {})
+ if config_properties:
+ sections.append({
+ "name": "Config Properties",
+ "description": mark_safe(render_config_properties_html(config_properties, machine_admin_url)),
+ "fields": {},
+ "help_texts": {},
+ })
return ItemContext(
slug=key,
title=plugin['name'],
- data=[section],
+ data=sections,
)
diff --git a/archivebox/core/admin_archiveresults.py b/archivebox/core/admin_archiveresults.py
index 5a4c806c..6f5f3765 100644
--- a/archivebox/core/admin_archiveresults.py
+++ b/archivebox/core/admin_archiveresults.py
@@ -1,14 +1,23 @@
__package__ = 'archivebox.core'
+import html
+import json
import os
+import shlex
from pathlib import Path
+from urllib.parse import quote
+from functools import reduce
+from operator import and_
from django.contrib import admin
+from django.db.models import Min, Q, TextField
+from django.db.models.functions import Cast
from django.utils.html import format_html
from django.utils.safestring import mark_safe
from django.core.exceptions import ValidationError
from django.urls import reverse, resolve
from django.utils import timezone
+from django.utils.text import smart_split
from archivebox.config import DATA_DIR
from archivebox.config.common import SERVER_CONFIG
@@ -16,11 +25,71 @@ from archivebox.misc.paginators import AccelleratedPaginator
from archivebox.base_models.admin import BaseModelAdmin
from archivebox.hooks import get_plugin_icon
from archivebox.core.host_utils import build_snapshot_url
+from archivebox.core.widgets import InlineTagEditorWidget
+from archivebox.core.views import LIVE_PLUGIN_BASE_URL
from archivebox.core.models import ArchiveResult, Snapshot
+def _stringify_env_value(value) -> str:
+ if value is None:
+ return ''
+ if isinstance(value, str):
+ return value
+ return json.dumps(value, separators=(',', ':'))
+
+
+def _quote_shell_string(value: str) -> str:
+ return "'" + str(value).replace("'", "'\"'\"'") + "'"
+
+
+def _get_replay_source_url(result: ArchiveResult) -> str:
+ process_env = getattr(getattr(result, 'process', None), 'env', None) or {}
+ return str(process_env.get('SOURCE_URL') or result.snapshot.url or '')
+
+
+def build_abx_dl_display_command(result: ArchiveResult) -> str:
+ source_url = _get_replay_source_url(result)
+ plugin_name = str(result.plugin or '').strip()
+ if not plugin_name and not source_url:
+ return 'abx-dl'
+ if not source_url:
+ return f'abx-dl --plugins={plugin_name}'
+ return f'abx-dl --plugins={plugin_name} {_quote_shell_string(source_url)}'
+
+
+def build_abx_dl_replay_command(result: ArchiveResult) -> str:
+ display_command = build_abx_dl_display_command(result)
+ process = getattr(result, 'process', None)
+ env = getattr(process, 'env', None) or {}
+ env_items = ' '.join(
+ f'{key}={shlex.quote(_stringify_env_value(value))}'
+ for key, value in sorted(env.items())
+ if value is not None
+ )
+ snapshot_dir = shlex.quote(str(result.snapshot_dir))
+ if env_items:
+ return f'cd {snapshot_dir}; env {env_items} {display_command}'
+ return f'cd {snapshot_dir}; {display_command}'
+
+
+def get_plugin_admin_url(plugin_name: str) -> str:
+ from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, iter_plugin_dirs
+
+ plugin_dir = next((path.resolve() for path in iter_plugin_dirs() if path.name == plugin_name), None)
+ if plugin_dir:
+ builtin_root = BUILTIN_PLUGINS_DIR.resolve()
+ if plugin_dir.is_relative_to(builtin_root):
+ return f'{LIVE_PLUGIN_BASE_URL}builtin.{quote(plugin_name)}/'
+
+ user_root = USER_PLUGINS_DIR.resolve()
+ if plugin_dir.is_relative_to(user_root):
+ return f'{LIVE_PLUGIN_BASE_URL}user.{quote(plugin_name)}/'
+
+ return f'{LIVE_PLUGIN_BASE_URL}builtin.{quote(plugin_name)}/'
+
+
def render_archiveresults_list(archiveresults_qs, limit=50):
"""Render a nice inline list view of archive results with status, plugin, output, and actions."""
@@ -35,6 +104,9 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
'failed': ('#991b1b', '#fee2e2'), # red
'queued': ('#6b7280', '#f3f4f6'), # gray
'started': ('#92400e', '#fef3c7'), # amber
+ 'backoff': ('#92400e', '#fef3c7'),
+ 'skipped': ('#475569', '#f1f5f9'),
+ 'noresults': ('#475569', '#f1f5f9'),
}
rows = []
@@ -54,8 +126,10 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
if len(full_output) > 60:
output_display += '...'
- # Get full command as tooltip
- cmd_str = ' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd or '-')
+ display_cmd = build_abx_dl_display_command(result)
+ replay_cmd = build_abx_dl_replay_command(result)
+ cmd_str_escaped = html.escape(display_cmd)
+ cmd_attr = html.escape(replay_cmd, quote=True)
# Build output link - use embed_path() which checks output_files first
embed_path = result.embed_path() if hasattr(result, 'embed_path') else None
@@ -77,7 +151,7 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
- {str(result.id)[:8]}
+ {str(result.id)[-8:]}
@@ -140,7 +214,15 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
Command:
- {cmd_str}
+
+
+ Copy
+
+ {cmd_str_escaped}
+
@@ -165,7 +247,7 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
- ID
+ Details
Status
Plugin
@@ -193,7 +275,7 @@ class ArchiveResultInline(admin.TabularInline):
extra = 0
sort_fields = ('end_ts', 'plugin', 'output_str', 'status', 'cmd_version')
readonly_fields = ('id', 'result_id', 'completed', 'command', 'version')
- fields = ('start_ts', 'end_ts', *readonly_fields, 'plugin', 'cmd', 'cmd_version', 'pwd', 'status', 'retry_at', 'output_str')
+ fields = ('start_ts', 'end_ts', *readonly_fields, 'plugin', 'cmd', 'cmd_version', 'pwd', 'status', 'output_str')
# exclude = ('id',)
ordering = ('end_ts',)
show_change_link = True
@@ -259,10 +341,11 @@ class ArchiveResultInline(admin.TabularInline):
class ArchiveResultAdmin(BaseModelAdmin):
- list_display = ('id', 'created_at', 'snapshot_info', 'tags_str', 'status', 'plugin_with_icon', 'cmd_str', 'output_str')
+ list_display = ('details_link', 'created_at', 'snapshot_info', 'tags_inline', 'status_badge', 'plugin_with_icon', 'process_link', 'machine_link', 'cmd_str', 'output_str_display')
+ list_display_links = None
sort_fields = ('id', 'created_at', 'plugin', 'status')
- readonly_fields = ('cmd', 'cmd_version', 'pwd', 'cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'plugin_with_icon')
- search_fields = ('id', 'snapshot__url', 'plugin', 'output_str', 'cmd_version', 'cmd', 'snapshot__timestamp')
+ readonly_fields = ('cmd', 'cmd_version', 'pwd', 'cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'plugin_with_icon', 'process_link')
+ search_fields = ()
autocomplete_fields = ['snapshot']
fieldsets = (
@@ -271,7 +354,7 @@ class ArchiveResultAdmin(BaseModelAdmin):
'classes': ('card', 'wide'),
}),
('Plugin', {
- 'fields': ('plugin', 'plugin_with_icon', 'status', 'retry_at'),
+ 'fields': ('plugin_with_icon', 'process_link', 'status'),
'classes': ('card',),
}),
('Timing', {
@@ -305,8 +388,61 @@ class ArchiveResultAdmin(BaseModelAdmin):
self.request = request
return super().change_view(request, object_id, form_url, extra_context)
+ def get_queryset(self, request):
+ return (
+ super()
+ .get_queryset(request)
+ .select_related('snapshot', 'process')
+ .prefetch_related('snapshot__tags')
+ .annotate(snapshot_first_tag=Min('snapshot__tags__name'))
+ )
+
+ def get_search_results(self, request, queryset, search_term):
+ if not search_term:
+ return queryset, False
+
+ queryset = queryset.annotate(
+ snapshot_id_text=Cast('snapshot__id', output_field=TextField()),
+ snapshot_crawl_id_text=Cast('snapshot__crawl_id', output_field=TextField()),
+ output_json_text=Cast('output_json', output_field=TextField()),
+ cmd_text=Cast('process__cmd', output_field=TextField()),
+ )
+
+ search_bits = [
+ bit[1:-1] if len(bit) >= 2 and bit[0] == bit[-1] and bit[0] in {'"', "'"} else bit
+ for bit in smart_split(search_term)
+ ]
+ search_bits = [bit.strip() for bit in search_bits if bit.strip()]
+ if not search_bits:
+ return queryset, False
+
+ filters = []
+ for bit in search_bits:
+ filters.append(
+ Q(snapshot_id_text__icontains=bit)
+ | Q(snapshot__url__icontains=bit)
+ | Q(snapshot__tags__name__icontains=bit)
+ | Q(snapshot_crawl_id_text__icontains=bit)
+ | Q(plugin__icontains=bit)
+ | Q(hook_name__icontains=bit)
+ | Q(output_str__icontains=bit)
+ | Q(output_json_text__icontains=bit)
+ | Q(cmd_text__icontains=bit)
+ )
+
+ return queryset.filter(reduce(and_, filters)).distinct(), True
+
+ @admin.display(description='Details', ordering='id')
+ def details_link(self, result):
+ return format_html(
+ '{} ',
+ reverse('admin:core_archiveresult_change', args=[result.id]),
+ str(result.id)[-8:],
+ )
+
@admin.display(
- description='Snapshot Info'
+ description='Snapshot',
+ ordering='snapshot__url',
)
def snapshot_info(self, result):
snapshot_id = str(result.snapshot_id)
@@ -325,20 +461,83 @@ class ArchiveResultAdmin(BaseModelAdmin):
def tags_str(self, result):
return result.snapshot.tags_str()
+ @admin.display(description='Tags', ordering='snapshot_first_tag')
+ def tags_inline(self, result):
+ widget = InlineTagEditorWidget(snapshot_id=str(result.snapshot_id), editable=False)
+ tags_html = widget.render(
+ name=f'tags_{result.snapshot_id}',
+ value=result.snapshot.tags.all(),
+ attrs={'id': f'tags_{result.snapshot_id}'},
+ snapshot_id=str(result.snapshot_id),
+ )
+ return mark_safe(f'{tags_html} ')
+
+ @admin.display(description='Status', ordering='status')
+ def status_badge(self, result):
+ status = result.status or ArchiveResult.StatusChoices.QUEUED
+ return format_html(
+ '{} ',
+ status,
+ status,
+ result.get_status_display() or status,
+ )
+
@admin.display(description='Plugin', ordering='plugin')
def plugin_with_icon(self, result):
icon = get_plugin_icon(result.plugin)
return format_html(
- '{} {}',
+ '{} {} ',
+ get_plugin_admin_url(result.plugin),
result.plugin,
icon,
+ get_plugin_admin_url(result.plugin),
result.plugin,
)
- def cmd_str(self, result):
+ @admin.display(description='Process', ordering='process__pid')
+ def process_link(self, result):
+ if not result.process_id:
+ return '-'
+ process_label = result.process.pid if result.process and result.process.pid else '-'
return format_html(
- '{} ',
- ' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd),
+ '{} ',
+ reverse('admin:machine_process_change', args=[result.process_id]),
+ process_label,
+ )
+
+ @admin.display(description='Machine', ordering='process__machine__hostname')
+ def machine_link(self, result):
+ if not result.process_id or not result.process or not result.process.machine_id:
+ return '-'
+ machine = result.process.machine
+ return format_html(
+ '{} {} ',
+ reverse('admin:machine_machine_change', args=[machine.id]),
+ str(machine.id)[:8],
+ machine.hostname,
+ )
+
+ @admin.display(description='Command')
+ def cmd_str(self, result):
+ display_cmd = build_abx_dl_display_command(result)
+ replay_cmd = build_abx_dl_replay_command(result)
+ return format_html(
+ '''
+
+
+ Copy
+
+
+ {}
+
+
+ ''',
+ replay_cmd,
+ replay_cmd,
+ display_cmd,
)
def output_display(self, result):
@@ -352,6 +551,27 @@ class ArchiveResultAdmin(BaseModelAdmin):
result.output_str,
)
+ @admin.display(description='Output', ordering='output_str')
+ def output_str_display(self, result):
+ output_text = str(result.output_str or '').strip()
+ if not output_text:
+ return '-'
+
+ live_path = result.embed_path() if hasattr(result, 'embed_path') else None
+ if live_path:
+ return format_html(
+ '{} ',
+ build_snapshot_url(str(result.snapshot_id), live_path),
+ output_text,
+ output_text,
+ )
+
+ return format_html(
+ '{} ',
+ output_text,
+ output_text,
+ )
+
def output_summary(self, result):
snapshot_dir = Path(DATA_DIR) / str(result.pwd).split('data/', 1)[-1]
output_html = format_html(
diff --git a/archivebox/core/admin_site.py b/archivebox/core/admin_site.py
index 4541b8c3..ae6be452 100644
--- a/archivebox/core/admin_site.py
+++ b/archivebox/core/admin_site.py
@@ -61,12 +61,14 @@ def register_admin_site():
from archivebox.crawls.admin import register_admin as register_crawls_admin
from archivebox.api.admin import register_admin as register_api_admin
from archivebox.machine.admin import register_admin as register_machine_admin
+ from archivebox.personas.admin import register_admin as register_personas_admin
from archivebox.workers.admin import register_admin as register_workers_admin
register_core_admin(archivebox_admin)
register_crawls_admin(archivebox_admin)
register_api_admin(archivebox_admin)
register_machine_admin(archivebox_admin)
+ register_personas_admin(archivebox_admin)
register_workers_admin(archivebox_admin)
return archivebox_admin
diff --git a/archivebox/core/admin_snapshots.py b/archivebox/core/admin_snapshots.py
index cf70f85d..0202e62c 100644
--- a/archivebox/core/admin_snapshots.py
+++ b/archivebox/core/admin_snapshots.py
@@ -6,6 +6,7 @@ from pathlib import Path
from django.contrib import admin, messages
from django.urls import path
+from django.shortcuts import get_object_or_404, redirect
from django.utils.html import format_html
from django.utils.safestring import mark_safe
from django.utils import timezone
@@ -14,6 +15,7 @@ from django.db.models.functions import Coalesce
from django import forms
from django.template import Template, RequestContext
from django.contrib.admin.helpers import ActionForm
+from django.middleware.csrf import get_token
from archivebox.config import DATA_DIR
from archivebox.config.common import SERVER_CONFIG
@@ -24,7 +26,7 @@ from archivebox.search.admin import SearchResultsAdminMixin
from archivebox.core.host_utils import build_snapshot_url, build_web_url
from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
-from archivebox.workers.tasks import bg_archive_snapshots, bg_add
+from archivebox.workers.tasks import bg_archive_snapshot, bg_archive_snapshots, bg_add
from archivebox.core.models import Tag, Snapshot, ArchiveResult
from archivebox.core.admin_archiveresults import render_archiveresults_list
@@ -215,10 +217,23 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
def get_urls(self):
urls = super().get_urls()
custom_urls = [
- path('grid/', self.admin_site.admin_view(self.grid_view), name='grid')
+ path('grid/', self.admin_site.admin_view(self.grid_view), name='grid'),
+ path('/redo-failed/', self.admin_site.admin_view(self.redo_failed_view), name='core_snapshot_redo_failed'),
]
return custom_urls + urls
+ def redo_failed_view(self, request, object_id):
+ snapshot = get_object_or_404(Snapshot, pk=object_id)
+
+ if request.method == 'POST':
+ queued = bg_archive_snapshot(snapshot, overwrite=False)
+ messages.success(
+ request,
+ f"Queued {queued} snapshot for re-archiving. The background runner will process it.",
+ )
+
+ return redirect(snapshot.admin_change_url)
+
# def get_queryset(self, request):
# # tags_qs = SnapshotTag.objects.all().select_related('tag')
# # prefetch = Prefetch('snapshottag_set', queryset=tags_qs)
@@ -312,6 +327,8 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
def admin_actions(self, obj):
summary_url = build_web_url(f'/{obj.archive_path}')
results_url = build_web_url(f'/{obj.archive_path}/index.html#all')
+ redo_failed_url = f'/admin/core/snapshot/{obj.pk}/redo-failed/'
+ csrf_token = get_token(self.request)
return format_html(
'''
@@ -344,13 +361,15 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
onmouseout="this.style.background='#eff6ff';">
🆕 Archive Now
-
- 🔁 Redo Failed
-
+
- Tip: Action buttons link to the list view with this snapshot pre-selected. Select it and use the action dropdown to execute.
+ Tip: Redo Failed runs immediately. The other action buttons link to the list view with this snapshot pre-selected.
''',
summary_url,
results_url,
obj.url,
obj.pk,
- obj.pk,
+ redo_failed_url,
+ csrf_token,
obj.pk,
obj.pk,
)
diff --git a/archivebox/core/admin_tags.py b/archivebox/core/admin_tags.py
index 87396ad8..3658badc 100644
--- a/archivebox/core/admin_tags.py
+++ b/archivebox/core/admin_tags.py
@@ -1,63 +1,74 @@
__package__ = 'archivebox.core'
-from django.contrib import admin
+from urllib.parse import quote
+
+from django import forms
+from django.contrib import admin, messages
+from django.contrib.admin.options import IS_POPUP_VAR
+from django.http import HttpRequest, HttpResponseRedirect
+from django.urls import reverse
from django.utils.html import format_html
from django.utils.safestring import mark_safe
-from archivebox.misc.paginators import AccelleratedPaginator
from archivebox.base_models.admin import BaseModelAdmin
-
from archivebox.core.models import SnapshotTag, Tag
+from archivebox.core.tag_utils import (
+ TAG_HAS_SNAPSHOTS_CHOICES,
+ TAG_SORT_CHOICES,
+ build_tag_cards,
+ get_tag_creator_choices,
+ get_tag_year_choices,
+ normalize_created_by_filter,
+ normalize_created_year_filter,
+ normalize_has_snapshots_filter,
+ normalize_tag_sort,
+)
+from archivebox.core.host_utils import build_snapshot_url
class TagInline(admin.TabularInline):
model = SnapshotTag
- # fk_name = 'snapshot'
fields = ('id', 'tag')
extra = 1
- # min_num = 1
max_num = 1000
autocomplete_fields = (
'tag',
)
-
-# class AutocompleteTags:
-# model = Tag
-# search_fields = ['name']
-# name = 'name'
-# # source_field = 'name'
-# remote_field = Tag._meta.get_field('name')
-# class AutocompleteTagsAdminStub:
-# name = 'admin'
-
-
-# class TaggedItemInline(admin.TabularInline):
-# readonly_fields = ('object_link',)
-# fields = ('id', 'tag', 'content_type', 'object_id', *readonly_fields)
-# model = TaggedItem
-# extra = 1
-# show_change_link = True
-
-# @admin.display(description='object')
-# def object_link(self, obj):
-# obj = obj.content_type.get_object_for_this_type(pk=obj.object_id)
-# return format_html('[{}] ', obj._meta.app_label, obj._meta.model_name, obj.pk, str(obj))
+class TagAdminForm(forms.ModelForm):
+ class Meta:
+ model = Tag
+ fields = '__all__'
+ widgets = {
+ 'name': forms.TextInput(attrs={
+ 'placeholder': 'research, receipts, product-design...',
+ 'autocomplete': 'off',
+ 'spellcheck': 'false',
+ 'data-tag-name-input': '1',
+ }),
+ }
+
+ def clean_name(self):
+ name = (self.cleaned_data.get('name') or '').strip()
+ if not name:
+ raise forms.ValidationError('Tag name is required.')
+ return name
+
-
class TagAdmin(BaseModelAdmin):
- list_display = ('created_at', 'created_by', 'id', 'name', 'num_snapshots', 'snapshots')
+ form = TagAdminForm
+ change_list_template = 'admin/core/tag/change_list.html'
+ change_form_template = 'admin/core/tag/change_form.html'
+ list_display = ('name', 'num_snapshots', 'created_at', 'created_by')
list_filter = ('created_at', 'created_by')
- sort_fields = ('name', 'slug', 'id', 'created_by', 'created_at')
- readonly_fields = ('slug', 'id', 'created_at', 'modified_at', 'snapshots')
search_fields = ('id', 'name', 'slug')
- actions = ['delete_selected', 'merge_tags']
- ordering = ['-created_at']
- # inlines = [TaggedItemInline]
+ readonly_fields = ('slug', 'id', 'created_at', 'modified_at', 'snapshots')
+ actions = ['delete_selected']
+ ordering = ['name', 'id']
fieldsets = (
- ('Tag Info', {
+ ('Tag', {
'fields': ('name', 'slug'),
'classes': ('card',),
}),
@@ -65,112 +76,137 @@ class TagAdmin(BaseModelAdmin):
'fields': ('id', 'created_by', 'created_at', 'modified_at'),
'classes': ('card',),
}),
- ('Snapshots', {
+ ('Recent Snapshots', {
'fields': ('snapshots',),
'classes': ('card', 'wide'),
}),
)
- paginator = AccelleratedPaginator
+ add_fieldsets = (
+ ('Tag', {
+ 'fields': ('name',),
+ 'classes': ('card', 'wide'),
+ }),
+ ('Metadata', {
+ 'fields': ('created_by',),
+ 'classes': ('card',),
+ }),
+ )
+ def get_fieldsets(self, request: HttpRequest, obj: Tag | None = None):
+ return self.fieldsets if obj else self.add_fieldsets
- def num_snapshots(self, tag):
+ def changelist_view(self, request: HttpRequest, extra_context=None):
+ query = (request.GET.get('q') or '').strip()
+ sort = normalize_tag_sort((request.GET.get('sort') or 'created_desc').strip())
+ created_by = normalize_created_by_filter((request.GET.get('created_by') or '').strip())
+ year = normalize_created_year_filter((request.GET.get('year') or '').strip())
+ has_snapshots = normalize_has_snapshots_filter((request.GET.get('has_snapshots') or 'all').strip())
+ extra_context = {
+ **(extra_context or {}),
+ 'initial_query': query,
+ 'initial_sort': sort,
+ 'initial_created_by': created_by,
+ 'initial_year': year,
+ 'initial_has_snapshots': has_snapshots,
+ 'tag_sort_choices': TAG_SORT_CHOICES,
+ 'tag_has_snapshots_choices': TAG_HAS_SNAPSHOTS_CHOICES,
+ 'tag_created_by_choices': get_tag_creator_choices(),
+ 'tag_year_choices': get_tag_year_choices(),
+ 'initial_tag_cards': build_tag_cards(
+ query=query,
+ request=request,
+ sort=sort,
+ created_by=created_by,
+ year=year,
+ has_snapshots=has_snapshots,
+ ),
+ 'tag_search_api_url': reverse('api-1:search_tags'),
+ 'tag_create_api_url': reverse('api-1:tags_create'),
+ }
+ return super().changelist_view(request, extra_context=extra_context)
+
+ def render_change_form(self, request, context, add=False, change=False, form_url='', obj=None):
+ current_name = (request.POST.get('name') or '').strip()
+ if not current_name and obj:
+ current_name = obj.name
+
+ similar_tag_cards = build_tag_cards(query=current_name, request=request, limit=12) if current_name else build_tag_cards(request=request, limit=12)
+ if obj:
+ similar_tag_cards = [card for card in similar_tag_cards if card['id'] != obj.pk]
+
+ context.update({
+ 'tag_search_api_url': reverse('api-1:search_tags'),
+ 'tag_similar_cards': similar_tag_cards,
+ 'tag_similar_query': current_name,
+ })
+ return super().render_change_form(request, context, add=add, change=change, form_url=form_url, obj=obj)
+
+ def response_add(self, request: HttpRequest, obj: Tag, post_url_continue=None):
+ if IS_POPUP_VAR in request.POST or '_continue' in request.POST or '_addanother' in request.POST:
+ return super().response_add(request, obj, post_url_continue=post_url_continue)
+
+ self.message_user(request, f'Tag "{obj.name}" saved.', level=messages.SUCCESS)
+ return self._redirect_to_changelist(obj.name)
+
+ def response_change(self, request: HttpRequest, obj: Tag):
+ if IS_POPUP_VAR in request.POST or '_continue' in request.POST or '_addanother' in request.POST or '_saveasnew' in request.POST:
+ return super().response_change(request, obj)
+
+ self.message_user(request, f'Tag "{obj.name}" updated.', level=messages.SUCCESS)
+ return self._redirect_to_changelist(obj.name)
+
+ def _redirect_to_changelist(self, query: str = '') -> HttpResponseRedirect:
+ changelist_url = reverse('admin:core_tag_changelist')
+ if query:
+ changelist_url = f'{changelist_url}?q={quote(query)}'
+ return HttpResponseRedirect(changelist_url)
+
+ @admin.display(description='Snapshots')
+ def snapshots(self, tag: Tag):
+ snapshots = tag.snapshot_set.select_related('crawl__created_by').order_by('-downloaded_at', '-created_at', '-pk')[:10]
+ total_count = tag.snapshot_set.count()
+ if not snapshots:
+ return mark_safe(
+ f'No snapshots use this tag yet. '
+ f'Open filtered snapshot list .
'
+ )
+
+ cards = []
+ for snapshot in snapshots:
+ title = (snapshot.title or '').strip() or snapshot.url
+ cards.append(format_html(
+ '''
+
+
+
+ {}
+ {}
+
+
+ ''',
+ reverse('admin:core_snapshot_change', args=[snapshot.pk]),
+ build_snapshot_url(str(snapshot.pk), 'favicon.ico'),
+ title[:120],
+ snapshot.url[:120],
+ ))
+
+ cards.append(format_html(
+ 'View all {} tagged snapshots ',
+ tag.id,
+ total_count,
+ ))
+ return mark_safe('' + ''.join(cards) + '
')
+
+ @admin.display(description='Snapshots', ordering='num_snapshots')
+ def num_snapshots(self, tag: Tag):
+ count = getattr(tag, 'num_snapshots', tag.snapshot_set.count())
return format_html(
'{} total ',
tag.id,
- tag.snapshot_set.count(),
+ count,
)
- def snapshots(self, tag):
- total_count = tag.snapshot_set.count()
- return mark_safe(' '.join(
- format_html(
- '[{}] {}',
- snap.pk,
- snap.downloaded_at.strftime('%Y-%m-%d %H:%M') if snap.downloaded_at else 'pending...',
- snap.url[:64],
- )
- for snap in tag.snapshot_set.order_by('-downloaded_at')[:10]
- ) + (f'{total_count} total snapshots... '))
-
- # def get_urls(self):
- # urls = super().get_urls()
- # custom_urls = [
- # path(
- # "merge-tags/",
- # self.admin_site.admin_view(self.merge_tags_view),
- # name="taggit_tag_merge_tags",
- # ),
- # ]
- # return custom_urls + urls
-
- # @admin.action(description="Merge selected tags")
- # def merge_tags(self, request, queryset):
- # selected = request.POST.getlist(admin.helpers.ACTION_CHECKBOX_NAME)
- # if not selected:
- # self.message_user(request, "Please select at least one tag.")
- # return redirect(request.get_full_path())
-
- # selected_tag_ids = ",".join(selected)
- # redirect_url = f"{request.get_full_path()}merge-tags/"
-
- # request.session["selected_tag_ids"] = selected_tag_ids
-
- # return redirect(redirect_url)
-
- # def merge_tags_view(self, request):
- # selected_tag_ids = request.session.get("selected_tag_ids", "").split(",")
- # if request.method == "POST":
- # form = MergeTagsForm(request.POST)
- # if form.is_valid():
- # new_tag_name = form.cleaned_data["new_tag_name"]
- # new_tag, created = Tag.objects.get_or_create(name=new_tag_name)
- # with transaction.atomic():
- # for tag_id in selected_tag_ids:
- # tag = Tag.objects.get(id=tag_id)
- # tagged_items = TaggedItem.objects.filter(tag=tag)
- # for tagged_item in tagged_items:
- # if TaggedItem.objects.filter(
- # tag=new_tag,
- # content_type=tagged_item.content_type,
- # object_id=tagged_item.object_id,
- # ).exists():
- # # we have the new tag as well, so we can just
- # # remove the tag association
- # tagged_item.delete()
- # else:
- # # point this taggedItem to the new one
- # tagged_item.tag = new_tag
- # tagged_item.save()
-
- # # delete the old tag
- # if tag.id != new_tag.id:
- # tag.delete()
-
- # self.message_user(request, "Tags have been merged", level="success")
- # # clear the selected_tag_ids from session after merge is complete
- # request.session.pop("selected_tag_ids", None)
-
- # return redirect("..")
- # else:
- # self.message_user(request, "Form is invalid.", level="error")
-
- # context = {
- # "form": MergeTagsForm(),
- # "selected_tag_ids": selected_tag_ids,
- # }
- # return render(request, "admin/taggit/merge_tags_form.html", context)
-
-
-# @admin.register(SnapshotTag, site=archivebox_admin)
-# class SnapshotTagAdmin(BaseModelAdmin):
-# list_display = ('id', 'snapshot', 'tag')
-# sort_fields = ('id', 'snapshot', 'tag')
-# search_fields = ('id', 'snapshot_id', 'tag_id')
-# fields = ('snapshot', 'id')
-# actions = ['delete_selected']
-# ordering = ['-id']
-
def register_admin(admin_site):
admin_site.register(Tag, TagAdmin)
diff --git a/archivebox/core/forms.py b/archivebox/core/forms.py
index 8589563a..6050a6a7 100644
--- a/archivebox/core/forms.py
+++ b/archivebox/core/forms.py
@@ -1,12 +1,16 @@
__package__ = 'archivebox.core'
from django import forms
+from django.utils.html import format_html
-from archivebox.misc.util import URL_REGEX
+from archivebox.misc.util import URL_REGEX, find_all_urls
from taggit.utils import edit_string_for_tags, parse_tags
from archivebox.base_models.admin import KeyValueWidget
from archivebox.crawls.schedule_utils import validate_schedule
-from archivebox.hooks import get_plugins
+from archivebox.config.common import SEARCH_BACKEND_CONFIG
+from archivebox.core.widgets import TagEditorWidget, URLFiltersWidget
+from archivebox.hooks import get_plugins, discover_plugin_configs, get_plugin_icon
+from archivebox.personas.models import Persona
DEPTH_CHOICES = (
('0', 'depth = 0 (archive just these URLs)'),
@@ -22,6 +26,22 @@ def get_plugin_choices():
return [(name, name) for name in get_plugins()]
+def get_plugin_choice_label(plugin_name: str, plugin_configs: dict[str, dict]) -> str:
+ schema = plugin_configs.get(plugin_name, {})
+ description = str(schema.get('description') or '').strip()
+ if not description:
+ return plugin_name
+ icon_html = get_plugin_icon(plugin_name)
+
+ return format_html(
+ '{} {} {} ',
+ icon_html,
+ plugin_name,
+ plugin_name,
+ description,
+ )
+
+
def get_choice_field(form: forms.Form, name: str) -> forms.ChoiceField:
field = form.fields[name]
if not isinstance(field, forms.ChoiceField):
@@ -31,22 +51,19 @@ def get_choice_field(form: forms.Form, name: str) -> forms.ChoiceField:
class AddLinkForm(forms.Form):
# Basic fields
- url = forms.RegexField(
- label="URLs (one per line)",
- regex=URL_REGEX,
- min_length=6,
+ url = forms.CharField(
+ label="URLs",
strip=True,
- widget=forms.Textarea,
+ widget=forms.Textarea(attrs={
+ 'data-url-regex': URL_REGEX.pattern,
+ }),
required=True
)
tag = forms.CharField(
- label="Tags (comma separated tag1,tag2,tag3)",
+ label="Tags",
strip=True,
required=False,
- widget=forms.TextInput(attrs={
- 'list': 'tag-datalist',
- 'autocomplete': 'off',
- })
+ widget=TagEditorWidget(),
)
depth = forms.ChoiceField(
label="Archive depth",
@@ -58,11 +75,15 @@ class AddLinkForm(forms.Form):
label="Notes",
strip=True,
required=False,
- widget=forms.Textarea(attrs={
- 'rows': 3,
- 'placeholder': 'Optional notes about this crawl (e.g., purpose, project name, context...)',
+ widget=forms.TextInput(attrs={
+ 'placeholder': 'Optional notes about this crawl',
})
)
+ url_filters = forms.Field(
+ label="URL allowlist / denylist",
+ required=False,
+ widget=URLFiltersWidget(source_selector='textarea[name="url"]'),
+ )
# Plugin groups
chrome_plugins = forms.MultipleChoiceField(
@@ -111,24 +132,15 @@ class AddLinkForm(forms.Form):
'placeholder': 'e.g., daily, weekly, 0 */6 * * * (every 6 hours)',
})
)
- persona = forms.CharField(
+ persona = forms.ModelChoiceField(
label="Persona (authentication profile)",
- max_length=100,
- initial='Default',
- required=False,
- )
- overwrite = forms.BooleanField(
- label="Overwrite existing snapshots",
- initial=False,
- required=False,
- )
- update = forms.BooleanField(
- label="Update/retry previously failed URLs",
- initial=False,
required=False,
+ queryset=Persona.objects.none(),
+ empty_label=None,
+ to_field_name='name',
)
index_only = forms.BooleanField(
- label="Index only (don't archive yet)",
+ label="Index only dry run (add crawl but don't archive yet)",
initial=False,
required=False,
)
@@ -142,11 +154,13 @@ class AddLinkForm(forms.Form):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
- # Import at runtime to avoid circular imports
- from archivebox.config.common import ARCHIVING_CONFIG
+ default_persona = Persona.get_or_create_default()
+ self.fields['persona'].queryset = Persona.objects.order_by('name')
+ self.fields['persona'].initial = default_persona.name
# Get all plugins
all_plugins = get_plugins()
+ plugin_configs = discover_plugin_configs()
# Define plugin groups
chrome_dependent = {
@@ -170,26 +184,28 @@ class AddLinkForm(forms.Form):
# Populate plugin field choices
get_choice_field(self, 'chrome_plugins').choices = [
- (p, p) for p in sorted(all_plugins) if p in chrome_dependent
+ (p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in chrome_dependent
]
get_choice_field(self, 'archiving_plugins').choices = [
- (p, p) for p in sorted(all_plugins) if p in archiving
+ (p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in archiving
]
get_choice_field(self, 'parsing_plugins').choices = [
- (p, p) for p in sorted(all_plugins) if p in parsing
+ (p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in parsing
]
get_choice_field(self, 'search_plugins').choices = [
- (p, p) for p in sorted(all_plugins) if p in search
+ (p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in search
]
get_choice_field(self, 'binary_plugins').choices = [
- (p, p) for p in sorted(all_plugins) if p in binary
+ (p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in binary
]
get_choice_field(self, 'extension_plugins').choices = [
- (p, p) for p in sorted(all_plugins) if p in extensions
+ (p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in extensions
]
- # Set update default from config
- self.fields['update'].initial = not ARCHIVING_CONFIG.ONLY_NEW
+ required_search_plugin = f'search_backend_{SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}'.strip()
+ search_choices = [choice[0] for choice in get_choice_field(self, 'search_plugins').choices]
+ if required_search_plugin in search_choices:
+ get_choice_field(self, 'search_plugins').initial = [required_search_plugin]
def clean(self):
cleaned_data = super().clean() or {}
@@ -207,6 +223,23 @@ class AddLinkForm(forms.Form):
return cleaned_data
+ def clean_url(self):
+ value = self.cleaned_data.get('url') or ''
+ urls = '\n'.join(find_all_urls(value))
+ if not urls:
+ raise forms.ValidationError('Enter at least one valid URL.')
+ return urls
+
+ def clean_url_filters(self):
+ from archivebox.crawls.models import Crawl
+
+ value = self.cleaned_data.get('url_filters') or {}
+ return {
+ 'allowlist': '\n'.join(Crawl.split_filter_patterns(value.get('allowlist', ''))),
+ 'denylist': '\n'.join(Crawl.split_filter_patterns(value.get('denylist', ''))),
+ 'same_domain_only': bool(value.get('same_domain_only')),
+ }
+
def clean_schedule(self):
schedule = (self.cleaned_data.get('schedule') or '').strip()
if not schedule:
diff --git a/archivebox/core/host_utils.py b/archivebox/core/host_utils.py
index 2cf8131b..c3581d4f 100644
--- a/archivebox/core/host_utils.py
+++ b/archivebox/core/host_utils.py
@@ -163,6 +163,10 @@ def get_api_base_url(request=None) -> str:
return _build_base_url_for_host(get_api_host(), request=request)
+def get_public_base_url(request=None) -> str:
+ return _build_base_url_for_host(get_public_host(), request=request)
+
+
# Backwards-compat aliases (archive == web)
def get_archive_base_url(request=None) -> str:
return get_web_base_url(request=request)
diff --git a/archivebox/core/migrations/0032_remove_archiveresult_retry_at.py b/archivebox/core/migrations/0032_remove_archiveresult_retry_at.py
new file mode 100644
index 00000000..4a8f74d1
--- /dev/null
+++ b/archivebox/core/migrations/0032_remove_archiveresult_retry_at.py
@@ -0,0 +1,15 @@
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ("core", "0031_add_archiveresult_snapshot_status_index"),
+ ]
+
+ operations = [
+ migrations.RemoveField(
+ model_name="archiveresult",
+ name="retry_at",
+ ),
+ ]
diff --git a/archivebox/core/models.py b/archivebox/core/models.py
index cf008afa..7f33bf0a 100755
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -36,7 +36,7 @@ from archivebox.base_models.models import (
from archivebox.workers.models import ModelWithStateMachine, BaseStateMachine
from archivebox.workers.tasks import bg_archive_snapshot
from archivebox.crawls.models import Crawl
-from archivebox.machine.models import NetworkInterface, Binary
+from archivebox.machine.models import Binary
@@ -60,32 +60,41 @@ class Tag(ModelWithUUID):
def __str__(self):
return self.name
+ def _generate_unique_slug(self) -> str:
+ base_slug = slugify(self.name) or 'tag'
+ existing = Tag.objects.filter(slug__startswith=base_slug)
+ if self.pk:
+ existing = existing.exclude(pk=self.pk)
+ existing_slugs = set(existing.values_list("slug", flat=True))
+
+ slug = base_slug
+ i = 1
+ while slug in existing_slugs:
+ slug = f"{base_slug}_{i}"
+ i += 1
+ return slug
+
def save(self, *args, **kwargs):
- is_new = self._state.adding
- if is_new:
- self.slug = slugify(self.name)
- existing = set(Tag.objects.filter(slug__startswith=self.slug).values_list("slug", flat=True))
- i = None
- while True:
- slug = f"{slugify(self.name)}_{i}" if i else slugify(self.name)
- if slug not in existing:
- self.slug = slug
- break
- i = (i or 0) + 1
+ existing_name = None
+ if self.pk:
+ existing_name = Tag.objects.filter(pk=self.pk).values_list('name', flat=True).first()
+
+ if not self.slug or existing_name != self.name:
+ self.slug = self._generate_unique_slug()
super().save(*args, **kwargs)
- if is_new:
- from archivebox.misc.logging_util import log_worker_event
- log_worker_event(
- worker_type='DB',
- event='Created Tag',
- indent_level=0,
- metadata={
- 'id': self.id,
- 'name': self.name,
- 'slug': self.slug,
- },
- )
+ # if is_new:
+ # from archivebox.misc.logging_util import log_worker_event
+ # log_worker_event(
+ # worker_type='DB',
+ # event='Created Tag',
+ # indent_level=0,
+ # metadata={
+ # 'id': self.id,
+ # 'name': self.name,
+ # 'slug': self.slug,
+ # },
+ # )
@property
def api_url(self) -> str:
@@ -364,7 +373,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
return Binary.objects.filter(process_set__archiveresult__snapshot_id=self.id).distinct()
def save(self, *args, **kwargs):
- is_new = self._state.adding
if not self.bookmarked_at:
self.bookmarked_at = self.created_at or timezone.now()
if not self.timestamp:
@@ -393,24 +401,25 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
super().save(*args, **kwargs)
self.ensure_legacy_archive_symlink()
- if self.url not in self.crawl.urls:
+ existing_urls = {url for _raw_line, url in self.crawl._iter_url_lines() if url}
+ if self.crawl.url_passes_filters(self.url, snapshot=self) and self.url not in existing_urls:
self.crawl.urls += f'\n{self.url}'
self.crawl.save()
- if is_new:
- from archivebox.misc.logging_util import log_worker_event
- log_worker_event(
- worker_type='DB',
- event='Created Snapshot',
- indent_level=2,
- url=self.url,
- metadata={
- 'id': str(self.id),
- 'crawl_id': str(self.crawl_id),
- 'depth': self.depth,
- 'status': self.status,
- },
- )
+ # if is_new:
+ # from archivebox.misc.logging_util import log_worker_event
+ # log_worker_event(
+ # worker_type='DB',
+ # event='Created Snapshot',
+ # indent_level=2,
+ # url=self.url,
+ # metadata={
+ # 'id': str(self.id),
+ # 'crawl_id': str(self.crawl_id),
+ # 'depth': self.depth,
+ # 'status': self.status,
+ # },
+ # )
# =========================================================================
# Filesystem Migration Methods
@@ -1528,16 +1537,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
"""
Execute snapshot by creating pending ArchiveResults for all enabled hooks.
- Called by: SnapshotMachine.enter_started()
-
- Hook Lifecycle:
- 1. discover_hooks('Snapshot') → finds all plugin hooks
- 2. For each hook:
- - Create ArchiveResult with status=QUEUED
- - Store hook_name (e.g., 'on_Snapshot__50_wget.py')
- 3. ArchiveResults execute independently via ArchiveResultMachine
- 4. Hook execution happens in ArchiveResult.run(), NOT here
-
Returns:
list[ArchiveResult]: Newly created pending results
"""
@@ -1602,7 +1601,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
'url': self.url,
'title': self.title,
'tags': self.tags_str(),
- 'tags_str': self.tags_str(),
'bookmarked_at': self.bookmarked_at.isoformat() if self.bookmarked_at else None,
'created_at': self.created_at.isoformat() if self.created_at else None,
'timestamp': self.timestamp,
@@ -1672,7 +1670,9 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
# ID not found, fall through to create-by-URL logic
pass
- url = record.get('url')
+ from archivebox.misc.util import fix_url_from_markdown
+
+ url = fix_url_from_markdown(str(record.get('url') or '').strip())
if not url:
return None
@@ -1807,7 +1807,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
defaults={
'plugin': plugin,
'status': ArchiveResult.INITIAL_STATE,
- 'retry_at': timezone.now(),
},
)
if archiveresult.status == ArchiveResult.INITIAL_STATE:
@@ -1853,11 +1852,12 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
failed = results.filter(status='failed').count()
running = results.filter(status='started').count()
skipped = results.filter(status='skipped').count()
+ noresults = results.filter(status='noresults').count()
total = results.count()
- pending = total - succeeded - failed - running - skipped
+ pending = total - succeeded - failed - running - skipped - noresults
- # Calculate percentage (succeeded + failed + skipped as completed)
- completed = succeeded + failed + skipped
+ # Calculate percentage (succeeded + failed + skipped + noresults as completed)
+ completed = succeeded + failed + skipped + noresults
percent = int((completed / total * 100) if total > 0 else 0)
# Sum output sizes
@@ -1875,47 +1875,38 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
'running': running,
'pending': pending,
'skipped': skipped,
+ 'noresults': noresults,
'percent': percent,
'output_size': output_size,
'is_sealed': is_sealed,
}
- def retry_failed_archiveresults(self, retry_at: Optional[datetime] = None) -> int:
+ def retry_failed_archiveresults(self) -> int:
"""
Reset failed/skipped ArchiveResults to queued for retry.
- This enables seamless retry of the entire extraction pipeline:
- - Resets FAILED and SKIPPED results to QUEUED
- - Sets retry_at so workers pick them up
- - Plugins run in order (numeric prefix)
- - Each plugin checks its dependencies at runtime
-
- Dependency handling (e.g., chrome → screenshot):
- - Plugins check if required outputs exist before running
- - If dependency output missing → plugin returns 'skipped'
- - On retry, if dependency now succeeds → dependent can run
-
Returns count of ArchiveResults reset.
"""
- retry_at = retry_at or timezone.now()
-
count = self.archiveresult_set.filter(
status__in=[
ArchiveResult.StatusChoices.FAILED,
ArchiveResult.StatusChoices.SKIPPED,
+ ArchiveResult.StatusChoices.NORESULTS,
]
).update(
status=ArchiveResult.StatusChoices.QUEUED,
- retry_at=retry_at,
- output=None,
+ output_str='',
+ output_json=None,
+ output_files={},
+ output_size=0,
+ output_mimetypes='',
start_ts=None,
end_ts=None,
)
- # Also reset the snapshot and current_step so it gets re-checked from the beginning
if count > 0:
self.status = self.StatusChoices.STARTED
- self.retry_at = retry_at
+ self.retry_at = timezone.now()
self.current_step = 0 # Reset to step 0 for retry
self.save(update_fields=['status', 'retry_at', 'current_step', 'modified_at'])
@@ -2228,6 +2219,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
best_result = outputs[0]
context = {
**self.to_dict(extended=True),
+ 'snapshot': self,
'title': htmlencode(self.title or (self.base_url if self.is_archived else TITLE_LOADING_MSG)),
'url_str': htmlencode(urldecode(self.base_url)),
'archive_url': urlencode(f'warc/{self.timestamp}' or (self.domain if self.is_archived else '')) or 'about:blank',
@@ -2275,8 +2267,8 @@ class SnapshotMachine(BaseStateMachine):
│ • discover_hooks('Snapshot') → finds all plugin hooks │
│ • create_pending_archiveresults() → creates ONE │
│ ArchiveResult per hook (NO execution yet) │
- │ 2. ArchiveResults process independently with their own │
- │ state machines (see ArchiveResultMachine) │
+ │ 2. The shared abx-dl runner executes hooks and the │
+ │ projector updates ArchiveResult rows from events │
│ 3. Advance through steps 0-9 as foreground hooks complete │
└─────────────────────────────────────────────────────────────┘
↓ tick() when is_finished()
@@ -2358,7 +2350,7 @@ class SnapshotMachine(BaseStateMachine):
cast(Any, crawl).sm.seal()
-class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithStateMachine):
+class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes):
class StatusChoices(models.TextChoices):
QUEUED = 'queued', 'Queued'
STARTED = 'started', 'Started'
@@ -2366,6 +2358,17 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
SUCCEEDED = 'succeeded', 'Succeeded'
FAILED = 'failed', 'Failed'
SKIPPED = 'skipped', 'Skipped'
+ NORESULTS = 'noresults', 'No Results'
+
+ INITIAL_STATE = StatusChoices.QUEUED
+ ACTIVE_STATE = StatusChoices.STARTED
+ FINAL_STATES = (
+ StatusChoices.SUCCEEDED,
+ StatusChoices.FAILED,
+ StatusChoices.SKIPPED,
+ StatusChoices.NORESULTS,
+ )
+ FINAL_OR_ACTIVE_STATES = (*FINAL_STATES, ACTIVE_STATE)
@classmethod
def get_plugin_choices(cls):
@@ -2404,16 +2407,10 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
start_ts = models.DateTimeField(default=None, null=True, blank=True)
end_ts = models.DateTimeField(default=None, null=True, blank=True)
- status = ModelWithStateMachine.StatusField(choices=StatusChoices.choices, default=StatusChoices.QUEUED)
- retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
+ status = models.CharField(max_length=16, choices=StatusChoices.choices, default=StatusChoices.QUEUED, db_index=True)
notes = models.TextField(blank=True, null=False, default='')
# output_dir is computed via @property from snapshot.output_dir / plugin
- state_machine_name = 'archivebox.core.models.ArchiveResultMachine'
- retry_at_field_name = 'retry_at'
- state_field_name = 'status'
- active_state = StatusChoices.STARTED
-
snapshot_id: uuid.UUID
process_id: uuid.UUID | None
@@ -2421,7 +2418,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
ModelWithOutputDir.Meta,
ModelWithConfig.Meta,
ModelWithNotes.Meta,
- ModelWithStateMachine.Meta,
):
app_label = 'core'
verbose_name = 'Archive Result'
@@ -2516,40 +2512,24 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
return None
def save(self, *args, **kwargs):
- is_new = self._state.adding
-
- # Create Process record if this is a new ArchiveResult and no process exists yet
- if is_new and not self.process_id:
- from archivebox.machine.models import Process, Machine
-
- process = Process.objects.create(
- machine=Machine.current(),
- pwd=str(Path(self.snapshot.output_dir) / self.plugin),
- cmd=[], # Will be set by run()
- status='queued',
- timeout=120,
- env={},
- )
- self.process = process
-
# Skip ModelWithOutputDir.save() to avoid creating index.json in plugin directories
# Call the Django Model.save() directly instead
models.Model.save(self, *args, **kwargs)
- if is_new:
- from archivebox.misc.logging_util import log_worker_event
- log_worker_event(
- worker_type='DB',
- event='Created ArchiveResult',
- indent_level=3,
- plugin=self.plugin,
- metadata={
- 'id': str(self.id),
- 'snapshot_id': str(self.snapshot_id),
- 'snapshot_url': str(self.snapshot.url)[:64],
- 'status': self.status,
- },
- )
+ # if is_new:
+ # from archivebox.misc.logging_util import log_worker_event
+ # log_worker_event(
+ # worker_type='DB',
+ # event='Created ArchiveResult',
+ # indent_level=3,
+ # plugin=self.plugin,
+ # metadata={
+ # 'id': str(self.id),
+ # 'snapshot_id': str(self.snapshot_id),
+ # 'snapshot_url': str(self.snapshot.url)[:64],
+ # 'status': self.status,
+ # },
+ # )
@cached_property
def snapshot_dir(self):
@@ -2566,6 +2546,28 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
def get_absolute_url(self):
return f'/{self.snapshot.archive_path}/{self.plugin}'
+ def reset_for_retry(self, *, save: bool = True) -> None:
+ self.status = self.StatusChoices.QUEUED
+ self.output_str = ''
+ self.output_json = None
+ self.output_files = {}
+ self.output_size = 0
+ self.output_mimetypes = ''
+ self.start_ts = None
+ self.end_ts = None
+ if save:
+ self.save(update_fields=[
+ 'status',
+ 'output_str',
+ 'output_json',
+ 'output_files',
+ 'output_size',
+ 'output_mimetypes',
+ 'start_ts',
+ 'end_ts',
+ 'modified_at',
+ ])
+
@property
def plugin_module(self) -> Any | None:
# Hook scripts are now used instead of Python plugin modules
@@ -2723,11 +2725,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
return None
- def create_output_dir(self):
- output_dir = Path(self.snapshot_dir) / self.plugin
- output_dir.mkdir(parents=True, exist_ok=True)
- return output_dir
-
@property
def output_dir_name(self) -> str:
return self.plugin
@@ -2782,134 +2779,17 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
def save_search_index(self):
pass
- def cascade_health_update(self, success: bool):
- """Update health stats for parent Snapshot, Crawl, and execution infrastructure (Binary, Machine, NetworkInterface)."""
- # Update archival hierarchy
- self.snapshot.increment_health_stats(success)
- self.snapshot.crawl.increment_health_stats(success)
-
- # Update execution infrastructure
- if self.binary:
- self.binary.increment_health_stats(success)
- if self.binary.machine:
- self.binary.machine.increment_health_stats(success)
-
- if self.iface:
- self.iface.increment_health_stats(success)
-
- def run(self):
- """
- Execute this ArchiveResult's hook and update status.
-
- If self.hook_name is set, runs only that specific hook.
- If self.hook_name is empty, discovers and runs all hooks for self.plugin (backwards compat).
-
- Updates status/output fields, queues discovered URLs, and triggers indexing.
- """
- from django.utils import timezone
- from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook
- from archivebox.config.configset import get_config
-
- # Get merged config with proper context
- config = get_config(
- crawl=self.snapshot.crawl,
- snapshot=self.snapshot,
- )
-
- # Determine which hook(s) to run
- hooks = []
-
- if self.hook_name:
- # SPECIFIC HOOK MODE: Find the specific hook by name
- for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
- if not base_dir.exists():
- continue
- plugin_dir = base_dir / self.plugin
- if plugin_dir.exists():
- hook_path = plugin_dir / self.hook_name
- if hook_path.exists():
- hooks.append(hook_path)
- break
- else:
- # LEGACY MODE: Discover all hooks for this plugin (backwards compatibility)
- for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
- if not base_dir.exists():
- continue
- plugin_dir = base_dir / self.plugin
- if plugin_dir.exists():
- matches = list(plugin_dir.glob('on_Snapshot__*.*'))
- if matches:
- hooks.extend(sorted(matches))
-
- if not hooks:
- self.status = self.StatusChoices.FAILED
- if self.hook_name:
- self.output_str = f'Hook not found: {self.plugin}/{self.hook_name}'
- else:
- self.output_str = f'No hooks found for plugin: {self.plugin}'
- self.retry_at = None
- self.save()
- return
-
- # Output directory is plugin_dir for the hook output
- plugin_dir = Path(self.snapshot.output_dir) / self.plugin
-
- start_ts = timezone.now()
- process = None
-
- for hook in hooks:
- # Run hook using Process.launch() - returns Process model
- process = run_hook(
- hook,
- output_dir=plugin_dir,
- config=config,
- url=self.snapshot.url,
- snapshot_id=str(self.snapshot.id),
- crawl_id=str(self.snapshot.crawl.id),
- depth=self.snapshot.depth,
- )
-
- # Link ArchiveResult to Process
- self.process = process
- self.start_ts = start_ts
- self.save(update_fields=['process_id', 'start_ts', 'modified_at'])
-
- if not process:
- # No hooks ran
- self.status = self.StatusChoices.FAILED
- self.output_str = 'No hooks executed'
- self.save()
- return
-
- # Update status based on hook execution
- if process.status == process.StatusChoices.RUNNING:
- # BACKGROUND HOOK - still running, return immediately
- # Status is already STARTED from enter_started(), will be finalized by Snapshot.cleanup()
- return
-
- # FOREGROUND HOOK - completed, update from filesystem
- self.update_from_output()
-
- # Clean up empty output directory if no files were created
- if plugin_dir.exists() and not self.output_files:
- try:
- if not any(plugin_dir.iterdir()):
- plugin_dir.rmdir()
- except (OSError, RuntimeError):
- pass
-
def update_from_output(self):
"""
Update this ArchiveResult from filesystem logs and output files.
- Used for:
- - Foreground hooks that completed (called from ArchiveResult.run())
- - Background hooks that completed (called from Snapshot.cleanup())
+ Used for Snapshot cleanup / orphan recovery when a hook's output exists
+ on disk but the projector did not finalize the row in the database.
Updates:
- status, output_str, output_json from ArchiveResult JSONL record
- output_files, output_size, output_mimetypes by walking filesystem
- - end_ts, retry_at, cmd, cmd_version, binary FK
+ - end_ts, cmd, cmd_version, binary FK
- Processes side-effect records (Snapshot, Tag, etc.) via process_hook_records()
"""
import mimetypes
@@ -2924,7 +2804,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
self.status = self.StatusChoices.FAILED
self.output_str = 'Output directory not found'
self.end_ts = timezone.now()
- self.retry_at = None
self.save()
return
@@ -2948,6 +2827,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
'succeeded': self.StatusChoices.SUCCEEDED,
'failed': self.StatusChoices.FAILED,
'skipped': self.StatusChoices.SKIPPED,
+ 'noresults': self.StatusChoices.NORESULTS,
}
self.status = status_map.get(hook_data.get('status', 'failed'), self.StatusChoices.FAILED)
@@ -3011,7 +2891,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
# Update timestamps
self.end_ts = timezone.now()
- self.retry_at = None
self.save()
@@ -3095,340 +2974,13 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
Uses proper config hierarchy: defaults -> file -> env -> machine -> user -> crawl -> snapshot
"""
- import re
- from archivebox.config.configset import get_config
-
- # Get merged config with proper hierarchy
- config = get_config(
- user=self.created_by,
- crawl=self.snapshot.crawl,
- snapshot=self.snapshot,
- )
-
- # Get allowlist/denylist (can be string or list)
- allowlist_raw = config.get('URL_ALLOWLIST', '')
- denylist_raw = config.get('URL_DENYLIST', '')
-
- # Normalize to list of patterns
- def to_pattern_list(value):
- if isinstance(value, list):
- return value
- if isinstance(value, str):
- return [p.strip() for p in value.split(',') if p.strip()]
- return []
-
- allowlist = to_pattern_list(allowlist_raw)
- denylist = to_pattern_list(denylist_raw)
-
- # Denylist takes precedence
- if denylist:
- for pattern in denylist:
- try:
- if re.search(pattern, url):
- return False
- except re.error:
- continue # Skip invalid regex patterns
-
- # If allowlist exists, URL must match at least one pattern
- if allowlist:
- for pattern in allowlist:
- try:
- if re.search(pattern, url):
- return True
- except re.error:
- continue # Skip invalid regex patterns
- return False # No allowlist patterns matched
-
- return True # No filters or passed filters
+ return self.snapshot.crawl.url_passes_filters(url, snapshot=self.snapshot)
@property
def output_dir(self) -> Path:
"""Get the output directory for this plugin's results."""
return Path(self.snapshot.output_dir) / self.plugin
- def is_background_hook(self) -> bool:
- """Check if this ArchiveResult is for a background hook."""
- plugin_dir = Path(self.pwd) if self.pwd else None
- if not plugin_dir:
- return False
- pid_file = plugin_dir / 'hook.pid'
- return pid_file.exists()
-
-
-# =============================================================================
-# ArchiveResult State Machine
-# =============================================================================
-
-class ArchiveResultMachine(BaseStateMachine):
- """
- State machine for managing ArchiveResult (single plugin execution) lifecycle.
-
- Hook Lifecycle:
- ┌─────────────────────────────────────────────────────────────┐
- │ QUEUED State │
- │ • Waiting for its turn to run │
- └─────────────────────────────────────────────────────────────┘
- ↓ tick() when can_start()
- ┌─────────────────────────────────────────────────────────────┐
- │ STARTED State → enter_started() │
- │ 1. archiveresult.run() │
- │ • Find specific hook by hook_name │
- │ • run_hook(script, output_dir, ...) → subprocess │
- │ │
- │ 2a. FOREGROUND hook (returns HookResult): │
- │ • update_from_output() immediately │
- │ - Read stdout.log │
- │ - Parse JSONL records │
- │ - Extract 'ArchiveResult' record → update status │
- │ - Walk output_dir → populate output_files │
- │ - Call process_hook_records() for side effects │
- │ │
- │ 2b. BACKGROUND hook (returns None): │
- │ • Status stays STARTED │
- │ • Continues running in background │
- │ • Killed by Snapshot.cleanup() when sealed │
- └─────────────────────────────────────────────────────────────┘
- ↓ tick() checks status
- ┌─────────────────────────────────────────────────────────────┐
- │ SUCCEEDED / FAILED / SKIPPED / BACKOFF │
- │ • Set by hook's JSONL output during update_from_output() │
- │ • Health stats incremented (num_uses_succeeded/failed) │
- │ • Parent Snapshot health stats also updated │
- └─────────────────────────────────────────────────────────────┘
-
- https://github.com/ArchiveBox/ArchiveBox/wiki/ArchiveBox-Architecture-Diagrams
- """
-
- model_attr_name = 'archiveresult'
-
- # States
- queued = State(value=ArchiveResult.StatusChoices.QUEUED, initial=True)
- started = State(value=ArchiveResult.StatusChoices.STARTED)
- backoff = State(value=ArchiveResult.StatusChoices.BACKOFF)
- succeeded = State(value=ArchiveResult.StatusChoices.SUCCEEDED, final=True)
- failed = State(value=ArchiveResult.StatusChoices.FAILED, final=True)
- skipped = State(value=ArchiveResult.StatusChoices.SKIPPED, final=True)
-
- # Tick Event - transitions based on conditions
- # Flow: queued → started → (succeeded|failed|skipped)
- # queued → skipped (if exceeded max attempts)
- # started → backoff → started (retry)
- tick = (
- queued.to(skipped, cond='is_exceeded_max_attempts') # Check skip first
- | queued.to.itself(unless='can_start')
- | queued.to(started, cond='can_start')
- | started.to(succeeded, cond='is_succeeded')
- | started.to(failed, cond='is_failed')
- | started.to(skipped, cond='is_skipped')
- | started.to(backoff, cond='is_backoff')
- | backoff.to(skipped, cond='is_exceeded_max_attempts') # Check skip from backoff too
- | backoff.to.itself(unless='can_start')
- | backoff.to(started, cond='can_start')
- # Removed redundant transitions: backoff.to(succeeded/failed/skipped)
- # Reason: backoff should always retry→started, then started→final states
- )
-
- archiveresult: ArchiveResult
-
- def can_start(self) -> bool:
- """Pure function - check if AR can start (has valid URL)."""
- return bool(self.archiveresult.snapshot.url)
-
- def is_exceeded_max_attempts(self) -> bool:
- """Check if snapshot has exceeded MAX_URL_ATTEMPTS failed results."""
- from archivebox.config.configset import get_config
-
- config = get_config(
- crawl=self.archiveresult.snapshot.crawl,
- snapshot=self.archiveresult.snapshot,
- )
- max_attempts = config.get('MAX_URL_ATTEMPTS', 50)
-
- # Count failed ArchiveResults for this snapshot (any plugin type)
- failed_count = self.archiveresult.snapshot.archiveresult_set.filter(
- status=ArchiveResult.StatusChoices.FAILED
- ).count()
-
- return failed_count >= max_attempts
-
- def is_succeeded(self) -> bool:
- """Check if extractor plugin succeeded (status was set by run())."""
- return self.archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED
-
- def is_failed(self) -> bool:
- """Check if extractor plugin failed (status was set by run())."""
- return self.archiveresult.status == ArchiveResult.StatusChoices.FAILED
-
- def is_skipped(self) -> bool:
- """Check if extractor plugin was skipped (status was set by run())."""
- return self.archiveresult.status == ArchiveResult.StatusChoices.SKIPPED
-
- def is_backoff(self) -> bool:
- """Check if we should backoff and retry later."""
- # Backoff if status is still started (plugin didn't complete) and output_str is empty
- return (
- self.archiveresult.status == ArchiveResult.StatusChoices.STARTED
- and not self.archiveresult.output_str
- )
-
- def is_finished(self) -> bool:
- """
- Check if extraction has completed (success, failure, or skipped).
-
- For background hooks in STARTED state, checks if their Process has finished and reaps them.
- """
- # If already in final state, return True
- if self.archiveresult.status in (
- ArchiveResult.StatusChoices.SUCCEEDED,
- ArchiveResult.StatusChoices.FAILED,
- ArchiveResult.StatusChoices.SKIPPED,
- ):
- return True
-
- # If in STARTED state with a Process, check if Process has finished running
- if self.archiveresult.status == ArchiveResult.StatusChoices.STARTED:
- if self.archiveresult.process_id:
- process = self.archiveresult.process
-
- # If process is NOT running anymore, reap the background hook
- if not process.is_running:
- self.archiveresult.update_from_output()
- # Check if now in final state after reaping
- return self.archiveresult.status in (
- ArchiveResult.StatusChoices.SUCCEEDED,
- ArchiveResult.StatusChoices.FAILED,
- ArchiveResult.StatusChoices.SKIPPED,
- )
-
- return False
-
- @queued.enter
- def enter_queued(self):
- self.archiveresult.update_and_requeue(
- retry_at=timezone.now(),
- status=ArchiveResult.StatusChoices.QUEUED,
- start_ts=None,
- ) # bump the snapshot's retry_at so they pickup any new changes
-
- @started.enter
- def enter_started(self):
-
- # Update Process with network interface
- if self.archiveresult.process_id:
- self.archiveresult.process.iface = NetworkInterface.current()
- self.archiveresult.process.save()
-
- # Lock the object and mark start time
- self.archiveresult.update_and_requeue(
- retry_at=timezone.now() + timedelta(seconds=120), # 2 min timeout for plugin
- status=ArchiveResult.StatusChoices.STARTED,
- start_ts=timezone.now(),
- )
-
- # Run the plugin - this updates status, output, timestamps, etc.
- self.archiveresult.run()
-
- # Save the updated result
- self.archiveresult.save()
-
-
- @backoff.enter
- def enter_backoff(self):
- self.archiveresult.update_and_requeue(
- retry_at=timezone.now() + timedelta(seconds=60),
- status=ArchiveResult.StatusChoices.BACKOFF,
- end_ts=None,
- )
-
- def _check_and_seal_parent_snapshot(self):
- """
- Check if this is the last ArchiveResult to finish - if so, seal the parent Snapshot.
-
- Note: In the new architecture, the shared runner handles step advancement and sealing.
- This method is kept for direct model-driven edge cases.
- """
- import sys
-
- snapshot = self.archiveresult.snapshot
-
- # Check if all archiveresults are finished (in final states)
- remaining_active = snapshot.archiveresult_set.exclude(
- status__in=[
- ArchiveResult.StatusChoices.SUCCEEDED,
- ArchiveResult.StatusChoices.FAILED,
- ArchiveResult.StatusChoices.SKIPPED,
- ]
- ).count()
-
- if remaining_active == 0:
- print(f'[cyan] 🔒 All archiveresults finished for snapshot {snapshot.url}, sealing snapshot[/cyan]', file=sys.stderr)
- # Seal the parent snapshot
- cast(Any, snapshot).sm.seal()
-
- @succeeded.enter
- def enter_succeeded(self):
- import sys
-
- self.archiveresult.update_and_requeue(
- retry_at=None,
- status=ArchiveResult.StatusChoices.SUCCEEDED,
- end_ts=timezone.now(),
- )
-
- # Update health stats for ArchiveResult, Snapshot, and Crawl cascade
- self.archiveresult.cascade_health_update(success=True)
-
- print(f'[cyan] ✅ ArchiveResult succeeded: {self.archiveresult.plugin} for {self.archiveresult.snapshot.url}[/cyan]', file=sys.stderr)
-
- # Check if this is the last AR to finish - seal parent snapshot if so
- self._check_and_seal_parent_snapshot()
-
- @failed.enter
- def enter_failed(self):
- import sys
-
- print(f'[red] ❌ ArchiveResult.enter_failed() called for {self.archiveresult.plugin}[/red]', file=sys.stderr)
-
- self.archiveresult.update_and_requeue(
- retry_at=None,
- status=ArchiveResult.StatusChoices.FAILED,
- end_ts=timezone.now(),
- )
-
- # Update health stats for ArchiveResult, Snapshot, and Crawl cascade
- self.archiveresult.cascade_health_update(success=False)
-
- print(f'[red] ❌ ArchiveResult failed: {self.archiveresult.plugin} for {self.archiveresult.snapshot.url}[/red]', file=sys.stderr)
-
- # Check if this is the last AR to finish - seal parent snapshot if so
- self._check_and_seal_parent_snapshot()
-
- @skipped.enter
- def enter_skipped(self):
- import sys
-
- # Set output_str if not already set (e.g., when skipped due to max attempts)
- if not self.archiveresult.output_str and self.is_exceeded_max_attempts():
- from archivebox.config.configset import get_config
- config = get_config(
- crawl=self.archiveresult.snapshot.crawl,
- snapshot=self.archiveresult.snapshot,
- )
- max_attempts = config.get('MAX_URL_ATTEMPTS', 50)
- self.archiveresult.output_str = f'Skipped: snapshot exceeded MAX_URL_ATTEMPTS ({max_attempts} failures)'
-
- self.archiveresult.update_and_requeue(
- retry_at=None,
- status=ArchiveResult.StatusChoices.SKIPPED,
- end_ts=timezone.now(),
- )
-
- print(f'[dim] ⏭️ ArchiveResult skipped: {self.archiveresult.plugin} for {self.archiveresult.snapshot.url}[/dim]', file=sys.stderr)
-
- # Check if this is the last AR to finish - seal parent snapshot if so
- self._check_and_seal_parent_snapshot()
-
-
# =============================================================================
# State Machine Registration
# =============================================================================
@@ -3436,4 +2988,3 @@ class ArchiveResultMachine(BaseStateMachine):
# Manually register state machines with python-statemachine registry
# (normally auto-discovered from statemachines.py, but we define them here for clarity)
registry.register(SnapshotMachine)
-registry.register(ArchiveResultMachine)
diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py
index 7f855b94..3a296516 100644
--- a/archivebox/core/settings.py
+++ b/archivebox/core/settings.py
@@ -232,11 +232,12 @@ SQLITE_CONNECTION_OPTIONS = {
# https://gcollazo.com/optimal-sqlite-settings-for-django/
# https://litestream.io/tips/#busy-timeout
# https://docs.djangoproject.com/en/5.1/ref/databases/#setting-pragma-options
- "timeout": 10,
+ "timeout": 30,
"check_same_thread": False,
"transaction_mode": "IMMEDIATE",
"init_command": (
"PRAGMA foreign_keys=ON;"
+ "PRAGMA busy_timeout = 30000;"
"PRAGMA journal_mode = WAL;"
"PRAGMA synchronous = NORMAL;"
"PRAGMA temp_store = MEMORY;"
diff --git a/archivebox/core/tag_utils.py b/archivebox/core/tag_utils.py
new file mode 100644
index 00000000..de562b34
--- /dev/null
+++ b/archivebox/core/tag_utils.py
@@ -0,0 +1,271 @@
+from __future__ import annotations
+
+import json
+from collections import defaultdict
+from typing import Any
+
+from django.contrib.auth.models import User
+from django.db.models import Count, F, Q, QuerySet
+from django.db.models.functions import Lower
+from django.http import HttpRequest
+from django.urls import reverse
+
+from archivebox.core.host_utils import build_snapshot_url, build_web_url
+from archivebox.core.models import Snapshot, SnapshotTag, Tag
+
+
+TAG_SNAPSHOT_PREVIEW_LIMIT = 10
+TAG_SORT_CHOICES = (
+ ('name_asc', 'Name A-Z'),
+ ('name_desc', 'Name Z-A'),
+ ('created_desc', 'Created newest'),
+ ('created_asc', 'Created oldest'),
+ ('snapshots_desc', 'Most snapshots'),
+ ('snapshots_asc', 'Fewest snapshots'),
+)
+TAG_HAS_SNAPSHOTS_CHOICES = (
+ ('all', 'All'),
+ ('yes', 'Has snapshots'),
+ ('no', 'No snapshots'),
+)
+
+
+def normalize_tag_name(name: str) -> str:
+ return (name or '').strip()
+
+
+def normalize_tag_sort(sort: str = 'created_desc') -> str:
+ valid_sorts = {key for key, _label in TAG_SORT_CHOICES}
+ return sort if sort in valid_sorts else 'created_desc'
+
+
+def normalize_has_snapshots_filter(value: str = 'all') -> str:
+ valid_filters = {key for key, _label in TAG_HAS_SNAPSHOTS_CHOICES}
+ return value if value in valid_filters else 'all'
+
+
+def normalize_created_by_filter(created_by: str = '') -> str:
+ return created_by if str(created_by).isdigit() else ''
+
+
+def normalize_created_year_filter(year: str = '') -> str:
+ year = (year or '').strip()
+ return year if len(year) == 4 and year.isdigit() else ''
+
+
+def get_matching_tags(
+ query: str = '',
+ sort: str = 'created_desc',
+ created_by: str = '',
+ year: str = '',
+ has_snapshots: str = 'all',
+) -> QuerySet[Tag]:
+ queryset = Tag.objects.select_related('created_by').annotate(
+ num_snapshots=Count('snapshot_set', distinct=True),
+ )
+
+ query = normalize_tag_name(query)
+ if query:
+ queryset = queryset.filter(
+ Q(name__icontains=query) | Q(slug__icontains=query),
+ )
+
+ created_by = normalize_created_by_filter(created_by)
+ if created_by:
+ queryset = queryset.filter(created_by_id=int(created_by))
+
+ year = normalize_created_year_filter(year)
+ if year:
+ queryset = queryset.filter(created_at__year=int(year))
+
+ has_snapshots = normalize_has_snapshots_filter(has_snapshots)
+ if has_snapshots == 'yes':
+ queryset = queryset.filter(num_snapshots__gt=0)
+ elif has_snapshots == 'no':
+ queryset = queryset.filter(num_snapshots=0)
+
+ sort = normalize_tag_sort(sort)
+ if sort == 'name_asc':
+ queryset = queryset.order_by(Lower('name'), 'id')
+ elif sort == 'name_desc':
+ queryset = queryset.order_by(Lower('name').desc(), '-id')
+ elif sort == 'created_asc':
+ queryset = queryset.order_by(F('created_at').asc(nulls_first=True), 'id', Lower('name'))
+ elif sort == 'snapshots_desc':
+ queryset = queryset.order_by(F('num_snapshots').desc(nulls_last=True), F('created_at').desc(nulls_last=True), '-id', Lower('name'))
+ elif sort == 'snapshots_asc':
+ queryset = queryset.order_by(F('num_snapshots').asc(nulls_first=True), Lower('name'), 'id')
+ else:
+ queryset = queryset.order_by(F('created_at').desc(nulls_last=True), '-id', Lower('name'))
+
+ return queryset
+
+
+def get_tag_creator_choices() -> list[tuple[str, str]]:
+ rows = (
+ Tag.objects
+ .filter(created_by__isnull=False)
+ .values_list('created_by_id', 'created_by__username')
+ .order_by(Lower('created_by__username'), 'created_by_id')
+ .distinct()
+ )
+ return [(str(user_id), username or f'User {user_id}') for user_id, username in rows]
+
+
+def get_tag_year_choices() -> list[str]:
+ years = Tag.objects.exclude(created_at__isnull=True).dates('created_at', 'year', order='DESC')
+ return [str(year.year) for year in years]
+
+
+def get_tag_by_ref(tag_ref: str | int) -> Tag:
+ if isinstance(tag_ref, int):
+ return Tag.objects.get(pk=tag_ref)
+
+ ref = str(tag_ref).strip()
+ if ref.isdigit():
+ return Tag.objects.get(pk=int(ref))
+
+ try:
+ return Tag.objects.get(slug__iexact=ref)
+ except Tag.DoesNotExist:
+ return Tag.objects.get(slug__icontains=ref)
+
+
+def get_or_create_tag(name: str, created_by: User | None = None) -> tuple[Tag, bool]:
+ normalized_name = normalize_tag_name(name)
+ if not normalized_name:
+ raise ValueError('Tag name is required')
+
+ existing = Tag.objects.filter(name__iexact=normalized_name).first()
+ if existing:
+ return existing, False
+
+ tag = Tag.objects.create(
+ name=normalized_name,
+ created_by=created_by,
+ )
+ return tag, True
+
+
+def rename_tag(tag: Tag, name: str) -> Tag:
+ normalized_name = normalize_tag_name(name)
+ if not normalized_name:
+ raise ValueError('Tag name is required')
+
+ existing = Tag.objects.filter(name__iexact=normalized_name).exclude(pk=tag.pk).first()
+ if existing:
+ raise ValueError(f'Tag "{existing.name}" already exists')
+
+ if tag.name != normalized_name:
+ tag.name = normalized_name
+ tag.save()
+ return tag
+
+
+def delete_tag(tag: Tag) -> tuple[int, dict[str, int]]:
+ return tag.delete()
+
+
+def export_tag_urls(tag: Tag) -> str:
+ urls = tag.snapshot_set.order_by('-downloaded_at', '-created_at', '-pk').values_list('url', flat=True)
+ return '\n'.join(urls)
+
+
+def export_tag_snapshots_jsonl(tag: Tag) -> str:
+ snapshots = tag.snapshot_set.order_by('-downloaded_at', '-created_at', '-pk').prefetch_related('tags')
+ return '\n'.join(json.dumps(snapshot.to_json()) for snapshot in snapshots)
+
+
+def _display_snapshot_title(snapshot: Snapshot) -> str:
+ title = (snapshot.title or '').strip()
+ url = (snapshot.url or '').strip()
+ if not title:
+ return url
+
+ normalized_title = title.lower()
+ if normalized_title == 'pending...' or normalized_title == url.lower():
+ return url
+ return title
+
+
+def _build_snapshot_preview(snapshot: Snapshot, request: HttpRequest | None = None) -> dict[str, Any]:
+ return {
+ 'id': str(snapshot.pk),
+ 'title': _display_snapshot_title(snapshot),
+ 'url': snapshot.url,
+ 'favicon_url': build_snapshot_url(str(snapshot.pk), 'favicon.ico', request=request),
+ 'admin_url': reverse('admin:core_snapshot_change', args=[snapshot.pk]),
+ 'archive_url': build_web_url(f'/{snapshot.archive_path_from_db}/index.html', request=request),
+ 'downloaded_at': snapshot.downloaded_at.isoformat() if snapshot.downloaded_at else None,
+ }
+
+
+def _build_snapshot_preview_map(tags: list[Tag], request: HttpRequest | None = None, preview_limit: int = TAG_SNAPSHOT_PREVIEW_LIMIT) -> dict[int, list[dict[str, Any]]]:
+ tag_ids = [tag.pk for tag in tags]
+ if not tag_ids:
+ return {}
+
+ snapshot_tags = (
+ SnapshotTag.objects
+ .filter(tag_id__in=tag_ids)
+ .select_related('snapshot__crawl__created_by')
+ .order_by(
+ 'tag_id',
+ F('snapshot__downloaded_at').desc(nulls_last=True),
+ F('snapshot__created_at').desc(nulls_last=True),
+ F('snapshot_id').desc(),
+ )
+ )
+
+ preview_map: dict[int, list[dict[str, Any]]] = defaultdict(list)
+ for snapshot_tag in snapshot_tags:
+ previews = preview_map[snapshot_tag.tag_id]
+ if len(previews) >= preview_limit:
+ continue
+ previews.append(_build_snapshot_preview(snapshot_tag.snapshot, request=request))
+ return preview_map
+
+
+def build_tag_card(tag: Tag, snapshot_previews: list[dict[str, Any]] | None = None) -> dict[str, Any]:
+ count = getattr(tag, 'num_snapshots', tag.snapshot_set.count())
+ return {
+ 'id': tag.pk,
+ 'name': tag.name,
+ 'slug': tag.slug,
+ 'num_snapshots': count,
+ 'filter_url': f"{reverse('admin:core_snapshot_changelist')}?tags__id__exact={tag.pk}",
+ 'edit_url': reverse('admin:core_tag_change', args=[tag.pk]),
+ 'export_urls_url': reverse('api-1:tag_urls_export', args=[tag.pk]),
+ 'export_jsonl_url': reverse('api-1:tag_snapshots_export', args=[tag.pk]),
+ 'rename_url': reverse('api-1:rename_tag', args=[tag.pk]),
+ 'delete_url': reverse('api-1:delete_tag', args=[tag.pk]),
+ 'snapshots': snapshot_previews or [],
+ }
+
+
+def build_tag_cards(
+ query: str = '',
+ request: HttpRequest | None = None,
+ limit: int | None = None,
+ preview_limit: int = TAG_SNAPSHOT_PREVIEW_LIMIT,
+ sort: str = 'created_desc',
+ created_by: str = '',
+ year: str = '',
+ has_snapshots: str = 'all',
+) -> list[dict[str, Any]]:
+ queryset = get_matching_tags(
+ query=query,
+ sort=sort,
+ created_by=created_by,
+ year=year,
+ has_snapshots=has_snapshots,
+ )
+ if limit is not None:
+ queryset = queryset[:limit]
+
+ tags = list(queryset)
+ preview_map = _build_snapshot_preview_map(tags, request=request, preview_limit=preview_limit)
+ return [
+ build_tag_card(tag, snapshot_previews=preview_map.get(tag.pk, []))
+ for tag in tags
+ ]
diff --git a/archivebox/core/templatetags/core_tags.py b/archivebox/core/templatetags/core_tags.py
index 859a4c6f..a0323ca3 100644
--- a/archivebox/core/templatetags/core_tags.py
+++ b/archivebox/core/templatetags/core_tags.py
@@ -11,6 +11,7 @@ from archivebox.hooks import (
)
from archivebox.core.host_utils import (
get_admin_base_url,
+ get_public_base_url,
get_web_base_url,
get_snapshot_base_url,
build_snapshot_url,
@@ -166,6 +167,11 @@ def web_base_url(context) -> str:
return get_web_base_url(request=context.get('request'))
+@register.simple_tag(takes_context=True)
+def public_base_url(context) -> str:
+ return get_public_base_url(request=context.get('request'))
+
+
@register.simple_tag(takes_context=True)
def snapshot_base_url(context, snapshot) -> str:
snapshot_id = getattr(snapshot, 'id', snapshot)
diff --git a/archivebox/core/views.py b/archivebox/core/views.py
index 5ff0d2fd..d63af6dc 100644
--- a/archivebox/core/views.py
+++ b/archivebox/core/views.py
@@ -1,5 +1,6 @@
__package__ = 'archivebox.core'
+import json
import os
import posixpath
from glob import glob, escape
@@ -7,7 +8,7 @@ from django.utils import timezone
import inspect
from typing import Callable, cast, get_type_hints
from pathlib import Path
-from urllib.parse import urlparse
+from urllib.parse import quote, urlparse
from django.shortcuts import render, redirect
from django.http import JsonResponse, HttpRequest, HttpResponse, Http404, HttpResponseForbidden
@@ -26,7 +27,7 @@ from admin_data_views.typing import TableContext, ItemContext, SectionData
from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink
from archivebox.config import CONSTANTS, CONSTANTS_CONFIG, DATA_DIR, VERSION
-from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG
+from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG, SEARCH_BACKEND_CONFIG
from archivebox.config.configset import get_flat_config, get_config, get_all_configs
from archivebox.misc.util import base_url, htmlencode, ts_to_date_str, urldecode
from archivebox.misc.serve_static import serve_static_with_byterange_support
@@ -37,7 +38,18 @@ from archivebox.core.models import Snapshot
from archivebox.core.host_utils import build_snapshot_url
from archivebox.core.forms import AddLinkForm
from archivebox.crawls.models import Crawl
-from archivebox.hooks import get_enabled_plugins, get_plugin_name
+from archivebox.hooks import (
+ BUILTIN_PLUGINS_DIR,
+ USER_PLUGINS_DIR,
+ discover_plugin_configs,
+ get_enabled_plugins,
+ get_plugin_name,
+ iter_plugin_dirs,
+)
+
+
+ABX_PLUGINS_GITHUB_BASE_URL = 'https://github.com/ArchiveBox/abx-plugins/tree/main/abx_plugins/plugins/'
+LIVE_PLUGIN_BASE_URL = '/admin/environment/plugins/'
def _files_index_target(snapshot: Snapshot, archivefile: str | None) -> str:
@@ -699,6 +711,9 @@ def _serve_responses_path(request, responses_root: Path, rel_path: str, show_ind
def _serve_snapshot_replay(request: HttpRequest, snapshot: Snapshot, path: str = ""):
rel_path = path or ""
show_indexes = bool(request.GET.get("files"))
+ if not show_indexes and (not rel_path or rel_path == "index.html"):
+ return SnapshotView.render_live_index(request, snapshot)
+
if not rel_path or rel_path.endswith("/"):
if show_indexes:
rel_path = rel_path.rstrip("/")
@@ -784,7 +799,6 @@ class SnapshotHostView(View):
raise Http404
return _serve_snapshot_replay(request, snapshot, path)
-
class SnapshotReplayView(View):
"""Serve snapshot directory contents on a one-domain replay path."""
@@ -915,8 +929,17 @@ class AddView(UserPassesTestMixin, FormView):
return custom_config
def get_context_data(self, **kwargs):
- from archivebox.core.models import Tag
-
+ required_search_plugin = f'search_backend_{SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}'.strip()
+ plugin_configs = discover_plugin_configs()
+ plugin_dependency_map = {
+ plugin_name: [
+ str(required_plugin).strip()
+ for required_plugin in (schema.get('required_plugins') or [])
+ if str(required_plugin).strip()
+ ]
+ for plugin_name, schema in plugin_configs.items()
+ if isinstance(schema.get('required_plugins'), list) and schema.get('required_plugins')
+ }
return {
**super().get_context_data(**kwargs),
'title': "Create Crawl",
@@ -924,8 +947,9 @@ class AddView(UserPassesTestMixin, FormView):
'absolute_add_path': self.request.build_absolute_uri(self.request.path),
'VERSION': VERSION,
'FOOTER_INFO': SERVER_CONFIG.FOOTER_INFO,
+ 'required_search_plugin': required_search_plugin,
+ 'plugin_dependency_map_json': json.dumps(plugin_dependency_map, sort_keys=True),
'stdout': '',
- 'available_tags': list(Tag.objects.all().order_by('name').values_list('name', flat=True)),
}
def _create_crawl_from_form(self, form, *, created_by_id=None) -> Crawl:
@@ -937,11 +961,10 @@ class AddView(UserPassesTestMixin, FormView):
depth = int(form.cleaned_data["depth"])
plugins = ','.join(form.cleaned_data.get("plugins", []))
schedule = form.cleaned_data.get("schedule", "").strip()
- persona = form.cleaned_data.get("persona", "Default")
- overwrite = form.cleaned_data.get("overwrite", False)
- update = form.cleaned_data.get("update", False)
+ persona = form.cleaned_data.get("persona")
index_only = form.cleaned_data.get("index_only", False)
notes = form.cleaned_data.get("notes", "")
+ url_filters = form.cleaned_data.get("url_filters") or {}
custom_config = self._get_custom_config_overrides(form)
from archivebox.config.permissions import HOSTNAME
@@ -957,6 +980,7 @@ class AddView(UserPassesTestMixin, FormView):
# 1. save the provided urls to sources/2024-11-05__23-59-59__web_ui_add_by_user_.txt
sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__web_ui_add_by_user_{created_by_id}.txt'
+ sources_file.parent.mkdir(parents=True, exist_ok=True)
sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))
# 2. create a new Crawl with the URLs from the file
@@ -964,16 +988,18 @@ class AddView(UserPassesTestMixin, FormView):
urls_content = sources_file.read_text()
# Build complete config
config = {
- 'ONLY_NEW': not update,
'INDEX_ONLY': index_only,
- 'OVERWRITE': overwrite,
'DEPTH': depth,
'PLUGINS': plugins or '',
- 'DEFAULT_PERSONA': persona or 'Default',
+ 'DEFAULT_PERSONA': (persona.name if persona else 'Default'),
}
# Merge custom config overrides
config.update(custom_config)
+ if url_filters.get('allowlist'):
+ config['URL_ALLOWLIST'] = url_filters['allowlist']
+ if url_filters.get('denylist'):
+ config['URL_DENYLIST'] = url_filters['denylist']
crawl = Crawl.objects.create(
urls=urls_content,
@@ -999,6 +1025,8 @@ class AddView(UserPassesTestMixin, FormView):
crawl.schedule = crawl_schedule
crawl.save(update_fields=['schedule'])
+ crawl.create_snapshots_from_urls()
+
# 4. start the Orchestrator & wait until it completes
# ... orchestrator will create the root Snapshot, which creates pending ArchiveResults, which gets run by the ArchiveResultActors ...
# from archivebox.crawls.actors import CrawlActor
@@ -1011,7 +1039,7 @@ class AddView(UserPassesTestMixin, FormView):
urls = form.cleaned_data["url"]
schedule = form.cleaned_data.get("schedule", "").strip()
- rough_url_count = urls.count('://')
+ rough_url_count = len([url for url in urls.splitlines() if url.strip()])
# Build success message with schedule link if created
schedule_msg = ""
@@ -1080,10 +1108,6 @@ class WebAddView(AddView):
'persona': defaults_form.fields['persona'].initial or 'Default',
'config': {},
}
- if defaults_form.fields['update'].initial:
- form_data['update'] = 'on'
- if defaults_form.fields['overwrite'].initial:
- form_data['overwrite'] = 'on'
if defaults_form.fields['index_only'].initial:
form_data['index_only'] = 'on'
@@ -1118,6 +1142,41 @@ def live_progress_view(request):
from archivebox.core.models import Snapshot, ArchiveResult
from archivebox.machine.models import Process, Machine
+ def hook_details(hook_name: str, plugin: str = "setup") -> tuple[str, str, str, str]:
+ normalized_hook_name = Path(hook_name).name if hook_name else ""
+ if not normalized_hook_name:
+ return (plugin, plugin, "unknown", "")
+
+ phase = "unknown"
+ if normalized_hook_name.startswith("on_Crawl__"):
+ phase = "crawl"
+ elif normalized_hook_name.startswith("on_Snapshot__"):
+ phase = "snapshot"
+ elif normalized_hook_name.startswith("on_Binary__"):
+ phase = "binary"
+
+ label = normalized_hook_name
+ if "__" in normalized_hook_name:
+ label = normalized_hook_name.split("__", 1)[1]
+ label = label.rsplit(".", 1)[0]
+ if len(label) > 3 and label[:2].isdigit() and label[2] == "_":
+ label = label[3:]
+ label = label.replace("_", " ").strip() or plugin
+
+ return (plugin, label, phase, normalized_hook_name)
+
+ def process_label(cmd: list[str] | None) -> tuple[str, str, str, str]:
+ hook_path = ""
+ if isinstance(cmd, list) and cmd:
+ first = cmd[0]
+ if isinstance(first, str):
+ hook_path = first
+
+ if not hook_path:
+ return ("", "setup", "unknown", "")
+
+ return hook_details(Path(hook_path).name, plugin=Path(hook_path).parent.name or "setup")
+
machine = Machine.current()
orchestrator_proc = Process.objects.filter(
machine=machine,
@@ -1188,8 +1247,19 @@ def live_progress_view(request):
Process.TypeChoices.BINARY,
],
)
+ recent_processes = Process.objects.filter(
+ machine=machine,
+ process_type__in=[
+ Process.TypeChoices.HOOK,
+ Process.TypeChoices.BINARY,
+ ],
+ modified_at__gte=timezone.now() - timedelta(minutes=10),
+ ).order_by("-modified_at")
crawl_process_pids: dict[str, int] = {}
snapshot_process_pids: dict[str, int] = {}
+ process_records_by_crawl: dict[str, list[dict[str, object]]] = {}
+ process_records_by_snapshot: dict[str, list[dict[str, object]]] = {}
+ seen_process_records: set[str] = set()
for proc in running_processes:
env = proc.env or {}
if not isinstance(env, dict):
@@ -1197,11 +1267,48 @@ def live_progress_view(request):
crawl_id = env.get('CRAWL_ID')
snapshot_id = env.get('SNAPSHOT_ID')
+ _plugin, _label, phase, _hook_name = process_label(proc.cmd)
if crawl_id and proc.pid:
crawl_process_pids.setdefault(str(crawl_id), proc.pid)
- if snapshot_id and proc.pid:
+ if phase == "snapshot" and snapshot_id and proc.pid:
snapshot_process_pids.setdefault(str(snapshot_id), proc.pid)
+ for proc in recent_processes:
+ env = proc.env or {}
+ if not isinstance(env, dict):
+ env = {}
+
+ crawl_id = env.get("CRAWL_ID")
+ snapshot_id = env.get("SNAPSHOT_ID")
+ if not crawl_id and not snapshot_id:
+ continue
+
+ plugin, label, phase, hook_name = process_label(proc.cmd)
+
+ record_scope = str(snapshot_id) if phase == "snapshot" and snapshot_id else str(crawl_id)
+ proc_key = f"{record_scope}:{plugin}:{label}:{proc.status}:{proc.exit_code}"
+ if proc_key in seen_process_records:
+ continue
+ seen_process_records.add(proc_key)
+
+ status = "started" if proc.status == Process.StatusChoices.RUNNING else ("failed" if proc.exit_code not in (None, 0) else "succeeded")
+ payload: dict[str, object] = {
+ "id": str(proc.id),
+ "plugin": plugin,
+ "label": label,
+ "hook_name": hook_name,
+ "status": status,
+ "phase": phase,
+ "source": "process",
+ "process_id": str(proc.id),
+ }
+ if status == "started" and proc.pid:
+ payload["pid"] = proc.pid
+ if phase == "snapshot" and snapshot_id:
+ process_records_by_snapshot.setdefault(str(snapshot_id), []).append(payload)
+ elif crawl_id:
+ process_records_by_crawl.setdefault(str(crawl_id), []).append(payload)
+
active_crawls_qs = Crawl.objects.filter(
status__in=[Crawl.StatusChoices.QUEUED, Crawl.StatusChoices.STARTED]
).prefetch_related(
@@ -1234,6 +1341,11 @@ def live_progress_view(request):
# Calculate crawl progress
crawl_progress = int((completed_snapshots / total_snapshots) * 100) if total_snapshots > 0 else 0
+ crawl_setup_plugins = list(process_records_by_crawl.get(str(crawl.id), []))
+ crawl_setup_total = len(crawl_setup_plugins)
+ crawl_setup_completed = sum(1 for item in crawl_setup_plugins if item.get("status") == "succeeded")
+ crawl_setup_failed = sum(1 for item in crawl_setup_plugins if item.get("status") == "failed")
+ crawl_setup_pending = sum(1 for item in crawl_setup_plugins if item.get("status") == "queued")
# Get active snapshots for this crawl (already prefetched)
active_snapshots_for_crawl = []
@@ -1241,28 +1353,21 @@ def live_progress_view(request):
# Get archive results for this snapshot (already prefetched)
snapshot_results = snapshot.archiveresult_set.all()
- # Count in memory instead of DB queries
- total_plugins = len(snapshot_results)
- completed_plugins = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.SUCCEEDED)
- failed_plugins = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.FAILED)
- pending_plugins = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.QUEUED)
-
- # Calculate snapshot progress using per-plugin progress
now = timezone.now()
plugin_progress_values: list[int] = []
+ all_plugins: list[dict[str, object]] = []
+ seen_plugin_keys: set[str] = set()
- # Get all extractor plugins for this snapshot (already prefetched, sort in Python)
- # Order: started first, then queued, then completed
def plugin_sort_key(ar):
status_order = {
ArchiveResult.StatusChoices.STARTED: 0,
ArchiveResult.StatusChoices.QUEUED: 1,
ArchiveResult.StatusChoices.SUCCEEDED: 2,
- ArchiveResult.StatusChoices.FAILED: 3,
+ ArchiveResult.StatusChoices.NORESULTS: 3,
+ ArchiveResult.StatusChoices.FAILED: 4,
}
- return (status_order.get(ar.status, 4), ar.plugin)
+ return (status_order.get(ar.status, 5), ar.plugin, ar.hook_name or "")
- all_plugins = []
for ar in sorted(snapshot_results, key=plugin_sort_key):
status = ar.status
progress_value = 0
@@ -1270,6 +1375,7 @@ def live_progress_view(request):
ArchiveResult.StatusChoices.SUCCEEDED,
ArchiveResult.StatusChoices.FAILED,
ArchiveResult.StatusChoices.SKIPPED,
+ ArchiveResult.StatusChoices.NORESULTS,
):
progress_value = 100
elif status == ArchiveResult.StatusChoices.STARTED:
@@ -1284,20 +1390,49 @@ def live_progress_view(request):
progress_value = 0
plugin_progress_values.append(progress_value)
+ plugin, label, phase, hook_name = hook_details(ar.hook_name or ar.plugin, plugin=ar.plugin)
plugin_payload = {
'id': str(ar.id),
'plugin': ar.plugin,
+ 'label': label,
+ 'hook_name': hook_name,
+ 'phase': phase,
'status': status,
+ 'process_id': str(ar.process_id) if ar.process_id else None,
}
if status == ArchiveResult.StatusChoices.STARTED and ar.process_id and ar.process:
plugin_payload['pid'] = ar.process.pid
if status == ArchiveResult.StatusChoices.STARTED:
plugin_payload['progress'] = progress_value
plugin_payload['timeout'] = ar.timeout or 120
+ plugin_payload['source'] = 'archiveresult'
all_plugins.append(plugin_payload)
+ seen_plugin_keys.add(
+ str(ar.process_id) if ar.process_id else f"{ar.plugin}:{hook_name}"
+ )
- snapshot_progress = int(sum(plugin_progress_values) / total_plugins) if total_plugins > 0 else 0
+ for proc_payload in process_records_by_snapshot.get(str(snapshot.id), []):
+ proc_key = str(proc_payload.get("process_id") or f"{proc_payload.get('plugin')}:{proc_payload.get('hook_name')}")
+ if proc_key in seen_plugin_keys:
+ continue
+ seen_plugin_keys.add(proc_key)
+ all_plugins.append(proc_payload)
+
+ proc_status = proc_payload.get("status")
+ if proc_status in ("succeeded", "failed", "skipped"):
+ plugin_progress_values.append(100)
+ elif proc_status == "started":
+ plugin_progress_values.append(1)
+ else:
+ plugin_progress_values.append(0)
+
+ total_plugins = len(all_plugins)
+ completed_plugins = sum(1 for item in all_plugins if item.get("status") == "succeeded")
+ failed_plugins = sum(1 for item in all_plugins if item.get("status") == "failed")
+ pending_plugins = sum(1 for item in all_plugins if item.get("status") == "queued")
+
+ snapshot_progress = int(sum(plugin_progress_values) / len(plugin_progress_values)) if plugin_progress_values else 0
active_snapshots_for_crawl.append({
'id': str(snapshot.id),
@@ -1334,6 +1469,11 @@ def live_progress_view(request):
'started_snapshots': started_snapshots,
'failed_snapshots': 0,
'pending_snapshots': pending_snapshots,
+ 'setup_plugins': crawl_setup_plugins,
+ 'setup_total_plugins': crawl_setup_total,
+ 'setup_completed_plugins': crawl_setup_completed,
+ 'setup_failed_plugins': crawl_setup_failed,
+ 'setup_pending_plugins': crawl_setup_pending,
'active_snapshots': active_snapshots_for_crawl,
'can_start': can_start,
'urls_preview': urls_preview,
@@ -1461,7 +1601,11 @@ def find_config_source(key: str, merged_config: dict) -> str:
"""Determine where a config value comes from."""
from archivebox.machine.models import Machine
- # Check if it's from archivebox.machine.config
+ # Environment variables override all persistent config sources.
+ if key in os.environ:
+ return 'Environment'
+
+ # Machine.config overrides ArchiveBox.conf.
try:
machine = Machine.current()
if machine.config and key in machine.config:
@@ -1469,10 +1613,6 @@ def find_config_source(key: str, merged_config: dict) -> str:
except Exception:
pass
- # Check if it's from environment variable
- if key in os.environ:
- return 'Environment'
-
# Check if it's from archivebox.config.file
from archivebox.config.configset import BaseConfigSet
file_config = BaseConfigSet.load_from_file(CONSTANTS.CONFIG_FILE)
@@ -1483,6 +1623,43 @@ def find_config_source(key: str, merged_config: dict) -> str:
return 'Default'
+def find_plugin_for_config_key(key: str) -> str | None:
+ for plugin_name, schema in discover_plugin_configs().items():
+ if key in (schema.get('properties') or {}):
+ return plugin_name
+ return None
+
+
+def get_config_definition_link(key: str) -> tuple[str, str]:
+ plugin_name = find_plugin_for_config_key(key)
+ if not plugin_name:
+ return (
+ f'https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig+{quote(key)}&type=code',
+ 'archivebox/config',
+ )
+
+ plugin_dir = next((path.resolve() for path in iter_plugin_dirs() if path.name == plugin_name), None)
+ if plugin_dir:
+ builtin_root = BUILTIN_PLUGINS_DIR.resolve()
+ if plugin_dir.is_relative_to(builtin_root):
+ return (
+ f'{ABX_PLUGINS_GITHUB_BASE_URL}{quote(plugin_name)}/config.json',
+ f'abx_plugins/plugins/{plugin_name}/config.json',
+ )
+
+ user_root = USER_PLUGINS_DIR.resolve()
+ if plugin_dir.is_relative_to(user_root):
+ return (
+ f'{LIVE_PLUGIN_BASE_URL}user.{quote(plugin_name)}/',
+ f'data/custom_plugins/{plugin_name}/config.json',
+ )
+
+ return (
+ f'{LIVE_PLUGIN_BASE_URL}builtin.{quote(plugin_name)}/',
+ f'abx_plugins/plugins/{plugin_name}/config.json',
+ )
+
+
@render_with_table_view
def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
CONFIGS = get_all_configs()
@@ -1566,17 +1743,6 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
# Determine all sources for this config value
sources_info = []
- # Default value
- default_val = find_config_default(key)
- if default_val:
- sources_info.append(('Default', default_val, 'gray'))
-
- # Config file value
- if CONSTANTS.CONFIG_FILE.exists():
- file_config = BaseConfigSet.load_from_file(CONSTANTS.CONFIG_FILE)
- if key in file_config:
- sources_info.append(('Config File', file_config[key], 'green'))
-
# Environment variable
if key in os.environ:
sources_info.append(('Environment', os.environ[key] if key_is_safe(key) else '********', 'blue'))
@@ -1592,6 +1758,17 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
except Exception:
pass
+ # Config file value
+ if CONSTANTS.CONFIG_FILE.exists():
+ file_config = BaseConfigSet.load_from_file(CONSTANTS.CONFIG_FILE)
+ if key in file_config:
+ sources_info.append(('Config File', file_config[key], 'green'))
+
+ # Default value
+ default_val = find_config_default(key)
+ if default_val:
+ sources_info.append(('Default', default_val, 'gray'))
+
# Final computed value
final_value = merged_config.get(key, FLAT_CONFIG.get(key, CONFIGS.get(key, None)))
if not key_is_safe(key):
@@ -1614,6 +1791,8 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
section_header = mark_safe(f'[DYNAMIC CONFIG] {key} (read-only, calculated at runtime) ')
+ definition_url, definition_label = get_config_definition_link(key)
+
section_data = cast(SectionData, {
"name": section_header,
"description": None,
@@ -1621,7 +1800,7 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
'Key': key,
'Type': find_config_type(key),
'Value': final_value,
- 'Source': find_config_source(key, merged_config),
+ 'Currently read from': find_config_source(key, merged_config),
},
"help_texts": {
'Key': mark_safe(f'''
@@ -1631,14 +1810,14 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
'''),
'Type': mark_safe(f'''
-
- See full definition in archivebox/config...
+
+ See full definition in {definition_label}...
'''),
'Value': mark_safe(f'''
{'Value is redacted for your security. (Passwords, secrets, API tokens, etc. cannot be viewed in the Web UI) ' if not key_is_safe(key) else ''}
- Configuration Sources (in priority order):
+ Configuration Sources (highest priority first):
{sources_html}
@@ -1651,15 +1830,15 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
}"
'''),
- 'Source': mark_safe(f'''
+ 'Currently read from': mark_safe(f'''
The value shown in the "Value" field comes from the {find_config_source(key, merged_config)} source.
Priority order (highest to lowest):
+ Environment - Environment variables
Machine - Machine-specific overrides (e.g., resolved binary paths)
{f'→ Edit {key} in Machine.config for this server ' if machine_admin_url else ''}
- Environment - Environment variables
Config File - data/ArchiveBox.conf
Default - Default value from code
diff --git a/archivebox/core/widgets.py b/archivebox/core/widgets.py
index 6e9fe475..af30544c 100644
--- a/archivebox/core/widgets.py
+++ b/archivebox/core/widgets.py
@@ -131,7 +131,46 @@ class TagEditorWidget(forms.Widget):
}};
window.updateHiddenInput_{widget_id} = function() {{
- document.getElementById('{widget_id}').value = currentTags_{widget_id}.join(',');
+ var hiddenInput = document.getElementById('{widget_id}');
+ if (!hiddenInput) {{
+ return;
+ }}
+ hiddenInput.value = currentTags_{widget_id}.join(',');
+ hiddenInput.dispatchEvent(new Event('input', {{ bubbles: true }}));
+ hiddenInput.dispatchEvent(new Event('change', {{ bubbles: true }}));
+ }};
+
+ function normalizeTags_{widget_id}(value) {{
+ var rawTags = Array.isArray(value) ? value : String(value || '').split(',');
+ var seen = {{}};
+ return rawTags
+ .map(function(tag) {{ return String(tag || '').trim(); }})
+ .filter(function(tag) {{
+ if (!tag) return false;
+ var normalized = tag.toLowerCase();
+ if (seen[normalized]) return false;
+ seen[normalized] = true;
+ return true;
+ }})
+ .sort(function(a, b) {{
+ return a.toLowerCase().localeCompare(b.toLowerCase());
+ }});
+ }}
+
+ window.setTags_{widget_id} = function(value, options) {{
+ currentTags_{widget_id} = normalizeTags_{widget_id}(value);
+ rebuildPills_{widget_id}();
+ if (!(options && options.skipHiddenUpdate)) {{
+ updateHiddenInput_{widget_id}();
+ }}
+ }};
+
+ window.syncTagEditorFromHidden_{widget_id} = function() {{
+ var hiddenInput = document.getElementById('{widget_id}');
+ if (!hiddenInput) {{
+ return;
+ }}
+ setTags_{widget_id}(hiddenInput.value, {{ skipHiddenUpdate: true }});
}};
function computeTagStyle_{widget_id}(tagName) {{
@@ -190,9 +229,7 @@ class TagEditorWidget(forms.Widget):
// Add to current tags
currentTags_{widget_id}.push(tagName);
- currentTags_{widget_id}.sort(function(a, b) {{
- return a.toLowerCase().localeCompare(b.toLowerCase());
- }});
+ currentTags_{widget_id} = normalizeTags_{widget_id}(currentTags_{widget_id});
// Rebuild pills
rebuildPills_{widget_id}();
@@ -252,6 +289,14 @@ class TagEditorWidget(forms.Widget):
}}
}});
+ document.getElementById('{widget_id}').addEventListener('change', function() {{
+ syncTagEditorFromHidden_{widget_id}();
+ }});
+
+ document.getElementById('{widget_id}').addEventListener('archivebox:sync-tags', function() {{
+ syncTagEditorFromHidden_{widget_id}();
+ }});
+
window.handleTagKeydown_{widget_id} = function(event) {{
var input = event.target;
var value = input.value.trim();
@@ -320,6 +365,8 @@ class TagEditorWidget(forms.Widget):
var input = document.querySelector('input[name="csrfmiddlewaretoken"]');
return input ? input.value : '';
}}
+
+ syncTagEditorFromHidden_{widget_id}();
}})();
'''
@@ -327,15 +374,232 @@ class TagEditorWidget(forms.Widget):
return mark_safe(html)
+class URLFiltersWidget(forms.Widget):
+ """Render URL allowlist / denylist controls with same-domain autofill."""
+
+ template_name = ""
+
+ def __init__(self, attrs=None, *, source_selector='textarea[name="url"]'):
+ self.source_selector = source_selector
+ super().__init__(attrs)
+
+ def render(self, name, value, attrs=None, renderer=None):
+ value = value if isinstance(value, dict) else {}
+ widget_id_raw = attrs.get('id', name) if attrs else name
+ widget_id = re.sub(r'[^A-Za-z0-9_]', '_', str(widget_id_raw)) or name
+ allowlist = escape(value.get('allowlist', '') or '')
+ denylist = escape(value.get('denylist', '') or '')
+
+ return mark_safe(f'''
+
+ ''')
+
+ def value_from_datadict(self, data, files, name):
+ return {
+ 'allowlist': data.get(f'{name}_allowlist', ''),
+ 'denylist': data.get(f'{name}_denylist', ''),
+ 'same_domain_only': data.get(f'{name}_same_domain_only') in ('1', 'on', 'true'),
+ }
+
+
class InlineTagEditorWidget(TagEditorWidget):
"""
Inline version of TagEditorWidget for use in list views.
Includes AJAX save functionality for immediate persistence.
"""
- def __init__(self, attrs=None, snapshot_id=None):
+ def __init__(self, attrs=None, snapshot_id=None, editable=True):
super().__init__(attrs, snapshot_id)
self.snapshot_id = snapshot_id
+ self.editable = editable
def render(self, name, value, attrs=None, renderer=None, snapshot_id=None):
"""Render inline tag editor with AJAX save."""
@@ -361,20 +625,24 @@ class InlineTagEditorWidget(TagEditorWidget):
# Build pills HTML with filter links
pills_html = ''
for td in tag_data:
+ remove_button = ''
+ if self.editable:
+ remove_button = (
+ f'× '
+ )
pills_html += f'''
{self._escape(td['name'])}
- ×
+ {remove_button}
'''
tags_json = escape(json.dumps(tag_data))
-
- html = f'''
-
-
- {pills_html}
-
+ input_html = ''
+ readonly_class = ' readonly' if not self.editable else ''
+ if self.editable:
+ input_html = f'''
+ '''
+
+ html = f'''
+
+
+ {pills_html}
+
+ {input_html}
'''
diff --git a/archivebox/crawls/admin.py b/archivebox/crawls/admin.py
index 2e637ff0..4c83e97b 100644
--- a/archivebox/crawls/admin.py
+++ b/archivebox/crawls/admin.py
@@ -1,8 +1,11 @@
__package__ = 'archivebox.crawls'
-
from django import forms
-from django.utils.html import format_html, format_html_join
+from django.http import JsonResponse, HttpRequest, HttpResponseNotAllowed
+from django.shortcuts import get_object_or_404, redirect
+from django.urls import path, reverse
+from django.utils.html import escape, format_html, format_html_join
+from django.utils import timezone
from django.utils.safestring import mark_safe
from django.contrib import admin, messages
from django.db.models import Count, Q
@@ -13,16 +16,19 @@ from django_object_actions import action
from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
from archivebox.core.models import Snapshot
+from archivebox.core.widgets import TagEditorWidget
from archivebox.crawls.models import Crawl, CrawlSchedule
-def render_snapshots_list(snapshots_qs, limit=20):
+def render_snapshots_list(snapshots_qs, limit=20, crawl=None):
"""Render a nice inline list view of snapshots with status, title, URL, and progress."""
snapshots = snapshots_qs.order_by('-created_at')[:limit].annotate(
total_results=Count('archiveresult'),
succeeded_results=Count('archiveresult', filter=Q(archiveresult__status='succeeded')),
failed_results=Count('archiveresult', filter=Q(archiveresult__status='failed')),
+ started_results=Count('archiveresult', filter=Q(archiveresult__status='started')),
+ skipped_results=Count('archiveresult', filter=Q(archiveresult__status='skipped')),
)
if not snapshots:
@@ -43,17 +49,57 @@ def render_snapshots_list(snapshots_qs, limit=20):
# Calculate progress
total = snapshot.total_results
- done = snapshot.succeeded_results + snapshot.failed_results
+ succeeded = snapshot.succeeded_results
+ failed = snapshot.failed_results
+ running = snapshot.started_results
+ skipped = snapshot.skipped_results
+ done = succeeded + failed + skipped
+ pending = max(total - done - running, 0)
progress_pct = int((done / total) * 100) if total > 0 else 0
progress_text = f'{done}/{total}' if total > 0 else '-'
+ progress_title = (
+ f'{succeeded} succeeded, {failed} failed, {running} running, '
+ f'{pending} pending, {skipped} skipped'
+ )
+ progress_color = '#28a745'
+ if failed:
+ progress_color = '#dc3545'
+ elif running:
+ progress_color = '#17a2b8'
+ elif pending:
+ progress_color = '#ffc107'
# Truncate title and URL
- title = (snapshot.title or 'Untitled')[:60]
- if len(snapshot.title or '') > 60:
+ snapshot_title = snapshot.title or 'Untitled'
+ title = snapshot_title[:60]
+ if len(snapshot_title) > 60:
title += '...'
url_display = snapshot.url[:50]
if len(snapshot.url) > 50:
url_display += '...'
+ delete_button = ''
+ exclude_button = ''
+ if crawl is not None:
+ delete_url = reverse('admin:crawls_crawl_snapshot_delete', args=[crawl.pk, snapshot.pk])
+ exclude_url = reverse('admin:crawls_crawl_snapshot_exclude_domain', args=[crawl.pk, snapshot.pk])
+ delete_button = f'''
+ 🗑
+ '''
+ exclude_button = f'''
+ ⊘
+ '''
# Format date
date_str = snapshot.created_at.strftime('%Y-%m-%d %H:%M') if snapshot.created_at else '-'
@@ -74,18 +120,18 @@ def render_snapshots_list(snapshots_qs, limit=20):
{title}
+ title="{escape(snapshot_title)}">{escape(title)}
- {url_display}
+ title="{escape(snapshot.url)}">{escape(url_display)}
-
+ {"%s%s
" % (exclude_button, delete_button) if crawl is not None else ""}
''')
@@ -111,7 +158,7 @@ def render_snapshots_list(snapshots_qs, limit=20):
'''
return mark_safe(f'''
-
+
@@ -121,6 +168,7 @@ def render_snapshots_list(snapshots_qs, limit=20):
URL
Progress
Created
+ {'Actions ' if crawl is not None else ''}
@@ -129,11 +177,197 @@ def render_snapshots_list(snapshots_qs, limit=20):
+ {'''
+
+ ''' if crawl is not None else ''}
''')
+class URLFiltersWidget(forms.Widget):
+ def render(self, name, value, attrs=None, renderer=None):
+ value = value if isinstance(value, dict) else {}
+ widget_id = (attrs or {}).get('id', name)
+ allowlist = escape(value.get('allowlist', '') or '')
+ denylist = escape(value.get('denylist', '') or '')
+
+ return mark_safe(f'''
+
+ ''')
+
+ def value_from_datadict(self, data, files, name):
+ return {
+ 'allowlist': data.get(f'{name}_allowlist', ''),
+ 'denylist': data.get(f'{name}_denylist', ''),
+ 'same_domain_only': data.get(f'{name}_same_domain_only') in ('1', 'on', 'true'),
+ }
+
+
+class URLFiltersField(forms.Field):
+ widget = URLFiltersWidget
+
+ def to_python(self, value):
+ if isinstance(value, dict):
+ return value
+ return {'allowlist': '', 'denylist': '', 'same_domain_only': False}
+
+
class CrawlAdminForm(forms.ModelForm):
"""Custom form for Crawl admin to render urls field as textarea."""
+ tags_editor = forms.CharField(
+ label='Tags',
+ required=False,
+ widget=TagEditorWidget(),
+ help_text='Type tag names and press Enter or Space to add. Click × to remove.',
+ )
+ url_filters = URLFiltersField(
+ label='URL Filters',
+ required=False,
+ help_text='Set URL_ALLOWLIST / URL_DENYLIST for this crawl.',
+ )
class Meta:
model = Crawl
@@ -144,8 +378,62 @@ class CrawlAdminForm(forms.ModelForm):
'style': 'width: 100%; font-family: monospace; font-size: 13px;',
'placeholder': 'https://example.com\nhttps://example2.com\n# Comments start with #',
}),
+ 'notes': forms.Textarea(attrs={
+ 'rows': 1,
+ 'style': 'width: 100%; min-height: 0; resize: vertical;',
+ }),
}
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ config = dict(self.instance.config or {}) if self.instance and self.instance.pk else {}
+ if self.instance and self.instance.pk:
+ self.initial['tags_editor'] = self.instance.tags_str
+ self.initial['url_filters'] = {
+ 'allowlist': config.get('URL_ALLOWLIST', ''),
+ 'denylist': config.get('URL_DENYLIST', ''),
+ 'same_domain_only': False,
+ }
+
+ def clean_tags_editor(self):
+ tags_str = self.cleaned_data.get('tags_editor', '')
+ tag_names = []
+ seen = set()
+ for raw_name in tags_str.split(','):
+ name = raw_name.strip()
+ if not name:
+ continue
+ lowered = name.lower()
+ if lowered in seen:
+ continue
+ seen.add(lowered)
+ tag_names.append(name)
+ return ','.join(tag_names)
+
+ def clean_url_filters(self):
+ value = self.cleaned_data.get('url_filters') or {}
+ return {
+ 'allowlist': '\n'.join(Crawl.split_filter_patterns(value.get('allowlist', ''))),
+ 'denylist': '\n'.join(Crawl.split_filter_patterns(value.get('denylist', ''))),
+ 'same_domain_only': bool(value.get('same_domain_only')),
+ }
+
+ def save(self, commit=True):
+ instance = super().save(commit=False)
+ instance.tags_str = self.cleaned_data.get('tags_editor', '')
+ url_filters = self.cleaned_data.get('url_filters') or {}
+ instance.set_url_filters(
+ url_filters.get('allowlist', ''),
+ url_filters.get('denylist', ''),
+ )
+ if commit:
+ instance.save()
+ instance.apply_crawl_config_filters()
+ save_m2m = getattr(self, '_save_m2m', None)
+ if callable(save_m2m):
+ save_m2m()
+ return instance
+
class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
form = CrawlAdminForm
@@ -161,11 +449,11 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
'classes': ('card', 'wide'),
}),
('Info', {
- 'fields': ('label', 'notes', 'tags_str'),
+ 'fields': ('label', 'notes', 'tags_editor'),
'classes': ('card',),
}),
('Settings', {
- 'fields': ('max_depth', 'config'),
+ 'fields': (('max_depth', 'url_filters'), 'config'),
'classes': ('card',),
}),
('Status', {
@@ -185,6 +473,28 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
'classes': ('card', 'wide'),
}),
)
+ add_fieldsets = (
+ ('URLs', {
+ 'fields': ('urls',),
+ 'classes': ('card', 'wide'),
+ }),
+ ('Info', {
+ 'fields': ('label', 'notes', 'tags_editor'),
+ 'classes': ('card',),
+ }),
+ ('Settings', {
+ 'fields': (('max_depth', 'url_filters'), 'config'),
+ 'classes': ('card',),
+ }),
+ ('Status', {
+ 'fields': ('status', 'retry_at'),
+ 'classes': ('card',),
+ }),
+ ('Relations', {
+ 'fields': ('schedule', 'created_by'),
+ 'classes': ('card',),
+ }),
+ )
list_filter = ('max_depth', 'schedule', 'created_by', 'status', 'retry_at')
ordering = ['-created_at', '-retry_at']
@@ -199,6 +509,25 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
num_snapshots_cached=Count('snapshot_set')
)
+ def get_fieldsets(self, request, obj=None):
+ return self.fieldsets if obj else self.add_fieldsets
+
+ def get_urls(self):
+ urls = super().get_urls()
+ custom_urls = [
+ path(
+ '
/snapshot//delete/',
+ self.admin_site.admin_view(self.delete_snapshot_view),
+ name='crawls_crawl_snapshot_delete',
+ ),
+ path(
+ '/snapshot//exclude-domain/',
+ self.admin_site.admin_view(self.exclude_domain_view),
+ name='crawls_crawl_snapshot_exclude_domain',
+ ),
+ ]
+ return custom_urls + urls
+
@admin.action(description='Delete selected crawls')
def delete_selected_batched(self, request, queryset):
"""Delete crawls in a single transaction to avoid SQLite concurrency issues."""
@@ -218,8 +547,6 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
@action(label='Recrawl', description='Create a new crawl with the same settings')
def recrawl(self, request, obj):
"""Duplicate this crawl as a new crawl with the same URLs and settings."""
- from django.utils import timezone
- from django.shortcuts import redirect
# Validate URLs (required for crawl to start)
if not obj.urls:
@@ -252,7 +579,37 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
return getattr(obj, 'num_snapshots_cached', obj.snapshot_set.count())
def snapshots(self, obj):
- return render_snapshots_list(obj.snapshot_set.all())
+ return render_snapshots_list(obj.snapshot_set.all(), crawl=obj)
+
+ def delete_snapshot_view(self, request: HttpRequest, object_id: str, snapshot_id: str):
+ if request.method != 'POST':
+ return HttpResponseNotAllowed(['POST'])
+
+ crawl = get_object_or_404(Crawl, pk=object_id)
+ snapshot = get_object_or_404(Snapshot, pk=snapshot_id, crawl=crawl)
+
+ if snapshot.status == Snapshot.StatusChoices.STARTED:
+ snapshot.cancel_running_hooks()
+
+ removed_urls = crawl.prune_url(snapshot.url)
+ snapshot.delete()
+ return JsonResponse({
+ 'ok': True,
+ 'snapshot_id': str(snapshot.id),
+ 'removed_urls': removed_urls,
+ })
+
+ def exclude_domain_view(self, request: HttpRequest, object_id: str, snapshot_id: str):
+ if request.method != 'POST':
+ return HttpResponseNotAllowed(['POST'])
+
+ crawl = get_object_or_404(Crawl, pk=object_id)
+ snapshot = get_object_or_404(Snapshot, pk=snapshot_id, crawl=crawl)
+ result = crawl.exclude_domain(snapshot.url)
+ return JsonResponse({
+ 'ok': True,
+ **result,
+ })
@admin.display(description='Schedule', ordering='schedule')
def schedule_str(self, obj):
diff --git a/archivebox/crawls/models.py b/archivebox/crawls/models.py
index afdb928f..77023c55 100755
--- a/archivebox/crawls/models.py
+++ b/archivebox/crawls/models.py
@@ -2,9 +2,12 @@ __package__ = 'archivebox.crawls'
from typing import TYPE_CHECKING
import uuid
+import json
+import re
from datetime import timedelta
from archivebox.uuid_compat import uuid7
from pathlib import Path
+from urllib.parse import urlparse
from django.db import models
from django.core.validators import MaxValueValidator, MinValueValidator
@@ -141,22 +144,21 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
return f'[...{short_id}] {first_url[:120]}'
def save(self, *args, **kwargs):
- is_new = self._state.adding
super().save(*args, **kwargs)
- if is_new:
- from archivebox.misc.logging_util import log_worker_event
- first_url = self.get_urls_list()[0] if self.get_urls_list() else ''
- log_worker_event(
- worker_type='DB',
- event='Created Crawl',
- indent_level=1,
- metadata={
- 'id': str(self.id),
- 'first_url': first_url[:64],
- 'max_depth': self.max_depth,
- 'status': self.status,
- },
- )
+ # if is_new:
+ # from archivebox.misc.logging_util import log_worker_event
+ # first_url = self.get_urls_list()[0] if self.get_urls_list() else ''
+ # log_worker_event(
+ # worker_type='DB',
+ # event='Created Crawl',
+ # indent_level=1,
+ # metadata={
+ # 'id': str(self.id),
+ # 'first_url': first_url[:64],
+ # 'max_depth': self.max_depth,
+ # 'status': self.status,
+ # },
+ # )
@property
def api_url(self) -> str:
@@ -248,6 +250,222 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
if url.strip() and not url.strip().startswith('#')
]
+ @staticmethod
+ def normalize_domain(value: str) -> str:
+ candidate = (value or '').strip().lower()
+ if not candidate:
+ return ''
+ if '://' not in candidate and '/' not in candidate:
+ candidate = f'https://{candidate.lstrip(".")}'
+ try:
+ parsed = urlparse(candidate)
+ hostname = parsed.hostname or ''
+ if not hostname:
+ return ''
+ if parsed.port:
+ return f'{hostname}_{parsed.port}'
+ return hostname
+ except Exception:
+ return ''
+
+ @staticmethod
+ def split_filter_patterns(value) -> list[str]:
+ patterns = []
+ seen = set()
+ if isinstance(value, list):
+ raw_values = value
+ elif isinstance(value, str):
+ raw_values = value.splitlines()
+ else:
+ raw_values = []
+
+ for raw_value in raw_values:
+ pattern = str(raw_value or '').strip()
+ if not pattern or pattern in seen:
+ continue
+ seen.add(pattern)
+ patterns.append(pattern)
+ return patterns
+
+ @classmethod
+ def _pattern_matches_url(cls, url: str, pattern: str) -> bool:
+ normalized_pattern = str(pattern or '').strip()
+ if not normalized_pattern:
+ return False
+
+ if re.fullmatch(r'[\w.*:-]+', normalized_pattern):
+ wildcard_only_subdomains = normalized_pattern.startswith('*.')
+ normalized_domain = cls.normalize_domain(
+ normalized_pattern[2:] if wildcard_only_subdomains else normalized_pattern
+ )
+ normalized_url_domain = cls.normalize_domain(url)
+ if not normalized_domain or not normalized_url_domain:
+ return False
+
+ pattern_host = normalized_domain.split('_', 1)[0]
+ url_host = normalized_url_domain.split('_', 1)[0]
+
+ if wildcard_only_subdomains:
+ return url_host.endswith(f'.{pattern_host}')
+
+ if normalized_url_domain == normalized_domain:
+ return True
+ return url_host == pattern_host or url_host.endswith(f'.{pattern_host}')
+
+ try:
+ return bool(re.search(normalized_pattern, url))
+ except re.error:
+ return False
+
+ def get_url_allowlist(self, *, use_effective_config: bool = False, snapshot=None) -> list[str]:
+ if use_effective_config:
+ from archivebox.config.configset import get_config
+
+ config = get_config(crawl=self, snapshot=snapshot)
+ else:
+ config = self.config or {}
+ return self.split_filter_patterns(config.get('URL_ALLOWLIST', ''))
+
+ def get_url_denylist(self, *, use_effective_config: bool = False, snapshot=None) -> list[str]:
+ if use_effective_config:
+ from archivebox.config.configset import get_config
+
+ config = get_config(crawl=self, snapshot=snapshot)
+ else:
+ config = self.config or {}
+ return self.split_filter_patterns(config.get('URL_DENYLIST', ''))
+
+ def url_passes_filters(self, url: str, *, snapshot=None, use_effective_config: bool = True) -> bool:
+ denylist = self.get_url_denylist(use_effective_config=use_effective_config, snapshot=snapshot)
+ allowlist = self.get_url_allowlist(use_effective_config=use_effective_config, snapshot=snapshot)
+
+ for pattern in denylist:
+ if self._pattern_matches_url(url, pattern):
+ return False
+
+ if allowlist:
+ return any(self._pattern_matches_url(url, pattern) for pattern in allowlist)
+
+ return True
+
+ def set_url_filters(self, allowlist, denylist) -> None:
+ config = dict(self.config or {})
+ allow_patterns = self.split_filter_patterns(allowlist)
+ deny_patterns = self.split_filter_patterns(denylist)
+
+ if allow_patterns:
+ config['URL_ALLOWLIST'] = '\n'.join(allow_patterns)
+ else:
+ config.pop('URL_ALLOWLIST', None)
+
+ if deny_patterns:
+ config['URL_DENYLIST'] = '\n'.join(deny_patterns)
+ else:
+ config.pop('URL_DENYLIST', None)
+
+ self.config = config
+
+ def apply_crawl_config_filters(self) -> dict[str, int]:
+ from archivebox.core.models import Snapshot
+
+ removed_urls = self.prune_urls(
+ lambda url: not self.url_passes_filters(url, use_effective_config=False)
+ )
+
+ filtered_snapshots = [
+ snapshot
+ for snapshot in self.snapshot_set.filter(
+ status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED],
+ ).only('pk', 'url', 'status')
+ if not self.url_passes_filters(snapshot.url, snapshot=snapshot, use_effective_config=False)
+ ]
+
+ deleted_snapshots = 0
+ if filtered_snapshots:
+ started_snapshots = [
+ snapshot for snapshot in filtered_snapshots
+ if snapshot.status == Snapshot.StatusChoices.STARTED
+ ]
+ for snapshot in started_snapshots:
+ snapshot.cancel_running_hooks()
+
+ filtered_snapshot_ids = [snapshot.pk for snapshot in filtered_snapshots]
+ deleted_snapshots, _ = self.snapshot_set.filter(pk__in=filtered_snapshot_ids).delete()
+
+ return {
+ 'removed_urls': len(removed_urls),
+ 'deleted_snapshots': deleted_snapshots,
+ }
+
+ def _iter_url_lines(self) -> list[tuple[str, str]]:
+ entries: list[tuple[str, str]] = []
+ for raw_line in (self.urls or '').splitlines():
+ stripped = raw_line.strip()
+ if not stripped:
+ continue
+ if stripped.startswith('#'):
+ entries.append((raw_line.rstrip(), ''))
+ continue
+ try:
+ entry = json.loads(stripped)
+ entries.append((raw_line.rstrip(), str(entry.get('url', '') or '').strip()))
+ except json.JSONDecodeError:
+ entries.append((raw_line.rstrip(), stripped))
+ return entries
+
+ def prune_urls(self, predicate) -> list[str]:
+ kept_lines: list[str] = []
+ removed_urls: list[str] = []
+
+ for raw_line, url in self._iter_url_lines():
+ if not url:
+ kept_lines.append(raw_line)
+ continue
+ if predicate(url):
+ removed_urls.append(url)
+ continue
+ kept_lines.append(raw_line)
+
+ next_urls = '\n'.join(kept_lines)
+ if next_urls != (self.urls or ''):
+ self.urls = next_urls
+ self.save(update_fields=['urls', 'modified_at'])
+ return removed_urls
+
+ def prune_url(self, url: str) -> int:
+ target = (url or '').strip()
+ removed = self.prune_urls(lambda candidate: candidate == target)
+ return len(removed)
+
+ def exclude_domain(self, domain: str) -> dict[str, int | str | bool]:
+ normalized_domain = self.normalize_domain(domain)
+ if not normalized_domain:
+ return {
+ 'domain': '',
+ 'created': False,
+ 'removed_urls': 0,
+ 'deleted_snapshots': 0,
+ }
+
+ domains = self.get_url_denylist(use_effective_config=False)
+ created = normalized_domain not in domains
+ if created:
+ domains.append(normalized_domain)
+ self.set_url_filters(
+ self.get_url_allowlist(use_effective_config=False),
+ domains,
+ )
+ self.save(update_fields=['config', 'modified_at'])
+
+ filter_result = self.apply_crawl_config_filters()
+
+ return {
+ 'domain': normalized_domain,
+ 'created': created,
+ 'removed_urls': filter_result['removed_urls'],
+ 'deleted_snapshots': filter_result['deleted_snapshots'],
+ }
+
def get_system_task(self) -> str | None:
urls = self.get_urls_list()
if len(urls) != 1:
@@ -284,11 +502,13 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
Returns:
True if URL was added, False if skipped (duplicate or depth exceeded)
"""
- import json
+ from archivebox.misc.util import fix_url_from_markdown
- url = entry.get('url', '')
+ url = fix_url_from_markdown(str(entry.get('url', '') or '').strip())
if not url:
return False
+ if not self.url_passes_filters(url):
+ return False
depth = entry.get('depth', 1)
@@ -301,20 +521,13 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
return False
# Check if already in urls (parse existing JSONL entries)
- existing_urls = set()
- for line in self.urls.splitlines():
- if not line.strip():
- continue
- try:
- existing_entry = json.loads(line)
- existing_urls.add(existing_entry.get('url', ''))
- except json.JSONDecodeError:
- existing_urls.add(line.strip())
+ existing_urls = {url for _raw_line, url in self._iter_url_lines() if url}
if url in existing_urls:
return False
# Append as JSONL
+ entry = {**entry, 'url': url}
jsonl_entry = json.dumps(entry)
self.urls = (self.urls.rstrip() + '\n' + jsonl_entry).lstrip('\n')
self.save(update_fields=['urls', 'modified_at'])
@@ -327,15 +540,11 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
Returns:
List of newly created Snapshot objects
"""
- import sys
- import json
from archivebox.core.models import Snapshot
+ from archivebox.misc.util import fix_url_from_markdown
created_snapshots = []
- print(f'[cyan]DEBUG create_snapshots_from_urls: self.urls={repr(self.urls)}[/cyan]', file=sys.stderr)
- print(f'[cyan]DEBUG create_snapshots_from_urls: lines={self.urls.splitlines()}[/cyan]', file=sys.stderr)
-
for line in self.urls.splitlines():
if not line.strip():
continue
@@ -343,13 +552,13 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
# Parse JSONL or plain URL
try:
entry = json.loads(line)
- url = entry.get('url', '')
+ url = fix_url_from_markdown(str(entry.get('url', '') or '').strip())
depth = entry.get('depth', 0)
title = entry.get('title')
timestamp = entry.get('timestamp')
tags = entry.get('tags', '')
except json.JSONDecodeError:
- url = line.strip()
+ url = fix_url_from_markdown(line.strip())
depth = 0
title = None
timestamp = None
@@ -357,6 +566,8 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
if not url:
continue
+ if not self.url_passes_filters(url):
+ continue
# Skip if depth exceeds max_depth
if depth > self.max_depth:
diff --git a/archivebox/hooks.py b/archivebox/hooks.py
index 586f88fb..3dda2bd6 100644
--- a/archivebox/hooks.py
+++ b/archivebox/hooks.py
@@ -64,6 +64,7 @@ from abx_plugins import get_plugins_dir
from django.conf import settings
from django.utils.safestring import mark_safe
from archivebox.config.constants import CONSTANTS
+from archivebox.misc.util import fix_url_from_markdown
if TYPE_CHECKING:
from archivebox.machine.models import Process
@@ -266,7 +267,7 @@ def run_hook(
if process.status == 'exited':
records = process.get_records() # Get parsed JSONL output
"""
- from archivebox.machine.models import Process, Machine
+ from archivebox.machine.models import Process, Machine, NetworkInterface
from archivebox.config.constants import CONSTANTS
import sys
@@ -280,6 +281,8 @@ def run_hook(
# Get current machine
machine = Machine.current()
+ iface = NetworkInterface.current(refresh=True)
+ machine = iface.machine
# Auto-detect parent process if not explicitly provided
# This enables automatic hierarchy tracking: Worker -> Hook
@@ -294,6 +297,7 @@ def run_hook(
# Create a failed Process record for hooks that don't exist
process = Process.objects.create(
machine=machine,
+ iface=iface,
parent=parent,
process_type=Process.TypeChoices.HOOK,
pwd=str(output_dir),
@@ -449,6 +453,7 @@ def run_hook(
# Create Process record
process = Process.objects.create(
machine=machine,
+ iface=iface,
parent=parent,
process_type=Process.TypeChoices.HOOK,
pwd=str(output_dir),
@@ -458,6 +463,7 @@ def run_hook(
# Copy the env dict we already built (includes os.environ + all customizations)
process.env = env.copy()
+ process.hydrate_binary_from_context(plugin_name=script.parent.name, hook_path=str(script))
# Save env before launching
process.save()
@@ -472,6 +478,7 @@ def run_hook(
# Create a failed Process record for exceptions
process = Process.objects.create(
machine=machine,
+ iface=iface,
process_type=Process.TypeChoices.HOOK,
pwd=str(output_dir),
cmd=cmd,
@@ -544,6 +551,9 @@ def collect_urls_from_plugins(snapshot_dir: Path) -> List[Dict[str, Any]]:
text = urls_file.read_text()
for entry in Process.parse_records_from_text(text):
if entry.get('url'):
+ entry['url'] = fix_url_from_markdown(str(entry['url']).strip())
+ if not entry['url']:
+ continue
# Track which parser plugin found this URL
entry['plugin'] = subdir.name
urls.append(entry)
@@ -615,11 +625,30 @@ def get_enabled_plugins(config: Optional[Dict[str, Any]] = None) -> List[str]:
from archivebox.config.configset import get_config
config = get_config()
+ def normalize_enabled_plugins(value: Any) -> List[str]:
+ if value is None:
+ return []
+ if isinstance(value, str):
+ raw = value.strip()
+ if not raw:
+ return []
+ if raw.startswith('['):
+ try:
+ parsed = json.loads(raw)
+ except json.JSONDecodeError:
+ parsed = None
+ if isinstance(parsed, list):
+ return [str(plugin).strip() for plugin in parsed if str(plugin).strip()]
+ return [plugin.strip() for plugin in raw.split(',') if plugin.strip()]
+ if isinstance(value, (list, tuple, set)):
+ return [str(plugin).strip() for plugin in value if str(plugin).strip()]
+ return [str(value).strip()] if str(value).strip() else []
+
# Support explicit ENABLED_PLUGINS override (legacy)
if 'ENABLED_PLUGINS' in config:
- return config['ENABLED_PLUGINS']
+ return normalize_enabled_plugins(config['ENABLED_PLUGINS'])
if 'ENABLED_EXTRACTORS' in config:
- return config['ENABLED_EXTRACTORS']
+ return normalize_enabled_plugins(config['ENABLED_EXTRACTORS'])
# Filter all plugins by enabled status
all_plugins = get_plugins()
@@ -1042,6 +1071,14 @@ def process_hook_records(records: List[Dict[str, Any]], overrides: Dict[str, Any
if record_type == 'Snapshot':
from archivebox.core.models import Snapshot
+ if record.get('url'):
+ record = {
+ **record,
+ 'url': fix_url_from_markdown(str(record['url']).strip()),
+ }
+ if not record['url']:
+ continue
+
# Check if discovered snapshot exceeds crawl max_depth
snapshot_depth = record.get('depth', 0)
crawl = overrides.get('crawl')
diff --git a/archivebox/machine/admin.py b/archivebox/machine/admin.py
index 27bdf060..7d531aed 100644
--- a/archivebox/machine/admin.py
+++ b/archivebox/machine/admin.py
@@ -113,7 +113,7 @@ class BinaryAdmin(BaseModelAdmin):
sort_fields = ('id', 'created_at', 'machine_info', 'name', 'binprovider', 'version', 'abspath', 'sha256', 'status')
search_fields = ('id', 'machine__id', 'name', 'binprovider', 'version', 'abspath', 'sha256')
- readonly_fields = ('created_at', 'modified_at')
+ readonly_fields = ('created_at', 'modified_at', 'output_dir')
fieldsets = (
('Binary Info', {
@@ -166,7 +166,7 @@ class ProcessAdmin(BaseModelAdmin):
sort_fields = ('id', 'created_at', 'status', 'exit_code', 'pid')
search_fields = ('id', 'machine__id', 'binary__name', 'cmd', 'pwd', 'stdout', 'stderr')
- readonly_fields = ('created_at', 'modified_at', 'machine', 'binary', 'iface', 'archiveresult_link')
+ readonly_fields = ('created_at', 'modified_at', 'machine', 'binary_link', 'iface_link', 'archiveresult_link')
fieldsets = (
('Process Info', {
@@ -178,7 +178,7 @@ class ProcessAdmin(BaseModelAdmin):
'classes': ('card', 'wide'),
}),
('Execution', {
- 'fields': ('binary', 'iface', 'pid', 'exit_code', 'url'),
+ 'fields': ('binary_link', 'iface_link', 'pid', 'exit_code', 'url'),
'classes': ('card',),
}),
('Timing', {
@@ -216,6 +216,21 @@ class ProcessAdmin(BaseModelAdmin):
process.binary.id, process.binary.name, process.binary.version,
)
+ @admin.display(description='Binary', ordering='binary__name')
+ def binary_link(self, process):
+ return self.binary_info(process)
+
+ @admin.display(description='Network Interface', ordering='iface__id')
+ def iface_link(self, process):
+ if not process.iface:
+ return '-'
+ return format_html(
+ '{} {} ',
+ process.iface.id,
+ str(process.iface.id)[:8],
+ process.iface.iface or process.iface.ip_public or process.iface.ip_local,
+ )
+
@admin.display(description='ArchiveResult')
def archiveresult_link(self, process):
if not hasattr(process, 'archiveresult'):
diff --git a/archivebox/machine/models.py b/archivebox/machine/models.py
index fd700f91..441b8cf1 100755
--- a/archivebox/machine/models.py
+++ b/archivebox/machine/models.py
@@ -49,6 +49,89 @@ BINARY_RECHECK_INTERVAL = 1 * 30 * 60
PROCESS_RECHECK_INTERVAL = 60 # Re-validate every 60 seconds
PID_REUSE_WINDOW = timedelta(hours=24) # Max age for considering a PID match valid
START_TIME_TOLERANCE = 5.0 # Seconds tolerance for start time matching
+LEGACY_MACHINE_CONFIG_KEYS = frozenset({"CHROMIUM_VERSION"})
+
+
+def _find_existing_binary_for_reference(machine: 'Machine', reference: str) -> 'Binary | None':
+ reference = str(reference or '').strip()
+ if not reference:
+ return None
+
+ qs = Binary.objects.filter(machine=machine)
+
+ direct_match = qs.filter(abspath=reference).order_by('-modified_at').first()
+ if direct_match:
+ return direct_match
+
+ ref_name = Path(reference).name
+ if ref_name:
+ named_match = qs.filter(name=ref_name).order_by('-modified_at').first()
+ if named_match:
+ return named_match
+
+ return qs.filter(name=reference).order_by('-modified_at').first()
+
+
+def _get_process_binary_env_keys(plugin_name: str, hook_path: str, env: dict[str, Any] | None) -> list[str]:
+ env = env or {}
+ plugin_name = str(plugin_name or '').strip()
+ hook_path = str(hook_path or '').strip()
+ plugin_key = plugin_name.upper().replace('-', '_')
+ keys: list[str] = []
+ seen: set[str] = set()
+
+ def add(key: str) -> None:
+ if key and key not in seen and env.get(key):
+ seen.add(key)
+ keys.append(key)
+
+ if plugin_key:
+ add(f'{plugin_key}_BINARY')
+
+ try:
+ from archivebox.hooks import discover_plugin_configs
+
+ plugin_schema = discover_plugin_configs().get(plugin_name, {})
+ schema_keys = [
+ key
+ for key in (plugin_schema.get('properties') or {})
+ if key.endswith('_BINARY')
+ ]
+ except Exception:
+ schema_keys = []
+
+ schema_keys.sort(key=lambda key: (
+ key != f'{plugin_key}_BINARY',
+ key.endswith('_NODE_BINARY'),
+ key.endswith('_CHROME_BINARY'),
+ key,
+ ))
+ for key in schema_keys:
+ add(key)
+
+ if plugin_name.startswith('search_backend_'):
+ backend_name = plugin_name.removeprefix('search_backend_').upper().replace('-', '_')
+ configured_engine = str(env.get('SEARCH_BACKEND_ENGINE') or '').strip().upper().replace('-', '_')
+ if backend_name and backend_name == configured_engine:
+ add(f'{backend_name}_BINARY')
+
+ hook_suffix = Path(hook_path).suffix.lower()
+ if hook_suffix == '.js':
+ if plugin_key:
+ add(f'{plugin_key}_NODE_BINARY')
+ add('NODE_BINARY')
+
+ return keys
+
+
+def _sanitize_machine_config(config: dict[str, Any] | None) -> dict[str, Any]:
+ if not isinstance(config, dict):
+ return {}
+
+ sanitized = dict(config)
+ for key in LEGACY_MACHINE_CONFIG_KEYS:
+ sanitized.pop(key, None)
+ return sanitized
class MachineManager(models.Manager):
@@ -89,13 +172,13 @@ class Machine(ModelWithHealthStats):
global _CURRENT_MACHINE
if _CURRENT_MACHINE:
if timezone.now() < _CURRENT_MACHINE.modified_at + timedelta(seconds=MACHINE_RECHECK_INTERVAL):
- return cls._hydrate_config_from_sibling(_CURRENT_MACHINE)
+ return cls._sanitize_config(cls._hydrate_config_from_sibling(_CURRENT_MACHINE))
_CURRENT_MACHINE = None
_CURRENT_MACHINE, _ = cls.objects.update_or_create(
guid=get_host_guid(),
defaults={'hostname': socket.gethostname(), **get_os_info(), **get_vm_info(), 'stats': get_host_stats()},
)
- return cls._hydrate_config_from_sibling(_CURRENT_MACHINE)
+ return cls._sanitize_config(cls._hydrate_config_from_sibling(_CURRENT_MACHINE))
@classmethod
def _hydrate_config_from_sibling(cls, machine: 'Machine') -> 'Machine':
@@ -115,6 +198,15 @@ class Machine(ModelWithHealthStats):
machine.save(update_fields=['config', 'modified_at'])
return machine
+ @classmethod
+ def _sanitize_config(cls, machine: 'Machine') -> 'Machine':
+ sanitized = _sanitize_machine_config(machine.config)
+ current = machine.config or {}
+ if sanitized != current:
+ machine.config = sanitized
+ machine.save(update_fields=['config', 'modified_at'])
+ return machine
+
def to_json(self) -> dict:
"""
Convert Machine model instance to a JSON-serializable dict.
@@ -152,11 +244,10 @@ class Machine(ModelWithHealthStats):
Returns:
Machine instance or None
"""
- config_patch = record.get('config')
- if isinstance(config_patch, dict) and config_patch:
+ config_patch = _sanitize_machine_config(record.get('config'))
+ if config_patch:
machine = Machine.current()
- if not machine.config:
- machine.config = {}
+ machine.config = _sanitize_machine_config(machine.config)
machine.config.update(config_patch)
machine.save(update_fields=['config'])
return machine
@@ -194,13 +285,17 @@ class NetworkInterface(ModelWithHealthStats):
unique_together = (('machine', 'ip_public', 'ip_local', 'mac_address', 'dns_server'),)
@classmethod
- def current(cls) -> 'NetworkInterface':
+ def current(cls, refresh: bool = False) -> 'NetworkInterface':
global _CURRENT_INTERFACE
+ machine = Machine.current()
if _CURRENT_INTERFACE:
- if timezone.now() < _CURRENT_INTERFACE.modified_at + timedelta(seconds=NETWORK_INTERFACE_RECHECK_INTERVAL):
+ if (
+ not refresh
+ and _CURRENT_INTERFACE.machine_id == machine.id
+ and timezone.now() < _CURRENT_INTERFACE.modified_at + timedelta(seconds=NETWORK_INTERFACE_RECHECK_INTERVAL)
+ ):
return _CURRENT_INTERFACE
_CURRENT_INTERFACE = None
- machine = Machine.current()
net_info = get_host_network()
_CURRENT_INTERFACE, _ = cls.objects.update_or_create(
machine=machine, ip_public=net_info.pop('ip_public'), ip_local=net_info.pop('ip_local'),
@@ -747,14 +842,17 @@ class ProcessManager(models.Manager):
Called during migration and when creating new ArchiveResults.
"""
+ iface = kwargs.get('iface') or NetworkInterface.current()
+
# Defaults from ArchiveResult if not provided
defaults = {
- 'machine': Machine.current(),
+ 'machine': iface.machine,
'pwd': kwargs.get('pwd') or str(archiveresult.snapshot.output_dir / archiveresult.plugin),
'cmd': kwargs.get('cmd') or [],
'status': 'queued',
'timeout': kwargs.get('timeout', 120),
'env': kwargs.get('env', {}),
+ 'iface': iface,
}
defaults.update(kwargs)
@@ -971,6 +1069,28 @@ class Process(models.Model):
record['timeout'] = self.timeout
return record
+ def hydrate_binary_from_context(self, *, plugin_name: str = '', hook_path: str = '') -> 'Binary | None':
+ machine = self.machine if self.machine_id else Machine.current()
+
+ references: list[str] = []
+ for key in _get_process_binary_env_keys(plugin_name, hook_path, self.env):
+ value = str(self.env.get(key) or '').strip()
+ if value and value not in references:
+ references.append(value)
+
+ if self.cmd:
+ cmd_0 = str(self.cmd[0]).strip()
+ if cmd_0 and cmd_0 not in references:
+ references.append(cmd_0)
+
+ for reference in references:
+ binary = _find_existing_binary_for_reference(machine, reference)
+ if binary:
+ self.binary = binary
+ return binary
+
+ return None
+
@classmethod
def parse_records_from_text(cls, text: str) -> list[dict]:
"""Parse JSONL records from raw text using the shared JSONL parser."""
@@ -1044,6 +1164,7 @@ class Process(models.Model):
current_pid = os.getpid()
machine = Machine.current()
+ iface = NetworkInterface.current()
# Check cache validity
if _CURRENT_PROCESS:
@@ -1053,6 +1174,9 @@ class Process(models.Model):
and _CURRENT_PROCESS.machine_id == machine.id
and timezone.now() < _CURRENT_PROCESS.modified_at + timedelta(seconds=PROCESS_RECHECK_INTERVAL)
):
+ if _CURRENT_PROCESS.iface_id != iface.id:
+ _CURRENT_PROCESS.iface = iface
+ _CURRENT_PROCESS.save(update_fields=['iface', 'modified_at'])
_CURRENT_PROCESS.ensure_log_files()
return _CURRENT_PROCESS
_CURRENT_PROCESS = None
@@ -1080,6 +1204,9 @@ class Process(models.Model):
db_start_time = existing.started_at.timestamp()
if abs(db_start_time - os_start_time) < START_TIME_TOLERANCE:
_CURRENT_PROCESS = existing
+ if existing.iface_id != iface.id:
+ existing.iface = iface
+ existing.save(update_fields=['iface', 'modified_at'])
_CURRENT_PROCESS.ensure_log_files()
return existing
@@ -1112,6 +1239,7 @@ class Process(models.Model):
pid=current_pid,
started_at=started_at,
status=cls.StatusChoices.RUNNING,
+ iface=iface,
)
_CURRENT_PROCESS.ensure_log_files()
return _CURRENT_PROCESS
@@ -1176,7 +1304,9 @@ class Process(models.Model):
if 'supervisord' in argv_str:
return cls.TypeChoices.SUPERVISORD
- elif 'archivebox run' in argv_str or 'runner_watch' in argv_str:
+ elif 'runner_watch' in argv_str:
+ return cls.TypeChoices.WORKER
+ elif 'archivebox run' in argv_str:
return cls.TypeChoices.ORCHESTRATOR
elif 'archivebox' in argv_str:
return cls.TypeChoices.CLI
@@ -1321,14 +1451,17 @@ class Process(models.Model):
if self.cmd:
try:
os_cmdline = os_proc.cmdline()
- # Check if first arg (binary) matches
if os_cmdline and self.cmd:
- os_binary = os_cmdline[0] if os_cmdline else ''
db_binary = self.cmd[0] if self.cmd else ''
- # Match by basename (handles /usr/bin/python3 vs python3)
- if os_binary and db_binary:
- if Path(os_binary).name != Path(db_binary).name:
- return None # Different binary, PID reused
+ if db_binary:
+ db_binary_name = Path(db_binary).name
+ cmd_matches = any(
+ arg == db_binary or Path(arg).name == db_binary_name
+ for arg in os_cmdline
+ if arg
+ )
+ if not cmd_matches:
+ return None # Different command, PID reused
except (psutil.AccessDenied, psutil.ZombieProcess):
pass # Can't check cmdline, trust start time match
diff --git a/archivebox/misc/util.py b/archivebox/misc/util.py
index e040b219..5d9a3129 100644
--- a/archivebox/misc/util.py
+++ b/archivebox/misc/util.py
@@ -4,6 +4,7 @@ import re
import requests
import json as pyjson
import http.cookiejar
+from dateparser import parse as dateparser
from typing import List, Optional, Any, Callable
from pathlib import Path
@@ -13,7 +14,6 @@ from hashlib import sha256
from urllib.parse import urlparse, quote, unquote
from html import escape, unescape
from datetime import datetime, timezone
-from dateparser import parse as dateparser
from requests.exceptions import RequestException, ReadTimeout
from base32_crockford import encode as base32_encode
@@ -122,9 +122,35 @@ def fix_url_from_markdown(url_str: str) -> str:
return url_str
+def split_comma_separated_urls(url: str):
+ offset = 0
+ while True:
+ http_index = url.find('http://', 1)
+ https_index = url.find('https://', 1)
+ next_indices = [idx for idx in (http_index, https_index) if idx != -1]
+ if not next_indices:
+ yield offset, url
+ return
+
+ next_index = min(next_indices)
+ if url[next_index - 1] != ',':
+ yield offset, url
+ return
+
+ yield offset, url[:next_index - 1]
+ offset += next_index
+ url = url[next_index:]
+
def find_all_urls(urls_str: str):
- for url in re.findall(URL_REGEX, urls_str):
- yield fix_url_from_markdown(url)
+ skipped_starts = set()
+ for match in re.finditer(URL_REGEX, urls_str):
+ if match.start() in skipped_starts:
+ continue
+
+ for offset, url in split_comma_separated_urls(fix_url_from_markdown(match.group(1))):
+ if offset:
+ skipped_starts.add(match.start() + offset)
+ yield url
def is_static_file(url: str):
@@ -214,7 +240,25 @@ def parse_date(date: Any) -> datetime | None:
date = str(date)
if isinstance(date, str):
- parsed_date = dateparser(date, settings={'TIMEZONE': 'UTC'})
+ normalized = date.strip()
+ if not normalized:
+ raise ValueError(f'Tried to parse invalid date string! {date}')
+
+ try:
+ return datetime.fromtimestamp(float(normalized), tz=timezone.utc)
+ except (TypeError, ValueError, OSError):
+ pass
+
+ try:
+ iso_date = normalized.replace('Z', '+00:00')
+ parsed_date = datetime.fromisoformat(iso_date)
+ if parsed_date.tzinfo is None:
+ return parsed_date.replace(tzinfo=timezone.utc)
+ return parsed_date.astimezone(timezone.utc)
+ except ValueError:
+ pass
+
+ parsed_date = dateparser(normalized, settings={'TIMEZONE': 'UTC'})
if parsed_date is None:
raise ValueError(f'Tried to parse invalid date string! {date}')
return parsed_date.astimezone(timezone.utc)
@@ -408,6 +452,7 @@ assert fix_url_from_markdown('https://wikipedia.org/en/some_article_(Disambiguat
URL_REGEX_TESTS = [
('https://example.com', ['https://example.com']),
+ ('https://sweeting.me,https://google.com', ['https://sweeting.me', 'https://google.com']),
('http://abc-file234example.com/abc?def=abc&23423=sdfsdf#abc=234&234=a234', ['http://abc-file234example.com/abc?def=abc&23423=sdfsdf#abc=234&234=a234']),
('https://twitter.com/share?url=https://akaao.success-corp.co.jp&text=ア@サ!ト&hashtags=ア%オ,元+ア.ア-オ_イ*シ$ロ abc', ['https://twitter.com/share?url=https://akaao.success-corp.co.jp&text=ア@サ!ト&hashtags=ア%オ,元+ア.ア-オ_イ*シ$ロ', 'https://akaao.success-corp.co.jp&text=ア@サ!ト&hashtags=ア%オ,元+ア.ア-オ_イ*シ$ロ']),
diff --git a/archivebox/personas/admin.py b/archivebox/personas/admin.py
index b97a94f6..501495bf 100644
--- a/archivebox/personas/admin.py
+++ b/archivebox/personas/admin.py
@@ -1,2 +1,169 @@
+__package__ = "archivebox.personas"
-# Register your models here.
+import shutil
+
+from django.contrib import admin, messages
+from django.utils.html import format_html, format_html_join
+
+from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
+from archivebox.personas.forms import PersonaAdminForm
+from archivebox.personas.importers import discover_local_browser_profiles
+from archivebox.personas.models import Persona
+
+
+class PersonaAdmin(ConfigEditorMixin, BaseModelAdmin):
+ form = PersonaAdminForm
+ change_form_template = "admin/personas/persona/change_form.html"
+
+ list_display = ("name", "created_by", "created_at", "chrome_profile_state", "cookies_state", "auth_state")
+ search_fields = ("name", "created_by__username")
+ list_filter = ("created_by",)
+ ordering = ["name"]
+ list_per_page = 100
+ readonly_fields = ("id", "created_at", "persona_paths", "import_artifact_status")
+
+ add_fieldsets = (
+ ("Persona", {
+ "fields": ("name", "created_by"),
+ "classes": ("card",),
+ }),
+ ("Browser Import", {
+ "fields": (
+ "import_mode",
+ "import_discovered_profile",
+ "import_source",
+ "import_profile_name",
+ "import_copy_profile",
+ "import_extract_cookies",
+ "import_capture_storage",
+ ),
+ "classes": ("card", "wide"),
+ }),
+ ("Advanced", {
+ "fields": ("config",),
+ "classes": ("card", "wide"),
+ }),
+ )
+
+ change_fieldsets = add_fieldsets + (
+ ("Artifacts", {
+ "fields": ("persona_paths", "import_artifact_status"),
+ "classes": ("card", "wide"),
+ }),
+ ("Timestamps", {
+ "fields": ("id", "created_at"),
+ "classes": ("card",),
+ }),
+ )
+
+ @admin.display(description="Chrome Profile")
+ def chrome_profile_state(self, obj: Persona) -> str:
+ return "yes" if (obj.path / "chrome_user_data").exists() else "no"
+
+ @admin.display(description="cookies.txt")
+ def cookies_state(self, obj: Persona) -> str:
+ return "yes" if obj.COOKIES_FILE else "no"
+
+ @admin.display(description="auth.json")
+ def auth_state(self, obj: Persona) -> str:
+ return "yes" if obj.AUTH_STORAGE_FILE else "no"
+
+ @admin.display(description="Persona Paths")
+ def persona_paths(self, obj: Persona) -> str:
+ return format_html(
+ ""
+ "
Persona root {}
"
+ "
chrome_user_data {}
"
+ "
chrome_extensions {}
"
+ "
chrome_downloads {}
"
+ "
cookies.txt {}
"
+ "
auth.json {}
"
+ "
",
+ obj.path,
+ obj.CHROME_USER_DATA_DIR,
+ obj.CHROME_EXTENSIONS_DIR,
+ obj.CHROME_DOWNLOADS_DIR,
+ obj.COOKIES_FILE or (obj.path / "cookies.txt"),
+ obj.AUTH_STORAGE_FILE or (obj.path / "auth.json"),
+ )
+
+ @admin.display(description="Import Artifacts")
+ def import_artifact_status(self, obj: Persona) -> str:
+ entries = [
+ ("Browser profile", (obj.path / "chrome_user_data").exists(), obj.CHROME_USER_DATA_DIR),
+ ("cookies.txt", bool(obj.COOKIES_FILE), obj.COOKIES_FILE or (obj.path / "cookies.txt")),
+ ("auth.json", bool(obj.AUTH_STORAGE_FILE), obj.AUTH_STORAGE_FILE or (obj.path / "auth.json")),
+ ]
+ return format_html(
+ "{}
",
+ format_html_join(
+ "",
+ "{} {} {}
",
+ (
+ (
+ label,
+ "abx-artifact-state abx-artifact-state--yes" if enabled else "abx-artifact-state abx-artifact-state--no",
+ "present" if enabled else "missing",
+ path,
+ )
+ for label, enabled, path in entries
+ ),
+ ),
+ )
+
+ def get_fieldsets(self, request, obj=None):
+ return self.change_fieldsets if obj else self.add_fieldsets
+
+ def render_change_form(self, request, context, add=False, change=False, form_url="", obj=None):
+ context["detected_profile_count"] = len(discover_local_browser_profiles())
+ return super().render_change_form(request, context, add=add, change=change, form_url=form_url, obj=obj)
+
+ def save_model(self, request, obj, form, change):
+ old_path = None
+ new_path = None
+ if change:
+ previous = Persona.objects.get(pk=obj.pk)
+ if previous.name != obj.name:
+ old_path = previous.path
+ new_path = obj.path
+
+ super().save_model(request, obj, form, change)
+
+ if old_path and new_path and old_path != new_path and old_path.exists():
+ if new_path.exists():
+ raise FileExistsError(f"Cannot rename Persona directory because the destination already exists: {new_path}")
+ shutil.move(str(old_path), str(new_path))
+
+ obj.ensure_dirs()
+
+ import_result = form.apply_import(obj)
+ if import_result is None:
+ return
+
+ completed_actions = []
+ if import_result.profile_copied:
+ completed_actions.append("profile copied")
+ if import_result.cookies_imported:
+ completed_actions.append("cookies.txt generated")
+ if import_result.storage_captured:
+ completed_actions.append("auth.json captured")
+ if import_result.user_agent_imported:
+ completed_actions.append("USER_AGENT copied")
+
+ if completed_actions:
+ messages.success(
+ request,
+ f'Imported {", ".join(completed_actions)} from {import_result.source.display_label}.',
+ )
+ else:
+ messages.warning(
+ request,
+ f"Persona saved, but no browser artifacts were imported from {import_result.source.display_label}.",
+ )
+
+ for warning in import_result.warnings:
+ messages.warning(request, warning)
+
+
+def register_admin(admin_site: admin.AdminSite) -> None:
+ admin_site.register(Persona, PersonaAdmin)
diff --git a/archivebox/personas/export_browser_state.js b/archivebox/personas/export_browser_state.js
new file mode 100644
index 00000000..77b394f9
--- /dev/null
+++ b/archivebox/personas/export_browser_state.js
@@ -0,0 +1,210 @@
+#!/usr/bin/env node
+/**
+ * Export cookies and open-tab storage from a Chromium profile or live CDP URL.
+ *
+ * Environment variables:
+ * ARCHIVEBOX_ABX_PLUGINS_DIR Absolute path to abx_plugins/plugins
+ * CHROME_USER_DATA_DIR Local Chromium user-data directory to launch
+ * CHROME_CDP_URL Existing browser CDP URL to attach to
+ * COOKIES_OUTPUT_FILE Optional output path for Netscape cookies.txt
+ * AUTH_STORAGE_OUTPUT_FILE Optional output path for auth.json
+ * CHROME_BINARY Optional browser binary override
+ * NODE_MODULES_DIR Optional node_modules path for puppeteer-core
+ */
+
+const fs = require('fs');
+const os = require('os');
+const path = require('path');
+
+const pluginsDir = process.env.ARCHIVEBOX_ABX_PLUGINS_DIR || process.env.ABX_PLUGINS_DIR;
+if (!pluginsDir) {
+ console.error('ARCHIVEBOX_ABX_PLUGINS_DIR is required');
+ process.exit(1);
+}
+
+const baseUtils = require(path.join(pluginsDir, 'base', 'utils.js'));
+baseUtils.ensureNodeModuleResolution(module);
+
+const chromeUtils = require(path.join(pluginsDir, 'chrome', 'chrome_utils.js'));
+const puppeteer = require('puppeteer-core');
+
+function cookieToNetscape(cookie) {
+ let domain = cookie.domain;
+ if (!domain.startsWith('.') && !cookie.hostOnly) {
+ domain = '.' + domain;
+ }
+
+ const includeSubdomains = domain.startsWith('.') ? 'TRUE' : 'FALSE';
+ const cookiePath = cookie.path || '/';
+ const secure = cookie.secure ? 'TRUE' : 'FALSE';
+ const expiry = cookie.expires && cookie.expires > 0 ? Math.floor(cookie.expires).toString() : '0';
+
+ return `${domain}\t${includeSubdomains}\t${cookiePath}\t${secure}\t${expiry}\t${cookie.name}\t${cookie.value}`;
+}
+
+function writeCookiesFile(cookies, outputPath) {
+ const lines = [
+ '# Netscape HTTP Cookie File',
+ '# https://curl.se/docs/http-cookies.html',
+ '# This file was generated by ArchiveBox persona cookie extraction',
+ '#',
+ '# Format: domain\\tincludeSubdomains\\tpath\\tsecure\\texpiry\\tname\\tvalue',
+ '',
+ ];
+
+ for (const cookie of cookies) {
+ lines.push(cookieToNetscape(cookie));
+ }
+
+ fs.mkdirSync(path.dirname(outputPath), { recursive: true });
+ fs.writeFileSync(outputPath, lines.join('\n') + '\n');
+}
+
+async function collectStorage(browser) {
+ const localStorage = {};
+ const sessionStorage = {};
+ const pages = await browser.pages();
+
+ for (const page of pages) {
+ try {
+ const url = page.url();
+ if (!url || url === 'about:blank') continue;
+ if (url.startsWith('chrome:') || url.startsWith('edge:') || url.startsWith('devtools:')) continue;
+
+ const payload = await page.evaluate(() => ({
+ origin: window.location.origin,
+ localStorage: Object.fromEntries(Object.entries(window.localStorage)),
+ sessionStorage: Object.fromEntries(Object.entries(window.sessionStorage)),
+ }));
+
+ if (!payload.origin || payload.origin === 'null') continue;
+ if (Object.keys(payload.localStorage || {}).length > 0) {
+ localStorage[payload.origin] = payload.localStorage;
+ }
+ if (Object.keys(payload.sessionStorage || {}).length > 0) {
+ sessionStorage[payload.origin] = payload.sessionStorage;
+ }
+ } catch (error) {
+ // Ignore pages that cannot be inspected via evaluate().
+ }
+ }
+
+ return { localStorage, sessionStorage };
+}
+
+async function openBrowser() {
+ const cdpUrl = process.env.CHROME_CDP_URL || '';
+ if (cdpUrl) {
+ const browser = await chromeUtils.connectToBrowserEndpoint(puppeteer, cdpUrl, { defaultViewport: null });
+ return {
+ browser,
+ async cleanup() {
+ try {
+ await browser.disconnect();
+ } catch (error) {}
+ },
+ sourceDescription: cdpUrl,
+ };
+ }
+
+ const userDataDir = process.env.CHROME_USER_DATA_DIR;
+ if (!userDataDir) {
+ throw new Error('Either CHROME_USER_DATA_DIR or CHROME_CDP_URL is required');
+ }
+ if (!fs.existsSync(userDataDir)) {
+ throw new Error(`User data directory does not exist: ${userDataDir}`);
+ }
+
+ const outputDir = fs.mkdtempSync(path.join(os.tmpdir(), 'abx-browser-state-'));
+ const binary = process.env.CHROME_BINARY || chromeUtils.findAnyChromiumBinary();
+ if (!binary) {
+ throw new Error('Could not find a Chromium binary for browser state export');
+ }
+
+ const launched = await chromeUtils.launchChromium({
+ binary,
+ outputDir,
+ userDataDir,
+ headless: true,
+ killZombies: false,
+ });
+
+ if (!launched.success) {
+ throw new Error(launched.error || 'Chrome launch failed');
+ }
+
+ const browser = await chromeUtils.connectToBrowserEndpoint(puppeteer, launched.cdpUrl, { defaultViewport: null });
+
+ return {
+ browser,
+ async cleanup() {
+ try {
+ await browser.disconnect();
+ } catch (error) {}
+ try {
+ await chromeUtils.killChrome(launched.pid, outputDir);
+ } catch (error) {}
+ try {
+ fs.rmSync(outputDir, { recursive: true, force: true });
+ } catch (error) {}
+ },
+ sourceDescription: userDataDir,
+ };
+}
+
+async function main() {
+ const cookiesOutput = process.env.COOKIES_OUTPUT_FILE || '';
+ const authOutput = process.env.AUTH_STORAGE_OUTPUT_FILE || '';
+ if (!cookiesOutput && !authOutput) {
+ throw new Error('COOKIES_OUTPUT_FILE or AUTH_STORAGE_OUTPUT_FILE is required');
+ }
+
+ const { browser, cleanup, sourceDescription } = await openBrowser();
+
+ try {
+ const session = await browser.target().createCDPSession();
+ const browserVersion = await session.send('Browser.getVersion');
+ const cookieResult = await session.send('Storage.getCookies');
+ const cookies = cookieResult?.cookies || [];
+ const { localStorage, sessionStorage } = await collectStorage(browser);
+ const userAgent = browserVersion?.userAgent || '';
+
+ if (cookiesOutput) {
+ writeCookiesFile(cookies, cookiesOutput);
+ }
+
+ if (authOutput) {
+ fs.mkdirSync(path.dirname(authOutput), { recursive: true });
+ fs.writeFileSync(
+ authOutput,
+ JSON.stringify(
+ {
+ TYPE: 'auth',
+ SOURCE: sourceDescription,
+ captured_at: new Date().toISOString(),
+ user_agent: userAgent,
+ cookies,
+ localStorage,
+ sessionStorage,
+ },
+ null,
+ 2,
+ ) + '\n',
+ );
+ }
+
+ console.error(
+ `[+] Exported ${cookies.length} cookies` +
+ `${authOutput ? ` and ${Object.keys(localStorage).length + Object.keys(sessionStorage).length} storage origins` : ''}` +
+ `${userAgent ? ' with browser USER_AGENT' : ''}` +
+ ` from ${sourceDescription}`,
+ );
+ } finally {
+ await cleanup();
+ }
+}
+
+main().catch((error) => {
+ console.error(`ERROR: ${error.message}`);
+ process.exit(1);
+});
diff --git a/archivebox/personas/forms.py b/archivebox/personas/forms.py
new file mode 100644
index 00000000..fbcf8a61
--- /dev/null
+++ b/archivebox/personas/forms.py
@@ -0,0 +1,176 @@
+__package__ = "archivebox.personas"
+
+from typing import Any
+
+from django import forms
+from django.utils.safestring import mark_safe
+
+from archivebox.personas.importers import (
+ PersonaImportResult,
+ PersonaImportSource,
+ discover_local_browser_profiles,
+ import_persona_from_source,
+ resolve_custom_import_source,
+ validate_persona_name,
+)
+from archivebox.personas.models import Persona
+
+
+def _mode_label(title: str, description: str) -> str:
+ return mark_safe(
+ f'{title} {description} '
+ )
+
+
+class PersonaAdminForm(forms.ModelForm):
+ import_mode = forms.ChoiceField(
+ required=False,
+ initial="none",
+ label="Bootstrap this persona",
+ widget=forms.RadioSelect,
+ choices=(
+ ("none", _mode_label("Blank Persona", "Create the persona without importing browser state yet.")),
+ ("discovered", _mode_label("Use a detected profile", "Pick from Chromium profiles auto-discovered on this host.")),
+ ("custom", _mode_label("Use a custom path or CDP URL", "Paste an absolute Chromium path or attach to a live browser debugging endpoint.")),
+ ),
+ help_text="These options run after the Persona row is saved, using the same backend import helpers as the CLI.",
+ )
+ import_discovered_profile = forms.ChoiceField(
+ required=False,
+ label="Autodiscovered profiles",
+ widget=forms.RadioSelect,
+ choices=(),
+ help_text="Detected from local Chrome, Chromium, Brave, and Edge profile roots.",
+ )
+ import_source = forms.CharField(
+ required=False,
+ label="Absolute path or CDP URL",
+ widget=forms.TextInput(
+ attrs={
+ "placeholder": "/Users/alice/Library/Application Support/Google/Chrome or ws://127.0.0.1:9222/devtools/browser/...",
+ "style": "width: 100%; font-family: monospace;",
+ }
+ ),
+ help_text="Accepts an absolute Chromium user-data dir, an exact profile dir, or a live HTTP/WS CDP endpoint.",
+ )
+ import_profile_name = forms.CharField(
+ required=False,
+ label="Profile directory name",
+ widget=forms.TextInput(
+ attrs={
+ "placeholder": "Default or Profile 1",
+ "style": "width: 100%; font-family: monospace;",
+ }
+ ),
+ help_text="Only used when the custom path points at a browser root containing multiple profiles.",
+ )
+ import_copy_profile = forms.BooleanField(
+ required=False,
+ initial=True,
+ label="Copy browser profile into this persona",
+ help_text="Copies the chosen Chromium user-data tree into `chrome_user_data` for future archiving runs.",
+ )
+ import_extract_cookies = forms.BooleanField(
+ required=False,
+ initial=True,
+ label="Generate `cookies.txt`",
+ help_text="Extracts cookies through Chrome DevTools Protocol and writes a Netscape cookie jar for wget/curl-based plugins.",
+ )
+ import_capture_storage = forms.BooleanField(
+ required=False,
+ initial=True,
+ label="Capture open-tab storage into `auth.json`",
+ help_text="Snapshots currently open tab `localStorage` / `sessionStorage` values by origin. This is most useful for live CDP imports.",
+ )
+
+ class Meta:
+ model = Persona
+ fields = ("name", "created_by", "config")
+
+ def __init__(self, *args: Any, **kwargs: Any) -> None:
+ super().__init__(*args, **kwargs)
+ self.discovered_profiles = discover_local_browser_profiles()
+ self._resolved_import_source: PersonaImportSource | None = None
+
+ self.fields["import_mode"].widget.attrs["class"] = "abx-import-mode"
+ self.fields["import_discovered_profile"].widget.attrs["class"] = "abx-profile-picker"
+
+ if self.discovered_profiles:
+ self.fields["import_discovered_profile"].choices = [
+ (profile.choice_value, profile.as_choice_label()) for profile in self.discovered_profiles
+ ]
+ else:
+ self.fields["import_discovered_profile"].choices = []
+ self.fields["import_discovered_profile"].help_text = (
+ "No local Chromium profiles were detected on this host right now. "
+ "Use the custom path/CDP option if the browser data lives elsewhere."
+ )
+
+ def clean_name(self) -> str:
+ name = str(self.cleaned_data.get("name") or "").strip()
+ is_valid, error_message = validate_persona_name(name)
+ if not is_valid:
+ raise forms.ValidationError(error_message)
+ return name
+
+ def clean(self) -> dict[str, Any]:
+ cleaned_data = super().clean()
+ self._resolved_import_source = None
+
+ import_mode = str(cleaned_data.get("import_mode") or "none").strip() or "none"
+ if import_mode == "none":
+ return cleaned_data
+
+ if import_mode == "discovered":
+ selection = str(cleaned_data.get("import_discovered_profile") or "").strip()
+ if not selection:
+ self.add_error("import_discovered_profile", "Choose one of the discovered profiles to import.")
+ return cleaned_data
+ try:
+ self._resolved_import_source = PersonaImportSource.from_choice_value(selection)
+ except ValueError as err:
+ self.add_error("import_discovered_profile", str(err))
+ return cleaned_data
+ elif import_mode == "custom":
+ raw_value = str(cleaned_data.get("import_source") or "").strip()
+ if not raw_value:
+ self.add_error("import_source", "Provide an absolute Chromium profile path or a CDP URL.")
+ return cleaned_data
+ try:
+ self._resolved_import_source = resolve_custom_import_source(
+ raw_value,
+ profile_dir=str(cleaned_data.get("import_profile_name") or "").strip() or None,
+ )
+ except ValueError as err:
+ self.add_error("import_source", str(err))
+ return cleaned_data
+ else:
+ self.add_error("import_mode", "Choose how this Persona should be bootstrapped.")
+ return cleaned_data
+
+ copy_profile = bool(cleaned_data.get("import_copy_profile"))
+ import_cookies = bool(cleaned_data.get("import_extract_cookies"))
+ capture_storage = bool(cleaned_data.get("import_capture_storage"))
+
+ if self._resolved_import_source.kind == "cdp":
+ if not (import_cookies or capture_storage):
+ self.add_error(
+ "import_extract_cookies",
+ "CDP imports can only capture cookies and/or open-tab storage. Profile copying is not available for a remote browser endpoint.",
+ )
+ elif not (copy_profile or import_cookies or capture_storage):
+ raise forms.ValidationError("Select at least one import action.")
+
+ return cleaned_data
+
+ def apply_import(self, persona: Persona) -> PersonaImportResult | None:
+ if not self._resolved_import_source:
+ return None
+
+ return import_persona_from_source(
+ persona,
+ self._resolved_import_source,
+ copy_profile=bool(self.cleaned_data.get("import_copy_profile")),
+ import_cookies=bool(self.cleaned_data.get("import_extract_cookies")),
+ capture_storage=bool(self.cleaned_data.get("import_capture_storage")),
+ )
diff --git a/archivebox/personas/importers.py b/archivebox/personas/importers.py
new file mode 100644
index 00000000..fa0963bd
--- /dev/null
+++ b/archivebox/personas/importers.py
@@ -0,0 +1,845 @@
+"""
+Shared persona browser discovery/import helpers.
+
+These helpers are used by both the CLI and the Django admin so Persona import
+behavior stays consistent regardless of where it is triggered from.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import platform
+import shutil
+import subprocess
+import tempfile
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import TYPE_CHECKING, Optional
+from urllib.parse import urlparse
+
+from django.utils.html import format_html
+from django.utils.safestring import SafeString
+
+if TYPE_CHECKING:
+ from archivebox.personas.models import Persona
+
+
+BROWSER_LABELS = {
+ "chrome": "Google Chrome",
+ "chromium": "Chromium",
+ "brave": "Brave",
+ "edge": "Microsoft Edge",
+ "custom": "Custom Path",
+ "persona": "Persona Template",
+}
+
+BROWSER_PROFILE_DIR_NAMES = (
+ "Default",
+ "Profile ",
+ "Guest Profile",
+)
+
+VOLATILE_PROFILE_COPY_PATTERNS = (
+ "Cache",
+ "Code Cache",
+ "GPUCache",
+ "ShaderCache",
+ "Service Worker",
+ "GCM Store",
+ "*.log",
+ "Crashpad",
+ "BrowserMetrics",
+ "BrowserMetrics-spare.pma",
+ "SingletonLock",
+ "SingletonSocket",
+ "SingletonCookie",
+)
+
+PERSONA_PROFILE_DIR_CANDIDATES = (
+ "chrome_profile",
+ "chrome_user_data",
+)
+
+
+@dataclass(frozen=True)
+class PersonaImportSource:
+ kind: str
+ browser: str = "custom"
+ source_name: str | None = None
+ user_data_dir: Path | None = None
+ profile_dir: str | None = None
+ browser_binary: str | None = None
+ cdp_url: str | None = None
+
+ @property
+ def browser_label(self) -> str:
+ return BROWSER_LABELS.get(self.browser, self.browser.title())
+
+ @property
+ def profile_path(self) -> Path | None:
+ if not self.user_data_dir or not self.profile_dir:
+ return None
+ return self.user_data_dir / self.profile_dir
+
+ @property
+ def display_label(self) -> str:
+ if self.kind == "cdp":
+ return self.cdp_url or "CDP URL"
+ profile_suffix = f" / {self.profile_dir}" if self.profile_dir else ""
+ source_prefix = f": {self.source_name}" if self.source_name else ""
+ return f"{self.browser_label}{source_prefix}{profile_suffix}"
+
+ @property
+ def choice_value(self) -> str:
+ return json.dumps(
+ {
+ "kind": self.kind,
+ "browser": self.browser,
+ "source_name": self.source_name or "",
+ "user_data_dir": str(self.user_data_dir) if self.user_data_dir else "",
+ "profile_dir": self.profile_dir or "",
+ "browser_binary": self.browser_binary or "",
+ "cdp_url": self.cdp_url or "",
+ },
+ sort_keys=True,
+ )
+
+ def as_choice_label(self) -> SafeString:
+ path_str = str(self.profile_path or self.user_data_dir or self.cdp_url or "")
+ binary_suffix = f"Using {self.browser_binary}" if self.browser_binary else "Will auto-detect a Chromium binary"
+ return format_html(
+ ''
+ '{} '
+ '{} '
+ '{}'
+ " ",
+ self.display_label,
+ binary_suffix,
+ path_str,
+ )
+
+ @classmethod
+ def from_choice_value(cls, value: str) -> "PersonaImportSource":
+ try:
+ payload = json.loads(value)
+ except json.JSONDecodeError as err:
+ raise ValueError("Invalid discovered profile selection.") from err
+
+ if payload.get("kind") != "browser-profile":
+ raise ValueError("Invalid discovered profile selection.")
+
+ user_data_dir = Path(str(payload.get("user_data_dir") or "")).expanduser()
+ profile_dir = str(payload.get("profile_dir") or "").strip()
+ browser = str(payload.get("browser") or "custom").strip().lower() or "custom"
+ source_name = str(payload.get("source_name") or "").strip() or None
+ browser_binary = str(payload.get("browser_binary") or "").strip() or None
+
+ return resolve_browser_profile_source(
+ browser=browser,
+ source_name=source_name,
+ user_data_dir=user_data_dir,
+ profile_dir=profile_dir,
+ browser_binary=browser_binary,
+ )
+
+
+@dataclass
+class PersonaImportResult:
+ source: PersonaImportSource
+ profile_copied: bool = False
+ cookies_imported: bool = False
+ storage_captured: bool = False
+ user_agent_imported: bool = False
+ warnings: list[str] = field(default_factory=list)
+
+ @property
+ def did_work(self) -> bool:
+ return self.profile_copied or self.cookies_imported or self.storage_captured or self.user_agent_imported
+
+
+def get_chrome_user_data_dir() -> Optional[Path]:
+ """Get the default Chrome user data directory for the current platform."""
+ system = platform.system()
+ home = Path.home()
+
+ if system == "Darwin":
+ candidates = [
+ home / "Library" / "Application Support" / "Google" / "Chrome",
+ home / "Library" / "Application Support" / "Chromium",
+ ]
+ elif system == "Linux":
+ candidates = [
+ home / ".config" / "google-chrome",
+ home / ".config" / "chromium",
+ home / ".config" / "chrome",
+ home / "snap" / "chromium" / "common" / "chromium",
+ ]
+ elif system == "Windows":
+ local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local"))
+ candidates = [
+ local_app_data / "Google" / "Chrome" / "User Data",
+ local_app_data / "Chromium" / "User Data",
+ ]
+ else:
+ candidates = []
+
+ for candidate in candidates:
+ if candidate.exists() and _list_profile_names(candidate):
+ return candidate
+
+ return None
+
+
+def get_brave_user_data_dir() -> Optional[Path]:
+ """Get the default Brave user data directory for the current platform."""
+ system = platform.system()
+ home = Path.home()
+
+ if system == "Darwin":
+ candidates = [
+ home / "Library" / "Application Support" / "BraveSoftware" / "Brave-Browser",
+ ]
+ elif system == "Linux":
+ candidates = [
+ home / ".config" / "BraveSoftware" / "Brave-Browser",
+ ]
+ elif system == "Windows":
+ local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local"))
+ candidates = [
+ local_app_data / "BraveSoftware" / "Brave-Browser" / "User Data",
+ ]
+ else:
+ candidates = []
+
+ for candidate in candidates:
+ if candidate.exists() and _list_profile_names(candidate):
+ return candidate
+
+ return None
+
+
+def get_edge_user_data_dir() -> Optional[Path]:
+ """Get the default Edge user data directory for the current platform."""
+ system = platform.system()
+ home = Path.home()
+
+ if system == "Darwin":
+ candidates = [
+ home / "Library" / "Application Support" / "Microsoft Edge",
+ ]
+ elif system == "Linux":
+ candidates = [
+ home / ".config" / "microsoft-edge",
+ home / ".config" / "microsoft-edge-beta",
+ home / ".config" / "microsoft-edge-dev",
+ ]
+ elif system == "Windows":
+ local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local"))
+ candidates = [
+ local_app_data / "Microsoft" / "Edge" / "User Data",
+ ]
+ else:
+ candidates = []
+
+ for candidate in candidates:
+ if candidate.exists() and _list_profile_names(candidate):
+ return candidate
+
+ return None
+
+
+def get_browser_binary(browser: str) -> Optional[str]:
+ system = platform.system()
+ home = Path.home()
+ browser = browser.lower()
+
+ if system == "Darwin":
+ candidates = {
+ "chrome": ["/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"],
+ "chromium": ["/Applications/Chromium.app/Contents/MacOS/Chromium"],
+ "brave": ["/Applications/Brave Browser.app/Contents/MacOS/Brave Browser"],
+ "edge": ["/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge"],
+ }.get(browser, [])
+ elif system == "Linux":
+ candidates = {
+ "chrome": ["/usr/bin/google-chrome", "/usr/bin/google-chrome-stable", "/usr/bin/google-chrome-beta", "/usr/bin/google-chrome-unstable"],
+ "chromium": ["/usr/bin/chromium", "/usr/bin/chromium-browser"],
+ "brave": ["/usr/bin/brave-browser", "/usr/bin/brave-browser-beta", "/usr/bin/brave-browser-nightly"],
+ "edge": ["/usr/bin/microsoft-edge", "/usr/bin/microsoft-edge-stable", "/usr/bin/microsoft-edge-beta", "/usr/bin/microsoft-edge-dev"],
+ }.get(browser, [])
+ elif system == "Windows":
+ local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local"))
+ candidates = {
+ "chrome": [
+ str(local_app_data / "Google" / "Chrome" / "Application" / "chrome.exe"),
+ "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
+ "C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe",
+ ],
+ "chromium": [str(local_app_data / "Chromium" / "Application" / "chrome.exe")],
+ "brave": [
+ str(local_app_data / "BraveSoftware" / "Brave-Browser" / "Application" / "brave.exe"),
+ "C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe",
+ "C:\\Program Files (x86)\\BraveSoftware\\Brave-Browser\\Application\\brave.exe",
+ ],
+ "edge": [
+ str(local_app_data / "Microsoft" / "Edge" / "Application" / "msedge.exe"),
+ "C:\\Program Files\\Microsoft\\Edge\\Application\\msedge.exe",
+ "C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe",
+ ],
+ }.get(browser, [])
+ else:
+ candidates = []
+
+ for candidate in candidates:
+ if candidate and Path(candidate).exists():
+ return candidate
+
+ return None
+
+
+BROWSER_PROFILE_FINDERS = {
+ "chrome": get_chrome_user_data_dir,
+ "chromium": get_chrome_user_data_dir,
+ "brave": get_brave_user_data_dir,
+ "edge": get_edge_user_data_dir,
+}
+
+CHROMIUM_BROWSERS = tuple(BROWSER_PROFILE_FINDERS.keys())
+
+
+NETSCAPE_COOKIE_HEADER = [
+ "# Netscape HTTP Cookie File",
+ "# https://curl.se/docs/http-cookies.html",
+ "# This file was generated by ArchiveBox persona cookie extraction",
+ "#",
+ "# Format: domain\\tincludeSubdomains\\tpath\\tsecure\\texpiry\\tname\\tvalue",
+ "",
+]
+
+
+def validate_persona_name(name: str) -> tuple[bool, str]:
+ """Validate persona name to prevent path traversal."""
+ if not name or not name.strip():
+ return False, "Persona name cannot be empty"
+ if "/" in name or "\\" in name:
+ return False, "Persona name cannot contain path separators (/ or \\)"
+ if ".." in name:
+ return False, "Persona name cannot contain parent directory references (..)"
+ if name.startswith("."):
+ return False, "Persona name cannot start with a dot (.)"
+ if "\x00" in name or "\n" in name or "\r" in name:
+ return False, "Persona name contains invalid characters"
+ return True, ""
+
+
+def discover_local_browser_profiles() -> list[PersonaImportSource]:
+ discovered: list[PersonaImportSource] = []
+
+ for browser, finder in BROWSER_PROFILE_FINDERS.items():
+ user_data_dir = finder()
+ if not user_data_dir:
+ continue
+
+ browser_binary = get_browser_binary(browser)
+ for profile_dir in _list_profile_names(user_data_dir):
+ try:
+ discovered.append(
+ resolve_browser_profile_source(
+ browser=browser,
+ user_data_dir=user_data_dir,
+ profile_dir=profile_dir,
+ browser_binary=browser_binary,
+ )
+ )
+ except ValueError:
+ continue
+
+ discovered.extend(discover_persona_template_profiles())
+
+ return discovered
+
+
+def discover_persona_template_profiles(personas_dir: Path | None = None) -> list[PersonaImportSource]:
+ from archivebox.config.constants import CONSTANTS
+
+ templates: list[PersonaImportSource] = []
+ candidate_roots: list[Path] = []
+
+ if personas_dir is not None:
+ candidate_roots.append(personas_dir.expanduser())
+ else:
+ candidate_roots.extend(
+ [
+ CONSTANTS.PERSONAS_DIR.expanduser(),
+ Path.home() / ".config" / "abx" / "personas",
+ ]
+ )
+
+ seen_roots: set[Path] = set()
+ for personas_root in candidate_roots:
+ resolved_root = personas_root.resolve()
+ if resolved_root in seen_roots:
+ continue
+ seen_roots.add(resolved_root)
+
+ if not resolved_root.exists() or not resolved_root.is_dir():
+ continue
+
+ for persona_dir in sorted((path for path in resolved_root.iterdir() if path.is_dir()), key=lambda path: path.name.lower()):
+ for candidate_dir_name in PERSONA_PROFILE_DIR_CANDIDATES:
+ user_data_dir = persona_dir / candidate_dir_name
+ if not user_data_dir.exists() or not user_data_dir.is_dir():
+ continue
+
+ for profile_dir in _list_profile_names(user_data_dir):
+ try:
+ templates.append(
+ resolve_browser_profile_source(
+ browser="persona",
+ source_name=persona_dir.name,
+ user_data_dir=user_data_dir,
+ profile_dir=profile_dir,
+ browser_binary=get_browser_binary("chrome"),
+ )
+ )
+ except ValueError:
+ continue
+
+ return templates
+
+
+def resolve_browser_import_source(browser: str, profile_dir: str | None = None) -> PersonaImportSource:
+ browser = browser.lower().strip()
+ if browser not in BROWSER_PROFILE_FINDERS:
+ supported = ", ".join(BROWSER_PROFILE_FINDERS)
+ raise ValueError(f"Unknown browser: {browser}. Supported browsers: {supported}")
+
+ user_data_dir = BROWSER_PROFILE_FINDERS[browser]()
+ if not user_data_dir:
+ raise ValueError(f"Could not find {browser} profile directory")
+
+ chosen_profile = profile_dir or pick_default_profile_dir(user_data_dir)
+ if not chosen_profile:
+ raise ValueError(f"Could not find a profile in {user_data_dir}")
+
+ return resolve_browser_profile_source(
+ browser=browser,
+ user_data_dir=user_data_dir,
+ profile_dir=chosen_profile,
+ browser_binary=get_browser_binary(browser),
+ )
+
+
+def resolve_browser_profile_source(
+ browser: str,
+ user_data_dir: Path,
+ profile_dir: str,
+ source_name: str | None = None,
+ browser_binary: str | None = None,
+) -> PersonaImportSource:
+ resolved_root = user_data_dir.expanduser()
+ if not resolved_root.is_absolute():
+ resolved_root = resolved_root.resolve()
+ if not resolved_root.exists():
+ raise ValueError(f"Profile root does not exist: {resolved_root}")
+ if not profile_dir.strip():
+ raise ValueError("Profile directory name cannot be empty.")
+
+ profile_path = resolved_root / profile_dir
+ if not _looks_like_profile_dir(profile_path):
+ raise ValueError(f"Profile directory does not look valid: {profile_path}")
+
+ return PersonaImportSource(
+ kind="browser-profile",
+ browser=browser,
+ source_name=source_name,
+ user_data_dir=resolved_root,
+ profile_dir=profile_dir,
+ browser_binary=browser_binary,
+ )
+
+
+def resolve_custom_import_source(raw_value: str, profile_dir: str | None = None) -> PersonaImportSource:
+ raw_value = raw_value.strip()
+ if not raw_value:
+ raise ValueError("Provide an absolute browser profile path or a CDP URL.")
+
+ if _looks_like_cdp_url(raw_value):
+ return PersonaImportSource(kind="cdp", cdp_url=raw_value)
+
+ source_path = Path(raw_value).expanduser()
+ if not source_path.is_absolute():
+ raise ValueError("Custom browser path must be an absolute path.")
+ if not source_path.exists():
+ raise ValueError(f"Custom browser path does not exist: {source_path}")
+
+ explicit_profile = profile_dir.strip() if profile_dir else ""
+ if _looks_like_profile_dir(source_path):
+ if explicit_profile and explicit_profile != source_path.name:
+ raise ValueError("Profile name does not match the provided profile directory path.")
+ return resolve_browser_profile_source(
+ browser="custom",
+ user_data_dir=source_path.parent.resolve(),
+ profile_dir=source_path.name,
+ )
+
+ chosen_profile = explicit_profile or pick_default_profile_dir(source_path)
+ if not chosen_profile:
+ raise ValueError(
+ "Could not find a Chromium profile in that directory. "
+ "Provide an exact profile directory path or fill in the profile name field."
+ )
+
+ return resolve_browser_profile_source(
+ browser="custom",
+ user_data_dir=source_path.resolve(),
+ profile_dir=chosen_profile,
+ )
+
+
+def pick_default_profile_dir(user_data_dir: Path) -> str | None:
+ profiles = _list_profile_names(user_data_dir)
+ if not profiles:
+ return None
+ if "Default" in profiles:
+ return "Default"
+ return profiles[0]
+
+
+def import_persona_from_source(
+ persona: "Persona",
+ source: PersonaImportSource,
+ *,
+ copy_profile: bool = True,
+ import_cookies: bool = True,
+ capture_storage: bool = False,
+) -> PersonaImportResult:
+ persona.ensure_dirs()
+ result = PersonaImportResult(source=source)
+
+ persona_chrome_dir = Path(persona.CHROME_USER_DATA_DIR)
+ cookies_file = persona.path / "cookies.txt"
+ auth_file = persona.path / "auth.json"
+
+ launch_user_data_dir: Path | None = None
+
+ if source.kind == "browser-profile":
+ if copy_profile and source.user_data_dir:
+ resolved_source_root = source.user_data_dir.resolve()
+ resolved_persona_root = persona_chrome_dir.resolve()
+ if resolved_source_root == resolved_persona_root:
+ result.warnings.append("Skipped profile copy because the selected source is already this persona's chrome_user_data directory.")
+ else:
+ copy_browser_user_data_dir(resolved_source_root, resolved_persona_root)
+ persona.cleanup_chrome_profile(resolved_persona_root)
+ result.profile_copied = True
+ launch_user_data_dir = resolved_persona_root
+ else:
+ launch_user_data_dir = source.user_data_dir
+ elif copy_profile:
+ result.warnings.append("Profile copying is only available for local Chromium profile paths. CDP imports can only pull cookies and open-tab storage.")
+
+ if source.kind == "cdp":
+ export_success, auth_payload, export_message = export_browser_state(
+ cdp_url=source.cdp_url,
+ cookies_output_file=cookies_file if import_cookies else None,
+ auth_output_file=auth_file if capture_storage else None,
+ )
+ else:
+ export_success, auth_payload, export_message = export_browser_state(
+ user_data_dir=launch_user_data_dir,
+ profile_dir=source.profile_dir,
+ chrome_binary=source.browser_binary,
+ cookies_output_file=cookies_file if import_cookies else None,
+ auth_output_file=auth_file if capture_storage else None,
+ )
+
+ if not export_success:
+ result.warnings.append(export_message or "Browser import failed.")
+ return result
+
+ if import_cookies and cookies_file.exists():
+ result.cookies_imported = True
+ if capture_storage and auth_file.exists():
+ result.storage_captured = True
+ if _apply_imported_user_agent(persona, auth_payload):
+ result.user_agent_imported = True
+
+ return result
+
+
+def copy_browser_user_data_dir(source_dir: Path, destination_dir: Path) -> None:
+ destination_dir.parent.mkdir(parents=True, exist_ok=True)
+ shutil.rmtree(destination_dir, ignore_errors=True)
+ shutil.copytree(
+ source_dir,
+ destination_dir,
+ symlinks=True,
+ ignore=shutil.ignore_patterns(*VOLATILE_PROFILE_COPY_PATTERNS),
+ )
+
+
+def export_browser_state(
+ *,
+ user_data_dir: Path | None = None,
+ cdp_url: str | None = None,
+ profile_dir: str | None = None,
+ chrome_binary: str | None = None,
+ cookies_output_file: Path | None = None,
+ auth_output_file: Path | None = None,
+) -> tuple[bool, dict | None, str]:
+ if not user_data_dir and not cdp_url:
+ return False, None, "Missing browser source."
+
+ from abx_plugins import get_plugins_dir
+ from archivebox.config.common import STORAGE_CONFIG
+
+ state_script = Path(__file__).with_name("export_browser_state.js")
+ if not state_script.exists():
+ return False, None, f"Browser state export script not found at {state_script}"
+
+ node_modules_dir = STORAGE_CONFIG.LIB_DIR / "npm" / "node_modules"
+ chrome_plugin_dir = Path(get_plugins_dir()).resolve()
+
+ env = os.environ.copy()
+ env["NODE_MODULES_DIR"] = str(node_modules_dir)
+ env["ARCHIVEBOX_ABX_PLUGINS_DIR"] = str(chrome_plugin_dir)
+
+ if user_data_dir:
+ env["CHROME_USER_DATA_DIR"] = str(user_data_dir)
+ if cdp_url:
+ env["CHROME_CDP_URL"] = cdp_url
+ env["CHROME_IS_LOCAL"] = "false"
+ if chrome_binary:
+ env["CHROME_BINARY"] = str(chrome_binary)
+ if profile_dir:
+ extra_arg = f"--profile-directory={profile_dir}"
+ existing_extra = env.get("CHROME_ARGS_EXTRA", "").strip()
+ args_list: list[str] = []
+ if existing_extra:
+ if existing_extra.startswith("["):
+ try:
+ parsed = json.loads(existing_extra)
+ if isinstance(parsed, list):
+ args_list.extend(str(x) for x in parsed)
+ except Exception:
+ args_list.extend([s.strip() for s in existing_extra.split(",") if s.strip()])
+ else:
+ args_list.extend([s.strip() for s in existing_extra.split(",") if s.strip()])
+ args_list.append(extra_arg)
+ env["CHROME_ARGS_EXTRA"] = json.dumps(args_list)
+
+ temp_dir: Path | None = None
+ tmp_cookies_file: Path | None = None
+ tmp_auth_file: Path | None = None
+
+ if cookies_output_file and cookies_output_file.exists():
+ temp_dir = Path(tempfile.mkdtemp(prefix="ab_browser_state_"))
+ tmp_cookies_file = temp_dir / "cookies.txt"
+ env["COOKIES_OUTPUT_FILE"] = str(tmp_cookies_file)
+ elif cookies_output_file:
+ env["COOKIES_OUTPUT_FILE"] = str(cookies_output_file)
+
+ if auth_output_file and auth_output_file.exists():
+ temp_dir = temp_dir or Path(tempfile.mkdtemp(prefix="ab_browser_state_"))
+ tmp_auth_file = temp_dir / "auth.json"
+ env["AUTH_STORAGE_OUTPUT_FILE"] = str(tmp_auth_file)
+ elif auth_output_file:
+ env["AUTH_STORAGE_OUTPUT_FILE"] = str(auth_output_file)
+ else:
+ temp_dir = temp_dir or Path(tempfile.mkdtemp(prefix="ab_browser_state_"))
+ tmp_auth_file = temp_dir / "auth.json"
+ env["AUTH_STORAGE_OUTPUT_FILE"] = str(tmp_auth_file)
+
+ try:
+ result = subprocess.run(
+ ["node", str(state_script)],
+ env=env,
+ capture_output=True,
+ text=True,
+ timeout=120,
+ )
+ except subprocess.TimeoutExpired:
+ return False, None, "Browser state export timed out."
+ except FileNotFoundError:
+ return False, None, "Node.js was not found, so ArchiveBox could not extract browser state."
+ except Exception as err:
+ return False, None, f"Browser state export failed: {err}"
+
+ if result.returncode != 0:
+ message = (result.stderr or result.stdout or "").strip() or "Browser state export failed."
+ return False, None, message
+
+ auth_payload: dict | None = None
+ if cookies_output_file and tmp_cookies_file and tmp_cookies_file.exists():
+ _merge_netscape_cookies(cookies_output_file, tmp_cookies_file)
+ if auth_output_file and tmp_auth_file and tmp_auth_file.exists():
+ _merge_auth_storage(auth_output_file, tmp_auth_file)
+ auth_payload = _load_auth_storage(tmp_auth_file)
+ elif auth_output_file and auth_output_file.exists():
+ auth_payload = _load_auth_storage(auth_output_file)
+ elif tmp_auth_file and tmp_auth_file.exists():
+ auth_payload = _load_auth_storage(tmp_auth_file)
+
+ if temp_dir and temp_dir.exists():
+ shutil.rmtree(temp_dir, ignore_errors=True)
+
+ return True, auth_payload, (result.stderr or result.stdout or "").strip()
+
+
+def _list_profile_names(user_data_dir: Path) -> list[str]:
+ if not user_data_dir.exists() or not user_data_dir.is_dir():
+ return []
+
+ profiles: list[str] = []
+ for child in sorted(user_data_dir.iterdir(), key=lambda path: path.name.lower()):
+ if not child.is_dir():
+ continue
+ if child.name == "System Profile":
+ continue
+ if child.name == "Default" or child.name.startswith("Profile ") or child.name.startswith("Guest Profile"):
+ if _looks_like_profile_dir(child):
+ profiles.append(child.name)
+ continue
+ if _looks_like_profile_dir(child):
+ profiles.append(child.name)
+ return profiles
+
+
+def _looks_like_profile_dir(path: Path) -> bool:
+ if not path.exists() or not path.is_dir():
+ return False
+
+ marker_paths = (
+ path / "Preferences",
+ path / "History",
+ path / "Cookies",
+ path / "Network" / "Cookies",
+ path / "Local Storage",
+ path / "Session Storage",
+ )
+
+ if any(marker.exists() for marker in marker_paths):
+ return True
+
+ return any(path.name == prefix or path.name.startswith(prefix) for prefix in BROWSER_PROFILE_DIR_NAMES)
+
+
+def _looks_like_cdp_url(value: str) -> bool:
+ parsed = urlparse(value)
+ return parsed.scheme in {"ws", "wss", "http", "https"} and bool(parsed.netloc)
+
+
+def _parse_netscape_cookies(path: Path) -> dict[tuple[str, str, str], tuple[str, str, str, str, str, str, str]]:
+ cookies: dict[tuple[str, str, str], tuple[str, str, str, str, str, str, str]] = {}
+ if not path.exists():
+ return cookies
+
+ for line in path.read_text().splitlines():
+ if not line or line.startswith("#"):
+ continue
+ parts = line.split("\t")
+ if len(parts) < 7:
+ continue
+ domain, include_subdomains, cookie_path, secure, expiry, name, value = parts[:7]
+ cookies[(domain, cookie_path, name)] = (domain, include_subdomains, cookie_path, secure, expiry, name, value)
+ return cookies
+
+
+def _write_netscape_cookies(
+ path: Path,
+ cookies: dict[tuple[str, str, str], tuple[str, str, str, str, str, str, str]],
+) -> None:
+ lines = list(NETSCAPE_COOKIE_HEADER)
+ for cookie in cookies.values():
+ lines.append("\t".join(cookie))
+ path.write_text("\n".join(lines) + "\n")
+
+
+def _merge_netscape_cookies(existing_file: Path, new_file: Path) -> None:
+ existing = _parse_netscape_cookies(existing_file)
+ new = _parse_netscape_cookies(new_file)
+ existing.update(new)
+ _write_netscape_cookies(existing_file, existing)
+
+
+def _merge_auth_storage(existing_file: Path, new_file: Path) -> None:
+ existing_payload = _load_auth_storage(existing_file)
+ new_payload = _load_auth_storage(new_file)
+
+ existing_local = existing_payload.setdefault("localStorage", {})
+ existing_session = existing_payload.setdefault("sessionStorage", {})
+
+ for origin, payload in (new_payload.get("localStorage") or {}).items():
+ existing_local[origin] = payload
+ for origin, payload in (new_payload.get("sessionStorage") or {}).items():
+ existing_session[origin] = payload
+
+ cookies = _merge_cookie_dicts(existing_payload.get("cookies") or [], new_payload.get("cookies") or [])
+
+ merged = {
+ **existing_payload,
+ **new_payload,
+ "cookies": cookies,
+ "localStorage": existing_local,
+ "sessionStorage": existing_session,
+ "user_agent": new_payload.get("user_agent") or existing_payload.get("user_agent") or "",
+ }
+ existing_file.write_text(json.dumps(merged, indent=2, sort_keys=True) + "\n")
+
+
+def _load_auth_storage(path: Path) -> dict:
+ if not path.exists():
+ return {
+ "TYPE": "auth",
+ "cookies": [],
+ "localStorage": {},
+ "sessionStorage": {},
+ }
+ try:
+ payload = json.loads(path.read_text())
+ except json.JSONDecodeError:
+ return {
+ "TYPE": "auth",
+ "cookies": [],
+ "localStorage": {},
+ "sessionStorage": {},
+ }
+ if not isinstance(payload, dict):
+ return {
+ "TYPE": "auth",
+ "cookies": [],
+ "localStorage": {},
+ "sessionStorage": {},
+ }
+ return payload
+
+
+def _merge_cookie_dicts(existing: list[dict], new: list[dict]) -> list[dict]:
+ merged: dict[tuple[str, str, str], dict] = {}
+ for cookie in existing:
+ key = (str(cookie.get("domain") or ""), str(cookie.get("path") or "/"), str(cookie.get("name") or ""))
+ merged[key] = cookie
+ for cookie in new:
+ key = (str(cookie.get("domain") or ""), str(cookie.get("path") or "/"), str(cookie.get("name") or ""))
+ merged[key] = cookie
+ return list(merged.values())
+
+
+def _apply_imported_user_agent(persona: "Persona", auth_payload: dict | None) -> bool:
+ if not auth_payload:
+ return False
+
+ user_agent = str(auth_payload.get("user_agent") or "").strip()
+ if not user_agent:
+ return False
+
+ config = dict(persona.config or {})
+ if config.get("USER_AGENT") == user_agent:
+ return False
+
+ config["USER_AGENT"] = user_agent
+ persona.config = config
+ persona.save(update_fields=["config"])
+ return True
diff --git a/archivebox/personas/models.py b/archivebox/personas/models.py
index 7e927c22..f19e70a0 100644
--- a/archivebox/personas/models.py
+++ b/archivebox/personas/models.py
@@ -117,6 +117,12 @@ class Persona(ModelWithConfig):
cookies_path = self.path / 'cookies.txt'
return str(cookies_path) if cookies_path.exists() else ''
+ @property
+ def AUTH_STORAGE_FILE(self) -> str:
+ """Derived path to auth.json for this persona (if it exists)."""
+ auth_path = self.path / 'auth.json'
+ return str(auth_path) if auth_path.exists() else ''
+
def get_derived_config(self) -> dict:
"""
Get config dict with derived paths filled in.
@@ -127,6 +133,7 @@ class Persona(ModelWithConfig):
- CHROME_EXTENSIONS_DIR (derived from persona path)
- CHROME_DOWNLOADS_DIR (derived from persona path)
- COOKIES_FILE (derived from persona path, if file exists)
+ - AUTH_STORAGE_FILE (derived from persona path, if file exists)
- ACTIVE_PERSONA (set to this persona's name)
"""
derived = dict(self.config or {})
@@ -140,6 +147,8 @@ class Persona(ModelWithConfig):
derived['CHROME_DOWNLOADS_DIR'] = self.CHROME_DOWNLOADS_DIR
if 'COOKIES_FILE' not in derived and self.COOKIES_FILE:
derived['COOKIES_FILE'] = self.COOKIES_FILE
+ if 'AUTH_STORAGE_FILE' not in derived and self.AUTH_STORAGE_FILE:
+ derived['AUTH_STORAGE_FILE'] = self.AUTH_STORAGE_FILE
# Always set ACTIVE_PERSONA to this persona's name
derived['ACTIVE_PERSONA'] = self.name
diff --git a/archivebox/services/archive_result_service.py b/archivebox/services/archive_result_service.py
index 1e346dc7..9912cf6b 100644
--- a/archivebox/services/archive_result_service.py
+++ b/archivebox/services/archive_result_service.py
@@ -1,5 +1,6 @@
from __future__ import annotations
+import json
import mimetypes
from collections import defaultdict
from pathlib import Path
@@ -7,9 +8,10 @@ from pathlib import Path
from asgiref.sync import sync_to_async
from django.utils import timezone
-from abx_dl.events import ArchiveResultEvent
+from abx_dl.events import ArchiveResultEvent, ProcessCompletedEvent
from abx_dl.services.base import BaseService
+from .db import run_db_op
from .process_service import ProcessService, parse_event_datetime
@@ -48,22 +50,93 @@ def _collect_output_metadata(plugin_dir: Path) -> tuple[dict[str, dict], int, st
def _normalize_status(status: str) -> str:
if status == "noresult":
- return "skipped"
+ return "noresults"
return status or "failed"
+def _has_content_files(output_files: list[str]) -> bool:
+ return any(Path(path).suffix not in {".log", ".pid", ".sh"} for path in output_files)
+
+
+def _iter_archiveresult_records(stdout: str) -> list[dict]:
+ records: list[dict] = []
+ for raw_line in stdout.splitlines():
+ line = raw_line.strip()
+ if not line.startswith("{"):
+ continue
+ try:
+ record = json.loads(line)
+ except json.JSONDecodeError:
+ continue
+ if record.get("type") == "ArchiveResult":
+ records.append(record)
+ return records
+
+
class ArchiveResultService(BaseService):
- LISTENS_TO = [ArchiveResultEvent]
+ LISTENS_TO = [ArchiveResultEvent, ProcessCompletedEvent]
EMITS = []
def __init__(self, bus, *, process_service: ProcessService):
self.process_service = process_service
super().__init__(bus)
- async def on_ArchiveResultEvent(self, event: ArchiveResultEvent) -> None:
- await sync_to_async(self._project, thread_sensitive=True)(event)
+ async def on_ArchiveResultEvent__Outer(self, event: ArchiveResultEvent) -> None:
+ snapshot_output_dir = await run_db_op(self._get_snapshot_output_dir, event.snapshot_id)
+ if snapshot_output_dir is None:
+ return
+ plugin_dir = Path(snapshot_output_dir) / event.plugin
+ output_files, output_size, output_mimetypes = await sync_to_async(_collect_output_metadata)(plugin_dir)
+ await run_db_op(self._project, event, output_files, output_size, output_mimetypes)
- def _project(self, event: ArchiveResultEvent) -> None:
+ async def on_ProcessCompletedEvent__Outer(self, event: ProcessCompletedEvent) -> None:
+ if not event.snapshot_id or not event.hook_name.startswith("on_Snapshot"):
+ return
+
+ plugin_dir = Path(event.output_dir)
+ output_files, output_size, output_mimetypes = await sync_to_async(_collect_output_metadata)(plugin_dir)
+ records = _iter_archiveresult_records(event.stdout)
+ if records:
+ for record in records:
+ await run_db_op(
+ self._project_from_process_completed,
+ event,
+ record,
+ output_files,
+ output_size,
+ output_mimetypes,
+ )
+ return
+
+ synthetic_record = {
+ "plugin": event.plugin_name,
+ "hook_name": event.hook_name,
+ "status": "failed" if event.exit_code != 0 else ("succeeded" if _has_content_files(event.output_files) else "skipped"),
+ "output_str": event.stderr if event.exit_code != 0 else "",
+ "error": event.stderr if event.exit_code != 0 else "",
+ }
+ await run_db_op(
+ self._project_from_process_completed,
+ event,
+ synthetic_record,
+ output_files,
+ output_size,
+ output_mimetypes,
+ )
+
+ def _get_snapshot_output_dir(self, snapshot_id: str) -> str | None:
+ from archivebox.core.models import Snapshot
+
+ snapshot = Snapshot.objects.filter(id=snapshot_id).only("output_dir").first()
+ return str(snapshot.output_dir) if snapshot is not None else None
+
+ def _project(
+ self,
+ event: ArchiveResultEvent,
+ output_files: dict[str, dict],
+ output_size: int,
+ output_mimetypes: str,
+ ) -> None:
from archivebox.core.models import ArchiveResult, Snapshot
from archivebox.machine.models import Process
@@ -86,8 +159,6 @@ class ArchiveResultService(BaseService):
},
)
- plugin_dir = Path(snapshot.output_dir) / event.plugin
- output_files, output_size, output_mimetypes = _collect_output_metadata(plugin_dir)
result.process = process or result.process
result.status = _normalize_status(event.status)
result.output_str = event.output_str
@@ -97,7 +168,28 @@ class ArchiveResultService(BaseService):
result.output_mimetypes = output_mimetypes
result.start_ts = parse_event_datetime(event.start_ts) or result.start_ts or timezone.now()
result.end_ts = parse_event_datetime(event.end_ts) or timezone.now()
- result.retry_at = None
if event.error:
result.notes = event.error
result.save()
+
+ def _project_from_process_completed(
+ self,
+ event: ProcessCompletedEvent,
+ record: dict,
+ output_files: dict[str, dict],
+ output_size: int,
+ output_mimetypes: str,
+ ) -> None:
+ archive_result_event = ArchiveResultEvent(
+ snapshot_id=record.get("snapshot_id") or event.snapshot_id,
+ plugin=record.get("plugin") or event.plugin_name,
+ hook_name=record.get("hook_name") or event.hook_name,
+ status=record.get("status") or "",
+ process_id=event.process_id,
+ output_str=record.get("output_str") or "",
+ output_json=record.get("output_json") if isinstance(record.get("output_json"), dict) else None,
+ start_ts=event.start_ts,
+ end_ts=event.end_ts,
+ error=record.get("error") or (event.stderr if event.exit_code != 0 else ""),
+ )
+ self._project(archive_result_event, output_files, output_size, output_mimetypes)
diff --git a/archivebox/services/binary_service.py b/archivebox/services/binary_service.py
index bf571e8f..5bba73af 100644
--- a/archivebox/services/binary_service.py
+++ b/archivebox/services/binary_service.py
@@ -1,19 +1,23 @@
from __future__ import annotations
-from asgiref.sync import sync_to_async
+import asyncio
+
from abx_dl.events import BinaryEvent, BinaryInstalledEvent
from abx_dl.services.base import BaseService
+from .db import run_db_op
+
class BinaryService(BaseService):
LISTENS_TO = [BinaryEvent, BinaryInstalledEvent]
EMITS = []
- async def on_BinaryEvent(self, event: BinaryEvent) -> None:
- await sync_to_async(self._project_binary, thread_sensitive=True)(event)
+ async def on_BinaryEvent__Outer(self, event: BinaryEvent) -> None:
+ await run_db_op(self._project_binary, event)
- async def on_BinaryInstalledEvent(self, event: BinaryInstalledEvent) -> None:
- await sync_to_async(self._project_installed_binary, thread_sensitive=True)(event)
+ async def on_BinaryInstalledEvent__Outer(self, event: BinaryInstalledEvent) -> None:
+ resolved = await asyncio.to_thread(self._resolve_installed_binary_metadata, event)
+ await run_db_op(self._project_installed_binary, event, resolved)
def _project_binary(self, event: BinaryEvent) -> None:
from archivebox.machine.models import Binary, Machine
@@ -44,7 +48,39 @@ class BinaryService(BaseService):
},
)
- def _project_installed_binary(self, event: BinaryInstalledEvent) -> None:
+ def _resolve_installed_binary_metadata(self, event: BinaryInstalledEvent) -> dict[str, str]:
+ resolved = {
+ "abspath": event.abspath or "",
+ "version": event.version or "",
+ "sha256": event.sha256 or "",
+ "binproviders": event.binproviders or "",
+ "binprovider": event.binprovider or "",
+ }
+ if resolved["abspath"] and resolved["version"] and resolved["binprovider"]:
+ return resolved
+
+ try:
+ from abx_dl.dependencies import load_binary
+
+ allowed_providers = resolved["binproviders"] or resolved["binprovider"] or "env,pip,npm,brew,apt"
+ spec = {
+ "name": event.name,
+ "binproviders": allowed_providers,
+ "overrides": event.overrides or {},
+ }
+ binary = load_binary(spec)
+ resolved["abspath"] = str(getattr(binary, "abspath", None) or resolved["abspath"] or "")
+ resolved["version"] = str(getattr(binary, "version", None) or resolved["version"] or "")
+ resolved["sha256"] = str(getattr(binary, "sha256", None) or resolved["sha256"] or "")
+ provider_name = getattr(getattr(binary, "loaded_binprovider", None), "name", None)
+ if provider_name:
+ resolved["binprovider"] = str(provider_name)
+ except Exception:
+ pass
+
+ return resolved
+
+ def _project_installed_binary(self, event: BinaryInstalledEvent, resolved: dict[str, str]) -> None:
from archivebox.machine.models import Binary, Machine
machine = Machine.current()
@@ -55,10 +91,14 @@ class BinaryService(BaseService):
"status": Binary.StatusChoices.QUEUED,
},
)
- binary.abspath = event.abspath or binary.abspath
- binary.version = event.version or binary.version
- binary.sha256 = event.sha256 or binary.sha256
- binary.binprovider = event.binprovider or binary.binprovider
+ binary.abspath = resolved["abspath"] or binary.abspath
+ binary.version = resolved["version"] or binary.version
+ binary.sha256 = resolved["sha256"] or binary.sha256
+ if resolved["binproviders"]:
+ binary.binproviders = resolved["binproviders"]
+ binary.binprovider = resolved["binprovider"] or binary.binprovider
+ if event.overrides and binary.overrides != event.overrides:
+ binary.overrides = event.overrides
binary.status = Binary.StatusChoices.INSTALLED
binary.retry_at = None
- binary.save(update_fields=["abspath", "version", "sha256", "binprovider", "status", "retry_at", "modified_at"])
+ binary.save(update_fields=["abspath", "version", "sha256", "binproviders", "binprovider", "overrides", "status", "retry_at", "modified_at"])
diff --git a/archivebox/services/crawl_service.py b/archivebox/services/crawl_service.py
index 5add6c2a..1b5e314b 100644
--- a/archivebox/services/crawl_service.py
+++ b/archivebox/services/crawl_service.py
@@ -1,11 +1,10 @@
from __future__ import annotations
-from asgiref.sync import sync_to_async
-from django.utils import timezone
-
from abx_dl.events import CrawlCleanupEvent, CrawlCompletedEvent, CrawlSetupEvent, CrawlStartEvent
from abx_dl.services.base import BaseService
+from .db import run_db_op
+
class CrawlService(BaseService):
LISTENS_TO = [CrawlSetupEvent, CrawlStartEvent, CrawlCleanupEvent, CrawlCompletedEvent]
@@ -15,17 +14,17 @@ class CrawlService(BaseService):
self.crawl_id = crawl_id
super().__init__(bus)
- async def on_CrawlSetupEvent(self, event: CrawlSetupEvent) -> None:
- await sync_to_async(self._mark_started, thread_sensitive=True)()
+ async def on_CrawlSetupEvent__Outer(self, event: CrawlSetupEvent) -> None:
+ await run_db_op(self._mark_started)
- async def on_CrawlStartEvent(self, event: CrawlStartEvent) -> None:
- await sync_to_async(self._mark_started, thread_sensitive=True)()
+ async def on_CrawlStartEvent__Outer(self, event: CrawlStartEvent) -> None:
+ await run_db_op(self._mark_started)
- async def on_CrawlCleanupEvent(self, event: CrawlCleanupEvent) -> None:
- await sync_to_async(self._mark_started, thread_sensitive=True)()
+ async def on_CrawlCleanupEvent__Outer(self, event: CrawlCleanupEvent) -> None:
+ await run_db_op(self._mark_started)
- async def on_CrawlCompletedEvent(self, event: CrawlCompletedEvent) -> None:
- await sync_to_async(self._mark_completed, thread_sensitive=True)()
+ async def on_CrawlCompletedEvent__Outer(self, event: CrawlCompletedEvent) -> None:
+ await run_db_op(self._mark_completed)
def _mark_started(self) -> None:
from archivebox.crawls.models import Crawl
diff --git a/archivebox/services/db.py b/archivebox/services/db.py
new file mode 100644
index 00000000..0c8e542c
--- /dev/null
+++ b/archivebox/services/db.py
@@ -0,0 +1,16 @@
+from __future__ import annotations
+
+from asgiref.sync import sync_to_async
+from django.db import close_old_connections
+
+
+def _run_db_op(func, *args, **kwargs):
+ close_old_connections()
+ try:
+ return func(*args, **kwargs)
+ finally:
+ close_old_connections()
+
+
+async def run_db_op(func, *args, **kwargs):
+ return await sync_to_async(_run_db_op, thread_sensitive=True)(func, *args, **kwargs)
diff --git a/archivebox/services/live_ui.py b/archivebox/services/live_ui.py
new file mode 100644
index 00000000..40f149bc
--- /dev/null
+++ b/archivebox/services/live_ui.py
@@ -0,0 +1 @@
+from abx_dl.cli import LiveBusUI
diff --git a/archivebox/services/machine_service.py b/archivebox/services/machine_service.py
index 62966a91..574893ee 100644
--- a/archivebox/services/machine_service.py
+++ b/archivebox/services/machine_service.py
@@ -1,16 +1,17 @@
from __future__ import annotations
-from asgiref.sync import sync_to_async
from abx_dl.events import MachineEvent
from abx_dl.services.base import BaseService
+from .db import run_db_op
+
class MachineService(BaseService):
LISTENS_TO = [MachineEvent]
EMITS = []
- async def on_MachineEvent(self, event: MachineEvent) -> None:
- await sync_to_async(self._project, thread_sensitive=True)(event)
+ async def on_MachineEvent__Outer(self, event: MachineEvent) -> None:
+ await run_db_op(self._project, event)
def _project(self, event: MachineEvent) -> None:
from archivebox.machine.models import Machine
diff --git a/archivebox/services/process_service.py b/archivebox/services/process_service.py
index 32e702d7..2b6551b2 100644
--- a/archivebox/services/process_service.py
+++ b/archivebox/services/process_service.py
@@ -3,12 +3,13 @@ from __future__ import annotations
from datetime import datetime
from typing import TYPE_CHECKING
-from asgiref.sync import sync_to_async
from django.utils import timezone
from abx_dl.events import ProcessCompletedEvent, ProcessStartedEvent
from abx_dl.services.base import BaseService
+from .db import run_db_op
+
if TYPE_CHECKING:
from archivebox.machine.models import Process
@@ -33,27 +34,33 @@ class ProcessService(BaseService):
self.process_ids: dict[str, str] = {}
super().__init__(bus)
- async def on_ProcessStartedEvent(self, event: ProcessStartedEvent) -> None:
- await sync_to_async(self._project_started, thread_sensitive=True)(event)
+ async def on_ProcessStartedEvent__Outer(self, event: ProcessStartedEvent) -> None:
+ await run_db_op(self._project_started, event)
- async def on_ProcessCompletedEvent(self, event: ProcessCompletedEvent) -> None:
- await sync_to_async(self._project_completed, thread_sensitive=True)(event)
+ async def on_ProcessCompletedEvent__Outer(self, event: ProcessCompletedEvent) -> None:
+ await run_db_op(self._project_completed, event)
def get_db_process_id(self, process_id: str) -> str | None:
return self.process_ids.get(process_id)
def _get_or_create_process(self, event: ProcessStartedEvent | ProcessCompletedEvent) -> "Process":
- from archivebox.machine.models import Machine, Process
+ from archivebox.machine.models import NetworkInterface, Process
db_process_id = self.process_ids.get(event.process_id)
+ iface = NetworkInterface.current(refresh=True)
if db_process_id:
process = Process.objects.filter(id=db_process_id).first()
if process is not None:
+ if process.iface_id != iface.id or process.machine_id != iface.machine_id:
+ process.iface = iface
+ process.machine = iface.machine
+ process.save(update_fields=["iface", "machine", "modified_at"])
return process
process_type = Process.TypeChoices.BINARY if event.hook_name.startswith("on_Binary") else Process.TypeChoices.HOOK
process = Process.objects.create(
- machine=Machine.current(),
+ machine=iface.machine,
+ iface=iface,
process_type=process_type,
pwd=event.output_dir,
cmd=[event.hook_path, *event.hook_args],
@@ -77,12 +84,14 @@ class ProcessService(BaseService):
process.started_at = parse_event_datetime(event.start_ts) or process.started_at or timezone.now()
process.status = process.StatusChoices.RUNNING
process.retry_at = None
+ process.hydrate_binary_from_context(plugin_name=event.plugin_name, hook_path=event.hook_path)
process.save()
def _project_completed(self, event: ProcessCompletedEvent) -> None:
process = self._get_or_create_process(event)
process.pwd = event.output_dir
- process.cmd = [event.hook_path, *event.hook_args]
+ if not process.cmd:
+ process.cmd = [event.hook_path, *event.hook_args]
process.env = event.env
process.pid = event.pid or process.pid
process.started_at = parse_event_datetime(event.start_ts) or process.started_at
@@ -92,4 +101,5 @@ class ProcessService(BaseService):
process.exit_code = event.exit_code
process.status = process.StatusChoices.EXITED
process.retry_at = None
+ process.hydrate_binary_from_context(plugin_name=event.plugin_name, hook_path=event.hook_path)
process.save()
diff --git a/archivebox/services/runner.py b/archivebox/services/runner.py
index 283dfb21..9821ef3a 100644
--- a/archivebox/services/runner.py
+++ b/archivebox/services/runner.py
@@ -3,16 +3,21 @@ from __future__ import annotations
import asyncio
import json
import os
+import shutil
+import subprocess
import sys
import time
+from contextlib import nullcontext
from pathlib import Path
+from tempfile import TemporaryDirectory
from typing import Any
from django.utils import timezone
+from rich.console import Console
from abx_dl.events import BinaryEvent
-from abx_dl.models import INSTALL_URL, Snapshot as AbxSnapshot, discover_plugins
-from abx_dl.orchestrator import create_bus, download, install_plugins as abx_install_plugins, setup_services as setup_abx_services
+from abx_dl.models import INSTALL_URL, Plugin, Snapshot as AbxSnapshot, discover_plugins, filter_plugins
+from abx_dl.orchestrator import create_bus, download, install_plugins as abx_install_plugins, prepare_install_plugins, setup_services as setup_abx_services
from .archive_result_service import ArchiveResultService
from .binary_service import BinaryService
@@ -21,6 +26,7 @@ from .machine_service import MachineService
from .process_service import ProcessService
from .snapshot_service import SnapshotService
from .tag_service import TagService
+from .live_ui import LiveBusUI
def _bus_name(prefix: str, identifier: str) -> str:
@@ -35,6 +41,19 @@ def _selected_plugins_from_config(config: dict[str, Any]) -> list[str] | None:
return [name.strip() for name in raw.split(",") if name.strip()]
+def _count_selected_hooks(plugins: dict[str, Plugin], selected_plugins: list[str] | None) -> int:
+ selected = filter_plugins(plugins, selected_plugins) if selected_plugins else plugins
+ total = 0
+ for plugin in selected.values():
+ total += len(list(plugin.get_crawl_hooks()))
+ total += len(list(plugin.get_snapshot_hooks()))
+ return total
+
+
+def _runner_debug(message: str) -> None:
+ print(f"[runner] {message}", file=sys.stderr, flush=True)
+
+
def _attach_bus_trace(bus) -> None:
trace_target = (os.environ.get("ARCHIVEBOX_BUS_TRACE") or "").strip()
if not trace_target:
@@ -78,10 +97,51 @@ async def _stop_bus_trace(bus) -> None:
bus._archivebox_trace_task = None
+def ensure_background_runner(*, allow_under_pytest: bool = False) -> bool:
+ if os.environ.get("PYTEST_CURRENT_TEST") and not allow_under_pytest:
+ return False
+
+ from archivebox.config import CONSTANTS
+ from archivebox.machine.models import Machine, Process
+
+ Process.cleanup_stale_running()
+ machine = Machine.current()
+ if Process.objects.filter(
+ machine=machine,
+ status=Process.StatusChoices.RUNNING,
+ process_type=Process.TypeChoices.ORCHESTRATOR,
+ ).exists():
+ return False
+
+ log_path = CONSTANTS.LOGS_DIR / "errors.log"
+ log_path.parent.mkdir(parents=True, exist_ok=True)
+ env = os.environ.copy()
+ env.setdefault("DATA_DIR", str(CONSTANTS.DATA_DIR))
+
+ with log_path.open("a", encoding="utf-8") as log_handle:
+ subprocess.Popen(
+ [sys.executable, "-m", "archivebox", "run", "--daemon"],
+ cwd=str(CONSTANTS.DATA_DIR),
+ env=env,
+ stdin=subprocess.DEVNULL,
+ stdout=log_handle,
+ stderr=log_handle,
+ start_new_session=True,
+ )
+ return True
+
+
class CrawlRunner:
MAX_CONCURRENT_SNAPSHOTS = 8
- def __init__(self, crawl, *, snapshot_ids: list[str] | None = None, selected_plugins: list[str] | None = None):
+ def __init__(
+ self,
+ crawl,
+ *,
+ snapshot_ids: list[str] | None = None,
+ selected_plugins: list[str] | None = None,
+ process_discovered_snapshots_inline: bool = True,
+ ):
self.crawl = crawl
self.bus = create_bus(name=_bus_name("ArchiveBox", str(crawl.id)), total_timeout=3600.0)
self.plugins = discover_plugins()
@@ -90,7 +150,12 @@ class CrawlRunner:
self.binary_service = BinaryService(self.bus)
self.tag_service = TagService(self.bus)
self.crawl_service = CrawlService(self.bus, crawl_id=str(crawl.id))
- self.snapshot_service = SnapshotService(self.bus, crawl_id=str(crawl.id), schedule_snapshot=self.enqueue_snapshot)
+ self.process_discovered_snapshots_inline = process_discovered_snapshots_inline
+ self.snapshot_service = SnapshotService(
+ self.bus,
+ crawl_id=str(crawl.id),
+ schedule_snapshot=self.enqueue_snapshot if process_discovered_snapshots_inline else self.leave_snapshot_queued,
+ )
self.archive_result_service = ArchiveResultService(self.bus, process_service=self.process_service)
self.selected_plugins = selected_plugins
self.initial_snapshot_ids = snapshot_ids
@@ -100,6 +165,29 @@ class CrawlRunner:
self.persona = None
self.base_config: dict[str, Any] = {}
self.primary_url = ""
+ self._live_stream = None
+
+ def _create_projector_bus(self, *, identifier: str, config_overrides: dict[str, Any]):
+ bus = create_bus(name=_bus_name("ArchiveBox", identifier), total_timeout=3600.0)
+ process_service = ProcessService(bus)
+ MachineService(bus)
+ BinaryService(bus)
+ TagService(bus)
+ CrawlService(bus, crawl_id=str(self.crawl.id))
+ SnapshotService(
+ bus,
+ crawl_id=str(self.crawl.id),
+ schedule_snapshot=self.enqueue_snapshot if self.process_discovered_snapshots_inline else self.leave_snapshot_queued,
+ )
+ ArchiveResultService(bus, process_service=process_service)
+ abx_services = setup_abx_services(
+ bus,
+ plugins=self.plugins,
+ config_overrides=config_overrides,
+ auto_install=True,
+ emit_jsonl=False,
+ )
+ return bus, abx_services
async def run(self) -> None:
from asgiref.sync import sync_to_async
@@ -107,35 +195,63 @@ class CrawlRunner:
try:
await sync_to_async(self._prepare, thread_sensitive=True)()
- _attach_bus_trace(self.bus)
- self.abx_services = setup_abx_services(
- self.bus,
- plugins=self.plugins,
- config_overrides=self.base_config,
- auto_install=True,
- emit_jsonl=False,
- )
- if self.crawl.get_system_task() == INSTALL_URL:
- await self._run_install_crawl()
- else:
- snapshot_ids = await sync_to_async(self._initial_snapshot_ids, thread_sensitive=True)()
- if snapshot_ids:
- root_snapshot_id = snapshot_ids[0]
- await self._run_crawl_setup(root_snapshot_id)
- for snapshot_id in snapshot_ids:
- await self.enqueue_snapshot(snapshot_id)
- await self._wait_for_snapshot_tasks()
- await self._run_crawl_cleanup(root_snapshot_id)
- if self.abx_services is not None:
- await self.abx_services.process.wait_for_background_monitors()
+ live_ui = self._create_live_ui()
+ with live_ui if live_ui is not None else nullcontext():
+ _attach_bus_trace(self.bus)
+ self.abx_services = setup_abx_services(
+ self.bus,
+ plugins=self.plugins,
+ config_overrides=self.base_config,
+ auto_install=True,
+ emit_jsonl=False,
+ )
+ if self.crawl.get_system_task() == INSTALL_URL:
+ await self._run_install_crawl()
+ else:
+ snapshot_ids = await sync_to_async(self._initial_snapshot_ids, thread_sensitive=True)()
+ if snapshot_ids:
+ root_snapshot_id = snapshot_ids[0]
+ _runner_debug(f"crawl {self.crawl.id} starting crawl setup root_snapshot={root_snapshot_id}")
+ await self._run_crawl_setup(root_snapshot_id)
+ _runner_debug(f"crawl {self.crawl.id} finished crawl setup root_snapshot={root_snapshot_id}")
+ for snapshot_id in snapshot_ids:
+ await self.enqueue_snapshot(snapshot_id)
+ _runner_debug(f"crawl {self.crawl.id} waiting for snapshot tasks count={len(self.snapshot_tasks)}")
+ await self._wait_for_snapshot_tasks()
+ _runner_debug(f"crawl {self.crawl.id} finished waiting for snapshot tasks")
+ _runner_debug(f"crawl {self.crawl.id} starting django crawl.cleanup()")
+ await sync_to_async(self.crawl.cleanup, thread_sensitive=True)()
+ _runner_debug(f"crawl {self.crawl.id} finished django crawl.cleanup()")
+ _runner_debug(f"crawl {self.crawl.id} starting abx crawl cleanup root_snapshot={root_snapshot_id}")
+ await self._run_crawl_cleanup(root_snapshot_id)
+ _runner_debug(f"crawl {self.crawl.id} finished abx crawl cleanup root_snapshot={root_snapshot_id}")
+ if self.abx_services is not None:
+ _runner_debug(f"crawl {self.crawl.id} waiting for main bus background monitors")
+ await self.abx_services.process.wait_for_background_monitors()
+ _runner_debug(f"crawl {self.crawl.id} finished waiting for main bus background monitors")
finally:
await _stop_bus_trace(self.bus)
await self.bus.stop()
+ if self._live_stream is not None:
+ try:
+ self._live_stream.close()
+ except Exception:
+ pass
+ self._live_stream = None
await sync_to_async(self._cleanup_persona, thread_sensitive=True)()
crawl = await sync_to_async(Crawl.objects.get, thread_sensitive=True)(id=self.crawl.id)
- if crawl.status != Crawl.StatusChoices.SEALED:
- crawl.status = Crawl.StatusChoices.SEALED
- crawl.retry_at = None
+ crawl_is_finished = await sync_to_async(crawl.is_finished, thread_sensitive=True)()
+ if crawl_is_finished:
+ if crawl.status != Crawl.StatusChoices.SEALED:
+ crawl.status = Crawl.StatusChoices.SEALED
+ crawl.retry_at = None
+ await sync_to_async(crawl.save, thread_sensitive=True)(update_fields=["status", "retry_at", "modified_at"])
+ else:
+ if crawl.status == Crawl.StatusChoices.SEALED:
+ crawl.status = Crawl.StatusChoices.QUEUED
+ elif crawl.status != Crawl.StatusChoices.STARTED:
+ crawl.status = Crawl.StatusChoices.STARTED
+ crawl.retry_at = crawl.retry_at or timezone.now()
await sync_to_async(crawl.save, thread_sensitive=True)(update_fields=["status", "retry_at", "modified_at"])
async def enqueue_snapshot(self, snapshot_id: str) -> None:
@@ -145,17 +261,36 @@ class CrawlRunner:
task = asyncio.create_task(self._run_snapshot(snapshot_id))
self.snapshot_tasks[snapshot_id] = task
+ async def leave_snapshot_queued(self, snapshot_id: str) -> None:
+ return None
+
async def _wait_for_snapshot_tasks(self) -> None:
while True:
- active = [task for task in self.snapshot_tasks.values() if not task.done()]
- if not active:
+ pending_tasks: list[asyncio.Task[None]] = []
+ for snapshot_id, task in list(self.snapshot_tasks.items()):
+ if task.done():
+ if self.snapshot_tasks.get(snapshot_id) is task:
+ self.snapshot_tasks.pop(snapshot_id, None)
+ task.result()
+ continue
+ pending_tasks.append(task)
+ if not pending_tasks:
return
- await asyncio.gather(*active)
+ done, _pending = await asyncio.wait(pending_tasks, return_when=asyncio.FIRST_COMPLETED)
+ for task in done:
+ task.result()
def _prepare(self) -> None:
from archivebox.config.configset import get_config
+ from archivebox.machine.models import NetworkInterface, Process
self.primary_url = self.crawl.get_urls_list()[0] if self.crawl.get_urls_list() else ""
+ current_iface = NetworkInterface.current(refresh=True)
+ current_process = Process.current()
+ if current_process.iface_id != current_iface.id or current_process.machine_id != current_iface.machine_id:
+ current_process.iface = current_iface
+ current_process.machine = current_iface.machine
+ current_process.save(update_fields=["iface", "machine", "modified_at"])
self.persona = self.crawl.resolve_persona()
self.base_config = get_config(crawl=self.crawl)
if self.selected_plugins is None:
@@ -168,6 +303,52 @@ class CrawlRunner:
if self.persona:
self.persona.cleanup_runtime_for_crawl(self.crawl)
+ def _create_live_ui(self) -> LiveBusUI | None:
+ stdout_is_tty = sys.stdout.isatty()
+ stderr_is_tty = sys.stderr.isatty()
+ interactive_tty = stdout_is_tty or stderr_is_tty
+ if not interactive_tty:
+ return None
+ stream = sys.stderr if stderr_is_tty else sys.stdout
+ if os.path.exists("/dev/tty"):
+ try:
+ self._live_stream = open("/dev/tty", "w", buffering=1, encoding=getattr(stream, "encoding", None) or "utf-8")
+ stream = self._live_stream
+ except OSError:
+ self._live_stream = None
+ try:
+ terminal_size = os.get_terminal_size(stream.fileno())
+ terminal_width = terminal_size.columns
+ terminal_height = terminal_size.lines
+ except (AttributeError, OSError, ValueError):
+ terminal_size = shutil.get_terminal_size(fallback=(160, 40))
+ terminal_width = terminal_size.columns
+ terminal_height = terminal_size.lines
+ ui_console = Console(
+ file=stream,
+ force_terminal=True,
+ width=terminal_width,
+ height=terminal_height,
+ _environ={
+ "COLUMNS": str(terminal_width),
+ "LINES": str(terminal_height),
+ },
+ )
+ plugins_label = ", ".join(self.selected_plugins) if self.selected_plugins else f"all ({len(self.plugins)} available)"
+ live_ui = LiveBusUI(
+ self.bus,
+ total_hooks=_count_selected_hooks(self.plugins, self.selected_plugins),
+ timeout_seconds=int(self.base_config.get("TIMEOUT") or 60),
+ ui_console=ui_console,
+ interactive_tty=True,
+ )
+ live_ui.print_intro(
+ url=self.primary_url or INSTALL_URL,
+ output_dir=Path(self.crawl.output_dir),
+ plugins_label=plugins_label,
+ )
+ return live_ui
+
def _create_root_snapshots(self) -> list[str]:
created = self.crawl.create_snapshots_from_urls()
snapshots = created or list(self.crawl.snapshot_set.filter(depth=0).order_by("created_at"))
@@ -290,18 +471,34 @@ class CrawlRunner:
parent_snapshot_id=snapshot["parent_snapshot_id"],
crawl_id=str(self.crawl.id),
)
- await download(
- url=snapshot["url"],
- plugins=self.plugins,
- output_dir=Path(snapshot["output_dir"]),
- selected_plugins=self.selected_plugins,
+ snapshot_bus, snapshot_services = self._create_projector_bus(
+ identifier=f"{self.crawl.id}_{snapshot['id']}",
config_overrides=snapshot["config"],
- bus=self.bus,
- emit_jsonl=False,
- snapshot=abx_snapshot,
- skip_crawl_setup=True,
- skip_crawl_cleanup=True,
)
+ try:
+ _attach_bus_trace(snapshot_bus)
+ _runner_debug(f"snapshot {snapshot_id} starting download()")
+ await download(
+ url=snapshot["url"],
+ plugins=self.plugins,
+ output_dir=Path(snapshot["output_dir"]),
+ selected_plugins=self.selected_plugins,
+ config_overrides=snapshot["config"],
+ bus=snapshot_bus,
+ emit_jsonl=False,
+ snapshot=abx_snapshot,
+ skip_crawl_setup=True,
+ skip_crawl_cleanup=True,
+ )
+ _runner_debug(f"snapshot {snapshot_id} finished download(), waiting for background monitors")
+ await snapshot_services.process.wait_for_background_monitors()
+ _runner_debug(f"snapshot {snapshot_id} finished waiting for background monitors")
+ finally:
+ current_task = asyncio.current_task()
+ if current_task is not None and self.snapshot_tasks.get(snapshot_id) is current_task:
+ self.snapshot_tasks.pop(snapshot_id, None)
+ await _stop_bus_trace(snapshot_bus)
+ await snapshot_bus.stop()
def _load_snapshot_run_data(self, snapshot_id: str):
from archivebox.core.models import Snapshot
@@ -322,11 +519,24 @@ class CrawlRunner:
}
-def run_crawl(crawl_id: str, *, snapshot_ids: list[str] | None = None, selected_plugins: list[str] | None = None) -> None:
+def run_crawl(
+ crawl_id: str,
+ *,
+ snapshot_ids: list[str] | None = None,
+ selected_plugins: list[str] | None = None,
+ process_discovered_snapshots_inline: bool = True,
+) -> None:
from archivebox.crawls.models import Crawl
crawl = Crawl.objects.get(id=crawl_id)
- asyncio.run(CrawlRunner(crawl, snapshot_ids=snapshot_ids, selected_plugins=selected_plugins).run())
+ asyncio.run(
+ CrawlRunner(
+ crawl,
+ snapshot_ids=snapshot_ids,
+ selected_plugins=selected_plugins,
+ process_discovered_snapshots_inline=process_discovered_snapshots_inline,
+ ).run()
+ )
async def _run_binary(binary_id: str) -> None:
@@ -397,28 +607,203 @@ async def _run_install(plugin_names: list[str] | None = None) -> None:
BinaryService(bus)
TagService(bus)
ArchiveResultService(bus, process_service=process_service)
+ live_stream = None
try:
- _attach_bus_trace(bus)
- await abx_install_plugins(
- plugin_names=plugin_names,
- plugins=plugins,
- config_overrides=config,
- emit_jsonl=False,
- bus=bus,
- )
- await abx_services.process.wait_for_background_monitors()
+ selected_plugins = prepare_install_plugins(plugins, plugin_names=plugin_names)
+ plugins_label = ", ".join(plugin_names) if plugin_names else f"all ({len(plugins)} available)"
+ timeout_seconds = int(config.get("TIMEOUT") or 60)
+ stdout_is_tty = sys.stdout.isatty()
+ stderr_is_tty = sys.stderr.isatty()
+ interactive_tty = stdout_is_tty or stderr_is_tty
+ ui_console = None
+ live_ui = None
+
+ if interactive_tty:
+ stream = sys.stderr if stderr_is_tty else sys.stdout
+ if os.path.exists("/dev/tty"):
+ try:
+ live_stream = open("/dev/tty", "w", buffering=1, encoding=getattr(stream, "encoding", None) or "utf-8")
+ stream = live_stream
+ except OSError:
+ live_stream = None
+ try:
+ terminal_size = os.get_terminal_size(stream.fileno())
+ terminal_width = terminal_size.columns
+ terminal_height = terminal_size.lines
+ except (AttributeError, OSError, ValueError):
+ terminal_size = shutil.get_terminal_size(fallback=(160, 40))
+ terminal_width = terminal_size.columns
+ terminal_height = terminal_size.lines
+ ui_console = Console(
+ file=stream,
+ force_terminal=True,
+ width=terminal_width,
+ height=terminal_height,
+ _environ={
+ "COLUMNS": str(terminal_width),
+ "LINES": str(terminal_height),
+ },
+ )
+
+ with TemporaryDirectory(prefix="archivebox-install-") as temp_dir:
+ output_dir = Path(temp_dir)
+ if ui_console is not None:
+ live_ui = LiveBusUI(
+ bus,
+ total_hooks=_count_selected_hooks(selected_plugins, None),
+ timeout_seconds=timeout_seconds,
+ ui_console=ui_console,
+ interactive_tty=interactive_tty,
+ )
+ live_ui.print_intro(
+ url=INSTALL_URL,
+ output_dir=output_dir,
+ plugins_label=plugins_label,
+ )
+ with live_ui if live_ui is not None else nullcontext():
+ _attach_bus_trace(bus)
+ results = await abx_install_plugins(
+ plugin_names=plugin_names,
+ plugins=plugins,
+ output_dir=output_dir,
+ config_overrides=config,
+ emit_jsonl=False,
+ bus=bus,
+ )
+ await abx_services.process.wait_for_background_monitors()
+ if live_ui is not None:
+ live_ui.print_summary(results, output_dir=output_dir)
finally:
await _stop_bus_trace(bus)
await bus.stop()
+ try:
+ if live_stream is not None:
+ live_stream.close()
+ except Exception:
+ pass
def run_install(*, plugin_names: list[str] | None = None) -> None:
asyncio.run(_run_install(plugin_names=plugin_names))
+def recover_orphaned_crawls() -> int:
+ from archivebox.crawls.models import Crawl
+ from archivebox.core.models import Snapshot
+ from archivebox.machine.models import Process
+
+ active_crawl_ids: set[str] = set()
+ running_processes = Process.objects.filter(
+ status=Process.StatusChoices.RUNNING,
+ process_type__in=[
+ Process.TypeChoices.WORKER,
+ Process.TypeChoices.HOOK,
+ Process.TypeChoices.BINARY,
+ ],
+ ).only("env")
+
+ for proc in running_processes:
+ env = proc.env or {}
+ if not isinstance(env, dict):
+ continue
+ crawl_id = env.get("CRAWL_ID")
+ if crawl_id:
+ active_crawl_ids.add(str(crawl_id))
+
+ recovered = 0
+ now = timezone.now()
+ orphaned_crawls = Crawl.objects.filter(
+ status=Crawl.StatusChoices.STARTED,
+ retry_at__isnull=True,
+ ).prefetch_related("snapshot_set")
+
+ for crawl in orphaned_crawls:
+ if str(crawl.id) in active_crawl_ids:
+ continue
+
+ snapshots = list(crawl.snapshot_set.all())
+ if not snapshots or all(snapshot.status == Snapshot.StatusChoices.SEALED for snapshot in snapshots):
+ crawl.status = Crawl.StatusChoices.SEALED
+ crawl.retry_at = None
+ crawl.save(update_fields=["status", "retry_at", "modified_at"])
+ recovered += 1
+ continue
+
+ crawl.retry_at = now
+ crawl.save(update_fields=["retry_at", "modified_at"])
+ recovered += 1
+
+ return recovered
+
+
+def recover_orphaned_snapshots() -> int:
+ from archivebox.crawls.models import Crawl
+ from archivebox.core.models import ArchiveResult, Snapshot
+ from archivebox.machine.models import Process
+
+ active_snapshot_ids: set[str] = set()
+ running_processes = Process.objects.filter(
+ status=Process.StatusChoices.RUNNING,
+ process_type__in=[
+ Process.TypeChoices.WORKER,
+ Process.TypeChoices.HOOK,
+ Process.TypeChoices.BINARY,
+ ],
+ ).only("env")
+
+ for proc in running_processes:
+ env = proc.env or {}
+ if not isinstance(env, dict):
+ continue
+ snapshot_id = env.get("SNAPSHOT_ID")
+ if snapshot_id:
+ active_snapshot_ids.add(str(snapshot_id))
+
+ recovered = 0
+ now = timezone.now()
+ orphaned_snapshots = (
+ Snapshot.objects
+ .filter(status=Snapshot.StatusChoices.STARTED, retry_at__isnull=True)
+ .select_related("crawl")
+ .prefetch_related("archiveresult_set")
+ )
+
+ for snapshot in orphaned_snapshots:
+ if str(snapshot.id) in active_snapshot_ids:
+ continue
+
+ results = list(snapshot.archiveresult_set.all())
+ if results and all(result.status in ArchiveResult.FINAL_STATES for result in results):
+ snapshot.status = Snapshot.StatusChoices.SEALED
+ snapshot.retry_at = None
+ snapshot.downloaded_at = snapshot.downloaded_at or now
+ snapshot.save(update_fields=["status", "retry_at", "downloaded_at", "modified_at"])
+
+ crawl = snapshot.crawl
+ if crawl.is_finished() and crawl.status != Crawl.StatusChoices.SEALED:
+ crawl.status = Crawl.StatusChoices.SEALED
+ crawl.retry_at = None
+ crawl.save(update_fields=["status", "retry_at", "modified_at"])
+ recovered += 1
+ continue
+
+ snapshot.status = Snapshot.StatusChoices.QUEUED
+ snapshot.retry_at = now
+ snapshot.save(update_fields=["status", "retry_at", "modified_at"])
+
+ crawl = snapshot.crawl
+ crawl.status = Crawl.StatusChoices.QUEUED
+ crawl.retry_at = now
+ crawl.save(update_fields=["status", "retry_at", "modified_at"])
+ recovered += 1
+
+ return recovered
+
+
def run_pending_crawls(*, daemon: bool = False, crawl_id: str | None = None) -> int:
from archivebox.crawls.models import Crawl, CrawlSchedule
+ from archivebox.core.models import Snapshot
from archivebox.machine.models import Binary
while True:
@@ -436,10 +821,48 @@ def run_pending_crawls(*, daemon: bool = False, crawl_id: str | None = None) ->
.first()
)
if binary is not None:
+ if not binary.claim_processing_lock(lock_seconds=60):
+ continue
run_binary(str(binary.id))
continue
- pending = Crawl.objects.filter(retry_at__lte=timezone.now()).exclude(status=Crawl.StatusChoices.SEALED)
+ queued_crawls = Crawl.objects.filter(
+ retry_at__lte=timezone.now(),
+ status=Crawl.StatusChoices.QUEUED,
+ )
+ if crawl_id:
+ queued_crawls = queued_crawls.filter(id=crawl_id)
+ queued_crawls = queued_crawls.order_by("retry_at", "created_at")
+
+ queued_crawl = queued_crawls.first()
+ if queued_crawl is not None:
+ if not queued_crawl.claim_processing_lock(lock_seconds=60):
+ continue
+ run_crawl(str(queued_crawl.id), process_discovered_snapshots_inline=False)
+ continue
+
+ if crawl_id is None:
+ snapshot = (
+ Snapshot.objects.filter(retry_at__lte=timezone.now())
+ .exclude(status=Snapshot.StatusChoices.SEALED)
+ .select_related("crawl")
+ .order_by("retry_at", "created_at")
+ .first()
+ )
+ if snapshot is not None:
+ if not snapshot.claim_processing_lock(lock_seconds=60):
+ continue
+ run_crawl(
+ str(snapshot.crawl_id),
+ snapshot_ids=[str(snapshot.id)],
+ process_discovered_snapshots_inline=False,
+ )
+ continue
+
+ pending = Crawl.objects.filter(
+ retry_at__lte=timezone.now(),
+ status=Crawl.StatusChoices.STARTED,
+ )
if crawl_id:
pending = pending.filter(id=crawl_id)
pending = pending.order_by("retry_at", "created_at")
@@ -451,4 +874,7 @@ def run_pending_crawls(*, daemon: bool = False, crawl_id: str | None = None) ->
continue
return 0
- run_crawl(str(crawl.id))
+ if not crawl.claim_processing_lock(lock_seconds=60):
+ continue
+
+ run_crawl(str(crawl.id), process_discovered_snapshots_inline=False)
diff --git a/archivebox/services/snapshot_service.py b/archivebox/services/snapshot_service.py
index bdb35641..c4acbe5d 100644
--- a/archivebox/services/snapshot_service.py
+++ b/archivebox/services/snapshot_service.py
@@ -1,13 +1,13 @@
from __future__ import annotations
-import re
-
from asgiref.sync import sync_to_async
from django.utils import timezone
from abx_dl.events import SnapshotCompletedEvent, SnapshotEvent
from abx_dl.services.base import BaseService
+from .db import run_db_op
+
class SnapshotService(BaseService):
LISTENS_TO = [SnapshotEvent, SnapshotCompletedEvent]
@@ -18,13 +18,17 @@ class SnapshotService(BaseService):
self.schedule_snapshot = schedule_snapshot
super().__init__(bus)
- async def on_SnapshotEvent(self, event: SnapshotEvent) -> None:
- snapshot_id = await sync_to_async(self._project_snapshot, thread_sensitive=True)(event)
+ async def on_SnapshotEvent__Outer(self, event: SnapshotEvent) -> None:
+ snapshot_id = await run_db_op(self._project_snapshot, event)
+ if snapshot_id:
+ await sync_to_async(self._ensure_crawl_symlink)(snapshot_id)
if snapshot_id and event.depth > 0:
await self.schedule_snapshot(snapshot_id)
- async def on_SnapshotCompletedEvent(self, event: SnapshotCompletedEvent) -> None:
- await sync_to_async(self._seal_snapshot, thread_sensitive=True)(event.snapshot_id)
+ async def on_SnapshotCompletedEvent__Outer(self, event: SnapshotCompletedEvent) -> None:
+ snapshot_id = await run_db_op(self._seal_snapshot, event.snapshot_id)
+ if snapshot_id:
+ await sync_to_async(self._write_snapshot_details)(snapshot_id)
def _project_snapshot(self, event: SnapshotEvent) -> str | None:
from archivebox.core.models import Snapshot
@@ -39,7 +43,6 @@ class SnapshotService(BaseService):
snapshot.status = Snapshot.StatusChoices.STARTED
snapshot.retry_at = None
snapshot.save(update_fields=["status", "retry_at", "modified_at"])
- snapshot.ensure_crawl_symlink()
return str(snapshot.id)
if event.depth > crawl.max_depth:
@@ -73,56 +76,36 @@ class SnapshotService(BaseService):
if snapshot.status != Snapshot.StatusChoices.SEALED:
snapshot.status = Snapshot.StatusChoices.QUEUED
snapshot.save(update_fields=["status", "retry_at", "modified_at"])
- snapshot.ensure_crawl_symlink()
return str(snapshot.id)
def _url_passes_filters(self, crawl, parent_snapshot, url: str) -> bool:
- from archivebox.config.configset import get_config
+ return crawl.url_passes_filters(url, snapshot=parent_snapshot)
- config = get_config(
- user=getattr(crawl, "created_by", None),
- crawl=crawl,
- snapshot=parent_snapshot,
- )
-
- def to_pattern_list(value):
- if isinstance(value, list):
- return value
- if isinstance(value, str):
- return [pattern.strip() for pattern in value.split(",") if pattern.strip()]
- return []
-
- allowlist = to_pattern_list(config.get("URL_ALLOWLIST", ""))
- denylist = to_pattern_list(config.get("URL_DENYLIST", ""))
-
- for pattern in denylist:
- try:
- if re.search(pattern, url):
- return False
- except re.error:
- continue
-
- if allowlist:
- for pattern in allowlist:
- try:
- if re.search(pattern, url):
- return True
- except re.error:
- continue
- return False
-
- return True
-
- def _seal_snapshot(self, snapshot_id: str) -> None:
+ def _seal_snapshot(self, snapshot_id: str) -> str | None:
from archivebox.core.models import Snapshot
snapshot = Snapshot.objects.filter(id=snapshot_id).first()
if snapshot is None:
- return
+ return None
snapshot.status = Snapshot.StatusChoices.SEALED
snapshot.retry_at = None
snapshot.downloaded_at = snapshot.downloaded_at or timezone.now()
snapshot.save(update_fields=["status", "retry_at", "downloaded_at", "modified_at"])
+ return str(snapshot.id)
+
+ def _ensure_crawl_symlink(self, snapshot_id: str) -> None:
+ from archivebox.core.models import Snapshot
+
+ snapshot = Snapshot.objects.filter(id=snapshot_id).select_related("crawl", "crawl__created_by").first()
+ if snapshot is not None:
+ snapshot.ensure_crawl_symlink()
+
+ def _write_snapshot_details(self, snapshot_id: str) -> None:
+ from archivebox.core.models import Snapshot
+
+ snapshot = Snapshot.objects.filter(id=snapshot_id).select_related("crawl", "crawl__created_by").first()
+ if snapshot is None:
+ return
snapshot.write_index_jsonl()
snapshot.write_json_details()
snapshot.write_html_details()
diff --git a/archivebox/services/tag_service.py b/archivebox/services/tag_service.py
index 69d0fe2f..78622609 100644
--- a/archivebox/services/tag_service.py
+++ b/archivebox/services/tag_service.py
@@ -1,16 +1,17 @@
from __future__ import annotations
-from asgiref.sync import sync_to_async
from abx_dl.events import TagEvent
from abx_dl.services.base import BaseService
+from .db import run_db_op
+
class TagService(BaseService):
LISTENS_TO = [TagEvent]
EMITS = []
- async def on_TagEvent(self, event: TagEvent) -> None:
- await sync_to_async(self._project, thread_sensitive=True)(event)
+ async def on_TagEvent__Outer(self, event: TagEvent) -> None:
+ await run_db_op(self._project, event)
def _project(self, event: TagEvent) -> None:
from archivebox.core.models import Snapshot, Tag
diff --git a/archivebox/templates/admin/base.html b/archivebox/templates/admin/base.html
index b2b5bcc9..f9d42c66 100644
--- a/archivebox/templates/admin/base.html
+++ b/archivebox/templates/admin/base.html
@@ -1083,8 +1083,11 @@
width: 100% !important;
}
- body.filters-collapsed.change-list #changelist .changelist-form-container > div {
+ body.filters-collapsed.change-list #changelist .changelist-form-container > div,
+ body.filters-collapsed.change-list #changelist .changelist-form-container > form {
max-width: 100% !important;
+ width: 100% !important;
+ flex: 1 1 100% !important;
}
/* Actions bar */
@@ -1372,7 +1375,8 @@
order: 2;
align-self: flex-start;
}
- body.change-list #changelist .changelist-form-container > div {
+ body.change-list #changelist .changelist-form-container > div,
+ body.change-list #changelist .changelist-form-container > form {
flex: 1 1 auto;
min-width: 0;
order: 1;
diff --git a/archivebox/templates/admin/core/tag/change_form.html b/archivebox/templates/admin/core/tag/change_form.html
new file mode 100644
index 00000000..cde49905
--- /dev/null
+++ b/archivebox/templates/admin/core/tag/change_form.html
@@ -0,0 +1,268 @@
+{% extends "admin/change_form.html" %}
+
+{% block bodyclass %}{{ block.super }} app-core model-tag tag-form-page{% endblock %}
+
+{% block extrastyle %}
+{{ block.super }}
+
+{% endblock %}
+
+{% block form_top %}
+
+{{ block.super }}
+{% endblock %}
+
+{% block after_field_sets %}
+{{ block.super }}
+
+ Similar Tags
+ Updates while typing.
+
+
+
+{{ tag_similar_cards|json_script:"abx-tag-similar-data" }}
+
+
+{% endblock %}
diff --git a/archivebox/templates/admin/core/tag/change_list.html b/archivebox/templates/admin/core/tag/change_list.html
new file mode 100644
index 00000000..5ce822c5
--- /dev/null
+++ b/archivebox/templates/admin/core/tag/change_list.html
@@ -0,0 +1,997 @@
+{% extends "admin/change_list.html" %}
+
+{% block bodyclass %}{{ block.super }} app-core model-tag change-list tag-admin-page{% endblock %}
+
+{% block object-tools %}{% endblock %}
+
+{% block extrastyle %}
+{{ block.super }}
+
+{% endblock %}
+
+{% block content %}
+
+
+
+
+
+
+ {% if initial_tag_cards %}
+ {% for card in initial_tag_cards %}
+
+
+
+ {% if card.snapshots %}
+ {% for snapshot in card.snapshots %}
+
+
+ {{ snapshot.title }}
+
+ {% endfor %}
+ {% else %}
+
No snapshots attached yet.
+ {% endif %}
+
+
+ {% endfor %}
+ {% else %}
+
No tags.
+ {% endif %}
+
+
+
+
+{{ initial_tag_cards|json_script:"abx-tag-cards-data" }}
+
+
+{% endblock %}
diff --git a/archivebox/templates/admin/personas/persona/change_form.html b/archivebox/templates/admin/personas/persona/change_form.html
new file mode 100644
index 00000000..262c66c9
--- /dev/null
+++ b/archivebox/templates/admin/personas/persona/change_form.html
@@ -0,0 +1,249 @@
+{% extends "admin/change_form.html" %}
+
+{% block bodyclass %}{{ block.super }} app-personas model-persona{% endblock %}
+
+{% block extrastyle %}
+{{ block.super }}
+
+{% endblock %}
+
+{% block extrahead %}
+{{ block.super }}
+
+{% endblock %}
+
+{% block form_top %}
+
+
+
Bootstrap a persona from a real browser session
+
+ Pick a local Chromium profile, paste an absolute profile path, or attach to a live CDP endpoint.
+ The form saves the Persona normally, then imports profile files, cookies, and optional tab storage into
+ the Persona's own directories.
+
+
+
+
+{{ block.super }}
+{% endblock %}
diff --git a/archivebox/templates/admin/progress_monitor.html b/archivebox/templates/admin/progress_monitor.html
index cd676de9..f5e48789 100644
--- a/archivebox/templates/admin/progress_monitor.html
+++ b/archivebox/templates/admin/progress_monitor.html
@@ -706,14 +706,14 @@
? Math.max(0, Math.min(100, extractor.progress))
: null;
const progressStyle = progress !== null ? ` style="width: ${progress}%;"` : '';
- const pidHtml = extractor.pid ? `pid ${extractor.pid} ` : '';
+ const pidHtml = extractor.status === 'started' && extractor.pid ? `pid ${extractor.pid} ` : '';
return `
@@ -742,6 +742,23 @@
`;
}
+ const hasProcessEntries = (snapshot.all_plugins || []).some(extractor => extractor.source === 'process');
+ const hasArchiveResults = (snapshot.all_plugins || []).some(extractor => extractor.source === 'archiveresult');
+ const processOnly = hasProcessEntries && !hasArchiveResults;
+ const runningProcessCount = (snapshot.all_plugins || []).filter(extractor => extractor.source === 'process' && extractor.status === 'started').length;
+ const failedProcessCount = (snapshot.all_plugins || []).filter(extractor => extractor.source === 'process' && extractor.status === 'failed').length;
+ const snapshotMeta = (snapshot.total_plugins || 0) > 0
+ ? processOnly
+ ? runningProcessCount > 0
+ ? `Running ${runningProcessCount}/${snapshot.total_plugins || 0} setup hooks`
+ : failedProcessCount > 0
+ ? `${failedProcessCount} setup hook${failedProcessCount === 1 ? '' : 's'} failed`
+ : `${snapshot.completed_plugins || 0}/${snapshot.total_plugins || 0} setup hooks`
+ : hasProcessEntries
+ ? `${snapshot.completed_plugins || 0}/${snapshot.total_plugins || 0} tasks${(snapshot.failed_plugins || 0) > 0 ? ` (${snapshot.failed_plugins} failed) ` : ''}${runningProcessCount > 0 ? ` (${runningProcessCount} hooks running) ` : ''}`
+ : `${snapshot.completed_plugins || 0}/${snapshot.total_plugins || 0} extractors${(snapshot.failed_plugins || 0) > 0 ? ` (${snapshot.failed_plugins} failed) ` : ''}`
+ : 'Waiting for extractors...';
+
return `
@@ -784,6 +799,29 @@
if (crawl.active_snapshots && crawl.active_snapshots.length > 0) {
snapshotsHtml = crawl.active_snapshots.map(s => renderSnapshot(s, crawl.id)).join('');
}
+ let setupHtml = '';
+ if (crawl.setup_plugins && crawl.setup_plugins.length > 0) {
+ const setupSummary = `${crawl.setup_completed_plugins || 0}/${crawl.setup_total_plugins || 0} setup tasks${(crawl.setup_failed_plugins || 0) > 0 ? `
(${crawl.setup_failed_plugins} failed) ` : ''}`;
+ const sortedSetup = [...crawl.setup_plugins].sort((a, b) =>
+ (a.plugin || '').localeCompare(b.plugin || '')
+ );
+ setupHtml = `
+
+
+
+
+ `;
+ }
// Show warning if crawl is stuck (queued but can't start)
let warningHtml = '';
@@ -847,6 +885,7 @@
${warningHtml}
+ ${setupHtml}
${snapshotsHtml}
diff --git a/archivebox/templates/core/add.html b/archivebox/templates/core/add.html
index dc5455c4..6663770a 100644
--- a/archivebox/templates/core/add.html
+++ b/archivebox/templates/core/add.html
@@ -38,56 +38,76 @@