From f400a2cd67222f5142820460ca3c348b7680ea3a Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sun, 22 Mar 2026 20:23:45 -0700 Subject: [PATCH] WIP: checkpoint working tree before rebasing onto dev --- .github/workflows/release-runner.yml | 45 + .github/workflows/release.yml | 1 - archivebox/api/urls.py | 3 +- archivebox/api/v1_core.py | 245 +++- archivebox/base_models/admin.py | 550 ++++++++- archivebox/cli/archivebox_add.py | 18 +- archivebox/cli/archivebox_archiveresult.py | 57 +- archivebox/cli/archivebox_extract.py | 76 +- archivebox/cli/archivebox_list.py | 11 +- archivebox/cli/archivebox_persona.py | 63 +- archivebox/cli/archivebox_pluginmap.py | 65 +- archivebox/cli/archivebox_run.py | 39 +- archivebox/cli/archivebox_server.py | 171 +-- archivebox/cli/archivebox_snapshot.py | 31 +- archivebox/config/common.py | 4 +- archivebox/config/views.py | 452 ++++++-- archivebox/core/admin_archiveresults.py | 250 +++- archivebox/core/admin_site.py | 2 + archivebox/core/admin_snapshots.py | 42 +- archivebox/core/admin_tags.py | 294 ++--- archivebox/core/forms.py | 111 +- archivebox/core/host_utils.py | 4 + .../0032_remove_archiveresult_retry_at.py | 15 + archivebox/core/models.py | 693 ++--------- archivebox/core/settings.py | 3 +- archivebox/core/tag_utils.py | 271 +++++ archivebox/core/templatetags/core_tags.py | 6 + archivebox/core/views.py | 287 ++++- archivebox/core/widgets.py | 300 ++++- archivebox/crawls/admin.py | 391 ++++++- archivebox/crawls/models.py | 277 ++++- archivebox/hooks.py | 43 +- archivebox/machine/admin.py | 21 +- archivebox/machine/models.py | 167 ++- archivebox/misc/util.py | 53 +- archivebox/personas/admin.py | 169 ++- archivebox/personas/export_browser_state.js | 210 ++++ archivebox/personas/forms.py | 176 +++ archivebox/personas/importers.py | 845 ++++++++++++++ archivebox/personas/models.py | 9 + archivebox/services/archive_result_service.py | 110 +- archivebox/services/binary_service.py | 62 +- archivebox/services/crawl_service.py | 21 +- archivebox/services/db.py | 16 + archivebox/services/live_ui.py | 1 + archivebox/services/machine_service.py | 7 +- archivebox/services/process_service.py | 26 +- archivebox/services/runner.py | 534 ++++++++- archivebox/services/snapshot_service.py | 73 +- archivebox/services/tag_service.py | 7 +- archivebox/templates/admin/base.html | 8 +- .../templates/admin/core/tag/change_form.html | 268 +++++ .../templates/admin/core/tag/change_list.html | 997 ++++++++++++++++ .../admin/personas/persona/change_form.html | 249 ++++ .../templates/admin/progress_monitor.html | 51 +- archivebox/templates/core/add.html | 1022 +++++++++++++++-- archivebox/templates/core/base.html | 6 +- archivebox/templates/core/navigation.html | 2 +- archivebox/templates/core/snapshot.html | 20 +- archivebox/templates/core/snapshot_live.html | 11 +- archivebox/templates/static/add.css | 500 +++++++- archivebox/templates/static/admin.css | 96 +- archivebox/tests/test_add_view.py | 195 ++++ archivebox/tests/test_admin_config_widget.py | 151 +++ archivebox/tests/test_admin_links.py | 127 ++ archivebox/tests/test_admin_views.py | 228 ++++ .../tests/test_archive_result_service.py | 305 +++++ archivebox/tests/test_cli_add.py | 47 + archivebox/tests/test_cli_archiveresult.py | 105 +- archivebox/tests/test_cli_crawl.py | 2 +- archivebox/tests/test_cli_piping.py | 25 +- archivebox/tests/test_cli_run.py | 182 +++ archivebox/tests/test_cli_server.py | 70 ++ archivebox/tests/test_cli_snapshot.py | 2 +- archivebox/tests/test_config_views.py | 326 ++++++ archivebox/tests/test_crawl_admin.py | 220 ++++ archivebox/tests/test_machine_models.py | 95 +- archivebox/tests/test_persona_admin.py | 191 +++ archivebox/tests/test_runner.py | 640 +++++++++++ archivebox/tests/test_tag_admin.py | 205 ++++ archivebox/tests/test_urls.py | 27 +- .../management/commands/runner_watch.py | 73 +- archivebox/workers/supervisord_util.py | 72 +- archivebox/workers/tasks.py | 6 +- bin/release.sh | 393 ++++++- pyproject.toml | 17 +- uv.lock | 154 ++- 87 files changed, 12607 insertions(+), 1808 deletions(-) create mode 100644 .github/workflows/release-runner.yml create mode 100644 archivebox/core/migrations/0032_remove_archiveresult_retry_at.py create mode 100644 archivebox/core/tag_utils.py create mode 100644 archivebox/personas/export_browser_state.js create mode 100644 archivebox/personas/forms.py create mode 100644 archivebox/personas/importers.py create mode 100644 archivebox/services/db.py create mode 100644 archivebox/services/live_ui.py create mode 100644 archivebox/templates/admin/core/tag/change_form.html create mode 100644 archivebox/templates/admin/core/tag/change_list.html create mode 100644 archivebox/templates/admin/personas/persona/change_form.html create mode 100644 archivebox/tests/test_add_view.py create mode 100644 archivebox/tests/test_admin_config_widget.py create mode 100644 archivebox/tests/test_admin_links.py create mode 100644 archivebox/tests/test_archive_result_service.py create mode 100644 archivebox/tests/test_config_views.py create mode 100644 archivebox/tests/test_crawl_admin.py create mode 100644 archivebox/tests/test_persona_admin.py create mode 100644 archivebox/tests/test_runner.py create mode 100644 archivebox/tests/test_tag_admin.py diff --git a/.github/workflows/release-runner.yml b/.github/workflows/release-runner.yml new file mode 100644 index 00000000..e9dd3ac4 --- /dev/null +++ b/.github/workflows/release-runner.yml @@ -0,0 +1,45 @@ +name: Release State + +on: + push: + branches: + - '**' + workflow_dispatch: + +permissions: + contents: write + id-token: write + +jobs: + release-state: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + submodules: true + ref: ${{ github.ref_name }} + + - uses: actions/setup-python@v5 + with: + python-version: "3.13" + + - uses: astral-sh/setup-uv@v6 + with: + enable-cache: true + + - uses: actions/setup-node@v4 + with: + node-version: 22 + + - name: Configure git identity + run: | + git config user.name "github-actions[bot]" + git config user.email "41898282+github-actions[bot]@users.noreply.github.com" + + - name: Run release script + env: + DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} + GH_TOKEN: ${{ github.token }} + PYPI_PAT_SECRET: ${{ secrets.PYPI_PAT_SECRET }} + run: ./bin/release.sh diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 86a507fa..032127ae 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -9,7 +9,6 @@ name: Release # This workflow ensures the correct ordering during a release. on: - workflow_dispatch: release: types: [published] diff --git a/archivebox/api/urls.py b/archivebox/api/urls.py index 81f8cb43..d22e07f7 100644 --- a/archivebox/api/urls.py +++ b/archivebox/api/urls.py @@ -6,8 +6,9 @@ from django.views.generic.base import RedirectView from .v1_api import urls as v1_api_urls urlpatterns = [ - path("", RedirectView.as_view(url='/api/v1')), + path("", RedirectView.as_view(url='/api/v1/docs')), + path("v1/", RedirectView.as_view(url='/api/v1/docs')), path("v1/", v1_api_urls), path("v1", RedirectView.as_view(url='/api/v1/docs')), diff --git a/archivebox/api/v1_core.py b/archivebox/api/v1_core.py index 062eba8b..51dab0e9 100644 --- a/archivebox/api/v1_core.py +++ b/archivebox/api/v1_core.py @@ -6,7 +6,8 @@ from typing import List, Optional, Union, Any, Annotated from datetime import datetime from django.db.models import Model, Q -from django.http import HttpRequest +from django.conf import settings +from django.http import HttpRequest, HttpResponse from django.core.exceptions import ValidationError from django.contrib.auth import get_user_model from django.contrib.auth.models import User @@ -18,6 +19,22 @@ from ninja.pagination import paginate, PaginationBase from ninja.errors import HttpError from archivebox.core.models import Snapshot, ArchiveResult, Tag +from archivebox.api.auth import auth_using_token +from archivebox.config.common import SERVER_CONFIG +from archivebox.core.tag_utils import ( + build_tag_cards, + delete_tag as delete_tag_record, + export_tag_snapshots_jsonl, + export_tag_urls, + get_matching_tags, + get_or_create_tag, + get_tag_by_ref, + normalize_created_by_filter, + normalize_created_year_filter, + normalize_has_snapshots_filter, + normalize_tag_sort, + rename_tag as rename_tag_record, +) from archivebox.crawls.models import Crawl from archivebox.api.v1_crawls import CrawlSchema @@ -404,7 +421,7 @@ class TagSchema(Schema): def get_tags(request: HttpRequest): setattr(request, 'with_snapshots', False) setattr(request, 'with_archiveresults', False) - return Tag.objects.all().distinct() + return get_matching_tags() @router.get("/tag/{tag_id}", response=TagSchema, url_name="get_tag") @@ -412,9 +429,9 @@ def get_tag(request: HttpRequest, tag_id: str, with_snapshots: bool = True): setattr(request, 'with_snapshots', with_snapshots) setattr(request, 'with_archiveresults', False) try: - return Tag.objects.get(id__icontains=tag_id) + return get_tag_by_ref(tag_id) except (Tag.DoesNotExist, ValidationError): - return Tag.objects.get(slug__icontains=tag_id) + raise HttpError(404, 'Tag not found') @router.get("/any/{id}", response=Union[SnapshotSchema, ArchiveResultSchema, TagSchema, CrawlSchema], url_name="get_any", summary="Get any object by its ID") @@ -459,6 +476,55 @@ class TagCreateResponseSchema(Schema): created: bool +class TagSearchSnapshotSchema(Schema): + id: str + title: str + url: str + favicon_url: str + admin_url: str + archive_url: str + downloaded_at: Optional[str] = None + + +class TagSearchCardSchema(Schema): + id: int + name: str + slug: str + num_snapshots: int + filter_url: str + edit_url: str + export_urls_url: str + export_jsonl_url: str + rename_url: str + delete_url: str + snapshots: List[TagSearchSnapshotSchema] + + +class TagSearchResponseSchema(Schema): + tags: List[TagSearchCardSchema] + sort: str + created_by: str + year: str + has_snapshots: str + + +class TagUpdateSchema(Schema): + name: str + + +class TagUpdateResponseSchema(Schema): + success: bool + tag_id: int + tag_name: str + slug: str + + +class TagDeleteResponseSchema(Schema): + success: bool + tag_id: int + deleted_count: int + + class TagSnapshotRequestSchema(Schema): snapshot_id: str tag_name: Optional[str] = None @@ -471,41 +537,82 @@ class TagSnapshotResponseSchema(Schema): tag_name: str -@router.get("/tags/autocomplete/", response=TagAutocompleteSchema, url_name="tags_autocomplete") +@router.get("/tags/search/", response=TagSearchResponseSchema, url_name="search_tags") +def search_tags( + request: HttpRequest, + q: str = "", + sort: str = 'created_desc', + created_by: str = '', + year: str = '', + has_snapshots: str = 'all', +): + """Return detailed tag cards for admin/live-search UIs.""" + normalized_sort = normalize_tag_sort(sort) + normalized_created_by = normalize_created_by_filter(created_by) + normalized_year = normalize_created_year_filter(year) + normalized_has_snapshots = normalize_has_snapshots_filter(has_snapshots) + return { + 'tags': build_tag_cards( + query=q, + request=request, + sort=normalized_sort, + created_by=normalized_created_by, + year=normalized_year, + has_snapshots=normalized_has_snapshots, + ), + 'sort': normalized_sort, + 'created_by': normalized_created_by, + 'year': normalized_year, + 'has_snapshots': normalized_has_snapshots, + } + + +def _public_tag_listing_enabled() -> bool: + explicit = getattr(settings, 'PUBLIC_SNAPSHOTS_LIST', None) + if explicit is not None: + return bool(explicit) + return bool(getattr(settings, 'PUBLIC_INDEX', SERVER_CONFIG.PUBLIC_INDEX)) + + +def _request_has_tag_autocomplete_access(request: HttpRequest) -> bool: + user = getattr(request, 'user', None) + if getattr(user, 'is_authenticated', False): + return True + + token = request.GET.get('api_key') or request.headers.get('X-ArchiveBox-API-Key') + auth_header = request.headers.get('Authorization', '') + if not token and auth_header.lower().startswith('bearer '): + token = auth_header.split(None, 1)[1].strip() + + if token and auth_using_token(token=token, request=request): + return True + + return _public_tag_listing_enabled() + + +@router.get("/tags/autocomplete/", response=TagAutocompleteSchema, url_name="tags_autocomplete", auth=None) def tags_autocomplete(request: HttpRequest, q: str = ""): """Return tags matching the query for autocomplete.""" - if not q: - # Return all tags if no query (limited to 50) - tags = Tag.objects.all().order_by('name')[:50] - else: - tags = Tag.objects.filter(name__icontains=q).order_by('name')[:20] + if not _request_has_tag_autocomplete_access(request): + raise HttpError(401, 'Authentication required') + + tags = get_matching_tags(q)[:50 if not q else 20] return { - 'tags': [{'id': tag.pk, 'name': tag.name, 'slug': tag.slug} for tag in tags] + 'tags': [{'id': tag.pk, 'name': tag.name, 'slug': tag.slug, 'num_snapshots': getattr(tag, 'num_snapshots', 0)} for tag in tags] } @router.post("/tags/create/", response=TagCreateResponseSchema, url_name="tags_create") def tags_create(request: HttpRequest, data: TagCreateSchema): """Create a new tag or return existing one.""" - name = data.name.strip() - if not name: - raise HttpError(400, 'Tag name is required') - - tag, created = Tag.objects.get_or_create( - name__iexact=name, - defaults={ - 'name': name, - 'created_by': request.user if request.user.is_authenticated else None, - } - ) - - # If found by case-insensitive match, use that tag - if not created: - existing_tag = Tag.objects.filter(name__iexact=name).first() - if existing_tag is None: - raise HttpError(500, 'Failed to load existing tag after get_or_create') - tag = existing_tag + try: + tag, created = get_or_create_tag( + data.name, + created_by=request.user if request.user.is_authenticated else None, + ) + except ValueError as err: + raise HttpError(400, str(err)) from err return { 'success': True, @@ -515,6 +622,62 @@ def tags_create(request: HttpRequest, data: TagCreateSchema): } +@router.post("/tag/{tag_id}/rename", response=TagUpdateResponseSchema, url_name="rename_tag") +def rename_tag(request: HttpRequest, tag_id: int, data: TagUpdateSchema): + try: + tag = rename_tag_record(get_tag_by_ref(tag_id), data.name) + except Tag.DoesNotExist as err: + raise HttpError(404, 'Tag not found') from err + except ValueError as err: + raise HttpError(400, str(err)) from err + + return { + 'success': True, + 'tag_id': tag.pk, + 'tag_name': tag.name, + 'slug': tag.slug, + } + + +@router.delete("/tag/{tag_id}", response=TagDeleteResponseSchema, url_name="delete_tag") +def delete_tag(request: HttpRequest, tag_id: int): + try: + tag = get_tag_by_ref(tag_id) + except Tag.DoesNotExist as err: + raise HttpError(404, 'Tag not found') from err + + deleted_count, _ = delete_tag_record(tag) + return { + 'success': True, + 'tag_id': int(tag_id), + 'deleted_count': deleted_count, + } + + +@router.get("/tag/{tag_id}/urls.txt", url_name="tag_urls_export") +def tag_urls_export(request: HttpRequest, tag_id: int): + try: + tag = get_tag_by_ref(tag_id) + except Tag.DoesNotExist as err: + raise HttpError(404, 'Tag not found') from err + + response = HttpResponse(export_tag_urls(tag), content_type='text/plain; charset=utf-8') + response['Content-Disposition'] = f'attachment; filename="tag-{tag.slug}-urls.txt"' + return response + + +@router.get("/tag/{tag_id}/snapshots.jsonl", url_name="tag_snapshots_export") +def tag_snapshots_export(request: HttpRequest, tag_id: int): + try: + tag = get_tag_by_ref(tag_id) + except Tag.DoesNotExist as err: + raise HttpError(404, 'Tag not found') from err + + response = HttpResponse(export_tag_snapshots_jsonl(tag), content_type='application/x-ndjson; charset=utf-8') + response['Content-Disposition'] = f'attachment; filename="tag-{tag.slug}-snapshots.jsonl"' + return response + + @router.post("/tags/add-to-snapshot/", response=TagSnapshotResponseSchema, url_name="tags_add_to_snapshot") def tags_add_to_snapshot(request: HttpRequest, data: TagSnapshotRequestSchema): """Add a tag to a snapshot. Creates the tag if it doesn't exist.""" @@ -534,24 +697,16 @@ def tags_add_to_snapshot(request: HttpRequest, data: TagSnapshotRequestSchema): # Get or create the tag if data.tag_name: - name = data.tag_name.strip() - if not name: - raise HttpError(400, 'Tag name is required') - - tag, _ = Tag.objects.get_or_create( - name__iexact=name, - defaults={ - 'name': name, - 'created_by': request.user if request.user.is_authenticated else None, - } - ) - # If found by case-insensitive match, use that tag - existing_tag = Tag.objects.filter(name__iexact=name).first() - if existing_tag is not None: - tag = existing_tag + try: + tag, _ = get_or_create_tag( + data.tag_name, + created_by=request.user if request.user.is_authenticated else None, + ) + except ValueError as err: + raise HttpError(400, str(err)) from err elif data.tag_id: try: - tag = Tag.objects.get(pk=data.tag_id) + tag = get_tag_by_ref(data.tag_id) except Tag.DoesNotExist: raise HttpError(404, 'Tag not found') else: diff --git a/archivebox/base_models/admin.py b/archivebox/base_models/admin.py index 0cd64854..116e3654 100644 --- a/archivebox/base_models/admin.py +++ b/archivebox/base_models/admin.py @@ -4,7 +4,7 @@ __package__ = 'archivebox.base_models' import json from collections.abc import Mapping -from typing import TypedDict +from typing import NotRequired, TypedDict from django import forms from django.contrib import admin @@ -17,9 +17,13 @@ from django_object_actions import DjangoObjectActions class ConfigOption(TypedDict): plugin: str - type: str + type: str | list[str] default: object description: str + enum: NotRequired[list[object]] + pattern: NotRequired[str] + minimum: NotRequired[int | float] + maximum: NotRequired[int | float] class KeyValueWidget(forms.Widget): @@ -44,12 +48,16 @@ class KeyValueWidget(forms.Widget): options: dict[str, ConfigOption] = {} for plugin_name, schema in plugin_configs.items(): for key, prop in schema.get('properties', {}).items(): - options[key] = { + option: ConfigOption = { 'plugin': plugin_name, 'type': prop.get('type', 'string'), 'default': prop.get('default', ''), 'description': prop.get('description', ''), } + for schema_key in ('enum', 'pattern', 'minimum', 'maximum'): + if schema_key in prop: + option[schema_key] = prop[schema_key] + options[key] = option return options except Exception: return {} @@ -98,14 +106,12 @@ class KeyValueWidget(forms.Widget): ''' # Render existing key-value pairs - row_idx = 0 for key, val in data.items(): val_str = json.dumps(val) if not isinstance(val, str) else val - html += self._render_row(widget_id, row_idx, key, val_str) - row_idx += 1 + html += self._render_row(widget_id, key, val_str) # Always add one empty row for new entries - html += self._render_row(widget_id, row_idx, '', '') + html += self._render_row(widget_id, '', '') html += f''' @@ -114,22 +120,450 @@ class KeyValueWidget(forms.Widget): style="padding: 4px 12px; cursor: pointer; background: #417690; color: white; border: none; border-radius: 4px;"> + Add Row - ''' return mark_safe(html) - def _render_row(self, widget_id: str, idx: int, key: str, value: str) -> str: + def _render_row(self, widget_id: str, key: str, value: str) -> str: return f''' -
- - - +
+
+ + + + +
+
''' diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index cbb6c7de..e38f4155 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -47,11 +47,13 @@ def _collect_input_urls(args: tuple[str, ...]) -> list[str]: def add(urls: str | list[str], depth: int | str=0, tag: str='', + url_allowlist: str='', + url_denylist: str='', parser: str="auto", plugins: str="", persona: str='Default', overwrite: bool=False, - update: bool=not ARCHIVING_CONFIG.ONLY_NEW, + update: bool | None=None, index_only: bool=False, bg: bool=False, created_by_id: int | None=None) -> tuple['Crawl', QuerySet['Snapshot']]: @@ -85,6 +87,8 @@ def add(urls: str | list[str], created_by_id = created_by_id or get_or_create_system_user_pk() started_at = timezone.now() + if update is None: + update = not ARCHIVING_CONFIG.ONLY_NEW # 1. Save the provided URLs to sources/2024-11-05__23-59-59__cli_add.txt sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__cli_add.txt' @@ -120,6 +124,8 @@ def add(urls: str | list[str], 'PLUGINS': plugins, 'DEFAULT_PERSONA': persona_name, 'PARSER': parser, + **({'URL_ALLOWLIST': url_allowlist} if url_allowlist else {}), + **({'URL_DENYLIST': url_denylist} if url_denylist else {}), } ) @@ -150,6 +156,9 @@ def add(urls: str | list[str], snapshot.ensure_crawl_symlink() return crawl, crawl.snapshot_set.all() + if bg: + crawl.create_snapshots_from_urls() + # 5. Start the crawl runner to process the queue # The runner will: # - Process Crawl -> create Snapshots from all URLs @@ -192,8 +201,7 @@ def add(urls: str | list[str], except Exception: rel_output_str = str(crawl.output_dir) - # Build admin URL from SERVER_CONFIG - bind_addr = SERVER_CONFIG.BIND_ADDR + bind_addr = SERVER_CONFIG.BIND_ADDR or '127.0.0.1:8000' if bind_addr.startswith('http://') or bind_addr.startswith('https://'): base_url = bind_addr else: @@ -218,11 +226,13 @@ def add(urls: str | list[str], @click.command() @click.option('--depth', '-d', type=click.Choice([str(i) for i in range(5)]), default='0', help='Recursively archive linked pages up to N hops away') @click.option('--tag', '-t', default='', help='Comma-separated list of tags to add to each snapshot e.g. tag1,tag2,tag3') +@click.option('--url-allowlist', '--domain-allowlist', default='', help='Comma-separated URL/domain allowlist for this crawl') +@click.option('--url-denylist', '--domain-denylist', default='', help='Comma-separated URL/domain denylist for this crawl') @click.option('--parser', default='auto', help='Parser for reading input URLs (auto, txt, html, rss, json, jsonl, netscape, ...)') @click.option('--plugins', '-p', default='', help='Comma-separated list of plugins to run e.g. title,favicon,screenshot,singlefile,...') @click.option('--persona', default='Default', help='Authentication profile to use when archiving') @click.option('--overwrite', '-F', is_flag=True, help='Overwrite existing data if URLs have been archived previously') -@click.option('--update', is_flag=True, default=ARCHIVING_CONFIG.ONLY_NEW, help='Retry any previously skipped/failed URLs when re-adding them') +@click.option('--update', is_flag=True, default=None, help='Retry any previously skipped/failed URLs when re-adding them') @click.option('--index-only', is_flag=True, help='Just add the URLs to the index without archiving them now') @click.option('--bg', is_flag=True, help='Run archiving in background (queue work and return immediately)') @click.argument('urls', nargs=-1, type=click.Path()) diff --git a/archivebox/cli/archivebox_archiveresult.py b/archivebox/cli/archivebox_archiveresult.py index aea83413..6cf0dffc 100644 --- a/archivebox/cli/archivebox_archiveresult.py +++ b/archivebox/cli/archivebox_archiveresult.py @@ -42,6 +42,16 @@ from rich import print as rprint from archivebox.cli.cli_utils import apply_filters +def build_archiveresult_request(snapshot_id: str, plugin: str, hook_name: str = '', status: str = 'queued') -> dict: + return { + 'type': 'ArchiveResult', + 'snapshot_id': str(snapshot_id), + 'plugin': plugin, + 'hook_name': hook_name, + 'status': status, + } + + # ============================================================================= # CREATE # ============================================================================= @@ -52,21 +62,21 @@ def create_archiveresults( status: str = 'queued', ) -> int: """ - Create ArchiveResults for Snapshots. + Create ArchiveResult request records for Snapshots. - Reads Snapshot records from stdin and creates ArchiveResult entries. + Reads Snapshot records from stdin and emits ArchiveResult request JSONL. Pass-through: Non-Snapshot/ArchiveResult records are output unchanged. - If --plugin is specified, only creates results for that plugin. - Otherwise, creates results for all pending plugins. + If --plugin is specified, only emits requests for that plugin. + Otherwise, emits requests for all enabled snapshot hooks. Exit codes: 0: Success 1: Failure """ - from django.utils import timezone - + from archivebox.config.configset import get_config + from archivebox.hooks import discover_hooks from archivebox.misc.jsonl import read_stdin, write_record, TYPE_SNAPSHOT, TYPE_ARCHIVERESULT - from archivebox.core.models import Snapshot, ArchiveResult + from archivebox.core.models import Snapshot is_tty = sys.stdout.isatty() @@ -135,33 +145,20 @@ def create_archiveresults( created_count = 0 for snapshot in snapshots: if plugin: - # Create for specific plugin only - result, created = ArchiveResult.objects.get_or_create( - snapshot=snapshot, - plugin=plugin, - defaults={ - 'status': status, - 'retry_at': timezone.now(), - } - ) - if not created and result.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED]: - # Reset for retry - result.status = status - result.retry_at = timezone.now() - result.save() - if not is_tty: - write_record(result.to_json()) + write_record(build_archiveresult_request(snapshot.id, plugin, status=status)) created_count += 1 else: - # Create all pending plugins - snapshot.create_pending_archiveresults() - for result in snapshot.archiveresult_set.filter(status=ArchiveResult.StatusChoices.QUEUED): + config = get_config(crawl=snapshot.crawl, snapshot=snapshot) + hooks = discover_hooks('Snapshot', config=config) + for hook_path in hooks: + hook_name = hook_path.name + plugin_name = hook_path.parent.name if not is_tty: - write_record(result.to_json()) + write_record(build_archiveresult_request(snapshot.id, plugin_name, hook_name=hook_name, status=status)) created_count += 1 - rprint(f'[green]Created/queued {created_count} archive results[/green]', file=sys.stderr) + rprint(f'[green]Created {created_count} archive result request records[/green]', file=sys.stderr) return 0 @@ -205,6 +202,7 @@ def list_archiveresults( 'succeeded': 'green', 'failed': 'red', 'skipped': 'dim', + 'noresults': 'dim', 'backoff': 'magenta', }.get(result.status, 'dim') rprint(f'[{status_color}]{result.status:10}[/{status_color}] {result.plugin:15} [dim]{result.id}[/dim] {result.snapshot.url[:40]}') @@ -233,8 +231,6 @@ def update_archiveresults( 0: Success 1: No input or error """ - from django.utils import timezone - from archivebox.misc.jsonl import read_stdin, write_record from archivebox.core.models import ArchiveResult @@ -257,7 +253,6 @@ def update_archiveresults( # Apply updates from CLI flags if status: result.status = status - result.retry_at = timezone.now() result.save() updated_count += 1 diff --git a/archivebox/cli/archivebox_extract.py b/archivebox/cli/archivebox_extract.py index dde97edb..8f132a58 100644 --- a/archivebox/cli/archivebox_extract.py +++ b/archivebox/cli/archivebox_extract.py @@ -38,15 +38,16 @@ import rich_click as click def process_archiveresult_by_id(archiveresult_id: str) -> int: """ - Run extraction for a single ArchiveResult by ID (used by workers). + Re-run extraction for a single ArchiveResult by ID. - Triggers the ArchiveResult's state machine tick() to run the extractor - plugin, but only after claiming ownership via retry_at. This keeps direct - CLI execution aligned with the worker lifecycle and prevents duplicate hook - runs if another process already owns the same ArchiveResult. + ArchiveResults are projected status rows, not queued work items. Re-running + a single result means resetting that row and queueing its parent snapshot + through the shared crawl runner with the corresponding plugin selected. """ from rich import print as rprint + from django.utils import timezone from archivebox.core.models import ArchiveResult + from archivebox.services.runner import run_crawl try: archiveresult = ArchiveResult.objects.get(id=archiveresult_id) @@ -57,16 +58,27 @@ def process_archiveresult_by_id(archiveresult_id: str) -> int: rprint(f'[blue]Extracting {archiveresult.plugin} for {archiveresult.snapshot.url}[/blue]', file=sys.stderr) try: - # Claim-before-tick is the required calling pattern for direct - # state-machine drivers. If another worker already owns this row, - # report that and exit without running duplicate extractor side effects. - if not archiveresult.tick_claimed(lock_seconds=120): - print(f'[yellow]Extraction already claimed by another process: {archiveresult.plugin}[/yellow]') - return 0 + archiveresult.reset_for_retry() + snapshot = archiveresult.snapshot + snapshot.status = snapshot.StatusChoices.QUEUED + snapshot.retry_at = timezone.now() + snapshot.save(update_fields=['status', 'retry_at', 'modified_at']) + + crawl = snapshot.crawl + if crawl.status != crawl.StatusChoices.STARTED: + crawl.status = crawl.StatusChoices.QUEUED + crawl.retry_at = timezone.now() + crawl.save(update_fields=['status', 'retry_at', 'modified_at']) + + run_crawl(str(crawl.id), snapshot_ids=[str(snapshot.id)], selected_plugins=[archiveresult.plugin]) + archiveresult.refresh_from_db() if archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED: print(f'[green]Extraction succeeded: {archiveresult.output_str}[/green]') return 0 + elif archiveresult.status == ArchiveResult.StatusChoices.NORESULTS: + print(f'[dim]Extraction completed with no results: {archiveresult.output_str}[/dim]') + return 0 elif archiveresult.status == ArchiveResult.StatusChoices.FAILED: print(f'[red]Extraction failed: {archiveresult.output_str}[/red]', file=sys.stderr) return 1 @@ -121,8 +133,9 @@ def run_plugins( rprint('[yellow]No snapshots provided. Pass snapshot IDs as arguments or via stdin.[/yellow]', file=sys.stderr) return 1 - # Gather snapshot IDs to process + # Gather snapshot IDs and optional plugin constraints to process snapshot_ids = set() + requested_plugins_by_snapshot: dict[str, set[str]] = defaultdict(set) for record in records: record_type = record.get('type') @@ -142,6 +155,9 @@ def run_plugins( snapshot_id = record.get('snapshot_id') if snapshot_id: snapshot_ids.add(snapshot_id) + plugin_name = record.get('plugin') + if plugin_name and not plugins_list: + requested_plugins_by_snapshot[str(snapshot_id)].add(str(plugin_name)) elif 'id' in record: # Assume it's a snapshot ID @@ -160,26 +176,15 @@ def run_plugins( rprint(f'[yellow]Snapshot {snapshot_id} not found[/yellow]', file=sys.stderr) continue - # Create pending ArchiveResults if needed - if plugins_list: - # Only create for specific plugins - for plugin_name in plugins_list: - result, created = ArchiveResult.objects.get_or_create( - snapshot=snapshot, - plugin=plugin_name, - defaults={ - 'status': ArchiveResult.StatusChoices.QUEUED, - 'retry_at': timezone.now(), - } - ) - if not created and result.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED]: - # Reset for retry - result.status = ArchiveResult.StatusChoices.QUEUED - result.retry_at = timezone.now() - result.save() - else: - # Create all pending plugins - snapshot.create_pending_archiveresults() + for plugin_name in requested_plugins_by_snapshot.get(str(snapshot.id), set()): + existing_result = snapshot.archiveresult_set.filter(plugin=plugin_name).order_by('-created_at').first() + if existing_result and existing_result.status in [ + ArchiveResult.StatusChoices.FAILED, + ArchiveResult.StatusChoices.SKIPPED, + ArchiveResult.StatusChoices.NORESULTS, + ArchiveResult.StatusChoices.BACKOFF, + ]: + existing_result.reset_for_retry() # Reset snapshot status to allow processing if snapshot.status == Snapshot.StatusChoices.SEALED: @@ -207,10 +212,15 @@ def run_plugins( snapshot_ids_by_crawl[str(snapshot.crawl_id)].add(str(snapshot.id)) for crawl_id, crawl_snapshot_ids in snapshot_ids_by_crawl.items(): + selected_plugins = plugins_list or sorted({ + plugin + for snapshot_id in crawl_snapshot_ids + for plugin in requested_plugins_by_snapshot.get(str(snapshot_id), set()) + }) or None run_crawl( crawl_id, snapshot_ids=sorted(crawl_snapshot_ids), - selected_plugins=plugins_list or None, + selected_plugins=selected_plugins, ) # Output results as JSONL (when piped) or human-readable (when TTY) diff --git a/archivebox/cli/archivebox_list.py b/archivebox/cli/archivebox_list.py index eb603b77..6714c537 100644 --- a/archivebox/cli/archivebox_list.py +++ b/archivebox/cli/archivebox_list.py @@ -18,9 +18,13 @@ from archivebox.cli.archivebox_snapshot import list_snapshots @click.option('--tag', '-t', help='Filter by tag name') @click.option('--crawl-id', help='Filter by crawl ID') @click.option('--limit', '-n', type=int, help='Limit number of results') +@click.option('--sort', '-o', type=str, help='Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at') +@click.option('--csv', '-C', type=str, help='Print output as CSV with the provided fields, e.g.: timestamp,url,title') +@click.option('--with-headers', is_flag=True, help='Include column headers in structured output') def main(status: Optional[str], url__icontains: Optional[str], url__istartswith: Optional[str], - tag: Optional[str], crawl_id: Optional[str], limit: Optional[int]) -> None: - """List Snapshots as JSONL.""" + tag: Optional[str], crawl_id: Optional[str], limit: Optional[int], + sort: Optional[str], csv: Optional[str], with_headers: bool) -> None: + """List Snapshots.""" sys.exit(list_snapshots( status=status, url__icontains=url__icontains, @@ -28,6 +32,9 @@ def main(status: Optional[str], url__icontains: Optional[str], url__istartswith: tag=tag, crawl_id=crawl_id, limit=limit, + sort=sort, + csv=csv, + with_headers=with_headers, )) diff --git a/archivebox/cli/archivebox_persona.py b/archivebox/cli/archivebox_persona.py index c8acbbff..6ba981f0 100644 --- a/archivebox/cli/archivebox_persona.py +++ b/archivebox/cli/archivebox_persona.py @@ -42,6 +42,7 @@ import rich_click as click from rich import print as rprint from archivebox.cli.cli_utils import apply_filters +from archivebox.personas import importers as persona_importers # ============================================================================= @@ -440,8 +441,6 @@ def create_personas( browser_binary = get_browser_binary(import_from) if browser_binary: rprint(f'[dim]Using {import_from} binary: {browser_binary}[/dim]', file=sys.stderr) - else: - browser_binary = None created_count = 0 for name in name_list: @@ -450,7 +449,7 @@ def create_personas( continue # Validate persona name to prevent path traversal - is_valid, error_msg = validate_persona_name(name) + is_valid, error_msg = persona_importers.validate_persona_name(name) if not is_valid: rprint(f'[red]Invalid persona name "{name}": {error_msg}[/red]', file=sys.stderr) continue @@ -468,49 +467,29 @@ def create_personas( # Import browser profile if requested if import_from in CHROMIUM_BROWSERS and source_profile_dir is not None: - persona_chrome_dir = Path(persona.CHROME_USER_DATA_DIR) - - # Copy the browser profile - rprint(f'[dim]Copying browser profile to {persona_chrome_dir}...[/dim]', file=sys.stderr) - try: - # Remove existing chrome_user_data if it exists - if persona_chrome_dir.exists(): - shutil.rmtree(persona_chrome_dir) - - # Copy the profile directory - # We copy the entire user data dir, not just Default profile - shutil.copytree( - source_profile_dir, - persona_chrome_dir, - symlinks=True, - ignore=shutil.ignore_patterns( - 'Cache', 'Code Cache', 'GPUCache', 'ShaderCache', - 'Service Worker', 'GCM Store', '*.log', 'Crashpad', - 'BrowserMetrics', 'BrowserMetrics-spare.pma', - 'SingletonLock', 'SingletonSocket', 'SingletonCookie', - ), + import_source = persona_importers.resolve_browser_import_source(import_from, profile_dir=profile) + import_result = persona_importers.import_persona_from_source( + persona, + import_source, + copy_profile=True, + import_cookies=True, + capture_storage=False, ) - rprint('[green]Copied browser profile to persona[/green]', file=sys.stderr) - - # Extract cookies via CDP - rprint('[dim]Extracting cookies via CDP...[/dim]', file=sys.stderr) - - if extract_cookies_via_cdp( - persona_chrome_dir, - cookies_file, - profile_dir=profile, - chrome_binary=browser_binary, - ): - rprint(f'[green]Extracted cookies to {cookies_file}[/green]', file=sys.stderr) - else: - rprint('[yellow]Could not extract cookies automatically.[/yellow]', file=sys.stderr) - rprint('[dim]You can manually export cookies using a browser extension.[/dim]', file=sys.stderr) - except Exception as e: - rprint(f'[red]Failed to copy browser profile: {e}[/red]', file=sys.stderr) + rprint(f'[red]Failed to import browser profile: {e}[/red]', file=sys.stderr) return 1 + if import_result.profile_copied: + rprint('[green]Copied browser profile to persona[/green]', file=sys.stderr) + if import_result.cookies_imported: + rprint(f'[green]Extracted cookies to {cookies_file}[/green]', file=sys.stderr) + elif not import_result.profile_copied: + rprint('[yellow]Could not import cookies automatically.[/yellow]', file=sys.stderr) + + for warning in import_result.warnings: + rprint(f'[yellow]{warning}[/yellow]', file=sys.stderr) + if not is_tty: write_record({ 'id': str(persona.id) if hasattr(persona, 'id') else None, @@ -616,7 +595,7 @@ def update_personas(name: Optional[str] = None) -> int: # Apply updates from CLI flags if name: # Validate new name to prevent path traversal - is_valid, error_msg = validate_persona_name(name) + is_valid, error_msg = persona_importers.validate_persona_name(name) if not is_valid: rprint(f'[red]Invalid new persona name "{name}": {error_msg}[/red]', file=sys.stderr) continue diff --git a/archivebox/cli/archivebox_pluginmap.py b/archivebox/cli/archivebox_pluginmap.py index 21938572..41c0724a 100644 --- a/archivebox/cli/archivebox_pluginmap.py +++ b/archivebox/cli/archivebox_pluginmap.py @@ -89,56 +89,6 @@ SNAPSHOT_MACHINE_DIAGRAM = """ └─────────────────────────────────────────────────────────────────────────────┘ """ -ARCHIVERESULT_MACHINE_DIAGRAM = """ -┌─────────────────────────────────────────────────────────────────────────────┐ -│ ArchiveResultMachine │ -├─────────────────────────────────────────────────────────────────────────────┤ -│ │ -│ ┌─────────────┐ │ -│ │ QUEUED │◄─────────────────┐ │ -│ │ (initial) │ │ │ -│ └──┬───────┬──┘ │ │ -│ │ │ │ tick() unless can_start() │ -│ │ │ exceeded_max_ │ │ -│ │ │ attempts │ │ -│ │ ▼ │ │ -│ │ ┌──────────┐ │ │ -│ │ │ SKIPPED │ │ │ -│ │ │ (final) │ │ │ -│ │ └──────────┘ │ │ -│ │ tick() when │ │ -│ │ can_start() │ │ -│ ▼ │ │ -│ ┌─────────────┐ │ │ -│ │ STARTED │──────────────────┘ │ -│ │ │◄─────────────────────────────────────────────────┐ │ -│ │ enter: │ │ │ │ -│ │ result.run()│ tick() unless │ │ │ -│ │ (execute │ is_finished() │ │ │ -│ │ hook via │──────────────────────┘ │ │ -│ │ run_hook())│ │ │ -│ └──────┬──────┘ │ │ -│ │ │ │ -│ │ tick() checks status set by hook output │ │ -│ ├─────────────┬─────────────┬─────────────┐ │ │ -│ ▼ ▼ ▼ ▼ │ │ -│ ┌───────────┐ ┌───────────┐ ┌───────────┐ ┌───────────┐ │ │ -│ │ SUCCEEDED │ │ FAILED │ │ SKIPPED │ │ BACKOFF │ │ │ -│ │ (final) │ │ (final) │ │ (final) │ │ │ │ │ -│ └───────────┘ └───────────┘ └───────────┘ └──┬──────┬─┘ │ │ -│ │ │ │ │ -│ exceeded_max_ │ │ can_start()│ │ -│ attempts │ │ loops back │ │ -│ ▼ │ └────────────┘ │ -│ ┌──────────┐ │ │ -│ │ SKIPPED │◄─┘ │ -│ │ (final) │ │ -│ └──────────┘ │ -│ │ -│ Each ArchiveResult runs ONE specific hook (stored in .hook_name field) │ -└─────────────────────────────────────────────────────────────────────────────┘ -""" - BINARY_MACHINE_DIAGRAM = """ ┌─────────────────────────────────────────────────────────────────────────────┐ │ BinaryMachine │ @@ -193,8 +143,8 @@ def pluginmap( """ Show a map of all state machines and their associated plugin hooks. - Displays ASCII art diagrams of the core model state machines (Crawl, Snapshot, - ArchiveResult, Binary) and lists all auto-detected on_Modelname_xyz hooks + Displays ASCII art diagrams of the core queued model state machines (Crawl, + Snapshot, Binary) and lists all auto-detected on_Modelname_xyz hooks that will run for each model's transitions. """ from rich.console import Console @@ -257,17 +207,6 @@ def pluginmap( prnt(f'[dim]User plugins: {USER_PLUGINS_DIR}[/dim]') prnt() - # Show diagrams first (unless quiet mode) - if not quiet: - # Show ArchiveResult diagram separately since it's different - prnt(Panel( - ARCHIVERESULT_MACHINE_DIAGRAM, - title='[bold green]ArchiveResultMachine[/bold green]', - border_style='green', - expand=False, - )) - prnt() - for event_name, info in model_events.items(): # Discover hooks for this event hooks = discover_hooks(event_name, filter_disabled=not show_disabled) diff --git a/archivebox/cli/archivebox_run.py b/archivebox/cli/archivebox_run.py index fd88823b..292baf87 100644 --- a/archivebox/cli/archivebox_run.py +++ b/archivebox/cli/archivebox_run.py @@ -145,17 +145,25 @@ def process_stdin_records() -> int: try: archiveresult = ArchiveResult.objects.get(id=record_id) except ArchiveResult.DoesNotExist: - archiveresult = ArchiveResult.from_json(record) + archiveresult = None else: - # New archiveresult - create it - archiveresult = ArchiveResult.from_json(record) + archiveresult = None + snapshot_id = record.get('snapshot_id') + plugin_name = record.get('plugin') + snapshot = None if archiveresult: - archiveresult.retry_at = timezone.now() - if archiveresult.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED, ArchiveResult.StatusChoices.BACKOFF]: - archiveresult.status = ArchiveResult.StatusChoices.QUEUED - archiveresult.save() + if archiveresult.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED, ArchiveResult.StatusChoices.NORESULTS, ArchiveResult.StatusChoices.BACKOFF]: + archiveresult.reset_for_retry() snapshot = archiveresult.snapshot + plugin_name = plugin_name or archiveresult.plugin + elif snapshot_id: + try: + snapshot = Snapshot.objects.get(id=snapshot_id) + except Snapshot.DoesNotExist: + snapshot = None + + if snapshot: snapshot.retry_at = timezone.now() if snapshot.status != Snapshot.StatusChoices.STARTED: snapshot.status = Snapshot.StatusChoices.QUEUED @@ -167,9 +175,9 @@ def process_stdin_records() -> int: crawl.save(update_fields=['status', 'retry_at', 'modified_at']) crawl_id = str(snapshot.crawl_id) snapshot_ids_by_crawl[crawl_id].add(str(snapshot.id)) - if archiveresult.plugin: - plugin_names_by_crawl[crawl_id].add(archiveresult.plugin) - output_records.append(archiveresult.to_json()) + if plugin_name: + plugin_names_by_crawl[crawl_id].add(str(plugin_name)) + output_records.append(record if not archiveresult else archiveresult.to_json()) queued_count += 1 elif record_type == TYPE_BINARY: @@ -234,9 +242,11 @@ def run_runner(daemon: bool = False) -> int: """ from django.utils import timezone from archivebox.machine.models import Machine, Process - from archivebox.services.runner import run_pending_crawls + from archivebox.services.runner import recover_orphaned_crawls, recover_orphaned_snapshots, run_pending_crawls Process.cleanup_stale_running() + recover_orphaned_snapshots() + recover_orphaned_crawls() Machine.current() current = Process.current() if current.process_type != Process.TypeChoices.ORCHESTRATOR: @@ -305,6 +315,13 @@ def main(daemon: bool, crawl_id: str, snapshot_id: str, binary_id: str): traceback.print_exc() sys.exit(1) + if daemon: + if not sys.stdin.isatty(): + exit_code = process_stdin_records() + if exit_code != 0: + sys.exit(exit_code) + sys.exit(run_runner(daemon=True)) + if not sys.stdin.isatty(): sys.exit(process_stdin_records()) else: diff --git a/archivebox/cli/archivebox_server.py b/archivebox/cli/archivebox_server.py index 36e53e91..cbd7a9ce 100644 --- a/archivebox/cli/archivebox_server.py +++ b/archivebox/cli/archivebox_server.py @@ -3,9 +3,7 @@ __package__ = 'archivebox.cli' from typing import Iterable -import os import sys -import subprocess import rich_click as click from rich import print @@ -14,6 +12,41 @@ from archivebox.misc.util import docstring, enforce_types from archivebox.config.common import SERVER_CONFIG +def stop_existing_background_runner(*, machine, process_model, supervisor=None, stop_worker_fn=None, log=print) -> int: + """Stop any existing orchestrator process so the server can take ownership.""" + process_model.cleanup_stale_running(machine=machine) + + running_runners = list(process_model.objects.filter( + machine=machine, + status=process_model.StatusChoices.RUNNING, + process_type=process_model.TypeChoices.ORCHESTRATOR, + ).order_by('created_at')) + + if not running_runners: + return 0 + + log('[yellow][*] Stopping existing ArchiveBox background runner...[/yellow]') + + if supervisor is not None and stop_worker_fn is not None: + for worker_name in ('worker_runner', 'worker_runner_watch'): + try: + stop_worker_fn(supervisor, worker_name) + except Exception: + pass + + for proc in running_runners: + try: + proc.kill_tree(graceful_timeout=2.0) + except Exception: + try: + proc.terminate(graceful_timeout=2.0) + except Exception: + pass + + process_model.cleanup_stale_running(machine=machine) + return len(running_runners) + + @enforce_types def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,), reload: bool=False, @@ -39,25 +72,6 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,), if debug or reload: SHELL_CONFIG.DEBUG = True - if run_in_debug: - os.environ['ARCHIVEBOX_RUNSERVER'] = '1' - if reload: - os.environ['ARCHIVEBOX_AUTORELOAD'] = '1' - from archivebox.config.common import STORAGE_CONFIG - pidfile = str(STORAGE_CONFIG.TMP_DIR / 'runserver.pid') - os.environ['ARCHIVEBOX_RUNSERVER_PIDFILE'] = pidfile - - from django.utils.autoreload import DJANGO_AUTORELOAD_ENV - is_reloader_child = os.environ.get(DJANGO_AUTORELOAD_ENV) == 'true' - if not is_reloader_child: - env = os.environ.copy() - subprocess.Popen( - [sys.executable, '-m', 'archivebox', 'manage', 'runner_watch', f'--pidfile={pidfile}'], - env=env, - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL, - ) - from django.contrib.auth.models import User if not User.objects.filter(is_superuser=True).exclude(username='system').exists(): @@ -81,73 +95,62 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,), except IndexError: pass + from archivebox.workers.supervisord_util import ( + get_existing_supervisord_process, + get_worker, + stop_worker, + start_server_workers, + is_port_in_use, + ) + from archivebox.machine.models import Machine, Process + + # Check if port is already in use + if is_port_in_use(host, int(port)): + print(f'[red][X] Error: Port {port} is already in use[/red]') + print(f' Another process (possibly daphne or runserver) is already listening on {host}:{port}') + print(' Stop the conflicting process or choose a different port') + sys.exit(1) + + machine = Machine.current() + stop_existing_background_runner( + machine=machine, + process_model=Process, + supervisor=get_existing_supervisord_process(), + stop_worker_fn=stop_worker, + ) + + supervisor = get_existing_supervisord_process() + if supervisor: + server_worker_name = 'worker_runserver' if run_in_debug else 'worker_daphne' + server_proc = get_worker(supervisor, server_worker_name) + server_state = server_proc.get('statename') if isinstance(server_proc, dict) else None + if server_state == 'RUNNING': + runner_proc = get_worker(supervisor, 'worker_runner') + runner_watch_proc = get_worker(supervisor, 'worker_runner_watch') + runner_state = runner_proc.get('statename') if isinstance(runner_proc, dict) else None + runner_watch_state = runner_watch_proc.get('statename') if isinstance(runner_watch_proc, dict) else None + print('[red][X] Error: ArchiveBox server is already running[/red]') + print(f' [green]√[/green] Web server ({server_worker_name}) is RUNNING on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]') + if runner_state == 'RUNNING': + print(' [green]√[/green] Background runner (worker_runner) is RUNNING') + if runner_watch_state == 'RUNNING': + print(' [green]√[/green] Reload watcher (worker_runner_watch) is RUNNING') + print() + print('[yellow]To stop the existing server, run:[/yellow]') + print(' pkill -f "archivebox server"') + print(' pkill -f supervisord') + sys.exit(1) + if run_in_debug: - from django.core.management import call_command print('[green][+] Starting ArchiveBox webserver in DEBUG mode...[/green]') - print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]') - print(f' [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]') - print(' > Writing ArchiveBox error log to ./logs/errors.log') - if not reload: - runserver_args.append('--noreload') # '--insecure' - if nothreading: - runserver_args.append('--nothreading') - call_command("runserver", *runserver_args) else: - from archivebox.workers.supervisord_util import ( - get_existing_supervisord_process, - get_worker, - start_server_workers, - is_port_in_use, - ) - from archivebox.machine.models import Machine, Process - - # Check if port is already in use - if is_port_in_use(host, int(port)): - print(f'[red][X] Error: Port {port} is already in use[/red]') - print(f' Another process (possibly daphne) is already listening on {host}:{port}') - print(' Stop the conflicting process or choose a different port') - sys.exit(1) - - # Check if the background crawl runner is already running for this data directory - if Process.objects.filter( - machine=Machine.current(), - status=Process.StatusChoices.RUNNING, - process_type=Process.TypeChoices.ORCHESTRATOR, - ).exists(): - print('[red][X] Error: ArchiveBox background runner is already running for this data directory[/red]') - print(' Stop the existing runner before starting a new server') - print(' To stop: pkill -f "archivebox run --daemon"') - sys.exit(1) - - # Check if supervisord is already running - supervisor = get_existing_supervisord_process() - if supervisor: - daphne_proc = get_worker(supervisor, 'worker_daphne') - daphne_state = daphne_proc.get('statename') if isinstance(daphne_proc, dict) else None - - # If daphne is already running, error out - if daphne_state == 'RUNNING': - runner_proc = get_worker(supervisor, 'worker_runner') - runner_state = runner_proc.get('statename') if isinstance(runner_proc, dict) else None - print('[red][X] Error: ArchiveBox server is already running[/red]') - print(f' [green]√[/green] Web server (worker_daphne) is RUNNING on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]') - if runner_state == 'RUNNING': - print(' [green]√[/green] Background runner (worker_runner) is RUNNING') - print() - print('[yellow]To stop the existing server, run:[/yellow]') - print(' pkill -f "archivebox server"') - print(' pkill -f supervisord') - sys.exit(1) - # Otherwise, daphne is not running - fall through to start it - - # No existing workers found - start new ones print('[green][+] Starting ArchiveBox webserver...[/green]') - print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]') - print(f' [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]') - print(' > Writing ArchiveBox error log to ./logs/errors.log') - print() - start_server_workers(host=host, port=port, daemonize=daemonize) - print("\n[i][green][🟩] ArchiveBox server shut down gracefully.[/green][/i]") + print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]') + print(f' [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]') + print(' > Writing ArchiveBox error log to ./logs/errors.log') + print() + start_server_workers(host=host, port=port, daemonize=daemonize, debug=run_in_debug, reload=reload, nothreading=nothreading) + print("\n[i][green][🟩] ArchiveBox server shut down gracefully.[/green][/i]") @click.command() diff --git a/archivebox/cli/archivebox_snapshot.py b/archivebox/cli/archivebox_snapshot.py index 46ad2949..ae65fdab 100644 --- a/archivebox/cli/archivebox_snapshot.py +++ b/archivebox/cli/archivebox_snapshot.py @@ -172,6 +172,9 @@ def list_snapshots( tag: Optional[str] = None, crawl_id: Optional[str] = None, limit: Optional[int] = None, + sort: Optional[str] = None, + csv: Optional[str] = None, + with_headers: bool = False, ) -> int: """ List Snapshots as JSONL with optional filters. @@ -182,7 +185,11 @@ def list_snapshots( from archivebox.misc.jsonl import write_record from archivebox.core.models import Snapshot - is_tty = sys.stdout.isatty() + if with_headers and not csv: + rprint('[red]--with-headers requires --csv[/red]', file=sys.stderr) + return 2 + + is_tty = sys.stdout.isatty() and not csv queryset = Snapshot.objects.all().order_by('-created_at') @@ -199,7 +206,29 @@ def list_snapshots( if tag: queryset = queryset.filter(tags__name__iexact=tag) + if sort: + queryset = queryset.order_by(sort) + count = 0 + if csv: + cols = [col.strip() for col in csv.split(',') if col.strip()] + if not cols: + rprint('[red]No CSV columns provided[/red]', file=sys.stderr) + return 2 + rows: list[str] = [] + if with_headers: + rows.append(','.join(cols)) + for snapshot in queryset.iterator(chunk_size=500): + rows.append(snapshot.to_csv(cols=cols, separator=',')) + count += 1 + output = '\n'.join(rows) + if output: + sys.stdout.write(output) + if not output.endswith('\n'): + sys.stdout.write('\n') + rprint(f'[dim]Listed {count} snapshots[/dim]', file=sys.stderr) + return 0 + for snapshot in queryset: if is_tty: status_color = { diff --git a/archivebox/config/common.py b/archivebox/config/common.py index 1546332d..f0395f97 100644 --- a/archivebox/config/common.py +++ b/archivebox/config/common.py @@ -1,6 +1,7 @@ __package__ = "archivebox.config" import re +import secrets import sys import shutil from typing import ClassVar, Dict, Optional, List @@ -8,7 +9,6 @@ from pathlib import Path from rich import print from pydantic import Field, field_validator -from django.utils.crypto import get_random_string from archivebox.config.configset import BaseConfigSet @@ -104,7 +104,7 @@ class ServerConfig(BaseConfigSet): "danger-onedomain-fullreplay", ) - SECRET_KEY: str = Field(default_factory=lambda: get_random_string(50, "abcdefghijklmnopqrstuvwxyz0123456789_")) + SECRET_KEY: str = Field(default_factory=lambda: ''.join(secrets.choice("abcdefghijklmnopqrstuvwxyz0123456789_") for _ in range(50))) BIND_ADDR: str = Field(default="127.0.0.1:8000") LISTEN_HOST: str = Field(default="archivebox.localhost:8000") ADMIN_BASE_URL: str = Field(default="") diff --git a/archivebox/config/views.py b/archivebox/config/views.py index 8fa3adc8..df7a83d6 100644 --- a/archivebox/config/views.py +++ b/archivebox/config/views.py @@ -1,10 +1,13 @@ __package__ = 'archivebox.config' +import html +import json import os -import shutil import inspect +import re from pathlib import Path -from typing import Any, Dict +from typing import Any, Callable, Dict +from urllib.parse import quote, urlencode from django.http import HttpRequest from django.utils import timezone from django.utils.html import format_html @@ -18,16 +21,27 @@ from archivebox.misc.util import parse_date from archivebox.machine.models import Binary +ABX_PLUGINS_DOCS_BASE_URL = 'https://archivebox.github.io/abx-plugins/' +ABX_PLUGINS_GITHUB_BASE_URL = 'https://github.com/ArchiveBox/abx-plugins/tree/main/abx_plugins/plugins/' +LIVE_CONFIG_BASE_URL = '/admin/environment/config/' +ENVIRONMENT_BINARIES_BASE_URL = '/admin/environment/binaries/' +INSTALLED_BINARIES_BASE_URL = '/admin/machine/binary/' + # Common binaries to check for KNOWN_BINARIES = [ 'wget', 'curl', 'chromium', 'chrome', 'google-chrome', 'google-chrome-stable', - 'node', 'npm', 'npx', 'yt-dlp', 'ytdlp', 'youtube-dl', + 'node', 'npm', 'npx', 'yt-dlp', 'git', 'singlefile', 'readability-extractor', 'mercury-parser', 'python3', 'python', 'bash', 'zsh', 'ffmpeg', 'ripgrep', 'rg', 'sonic', 'archivebox', ] +CANONICAL_BINARY_ALIASES = { + 'youtube-dl': 'yt-dlp', + 'ytdlp': 'yt-dlp', +} + def is_superuser(request: HttpRequest) -> bool: return bool(getattr(request.user, 'is_superuser', False)) @@ -38,6 +52,249 @@ def format_parsed_datetime(value: object) -> str: return parsed.strftime("%Y-%m-%d %H:%M:%S") if parsed else "" +JSON_TOKEN_RE = re.compile( + r'(?P"(?:\\u[a-fA-F0-9]{4}|\\[^u]|[^\\"])*")(?=\s*:)' + r'|(?P"(?:\\u[a-fA-F0-9]{4}|\\[^u]|[^\\"])*")' + r'|(?P\btrue\b|\bfalse\b)' + r'|(?P\bnull\b)' + r'|(?P-?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?)' +) + + +def render_code_block(text: str, *, highlighted: bool = False) -> str: + code = html.escape(text, quote=False) + + if highlighted: + def _wrap_token(match: re.Match[str]) -> str: + styles = { + 'key': 'color: #0550ae;', + 'string': 'color: #0a7f45;', + 'boolean': 'color: #8250df; font-weight: 600;', + 'null': 'color: #6e7781; font-style: italic;', + 'number': 'color: #b35900;', + } + token_type = next(name for name, value in match.groupdict().items() if value is not None) + return f'{match.group(0)}' + + code = JSON_TOKEN_RE.sub(_wrap_token, code) + + return ( + '
'
+        ''
+        f'{code}'
+        '
' + ) + + +def render_highlighted_json_block(value: Any) -> str: + return render_code_block(json.dumps(value, indent=2, ensure_ascii=False), highlighted=True) + + +def get_plugin_docs_url(plugin_name: str) -> str: + return f'{ABX_PLUGINS_DOCS_BASE_URL}#{plugin_name}' + + +def get_plugin_hook_source_url(plugin_name: str, hook_name: str) -> str: + return f'{ABX_PLUGINS_GITHUB_BASE_URL}{quote(plugin_name)}/{quote(hook_name)}' + + +def get_live_config_url(key: str) -> str: + return f'{LIVE_CONFIG_BASE_URL}{quote(key)}/' + + +def get_environment_binary_url(name: str) -> str: + return f'{ENVIRONMENT_BINARIES_BASE_URL}{quote(name)}/' + + +def get_installed_binary_change_url(name: str, binary: Any) -> str | None: + binary_id = getattr(binary, 'id', None) + if not binary_id: + return None + + base_url = getattr(binary, 'admin_change_url', None) or f'{INSTALLED_BINARIES_BASE_URL}{binary_id}/change/' + changelist_filters = urlencode({'q': canonical_binary_name(name)}) + return f'{base_url}?{urlencode({"_changelist_filters": changelist_filters})}' + + +def get_machine_admin_url() -> str | None: + try: + from archivebox.machine.models import Machine + return Machine.current().admin_change_url + except Exception: + return None + + +def render_code_tag_list(values: list[str]) -> str: + if not values: + return '(none)' + + tags = ''.join( + str(format_html( + '{}', + value, + )) + for value in values + ) + return f'
{tags}
' + + +def render_plugin_metadata_html(config: dict[str, Any]) -> str: + rows = ( + ('Title', config.get('title') or '(none)'), + ('Description', config.get('description') or '(none)'), + ('Required Plugins', mark_safe(render_link_tag_list(config.get('required_plugins') or [], get_plugin_docs_url))), + ('Required Binaries', mark_safe(render_link_tag_list(config.get('required_binaries') or [], get_environment_binary_url))), + ('Output MIME Types', mark_safe(render_code_tag_list(config.get('output_mimetypes') or []))), + ) + + rendered_rows = ''.join( + str(format_html( + '
' + '
{}
' + '
{}
' + '
', + label, + value, + )) + for label, value in rows + ) + return f'
{rendered_rows}
' + + +def render_link_tag_list(values: list[str], url_resolver: Callable[[str], str] | None = None) -> str: + if not values: + return '(none)' + + tags = [] + for value in values: + if url_resolver is None: + tags.append(str(format_html( + '{}', + value, + ))) + else: + tags.append(str(format_html( + '' + '{}' + '', + url_resolver(value), + value, + ))) + return f'
{"".join(tags)}
' + + +def render_property_links(prop_name: str, prop_info: dict[str, Any], machine_admin_url: str | None) -> str: + links = [ + str(format_html('Computed value', get_live_config_url(prop_name))), + ] + if machine_admin_url: + links.append(str(format_html('Edit override', machine_admin_url))) + + fallback = prop_info.get('x-fallback') + if isinstance(fallback, str) and fallback: + links.append(str(format_html('Fallback: {}', get_live_config_url(fallback), fallback))) + + aliases = prop_info.get('x-aliases') or [] + if isinstance(aliases, list): + for alias in aliases: + if isinstance(alias, str) and alias: + links.append(str(format_html('Alias: {}', get_live_config_url(alias), alias))) + + default = prop_info.get('default') + if prop_name.endswith('_BINARY') and isinstance(default, str) and default: + links.append(str(format_html('Binary: {}', get_environment_binary_url(default), default))) + + return '   '.join(links) + + +def render_config_properties_html(properties: dict[str, Any], machine_admin_url: str | None) -> str: + header_links = [ + str(format_html('Dependencies', ENVIRONMENT_BINARIES_BASE_URL)), + str(format_html('Installed Binaries', INSTALLED_BINARIES_BASE_URL)), + ] + if machine_admin_url: + header_links.insert(0, str(format_html('Machine Config Editor', machine_admin_url))) + + cards = [ + f'
{"   |   ".join(header_links)}
' + ] + + for prop_name, prop_info in properties.items(): + prop_type = prop_info.get('type', 'unknown') + if isinstance(prop_type, list): + prop_type = ' | '.join(str(type_name) for type_name in prop_type) + prop_desc = prop_info.get('description', '') + + default_html = '' + if 'default' in prop_info: + default_html = str(format_html( + '
Default: {}
', + prop_info['default'], + )) + + description_html = prop_desc or mark_safe('(no description)') + cards.append(str(format_html( + '
' + '
' + '{}' + ' ({})' + '
' + '
{}
' + '
{}
' + '{}' + '
', + get_live_config_url(prop_name), + prop_name, + prop_type, + description_html, + mark_safe(render_property_links(prop_name, prop_info, machine_admin_url)), + mark_safe(default_html), + ))) + + return ''.join(cards) + + +def render_hook_links_html(plugin_name: str, hooks: list[str], source: str) -> str: + if not hooks: + return '(none)' + + items = [] + for hook_name in hooks: + if source == 'builtin': + items.append(str(format_html( + '
' + '{}' + '
', + get_plugin_hook_source_url(plugin_name, hook_name), + hook_name, + ))) + else: + items.append(str(format_html( + '
{}
', + hook_name, + ))) + return ''.join(items) + + +def render_binary_detail_description(name: str, merged: dict[str, Any], db_binary: Any) -> str: + installed_binary_url = get_installed_binary_change_url(name, db_binary) + + if installed_binary_url: + return str(format_html( + '{}
' + 'View Installed Binary Record', + merged['abspath'], + installed_binary_url, + )) + + return str(format_html('{}', merged['abspath'])) + + def obj_to_yaml(obj: Any, indent: int = 0) -> str: indent_str = " " * indent if indent == 0: @@ -80,21 +337,41 @@ def obj_to_yaml(obj: Any, indent: int = 0) -> str: return f" {str(obj)}" -def get_detected_binaries() -> Dict[str, Dict[str, Any]]: - """Detect available binaries using shutil.which.""" - binaries = {} +def canonical_binary_name(name: str) -> str: + return CANONICAL_BINARY_ALIASES.get(name, name) - for name in KNOWN_BINARIES: - path = shutil.which(name) - if path: - binaries[name] = { - 'name': name, - 'abspath': path, - 'version': None, # Could add version detection later - 'is_available': True, - } - return binaries +def _binary_sort_key(binary: Binary) -> tuple[int, int, int, Any]: + return ( + int(binary.status == Binary.StatusChoices.INSTALLED), + int(bool(binary.version)), + int(bool(binary.abspath)), + binary.modified_at, + ) + + +def get_db_binaries_by_name() -> Dict[str, Binary]: + grouped: Dict[str, list[Binary]] = {} + for binary in Binary.objects.all(): + grouped.setdefault(canonical_binary_name(binary.name), []).append(binary) + + return { + name: max(records, key=_binary_sort_key) + for name, records in grouped.items() + } + + +def serialize_binary_record(name: str, binary: Binary | None) -> Dict[str, Any]: + is_installed = bool(binary and binary.status == Binary.StatusChoices.INSTALLED) + return { + 'name': canonical_binary_name(name), + 'version': str(getattr(binary, 'version', '') or ''), + 'binprovider': str(getattr(binary, 'binprovider', '') or ''), + 'abspath': str(getattr(binary, 'abspath', '') or ''), + 'sha256': str(getattr(binary, 'sha256', '') or ''), + 'status': str(getattr(binary, 'status', '') or ''), + 'is_available': is_installed and bool(getattr(binary, 'abspath', '') or ''), + } def get_filesystem_plugins() -> Dict[str, Dict[str, Any]]: @@ -150,29 +427,18 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext: "Found Abspath": [], } - # Get binaries from database (previously detected/installed) - db_binaries = {b.name: b for b in Binary.objects.all()} - - # Get currently detectable binaries - detected = get_detected_binaries() - - # Merge and display - all_binary_names = sorted(set(list(db_binaries.keys()) + list(detected.keys()))) + db_binaries = get_db_binaries_by_name() + all_binary_names = sorted(db_binaries.keys()) for name in all_binary_names: - db_binary = db_binaries.get(name) - detected_binary = detected.get(name) + merged = serialize_binary_record(name, db_binaries.get(name)) rows['Binary Name'].append(ItemLink(name, key=name)) - if db_binary: - rows['Found Version'].append(f'✅ {db_binary.version}' if db_binary.version else '✅ found') - rows['Provided By'].append(db_binary.binprovider or 'PATH') - rows['Found Abspath'].append(str(db_binary.abspath or '')) - elif detected_binary: - rows['Found Version'].append('✅ found') - rows['Provided By'].append('PATH') - rows['Found Abspath'].append(detected_binary['abspath']) + if merged['is_available']: + rows['Found Version'].append(f"✅ {merged['version']}" if merged['version'] else '✅ found') + rows['Provided By'].append(merged['binprovider'] or '-') + rows['Found Abspath'].append(merged['abspath'] or '-') else: rows['Found Version'].append('❌ missing') rows['Provided By'].append('-') @@ -187,41 +453,22 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext: @render_with_item_view def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: assert is_superuser(request), 'Must be a superuser to view configuration settings.' + key = canonical_binary_name(key) - # Try database first - try: - binary = Binary.objects.get(name=key) - section: SectionData = { - "name": binary.name, - "description": str(binary.abspath or ''), - "fields": { - 'name': binary.name, - 'binprovider': binary.binprovider, - 'abspath': str(binary.abspath), - 'version': binary.version, - 'sha256': binary.sha256, - }, - "help_texts": {}, - } - return ItemContext( - slug=key, - title=key, - data=[section], - ) - except Binary.DoesNotExist: - pass + db_binary = get_db_binaries_by_name().get(key) + merged = serialize_binary_record(key, db_binary) - # Try to detect from PATH - path = shutil.which(key) - if path: + if merged['is_available']: section: SectionData = { "name": key, - "description": path, + "description": mark_safe(render_binary_detail_description(key, merged, db_binary)), "fields": { 'name': key, - 'binprovider': 'PATH', - 'abspath': path, - 'version': 'unknown', + 'binprovider': merged['binprovider'] or '-', + 'abspath': merged['abspath'] or 'not found', + 'version': merged['version'] or 'unknown', + 'sha256': merged['sha256'], + 'status': merged['status'], }, "help_texts": {}, } @@ -233,12 +480,13 @@ def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: section: SectionData = { "name": key, - "description": "Binary not found", + "description": "No persisted Binary record found", "fields": { 'name': key, - 'binprovider': 'not installed', - 'abspath': 'not found', - 'version': 'N/A', + 'binprovider': merged['binprovider'] or 'not recorded', + 'abspath': merged['abspath'] or 'not recorded', + 'version': merged['version'] or 'N/A', + 'status': merged['status'] or 'unrecorded', }, "help_texts": {}, } @@ -293,8 +541,6 @@ def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext: @render_with_item_view def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: - import json - assert is_superuser(request), 'Must be a superuser to view configuration settings.' plugins = get_filesystem_plugins() @@ -308,45 +554,61 @@ def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: ) # Base fields that all plugins have + docs_url = get_plugin_docs_url(plugin['name']) + machine_admin_url = get_machine_admin_url() fields = { "id": plugin['id'], "name": plugin['name'], "source": plugin['source'], - "path": plugin['path'], - "hooks": ', '.join(plugin['hooks']), } - # Add config.json data if available - if plugin.get('config'): - config_json = json.dumps(plugin['config'], indent=2) - fields["config.json"] = mark_safe( - '
{config_json}
' - ) - - # Also extract and display individual config properties for easier viewing - if 'properties' in plugin['config']: - config_properties = plugin['config']['properties'] - properties_summary = [] - for prop_name, prop_info in config_properties.items(): - prop_type = prop_info.get('type', 'unknown') - prop_desc = prop_info.get('description', '') - properties_summary.append(f"• {prop_name} ({prop_type}): {prop_desc}") - - if properties_summary: - fields["Config Properties"] = mark_safe('
'.join(properties_summary)) - - section: SectionData = { + sections: list[SectionData] = [{ "name": plugin['name'], - "description": plugin['path'], + "description": format_html( + '{}
ABX Plugin Docs', + plugin['path'], + docs_url, + ), "fields": fields, "help_texts": {}, - } + }] + + if plugin['hooks']: + sections.append({ + "name": "Hooks", + "description": mark_safe(render_hook_links_html(plugin['name'], plugin['hooks'], plugin['source'])), + "fields": {}, + "help_texts": {}, + }) + + if plugin.get('config'): + sections.append({ + "name": "Plugin Metadata", + "description": mark_safe(render_plugin_metadata_html(plugin['config'])), + "fields": {}, + "help_texts": {}, + }) + + sections.append({ + "name": "config.json", + "description": mark_safe(render_highlighted_json_block(plugin['config'])), + "fields": {}, + "help_texts": {}, + }) + + config_properties = plugin['config'].get('properties', {}) + if config_properties: + sections.append({ + "name": "Config Properties", + "description": mark_safe(render_config_properties_html(config_properties, machine_admin_url)), + "fields": {}, + "help_texts": {}, + }) return ItemContext( slug=key, title=plugin['name'], - data=[section], + data=sections, ) diff --git a/archivebox/core/admin_archiveresults.py b/archivebox/core/admin_archiveresults.py index 5a4c806c..6f5f3765 100644 --- a/archivebox/core/admin_archiveresults.py +++ b/archivebox/core/admin_archiveresults.py @@ -1,14 +1,23 @@ __package__ = 'archivebox.core' +import html +import json import os +import shlex from pathlib import Path +from urllib.parse import quote +from functools import reduce +from operator import and_ from django.contrib import admin +from django.db.models import Min, Q, TextField +from django.db.models.functions import Cast from django.utils.html import format_html from django.utils.safestring import mark_safe from django.core.exceptions import ValidationError from django.urls import reverse, resolve from django.utils import timezone +from django.utils.text import smart_split from archivebox.config import DATA_DIR from archivebox.config.common import SERVER_CONFIG @@ -16,11 +25,71 @@ from archivebox.misc.paginators import AccelleratedPaginator from archivebox.base_models.admin import BaseModelAdmin from archivebox.hooks import get_plugin_icon from archivebox.core.host_utils import build_snapshot_url +from archivebox.core.widgets import InlineTagEditorWidget +from archivebox.core.views import LIVE_PLUGIN_BASE_URL from archivebox.core.models import ArchiveResult, Snapshot +def _stringify_env_value(value) -> str: + if value is None: + return '' + if isinstance(value, str): + return value + return json.dumps(value, separators=(',', ':')) + + +def _quote_shell_string(value: str) -> str: + return "'" + str(value).replace("'", "'\"'\"'") + "'" + + +def _get_replay_source_url(result: ArchiveResult) -> str: + process_env = getattr(getattr(result, 'process', None), 'env', None) or {} + return str(process_env.get('SOURCE_URL') or result.snapshot.url or '') + + +def build_abx_dl_display_command(result: ArchiveResult) -> str: + source_url = _get_replay_source_url(result) + plugin_name = str(result.plugin or '').strip() + if not plugin_name and not source_url: + return 'abx-dl' + if not source_url: + return f'abx-dl --plugins={plugin_name}' + return f'abx-dl --plugins={plugin_name} {_quote_shell_string(source_url)}' + + +def build_abx_dl_replay_command(result: ArchiveResult) -> str: + display_command = build_abx_dl_display_command(result) + process = getattr(result, 'process', None) + env = getattr(process, 'env', None) or {} + env_items = ' '.join( + f'{key}={shlex.quote(_stringify_env_value(value))}' + for key, value in sorted(env.items()) + if value is not None + ) + snapshot_dir = shlex.quote(str(result.snapshot_dir)) + if env_items: + return f'cd {snapshot_dir}; env {env_items} {display_command}' + return f'cd {snapshot_dir}; {display_command}' + + +def get_plugin_admin_url(plugin_name: str) -> str: + from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, iter_plugin_dirs + + plugin_dir = next((path.resolve() for path in iter_plugin_dirs() if path.name == plugin_name), None) + if plugin_dir: + builtin_root = BUILTIN_PLUGINS_DIR.resolve() + if plugin_dir.is_relative_to(builtin_root): + return f'{LIVE_PLUGIN_BASE_URL}builtin.{quote(plugin_name)}/' + + user_root = USER_PLUGINS_DIR.resolve() + if plugin_dir.is_relative_to(user_root): + return f'{LIVE_PLUGIN_BASE_URL}user.{quote(plugin_name)}/' + + return f'{LIVE_PLUGIN_BASE_URL}builtin.{quote(plugin_name)}/' + + def render_archiveresults_list(archiveresults_qs, limit=50): """Render a nice inline list view of archive results with status, plugin, output, and actions.""" @@ -35,6 +104,9 @@ def render_archiveresults_list(archiveresults_qs, limit=50): 'failed': ('#991b1b', '#fee2e2'), # red 'queued': ('#6b7280', '#f3f4f6'), # gray 'started': ('#92400e', '#fef3c7'), # amber + 'backoff': ('#92400e', '#fef3c7'), + 'skipped': ('#475569', '#f1f5f9'), + 'noresults': ('#475569', '#f1f5f9'), } rows = [] @@ -54,8 +126,10 @@ def render_archiveresults_list(archiveresults_qs, limit=50): if len(full_output) > 60: output_display += '...' - # Get full command as tooltip - cmd_str = ' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd or '-') + display_cmd = build_abx_dl_display_command(result) + replay_cmd = build_abx_dl_replay_command(result) + cmd_str_escaped = html.escape(display_cmd) + cmd_attr = html.escape(replay_cmd, quote=True) # Build output link - use embed_path() which checks output_files first embed_path = result.embed_path() if hasattr(result, 'embed_path') else None @@ -77,7 +151,7 @@ def render_archiveresults_list(archiveresults_qs, limit=50): - {str(result.id)[:8]} + {str(result.id)[-8:]} @@ -140,7 +214,15 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
Command:
-
{cmd_str}
+
+ + {cmd_str_escaped} +
@@ -165,7 +247,7 @@ def render_archiveresults_list(archiveresults_qs, limit=50): - + @@ -193,7 +275,7 @@ class ArchiveResultInline(admin.TabularInline): extra = 0 sort_fields = ('end_ts', 'plugin', 'output_str', 'status', 'cmd_version') readonly_fields = ('id', 'result_id', 'completed', 'command', 'version') - fields = ('start_ts', 'end_ts', *readonly_fields, 'plugin', 'cmd', 'cmd_version', 'pwd', 'status', 'retry_at', 'output_str') + fields = ('start_ts', 'end_ts', *readonly_fields, 'plugin', 'cmd', 'cmd_version', 'pwd', 'status', 'output_str') # exclude = ('id',) ordering = ('end_ts',) show_change_link = True @@ -259,10 +341,11 @@ class ArchiveResultInline(admin.TabularInline): class ArchiveResultAdmin(BaseModelAdmin): - list_display = ('id', 'created_at', 'snapshot_info', 'tags_str', 'status', 'plugin_with_icon', 'cmd_str', 'output_str') + list_display = ('details_link', 'created_at', 'snapshot_info', 'tags_inline', 'status_badge', 'plugin_with_icon', 'process_link', 'machine_link', 'cmd_str', 'output_str_display') + list_display_links = None sort_fields = ('id', 'created_at', 'plugin', 'status') - readonly_fields = ('cmd', 'cmd_version', 'pwd', 'cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'plugin_with_icon') - search_fields = ('id', 'snapshot__url', 'plugin', 'output_str', 'cmd_version', 'cmd', 'snapshot__timestamp') + readonly_fields = ('cmd', 'cmd_version', 'pwd', 'cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'plugin_with_icon', 'process_link') + search_fields = () autocomplete_fields = ['snapshot'] fieldsets = ( @@ -271,7 +354,7 @@ class ArchiveResultAdmin(BaseModelAdmin): 'classes': ('card', 'wide'), }), ('Plugin', { - 'fields': ('plugin', 'plugin_with_icon', 'status', 'retry_at'), + 'fields': ('plugin_with_icon', 'process_link', 'status'), 'classes': ('card',), }), ('Timing', { @@ -305,8 +388,61 @@ class ArchiveResultAdmin(BaseModelAdmin): self.request = request return super().change_view(request, object_id, form_url, extra_context) + def get_queryset(self, request): + return ( + super() + .get_queryset(request) + .select_related('snapshot', 'process') + .prefetch_related('snapshot__tags') + .annotate(snapshot_first_tag=Min('snapshot__tags__name')) + ) + + def get_search_results(self, request, queryset, search_term): + if not search_term: + return queryset, False + + queryset = queryset.annotate( + snapshot_id_text=Cast('snapshot__id', output_field=TextField()), + snapshot_crawl_id_text=Cast('snapshot__crawl_id', output_field=TextField()), + output_json_text=Cast('output_json', output_field=TextField()), + cmd_text=Cast('process__cmd', output_field=TextField()), + ) + + search_bits = [ + bit[1:-1] if len(bit) >= 2 and bit[0] == bit[-1] and bit[0] in {'"', "'"} else bit + for bit in smart_split(search_term) + ] + search_bits = [bit.strip() for bit in search_bits if bit.strip()] + if not search_bits: + return queryset, False + + filters = [] + for bit in search_bits: + filters.append( + Q(snapshot_id_text__icontains=bit) + | Q(snapshot__url__icontains=bit) + | Q(snapshot__tags__name__icontains=bit) + | Q(snapshot_crawl_id_text__icontains=bit) + | Q(plugin__icontains=bit) + | Q(hook_name__icontains=bit) + | Q(output_str__icontains=bit) + | Q(output_json_text__icontains=bit) + | Q(cmd_text__icontains=bit) + ) + + return queryset.filter(reduce(and_, filters)).distinct(), True + + @admin.display(description='Details', ordering='id') + def details_link(self, result): + return format_html( + '{}', + reverse('admin:core_archiveresult_change', args=[result.id]), + str(result.id)[-8:], + ) + @admin.display( - description='Snapshot Info' + description='Snapshot', + ordering='snapshot__url', ) def snapshot_info(self, result): snapshot_id = str(result.snapshot_id) @@ -325,20 +461,83 @@ class ArchiveResultAdmin(BaseModelAdmin): def tags_str(self, result): return result.snapshot.tags_str() + @admin.display(description='Tags', ordering='snapshot_first_tag') + def tags_inline(self, result): + widget = InlineTagEditorWidget(snapshot_id=str(result.snapshot_id), editable=False) + tags_html = widget.render( + name=f'tags_{result.snapshot_id}', + value=result.snapshot.tags.all(), + attrs={'id': f'tags_{result.snapshot_id}'}, + snapshot_id=str(result.snapshot_id), + ) + return mark_safe(f'{tags_html}') + + @admin.display(description='Status', ordering='status') + def status_badge(self, result): + status = result.status or ArchiveResult.StatusChoices.QUEUED + return format_html( + '{}', + status, + status, + result.get_status_display() or status, + ) + @admin.display(description='Plugin', ordering='plugin') def plugin_with_icon(self, result): icon = get_plugin_icon(result.plugin) return format_html( - '{} {}', + '{}{}', + get_plugin_admin_url(result.plugin), result.plugin, icon, + get_plugin_admin_url(result.plugin), result.plugin, ) - def cmd_str(self, result): + @admin.display(description='Process', ordering='process__pid') + def process_link(self, result): + if not result.process_id: + return '-' + process_label = result.process.pid if result.process and result.process.pid else '-' return format_html( - '
{}
', - ' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd), + '{}', + reverse('admin:machine_process_change', args=[result.process_id]), + process_label, + ) + + @admin.display(description='Machine', ordering='process__machine__hostname') + def machine_link(self, result): + if not result.process_id or not result.process or not result.process.machine_id: + return '-' + machine = result.process.machine + return format_html( + '{} {}', + reverse('admin:machine_machine_change', args=[machine.id]), + str(machine.id)[:8], + machine.hostname, + ) + + @admin.display(description='Command') + def cmd_str(self, result): + display_cmd = build_abx_dl_display_command(result) + replay_cmd = build_abx_dl_replay_command(result) + return format_html( + ''' +
+ + + {} + +
+ ''', + replay_cmd, + replay_cmd, + display_cmd, ) def output_display(self, result): @@ -352,6 +551,27 @@ class ArchiveResultAdmin(BaseModelAdmin): result.output_str, ) + @admin.display(description='Output', ordering='output_str') + def output_str_display(self, result): + output_text = str(result.output_str or '').strip() + if not output_text: + return '-' + + live_path = result.embed_path() if hasattr(result, 'embed_path') else None + if live_path: + return format_html( + '{}', + build_snapshot_url(str(result.snapshot_id), live_path), + output_text, + output_text, + ) + + return format_html( + '{}', + output_text, + output_text, + ) + def output_summary(self, result): snapshot_dir = Path(DATA_DIR) / str(result.pwd).split('data/', 1)[-1] output_html = format_html( diff --git a/archivebox/core/admin_site.py b/archivebox/core/admin_site.py index 4541b8c3..ae6be452 100644 --- a/archivebox/core/admin_site.py +++ b/archivebox/core/admin_site.py @@ -61,12 +61,14 @@ def register_admin_site(): from archivebox.crawls.admin import register_admin as register_crawls_admin from archivebox.api.admin import register_admin as register_api_admin from archivebox.machine.admin import register_admin as register_machine_admin + from archivebox.personas.admin import register_admin as register_personas_admin from archivebox.workers.admin import register_admin as register_workers_admin register_core_admin(archivebox_admin) register_crawls_admin(archivebox_admin) register_api_admin(archivebox_admin) register_machine_admin(archivebox_admin) + register_personas_admin(archivebox_admin) register_workers_admin(archivebox_admin) return archivebox_admin diff --git a/archivebox/core/admin_snapshots.py b/archivebox/core/admin_snapshots.py index cf70f85d..0202e62c 100644 --- a/archivebox/core/admin_snapshots.py +++ b/archivebox/core/admin_snapshots.py @@ -6,6 +6,7 @@ from pathlib import Path from django.contrib import admin, messages from django.urls import path +from django.shortcuts import get_object_or_404, redirect from django.utils.html import format_html from django.utils.safestring import mark_safe from django.utils import timezone @@ -14,6 +15,7 @@ from django.db.models.functions import Coalesce from django import forms from django.template import Template, RequestContext from django.contrib.admin.helpers import ActionForm +from django.middleware.csrf import get_token from archivebox.config import DATA_DIR from archivebox.config.common import SERVER_CONFIG @@ -24,7 +26,7 @@ from archivebox.search.admin import SearchResultsAdminMixin from archivebox.core.host_utils import build_snapshot_url, build_web_url from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin -from archivebox.workers.tasks import bg_archive_snapshots, bg_add +from archivebox.workers.tasks import bg_archive_snapshot, bg_archive_snapshots, bg_add from archivebox.core.models import Tag, Snapshot, ArchiveResult from archivebox.core.admin_archiveresults import render_archiveresults_list @@ -215,10 +217,23 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): def get_urls(self): urls = super().get_urls() custom_urls = [ - path('grid/', self.admin_site.admin_view(self.grid_view), name='grid') + path('grid/', self.admin_site.admin_view(self.grid_view), name='grid'), + path('/redo-failed/', self.admin_site.admin_view(self.redo_failed_view), name='core_snapshot_redo_failed'), ] return custom_urls + urls + def redo_failed_view(self, request, object_id): + snapshot = get_object_or_404(Snapshot, pk=object_id) + + if request.method == 'POST': + queued = bg_archive_snapshot(snapshot, overwrite=False) + messages.success( + request, + f"Queued {queued} snapshot for re-archiving. The background runner will process it.", + ) + + return redirect(snapshot.admin_change_url) + # def get_queryset(self, request): # # tags_qs = SnapshotTag.objects.all().select_related('tag') # # prefetch = Prefetch('snapshottag_set', queryset=tags_qs) @@ -312,6 +327,8 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): def admin_actions(self, obj): summary_url = build_web_url(f'/{obj.archive_path}') results_url = build_web_url(f'/{obj.archive_path}/index.html#all') + redo_failed_url = f'/admin/core/snapshot/{obj.pk}/redo-failed/' + csrf_token = get_token(self.request) return format_html( '''
@@ -344,13 +361,15 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): onmouseout="this.style.background='#eff6ff';"> 🆕 Archive Now - - 🔁 Redo Failed - +
+ + +

- Tip: Action buttons link to the list view with this snapshot pre-selected. Select it and use the action dropdown to execute. + Tip: Redo Failed runs immediately. The other action buttons link to the list view with this snapshot pre-selected.

''', summary_url, results_url, obj.url, obj.pk, - obj.pk, + redo_failed_url, + csrf_token, obj.pk, obj.pk, ) diff --git a/archivebox/core/admin_tags.py b/archivebox/core/admin_tags.py index 87396ad8..3658badc 100644 --- a/archivebox/core/admin_tags.py +++ b/archivebox/core/admin_tags.py @@ -1,63 +1,74 @@ __package__ = 'archivebox.core' -from django.contrib import admin +from urllib.parse import quote + +from django import forms +from django.contrib import admin, messages +from django.contrib.admin.options import IS_POPUP_VAR +from django.http import HttpRequest, HttpResponseRedirect +from django.urls import reverse from django.utils.html import format_html from django.utils.safestring import mark_safe -from archivebox.misc.paginators import AccelleratedPaginator from archivebox.base_models.admin import BaseModelAdmin - from archivebox.core.models import SnapshotTag, Tag +from archivebox.core.tag_utils import ( + TAG_HAS_SNAPSHOTS_CHOICES, + TAG_SORT_CHOICES, + build_tag_cards, + get_tag_creator_choices, + get_tag_year_choices, + normalize_created_by_filter, + normalize_created_year_filter, + normalize_has_snapshots_filter, + normalize_tag_sort, +) +from archivebox.core.host_utils import build_snapshot_url class TagInline(admin.TabularInline): model = SnapshotTag - # fk_name = 'snapshot' fields = ('id', 'tag') extra = 1 - # min_num = 1 max_num = 1000 autocomplete_fields = ( 'tag', ) - -# class AutocompleteTags: -# model = Tag -# search_fields = ['name'] -# name = 'name' -# # source_field = 'name' -# remote_field = Tag._meta.get_field('name') -# class AutocompleteTagsAdminStub: -# name = 'admin' - - -# class TaggedItemInline(admin.TabularInline): -# readonly_fields = ('object_link',) -# fields = ('id', 'tag', 'content_type', 'object_id', *readonly_fields) -# model = TaggedItem -# extra = 1 -# show_change_link = True - -# @admin.display(description='object') -# def object_link(self, obj): -# obj = obj.content_type.get_object_for_this_type(pk=obj.object_id) -# return format_html('
[{}]', obj._meta.app_label, obj._meta.model_name, obj.pk, str(obj)) +class TagAdminForm(forms.ModelForm): + class Meta: + model = Tag + fields = '__all__' + widgets = { + 'name': forms.TextInput(attrs={ + 'placeholder': 'research, receipts, product-design...', + 'autocomplete': 'off', + 'spellcheck': 'false', + 'data-tag-name-input': '1', + }), + } + + def clean_name(self): + name = (self.cleaned_data.get('name') or '').strip() + if not name: + raise forms.ValidationError('Tag name is required.') + return name + - class TagAdmin(BaseModelAdmin): - list_display = ('created_at', 'created_by', 'id', 'name', 'num_snapshots', 'snapshots') + form = TagAdminForm + change_list_template = 'admin/core/tag/change_list.html' + change_form_template = 'admin/core/tag/change_form.html' + list_display = ('name', 'num_snapshots', 'created_at', 'created_by') list_filter = ('created_at', 'created_by') - sort_fields = ('name', 'slug', 'id', 'created_by', 'created_at') - readonly_fields = ('slug', 'id', 'created_at', 'modified_at', 'snapshots') search_fields = ('id', 'name', 'slug') - actions = ['delete_selected', 'merge_tags'] - ordering = ['-created_at'] - # inlines = [TaggedItemInline] + readonly_fields = ('slug', 'id', 'created_at', 'modified_at', 'snapshots') + actions = ['delete_selected'] + ordering = ['name', 'id'] fieldsets = ( - ('Tag Info', { + ('Tag', { 'fields': ('name', 'slug'), 'classes': ('card',), }), @@ -65,112 +76,137 @@ class TagAdmin(BaseModelAdmin): 'fields': ('id', 'created_by', 'created_at', 'modified_at'), 'classes': ('card',), }), - ('Snapshots', { + ('Recent Snapshots', { 'fields': ('snapshots',), 'classes': ('card', 'wide'), }), ) - paginator = AccelleratedPaginator + add_fieldsets = ( + ('Tag', { + 'fields': ('name',), + 'classes': ('card', 'wide'), + }), + ('Metadata', { + 'fields': ('created_by',), + 'classes': ('card',), + }), + ) + def get_fieldsets(self, request: HttpRequest, obj: Tag | None = None): + return self.fieldsets if obj else self.add_fieldsets - def num_snapshots(self, tag): + def changelist_view(self, request: HttpRequest, extra_context=None): + query = (request.GET.get('q') or '').strip() + sort = normalize_tag_sort((request.GET.get('sort') or 'created_desc').strip()) + created_by = normalize_created_by_filter((request.GET.get('created_by') or '').strip()) + year = normalize_created_year_filter((request.GET.get('year') or '').strip()) + has_snapshots = normalize_has_snapshots_filter((request.GET.get('has_snapshots') or 'all').strip()) + extra_context = { + **(extra_context or {}), + 'initial_query': query, + 'initial_sort': sort, + 'initial_created_by': created_by, + 'initial_year': year, + 'initial_has_snapshots': has_snapshots, + 'tag_sort_choices': TAG_SORT_CHOICES, + 'tag_has_snapshots_choices': TAG_HAS_SNAPSHOTS_CHOICES, + 'tag_created_by_choices': get_tag_creator_choices(), + 'tag_year_choices': get_tag_year_choices(), + 'initial_tag_cards': build_tag_cards( + query=query, + request=request, + sort=sort, + created_by=created_by, + year=year, + has_snapshots=has_snapshots, + ), + 'tag_search_api_url': reverse('api-1:search_tags'), + 'tag_create_api_url': reverse('api-1:tags_create'), + } + return super().changelist_view(request, extra_context=extra_context) + + def render_change_form(self, request, context, add=False, change=False, form_url='', obj=None): + current_name = (request.POST.get('name') or '').strip() + if not current_name and obj: + current_name = obj.name + + similar_tag_cards = build_tag_cards(query=current_name, request=request, limit=12) if current_name else build_tag_cards(request=request, limit=12) + if obj: + similar_tag_cards = [card for card in similar_tag_cards if card['id'] != obj.pk] + + context.update({ + 'tag_search_api_url': reverse('api-1:search_tags'), + 'tag_similar_cards': similar_tag_cards, + 'tag_similar_query': current_name, + }) + return super().render_change_form(request, context, add=add, change=change, form_url=form_url, obj=obj) + + def response_add(self, request: HttpRequest, obj: Tag, post_url_continue=None): + if IS_POPUP_VAR in request.POST or '_continue' in request.POST or '_addanother' in request.POST: + return super().response_add(request, obj, post_url_continue=post_url_continue) + + self.message_user(request, f'Tag "{obj.name}" saved.', level=messages.SUCCESS) + return self._redirect_to_changelist(obj.name) + + def response_change(self, request: HttpRequest, obj: Tag): + if IS_POPUP_VAR in request.POST or '_continue' in request.POST or '_addanother' in request.POST or '_saveasnew' in request.POST: + return super().response_change(request, obj) + + self.message_user(request, f'Tag "{obj.name}" updated.', level=messages.SUCCESS) + return self._redirect_to_changelist(obj.name) + + def _redirect_to_changelist(self, query: str = '') -> HttpResponseRedirect: + changelist_url = reverse('admin:core_tag_changelist') + if query: + changelist_url = f'{changelist_url}?q={quote(query)}' + return HttpResponseRedirect(changelist_url) + + @admin.display(description='Snapshots') + def snapshots(self, tag: Tag): + snapshots = tag.snapshot_set.select_related('crawl__created_by').order_by('-downloaded_at', '-created_at', '-pk')[:10] + total_count = tag.snapshot_set.count() + if not snapshots: + return mark_safe( + f'

No snapshots use this tag yet. ' + f'Open filtered snapshot list.

' + ) + + cards = [] + for snapshot in snapshots: + title = (snapshot.title or '').strip() or snapshot.url + cards.append(format_html( + ''' + + + + {} + {} + + + ''', + reverse('admin:core_snapshot_change', args=[snapshot.pk]), + build_snapshot_url(str(snapshot.pk), 'favicon.ico'), + title[:120], + snapshot.url[:120], + )) + + cards.append(format_html( + 'View all {} tagged snapshots', + tag.id, + total_count, + )) + return mark_safe('
' + ''.join(cards) + '
') + + @admin.display(description='Snapshots', ordering='num_snapshots') + def num_snapshots(self, tag: Tag): + count = getattr(tag, 'num_snapshots', tag.snapshot_set.count()) return format_html( '{} total', tag.id, - tag.snapshot_set.count(), + count, ) - def snapshots(self, tag): - total_count = tag.snapshot_set.count() - return mark_safe('
'.join( - format_html( - '[{}] {}', - snap.pk, - snap.downloaded_at.strftime('%Y-%m-%d %H:%M') if snap.downloaded_at else 'pending...', - snap.url[:64], - ) - for snap in tag.snapshot_set.order_by('-downloaded_at')[:10] - ) + (f'
{total_count} total snapshots...')) - - # def get_urls(self): - # urls = super().get_urls() - # custom_urls = [ - # path( - # "merge-tags/", - # self.admin_site.admin_view(self.merge_tags_view), - # name="taggit_tag_merge_tags", - # ), - # ] - # return custom_urls + urls - - # @admin.action(description="Merge selected tags") - # def merge_tags(self, request, queryset): - # selected = request.POST.getlist(admin.helpers.ACTION_CHECKBOX_NAME) - # if not selected: - # self.message_user(request, "Please select at least one tag.") - # return redirect(request.get_full_path()) - - # selected_tag_ids = ",".join(selected) - # redirect_url = f"{request.get_full_path()}merge-tags/" - - # request.session["selected_tag_ids"] = selected_tag_ids - - # return redirect(redirect_url) - - # def merge_tags_view(self, request): - # selected_tag_ids = request.session.get("selected_tag_ids", "").split(",") - # if request.method == "POST": - # form = MergeTagsForm(request.POST) - # if form.is_valid(): - # new_tag_name = form.cleaned_data["new_tag_name"] - # new_tag, created = Tag.objects.get_or_create(name=new_tag_name) - # with transaction.atomic(): - # for tag_id in selected_tag_ids: - # tag = Tag.objects.get(id=tag_id) - # tagged_items = TaggedItem.objects.filter(tag=tag) - # for tagged_item in tagged_items: - # if TaggedItem.objects.filter( - # tag=new_tag, - # content_type=tagged_item.content_type, - # object_id=tagged_item.object_id, - # ).exists(): - # # we have the new tag as well, so we can just - # # remove the tag association - # tagged_item.delete() - # else: - # # point this taggedItem to the new one - # tagged_item.tag = new_tag - # tagged_item.save() - - # # delete the old tag - # if tag.id != new_tag.id: - # tag.delete() - - # self.message_user(request, "Tags have been merged", level="success") - # # clear the selected_tag_ids from session after merge is complete - # request.session.pop("selected_tag_ids", None) - - # return redirect("..") - # else: - # self.message_user(request, "Form is invalid.", level="error") - - # context = { - # "form": MergeTagsForm(), - # "selected_tag_ids": selected_tag_ids, - # } - # return render(request, "admin/taggit/merge_tags_form.html", context) - - -# @admin.register(SnapshotTag, site=archivebox_admin) -# class SnapshotTagAdmin(BaseModelAdmin): -# list_display = ('id', 'snapshot', 'tag') -# sort_fields = ('id', 'snapshot', 'tag') -# search_fields = ('id', 'snapshot_id', 'tag_id') -# fields = ('snapshot', 'id') -# actions = ['delete_selected'] -# ordering = ['-id'] - def register_admin(admin_site): admin_site.register(Tag, TagAdmin) diff --git a/archivebox/core/forms.py b/archivebox/core/forms.py index 8589563a..6050a6a7 100644 --- a/archivebox/core/forms.py +++ b/archivebox/core/forms.py @@ -1,12 +1,16 @@ __package__ = 'archivebox.core' from django import forms +from django.utils.html import format_html -from archivebox.misc.util import URL_REGEX +from archivebox.misc.util import URL_REGEX, find_all_urls from taggit.utils import edit_string_for_tags, parse_tags from archivebox.base_models.admin import KeyValueWidget from archivebox.crawls.schedule_utils import validate_schedule -from archivebox.hooks import get_plugins +from archivebox.config.common import SEARCH_BACKEND_CONFIG +from archivebox.core.widgets import TagEditorWidget, URLFiltersWidget +from archivebox.hooks import get_plugins, discover_plugin_configs, get_plugin_icon +from archivebox.personas.models import Persona DEPTH_CHOICES = ( ('0', 'depth = 0 (archive just these URLs)'), @@ -22,6 +26,22 @@ def get_plugin_choices(): return [(name, name) for name in get_plugins()] +def get_plugin_choice_label(plugin_name: str, plugin_configs: dict[str, dict]) -> str: + schema = plugin_configs.get(plugin_name, {}) + description = str(schema.get('description') or '').strip() + if not description: + return plugin_name + icon_html = get_plugin_icon(plugin_name) + + return format_html( + '{}{}{}', + icon_html, + plugin_name, + plugin_name, + description, + ) + + def get_choice_field(form: forms.Form, name: str) -> forms.ChoiceField: field = form.fields[name] if not isinstance(field, forms.ChoiceField): @@ -31,22 +51,19 @@ def get_choice_field(form: forms.Form, name: str) -> forms.ChoiceField: class AddLinkForm(forms.Form): # Basic fields - url = forms.RegexField( - label="URLs (one per line)", - regex=URL_REGEX, - min_length=6, + url = forms.CharField( + label="URLs", strip=True, - widget=forms.Textarea, + widget=forms.Textarea(attrs={ + 'data-url-regex': URL_REGEX.pattern, + }), required=True ) tag = forms.CharField( - label="Tags (comma separated tag1,tag2,tag3)", + label="Tags", strip=True, required=False, - widget=forms.TextInput(attrs={ - 'list': 'tag-datalist', - 'autocomplete': 'off', - }) + widget=TagEditorWidget(), ) depth = forms.ChoiceField( label="Archive depth", @@ -58,11 +75,15 @@ class AddLinkForm(forms.Form): label="Notes", strip=True, required=False, - widget=forms.Textarea(attrs={ - 'rows': 3, - 'placeholder': 'Optional notes about this crawl (e.g., purpose, project name, context...)', + widget=forms.TextInput(attrs={ + 'placeholder': 'Optional notes about this crawl', }) ) + url_filters = forms.Field( + label="URL allowlist / denylist", + required=False, + widget=URLFiltersWidget(source_selector='textarea[name="url"]'), + ) # Plugin groups chrome_plugins = forms.MultipleChoiceField( @@ -111,24 +132,15 @@ class AddLinkForm(forms.Form): 'placeholder': 'e.g., daily, weekly, 0 */6 * * * (every 6 hours)', }) ) - persona = forms.CharField( + persona = forms.ModelChoiceField( label="Persona (authentication profile)", - max_length=100, - initial='Default', - required=False, - ) - overwrite = forms.BooleanField( - label="Overwrite existing snapshots", - initial=False, - required=False, - ) - update = forms.BooleanField( - label="Update/retry previously failed URLs", - initial=False, required=False, + queryset=Persona.objects.none(), + empty_label=None, + to_field_name='name', ) index_only = forms.BooleanField( - label="Index only (don't archive yet)", + label="Index only dry run (add crawl but don't archive yet)", initial=False, required=False, ) @@ -142,11 +154,13 @@ class AddLinkForm(forms.Form): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - # Import at runtime to avoid circular imports - from archivebox.config.common import ARCHIVING_CONFIG + default_persona = Persona.get_or_create_default() + self.fields['persona'].queryset = Persona.objects.order_by('name') + self.fields['persona'].initial = default_persona.name # Get all plugins all_plugins = get_plugins() + plugin_configs = discover_plugin_configs() # Define plugin groups chrome_dependent = { @@ -170,26 +184,28 @@ class AddLinkForm(forms.Form): # Populate plugin field choices get_choice_field(self, 'chrome_plugins').choices = [ - (p, p) for p in sorted(all_plugins) if p in chrome_dependent + (p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in chrome_dependent ] get_choice_field(self, 'archiving_plugins').choices = [ - (p, p) for p in sorted(all_plugins) if p in archiving + (p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in archiving ] get_choice_field(self, 'parsing_plugins').choices = [ - (p, p) for p in sorted(all_plugins) if p in parsing + (p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in parsing ] get_choice_field(self, 'search_plugins').choices = [ - (p, p) for p in sorted(all_plugins) if p in search + (p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in search ] get_choice_field(self, 'binary_plugins').choices = [ - (p, p) for p in sorted(all_plugins) if p in binary + (p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in binary ] get_choice_field(self, 'extension_plugins').choices = [ - (p, p) for p in sorted(all_plugins) if p in extensions + (p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in extensions ] - # Set update default from config - self.fields['update'].initial = not ARCHIVING_CONFIG.ONLY_NEW + required_search_plugin = f'search_backend_{SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}'.strip() + search_choices = [choice[0] for choice in get_choice_field(self, 'search_plugins').choices] + if required_search_plugin in search_choices: + get_choice_field(self, 'search_plugins').initial = [required_search_plugin] def clean(self): cleaned_data = super().clean() or {} @@ -207,6 +223,23 @@ class AddLinkForm(forms.Form): return cleaned_data + def clean_url(self): + value = self.cleaned_data.get('url') or '' + urls = '\n'.join(find_all_urls(value)) + if not urls: + raise forms.ValidationError('Enter at least one valid URL.') + return urls + + def clean_url_filters(self): + from archivebox.crawls.models import Crawl + + value = self.cleaned_data.get('url_filters') or {} + return { + 'allowlist': '\n'.join(Crawl.split_filter_patterns(value.get('allowlist', ''))), + 'denylist': '\n'.join(Crawl.split_filter_patterns(value.get('denylist', ''))), + 'same_domain_only': bool(value.get('same_domain_only')), + } + def clean_schedule(self): schedule = (self.cleaned_data.get('schedule') or '').strip() if not schedule: diff --git a/archivebox/core/host_utils.py b/archivebox/core/host_utils.py index 2cf8131b..c3581d4f 100644 --- a/archivebox/core/host_utils.py +++ b/archivebox/core/host_utils.py @@ -163,6 +163,10 @@ def get_api_base_url(request=None) -> str: return _build_base_url_for_host(get_api_host(), request=request) +def get_public_base_url(request=None) -> str: + return _build_base_url_for_host(get_public_host(), request=request) + + # Backwards-compat aliases (archive == web) def get_archive_base_url(request=None) -> str: return get_web_base_url(request=request) diff --git a/archivebox/core/migrations/0032_remove_archiveresult_retry_at.py b/archivebox/core/migrations/0032_remove_archiveresult_retry_at.py new file mode 100644 index 00000000..4a8f74d1 --- /dev/null +++ b/archivebox/core/migrations/0032_remove_archiveresult_retry_at.py @@ -0,0 +1,15 @@ +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ("core", "0031_add_archiveresult_snapshot_status_index"), + ] + + operations = [ + migrations.RemoveField( + model_name="archiveresult", + name="retry_at", + ), + ] diff --git a/archivebox/core/models.py b/archivebox/core/models.py index cf008afa..7f33bf0a 100755 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -36,7 +36,7 @@ from archivebox.base_models.models import ( from archivebox.workers.models import ModelWithStateMachine, BaseStateMachine from archivebox.workers.tasks import bg_archive_snapshot from archivebox.crawls.models import Crawl -from archivebox.machine.models import NetworkInterface, Binary +from archivebox.machine.models import Binary @@ -60,32 +60,41 @@ class Tag(ModelWithUUID): def __str__(self): return self.name + def _generate_unique_slug(self) -> str: + base_slug = slugify(self.name) or 'tag' + existing = Tag.objects.filter(slug__startswith=base_slug) + if self.pk: + existing = existing.exclude(pk=self.pk) + existing_slugs = set(existing.values_list("slug", flat=True)) + + slug = base_slug + i = 1 + while slug in existing_slugs: + slug = f"{base_slug}_{i}" + i += 1 + return slug + def save(self, *args, **kwargs): - is_new = self._state.adding - if is_new: - self.slug = slugify(self.name) - existing = set(Tag.objects.filter(slug__startswith=self.slug).values_list("slug", flat=True)) - i = None - while True: - slug = f"{slugify(self.name)}_{i}" if i else slugify(self.name) - if slug not in existing: - self.slug = slug - break - i = (i or 0) + 1 + existing_name = None + if self.pk: + existing_name = Tag.objects.filter(pk=self.pk).values_list('name', flat=True).first() + + if not self.slug or existing_name != self.name: + self.slug = self._generate_unique_slug() super().save(*args, **kwargs) - if is_new: - from archivebox.misc.logging_util import log_worker_event - log_worker_event( - worker_type='DB', - event='Created Tag', - indent_level=0, - metadata={ - 'id': self.id, - 'name': self.name, - 'slug': self.slug, - }, - ) + # if is_new: + # from archivebox.misc.logging_util import log_worker_event + # log_worker_event( + # worker_type='DB', + # event='Created Tag', + # indent_level=0, + # metadata={ + # 'id': self.id, + # 'name': self.name, + # 'slug': self.slug, + # }, + # ) @property def api_url(self) -> str: @@ -364,7 +373,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea return Binary.objects.filter(process_set__archiveresult__snapshot_id=self.id).distinct() def save(self, *args, **kwargs): - is_new = self._state.adding if not self.bookmarked_at: self.bookmarked_at = self.created_at or timezone.now() if not self.timestamp: @@ -393,24 +401,25 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea super().save(*args, **kwargs) self.ensure_legacy_archive_symlink() - if self.url not in self.crawl.urls: + existing_urls = {url for _raw_line, url in self.crawl._iter_url_lines() if url} + if self.crawl.url_passes_filters(self.url, snapshot=self) and self.url not in existing_urls: self.crawl.urls += f'\n{self.url}' self.crawl.save() - if is_new: - from archivebox.misc.logging_util import log_worker_event - log_worker_event( - worker_type='DB', - event='Created Snapshot', - indent_level=2, - url=self.url, - metadata={ - 'id': str(self.id), - 'crawl_id': str(self.crawl_id), - 'depth': self.depth, - 'status': self.status, - }, - ) + # if is_new: + # from archivebox.misc.logging_util import log_worker_event + # log_worker_event( + # worker_type='DB', + # event='Created Snapshot', + # indent_level=2, + # url=self.url, + # metadata={ + # 'id': str(self.id), + # 'crawl_id': str(self.crawl_id), + # 'depth': self.depth, + # 'status': self.status, + # }, + # ) # ========================================================================= # Filesystem Migration Methods @@ -1528,16 +1537,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea """ Execute snapshot by creating pending ArchiveResults for all enabled hooks. - Called by: SnapshotMachine.enter_started() - - Hook Lifecycle: - 1. discover_hooks('Snapshot') → finds all plugin hooks - 2. For each hook: - - Create ArchiveResult with status=QUEUED - - Store hook_name (e.g., 'on_Snapshot__50_wget.py') - 3. ArchiveResults execute independently via ArchiveResultMachine - 4. Hook execution happens in ArchiveResult.run(), NOT here - Returns: list[ArchiveResult]: Newly created pending results """ @@ -1602,7 +1601,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea 'url': self.url, 'title': self.title, 'tags': self.tags_str(), - 'tags_str': self.tags_str(), 'bookmarked_at': self.bookmarked_at.isoformat() if self.bookmarked_at else None, 'created_at': self.created_at.isoformat() if self.created_at else None, 'timestamp': self.timestamp, @@ -1672,7 +1670,9 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea # ID not found, fall through to create-by-URL logic pass - url = record.get('url') + from archivebox.misc.util import fix_url_from_markdown + + url = fix_url_from_markdown(str(record.get('url') or '').strip()) if not url: return None @@ -1807,7 +1807,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea defaults={ 'plugin': plugin, 'status': ArchiveResult.INITIAL_STATE, - 'retry_at': timezone.now(), }, ) if archiveresult.status == ArchiveResult.INITIAL_STATE: @@ -1853,11 +1852,12 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea failed = results.filter(status='failed').count() running = results.filter(status='started').count() skipped = results.filter(status='skipped').count() + noresults = results.filter(status='noresults').count() total = results.count() - pending = total - succeeded - failed - running - skipped + pending = total - succeeded - failed - running - skipped - noresults - # Calculate percentage (succeeded + failed + skipped as completed) - completed = succeeded + failed + skipped + # Calculate percentage (succeeded + failed + skipped + noresults as completed) + completed = succeeded + failed + skipped + noresults percent = int((completed / total * 100) if total > 0 else 0) # Sum output sizes @@ -1875,47 +1875,38 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea 'running': running, 'pending': pending, 'skipped': skipped, + 'noresults': noresults, 'percent': percent, 'output_size': output_size, 'is_sealed': is_sealed, } - def retry_failed_archiveresults(self, retry_at: Optional[datetime] = None) -> int: + def retry_failed_archiveresults(self) -> int: """ Reset failed/skipped ArchiveResults to queued for retry. - This enables seamless retry of the entire extraction pipeline: - - Resets FAILED and SKIPPED results to QUEUED - - Sets retry_at so workers pick them up - - Plugins run in order (numeric prefix) - - Each plugin checks its dependencies at runtime - - Dependency handling (e.g., chrome → screenshot): - - Plugins check if required outputs exist before running - - If dependency output missing → plugin returns 'skipped' - - On retry, if dependency now succeeds → dependent can run - Returns count of ArchiveResults reset. """ - retry_at = retry_at or timezone.now() - count = self.archiveresult_set.filter( status__in=[ ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED, + ArchiveResult.StatusChoices.NORESULTS, ] ).update( status=ArchiveResult.StatusChoices.QUEUED, - retry_at=retry_at, - output=None, + output_str='', + output_json=None, + output_files={}, + output_size=0, + output_mimetypes='', start_ts=None, end_ts=None, ) - # Also reset the snapshot and current_step so it gets re-checked from the beginning if count > 0: self.status = self.StatusChoices.STARTED - self.retry_at = retry_at + self.retry_at = timezone.now() self.current_step = 0 # Reset to step 0 for retry self.save(update_fields=['status', 'retry_at', 'current_step', 'modified_at']) @@ -2228,6 +2219,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea best_result = outputs[0] context = { **self.to_dict(extended=True), + 'snapshot': self, 'title': htmlencode(self.title or (self.base_url if self.is_archived else TITLE_LOADING_MSG)), 'url_str': htmlencode(urldecode(self.base_url)), 'archive_url': urlencode(f'warc/{self.timestamp}' or (self.domain if self.is_archived else '')) or 'about:blank', @@ -2275,8 +2267,8 @@ class SnapshotMachine(BaseStateMachine): │ • discover_hooks('Snapshot') → finds all plugin hooks │ │ • create_pending_archiveresults() → creates ONE │ │ ArchiveResult per hook (NO execution yet) │ - │ 2. ArchiveResults process independently with their own │ - │ state machines (see ArchiveResultMachine) │ + │ 2. The shared abx-dl runner executes hooks and the │ + │ projector updates ArchiveResult rows from events │ │ 3. Advance through steps 0-9 as foreground hooks complete │ └─────────────────────────────────────────────────────────────┘ ↓ tick() when is_finished() @@ -2358,7 +2350,7 @@ class SnapshotMachine(BaseStateMachine): cast(Any, crawl).sm.seal() -class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithStateMachine): +class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes): class StatusChoices(models.TextChoices): QUEUED = 'queued', 'Queued' STARTED = 'started', 'Started' @@ -2366,6 +2358,17 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi SUCCEEDED = 'succeeded', 'Succeeded' FAILED = 'failed', 'Failed' SKIPPED = 'skipped', 'Skipped' + NORESULTS = 'noresults', 'No Results' + + INITIAL_STATE = StatusChoices.QUEUED + ACTIVE_STATE = StatusChoices.STARTED + FINAL_STATES = ( + StatusChoices.SUCCEEDED, + StatusChoices.FAILED, + StatusChoices.SKIPPED, + StatusChoices.NORESULTS, + ) + FINAL_OR_ACTIVE_STATES = (*FINAL_STATES, ACTIVE_STATE) @classmethod def get_plugin_choices(cls): @@ -2404,16 +2407,10 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi start_ts = models.DateTimeField(default=None, null=True, blank=True) end_ts = models.DateTimeField(default=None, null=True, blank=True) - status = ModelWithStateMachine.StatusField(choices=StatusChoices.choices, default=StatusChoices.QUEUED) - retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now) + status = models.CharField(max_length=16, choices=StatusChoices.choices, default=StatusChoices.QUEUED, db_index=True) notes = models.TextField(blank=True, null=False, default='') # output_dir is computed via @property from snapshot.output_dir / plugin - state_machine_name = 'archivebox.core.models.ArchiveResultMachine' - retry_at_field_name = 'retry_at' - state_field_name = 'status' - active_state = StatusChoices.STARTED - snapshot_id: uuid.UUID process_id: uuid.UUID | None @@ -2421,7 +2418,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi ModelWithOutputDir.Meta, ModelWithConfig.Meta, ModelWithNotes.Meta, - ModelWithStateMachine.Meta, ): app_label = 'core' verbose_name = 'Archive Result' @@ -2516,40 +2512,24 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi return None def save(self, *args, **kwargs): - is_new = self._state.adding - - # Create Process record if this is a new ArchiveResult and no process exists yet - if is_new and not self.process_id: - from archivebox.machine.models import Process, Machine - - process = Process.objects.create( - machine=Machine.current(), - pwd=str(Path(self.snapshot.output_dir) / self.plugin), - cmd=[], # Will be set by run() - status='queued', - timeout=120, - env={}, - ) - self.process = process - # Skip ModelWithOutputDir.save() to avoid creating index.json in plugin directories # Call the Django Model.save() directly instead models.Model.save(self, *args, **kwargs) - if is_new: - from archivebox.misc.logging_util import log_worker_event - log_worker_event( - worker_type='DB', - event='Created ArchiveResult', - indent_level=3, - plugin=self.plugin, - metadata={ - 'id': str(self.id), - 'snapshot_id': str(self.snapshot_id), - 'snapshot_url': str(self.snapshot.url)[:64], - 'status': self.status, - }, - ) + # if is_new: + # from archivebox.misc.logging_util import log_worker_event + # log_worker_event( + # worker_type='DB', + # event='Created ArchiveResult', + # indent_level=3, + # plugin=self.plugin, + # metadata={ + # 'id': str(self.id), + # 'snapshot_id': str(self.snapshot_id), + # 'snapshot_url': str(self.snapshot.url)[:64], + # 'status': self.status, + # }, + # ) @cached_property def snapshot_dir(self): @@ -2566,6 +2546,28 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi def get_absolute_url(self): return f'/{self.snapshot.archive_path}/{self.plugin}' + def reset_for_retry(self, *, save: bool = True) -> None: + self.status = self.StatusChoices.QUEUED + self.output_str = '' + self.output_json = None + self.output_files = {} + self.output_size = 0 + self.output_mimetypes = '' + self.start_ts = None + self.end_ts = None + if save: + self.save(update_fields=[ + 'status', + 'output_str', + 'output_json', + 'output_files', + 'output_size', + 'output_mimetypes', + 'start_ts', + 'end_ts', + 'modified_at', + ]) + @property def plugin_module(self) -> Any | None: # Hook scripts are now used instead of Python plugin modules @@ -2723,11 +2725,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi return None - def create_output_dir(self): - output_dir = Path(self.snapshot_dir) / self.plugin - output_dir.mkdir(parents=True, exist_ok=True) - return output_dir - @property def output_dir_name(self) -> str: return self.plugin @@ -2782,134 +2779,17 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi def save_search_index(self): pass - def cascade_health_update(self, success: bool): - """Update health stats for parent Snapshot, Crawl, and execution infrastructure (Binary, Machine, NetworkInterface).""" - # Update archival hierarchy - self.snapshot.increment_health_stats(success) - self.snapshot.crawl.increment_health_stats(success) - - # Update execution infrastructure - if self.binary: - self.binary.increment_health_stats(success) - if self.binary.machine: - self.binary.machine.increment_health_stats(success) - - if self.iface: - self.iface.increment_health_stats(success) - - def run(self): - """ - Execute this ArchiveResult's hook and update status. - - If self.hook_name is set, runs only that specific hook. - If self.hook_name is empty, discovers and runs all hooks for self.plugin (backwards compat). - - Updates status/output fields, queues discovered URLs, and triggers indexing. - """ - from django.utils import timezone - from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook - from archivebox.config.configset import get_config - - # Get merged config with proper context - config = get_config( - crawl=self.snapshot.crawl, - snapshot=self.snapshot, - ) - - # Determine which hook(s) to run - hooks = [] - - if self.hook_name: - # SPECIFIC HOOK MODE: Find the specific hook by name - for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR): - if not base_dir.exists(): - continue - plugin_dir = base_dir / self.plugin - if plugin_dir.exists(): - hook_path = plugin_dir / self.hook_name - if hook_path.exists(): - hooks.append(hook_path) - break - else: - # LEGACY MODE: Discover all hooks for this plugin (backwards compatibility) - for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR): - if not base_dir.exists(): - continue - plugin_dir = base_dir / self.plugin - if plugin_dir.exists(): - matches = list(plugin_dir.glob('on_Snapshot__*.*')) - if matches: - hooks.extend(sorted(matches)) - - if not hooks: - self.status = self.StatusChoices.FAILED - if self.hook_name: - self.output_str = f'Hook not found: {self.plugin}/{self.hook_name}' - else: - self.output_str = f'No hooks found for plugin: {self.plugin}' - self.retry_at = None - self.save() - return - - # Output directory is plugin_dir for the hook output - plugin_dir = Path(self.snapshot.output_dir) / self.plugin - - start_ts = timezone.now() - process = None - - for hook in hooks: - # Run hook using Process.launch() - returns Process model - process = run_hook( - hook, - output_dir=plugin_dir, - config=config, - url=self.snapshot.url, - snapshot_id=str(self.snapshot.id), - crawl_id=str(self.snapshot.crawl.id), - depth=self.snapshot.depth, - ) - - # Link ArchiveResult to Process - self.process = process - self.start_ts = start_ts - self.save(update_fields=['process_id', 'start_ts', 'modified_at']) - - if not process: - # No hooks ran - self.status = self.StatusChoices.FAILED - self.output_str = 'No hooks executed' - self.save() - return - - # Update status based on hook execution - if process.status == process.StatusChoices.RUNNING: - # BACKGROUND HOOK - still running, return immediately - # Status is already STARTED from enter_started(), will be finalized by Snapshot.cleanup() - return - - # FOREGROUND HOOK - completed, update from filesystem - self.update_from_output() - - # Clean up empty output directory if no files were created - if plugin_dir.exists() and not self.output_files: - try: - if not any(plugin_dir.iterdir()): - plugin_dir.rmdir() - except (OSError, RuntimeError): - pass - def update_from_output(self): """ Update this ArchiveResult from filesystem logs and output files. - Used for: - - Foreground hooks that completed (called from ArchiveResult.run()) - - Background hooks that completed (called from Snapshot.cleanup()) + Used for Snapshot cleanup / orphan recovery when a hook's output exists + on disk but the projector did not finalize the row in the database. Updates: - status, output_str, output_json from ArchiveResult JSONL record - output_files, output_size, output_mimetypes by walking filesystem - - end_ts, retry_at, cmd, cmd_version, binary FK + - end_ts, cmd, cmd_version, binary FK - Processes side-effect records (Snapshot, Tag, etc.) via process_hook_records() """ import mimetypes @@ -2924,7 +2804,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi self.status = self.StatusChoices.FAILED self.output_str = 'Output directory not found' self.end_ts = timezone.now() - self.retry_at = None self.save() return @@ -2948,6 +2827,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi 'succeeded': self.StatusChoices.SUCCEEDED, 'failed': self.StatusChoices.FAILED, 'skipped': self.StatusChoices.SKIPPED, + 'noresults': self.StatusChoices.NORESULTS, } self.status = status_map.get(hook_data.get('status', 'failed'), self.StatusChoices.FAILED) @@ -3011,7 +2891,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi # Update timestamps self.end_ts = timezone.now() - self.retry_at = None self.save() @@ -3095,340 +2974,13 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi Uses proper config hierarchy: defaults -> file -> env -> machine -> user -> crawl -> snapshot """ - import re - from archivebox.config.configset import get_config - - # Get merged config with proper hierarchy - config = get_config( - user=self.created_by, - crawl=self.snapshot.crawl, - snapshot=self.snapshot, - ) - - # Get allowlist/denylist (can be string or list) - allowlist_raw = config.get('URL_ALLOWLIST', '') - denylist_raw = config.get('URL_DENYLIST', '') - - # Normalize to list of patterns - def to_pattern_list(value): - if isinstance(value, list): - return value - if isinstance(value, str): - return [p.strip() for p in value.split(',') if p.strip()] - return [] - - allowlist = to_pattern_list(allowlist_raw) - denylist = to_pattern_list(denylist_raw) - - # Denylist takes precedence - if denylist: - for pattern in denylist: - try: - if re.search(pattern, url): - return False - except re.error: - continue # Skip invalid regex patterns - - # If allowlist exists, URL must match at least one pattern - if allowlist: - for pattern in allowlist: - try: - if re.search(pattern, url): - return True - except re.error: - continue # Skip invalid regex patterns - return False # No allowlist patterns matched - - return True # No filters or passed filters + return self.snapshot.crawl.url_passes_filters(url, snapshot=self.snapshot) @property def output_dir(self) -> Path: """Get the output directory for this plugin's results.""" return Path(self.snapshot.output_dir) / self.plugin - def is_background_hook(self) -> bool: - """Check if this ArchiveResult is for a background hook.""" - plugin_dir = Path(self.pwd) if self.pwd else None - if not plugin_dir: - return False - pid_file = plugin_dir / 'hook.pid' - return pid_file.exists() - - -# ============================================================================= -# ArchiveResult State Machine -# ============================================================================= - -class ArchiveResultMachine(BaseStateMachine): - """ - State machine for managing ArchiveResult (single plugin execution) lifecycle. - - Hook Lifecycle: - ┌─────────────────────────────────────────────────────────────┐ - │ QUEUED State │ - │ • Waiting for its turn to run │ - └─────────────────────────────────────────────────────────────┘ - ↓ tick() when can_start() - ┌─────────────────────────────────────────────────────────────┐ - │ STARTED State → enter_started() │ - │ 1. archiveresult.run() │ - │ • Find specific hook by hook_name │ - │ • run_hook(script, output_dir, ...) → subprocess │ - │ │ - │ 2a. FOREGROUND hook (returns HookResult): │ - │ • update_from_output() immediately │ - │ - Read stdout.log │ - │ - Parse JSONL records │ - │ - Extract 'ArchiveResult' record → update status │ - │ - Walk output_dir → populate output_files │ - │ - Call process_hook_records() for side effects │ - │ │ - │ 2b. BACKGROUND hook (returns None): │ - │ • Status stays STARTED │ - │ • Continues running in background │ - │ • Killed by Snapshot.cleanup() when sealed │ - └─────────────────────────────────────────────────────────────┘ - ↓ tick() checks status - ┌─────────────────────────────────────────────────────────────┐ - │ SUCCEEDED / FAILED / SKIPPED / BACKOFF │ - │ • Set by hook's JSONL output during update_from_output() │ - │ • Health stats incremented (num_uses_succeeded/failed) │ - │ • Parent Snapshot health stats also updated │ - └─────────────────────────────────────────────────────────────┘ - - https://github.com/ArchiveBox/ArchiveBox/wiki/ArchiveBox-Architecture-Diagrams - """ - - model_attr_name = 'archiveresult' - - # States - queued = State(value=ArchiveResult.StatusChoices.QUEUED, initial=True) - started = State(value=ArchiveResult.StatusChoices.STARTED) - backoff = State(value=ArchiveResult.StatusChoices.BACKOFF) - succeeded = State(value=ArchiveResult.StatusChoices.SUCCEEDED, final=True) - failed = State(value=ArchiveResult.StatusChoices.FAILED, final=True) - skipped = State(value=ArchiveResult.StatusChoices.SKIPPED, final=True) - - # Tick Event - transitions based on conditions - # Flow: queued → started → (succeeded|failed|skipped) - # queued → skipped (if exceeded max attempts) - # started → backoff → started (retry) - tick = ( - queued.to(skipped, cond='is_exceeded_max_attempts') # Check skip first - | queued.to.itself(unless='can_start') - | queued.to(started, cond='can_start') - | started.to(succeeded, cond='is_succeeded') - | started.to(failed, cond='is_failed') - | started.to(skipped, cond='is_skipped') - | started.to(backoff, cond='is_backoff') - | backoff.to(skipped, cond='is_exceeded_max_attempts') # Check skip from backoff too - | backoff.to.itself(unless='can_start') - | backoff.to(started, cond='can_start') - # Removed redundant transitions: backoff.to(succeeded/failed/skipped) - # Reason: backoff should always retry→started, then started→final states - ) - - archiveresult: ArchiveResult - - def can_start(self) -> bool: - """Pure function - check if AR can start (has valid URL).""" - return bool(self.archiveresult.snapshot.url) - - def is_exceeded_max_attempts(self) -> bool: - """Check if snapshot has exceeded MAX_URL_ATTEMPTS failed results.""" - from archivebox.config.configset import get_config - - config = get_config( - crawl=self.archiveresult.snapshot.crawl, - snapshot=self.archiveresult.snapshot, - ) - max_attempts = config.get('MAX_URL_ATTEMPTS', 50) - - # Count failed ArchiveResults for this snapshot (any plugin type) - failed_count = self.archiveresult.snapshot.archiveresult_set.filter( - status=ArchiveResult.StatusChoices.FAILED - ).count() - - return failed_count >= max_attempts - - def is_succeeded(self) -> bool: - """Check if extractor plugin succeeded (status was set by run()).""" - return self.archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED - - def is_failed(self) -> bool: - """Check if extractor plugin failed (status was set by run()).""" - return self.archiveresult.status == ArchiveResult.StatusChoices.FAILED - - def is_skipped(self) -> bool: - """Check if extractor plugin was skipped (status was set by run()).""" - return self.archiveresult.status == ArchiveResult.StatusChoices.SKIPPED - - def is_backoff(self) -> bool: - """Check if we should backoff and retry later.""" - # Backoff if status is still started (plugin didn't complete) and output_str is empty - return ( - self.archiveresult.status == ArchiveResult.StatusChoices.STARTED - and not self.archiveresult.output_str - ) - - def is_finished(self) -> bool: - """ - Check if extraction has completed (success, failure, or skipped). - - For background hooks in STARTED state, checks if their Process has finished and reaps them. - """ - # If already in final state, return True - if self.archiveresult.status in ( - ArchiveResult.StatusChoices.SUCCEEDED, - ArchiveResult.StatusChoices.FAILED, - ArchiveResult.StatusChoices.SKIPPED, - ): - return True - - # If in STARTED state with a Process, check if Process has finished running - if self.archiveresult.status == ArchiveResult.StatusChoices.STARTED: - if self.archiveresult.process_id: - process = self.archiveresult.process - - # If process is NOT running anymore, reap the background hook - if not process.is_running: - self.archiveresult.update_from_output() - # Check if now in final state after reaping - return self.archiveresult.status in ( - ArchiveResult.StatusChoices.SUCCEEDED, - ArchiveResult.StatusChoices.FAILED, - ArchiveResult.StatusChoices.SKIPPED, - ) - - return False - - @queued.enter - def enter_queued(self): - self.archiveresult.update_and_requeue( - retry_at=timezone.now(), - status=ArchiveResult.StatusChoices.QUEUED, - start_ts=None, - ) # bump the snapshot's retry_at so they pickup any new changes - - @started.enter - def enter_started(self): - - # Update Process with network interface - if self.archiveresult.process_id: - self.archiveresult.process.iface = NetworkInterface.current() - self.archiveresult.process.save() - - # Lock the object and mark start time - self.archiveresult.update_and_requeue( - retry_at=timezone.now() + timedelta(seconds=120), # 2 min timeout for plugin - status=ArchiveResult.StatusChoices.STARTED, - start_ts=timezone.now(), - ) - - # Run the plugin - this updates status, output, timestamps, etc. - self.archiveresult.run() - - # Save the updated result - self.archiveresult.save() - - - @backoff.enter - def enter_backoff(self): - self.archiveresult.update_and_requeue( - retry_at=timezone.now() + timedelta(seconds=60), - status=ArchiveResult.StatusChoices.BACKOFF, - end_ts=None, - ) - - def _check_and_seal_parent_snapshot(self): - """ - Check if this is the last ArchiveResult to finish - if so, seal the parent Snapshot. - - Note: In the new architecture, the shared runner handles step advancement and sealing. - This method is kept for direct model-driven edge cases. - """ - import sys - - snapshot = self.archiveresult.snapshot - - # Check if all archiveresults are finished (in final states) - remaining_active = snapshot.archiveresult_set.exclude( - status__in=[ - ArchiveResult.StatusChoices.SUCCEEDED, - ArchiveResult.StatusChoices.FAILED, - ArchiveResult.StatusChoices.SKIPPED, - ] - ).count() - - if remaining_active == 0: - print(f'[cyan] 🔒 All archiveresults finished for snapshot {snapshot.url}, sealing snapshot[/cyan]', file=sys.stderr) - # Seal the parent snapshot - cast(Any, snapshot).sm.seal() - - @succeeded.enter - def enter_succeeded(self): - import sys - - self.archiveresult.update_and_requeue( - retry_at=None, - status=ArchiveResult.StatusChoices.SUCCEEDED, - end_ts=timezone.now(), - ) - - # Update health stats for ArchiveResult, Snapshot, and Crawl cascade - self.archiveresult.cascade_health_update(success=True) - - print(f'[cyan] ✅ ArchiveResult succeeded: {self.archiveresult.plugin} for {self.archiveresult.snapshot.url}[/cyan]', file=sys.stderr) - - # Check if this is the last AR to finish - seal parent snapshot if so - self._check_and_seal_parent_snapshot() - - @failed.enter - def enter_failed(self): - import sys - - print(f'[red] ❌ ArchiveResult.enter_failed() called for {self.archiveresult.plugin}[/red]', file=sys.stderr) - - self.archiveresult.update_and_requeue( - retry_at=None, - status=ArchiveResult.StatusChoices.FAILED, - end_ts=timezone.now(), - ) - - # Update health stats for ArchiveResult, Snapshot, and Crawl cascade - self.archiveresult.cascade_health_update(success=False) - - print(f'[red] ❌ ArchiveResult failed: {self.archiveresult.plugin} for {self.archiveresult.snapshot.url}[/red]', file=sys.stderr) - - # Check if this is the last AR to finish - seal parent snapshot if so - self._check_and_seal_parent_snapshot() - - @skipped.enter - def enter_skipped(self): - import sys - - # Set output_str if not already set (e.g., when skipped due to max attempts) - if not self.archiveresult.output_str and self.is_exceeded_max_attempts(): - from archivebox.config.configset import get_config - config = get_config( - crawl=self.archiveresult.snapshot.crawl, - snapshot=self.archiveresult.snapshot, - ) - max_attempts = config.get('MAX_URL_ATTEMPTS', 50) - self.archiveresult.output_str = f'Skipped: snapshot exceeded MAX_URL_ATTEMPTS ({max_attempts} failures)' - - self.archiveresult.update_and_requeue( - retry_at=None, - status=ArchiveResult.StatusChoices.SKIPPED, - end_ts=timezone.now(), - ) - - print(f'[dim] ⏭️ ArchiveResult skipped: {self.archiveresult.plugin} for {self.archiveresult.snapshot.url}[/dim]', file=sys.stderr) - - # Check if this is the last AR to finish - seal parent snapshot if so - self._check_and_seal_parent_snapshot() - - # ============================================================================= # State Machine Registration # ============================================================================= @@ -3436,4 +2988,3 @@ class ArchiveResultMachine(BaseStateMachine): # Manually register state machines with python-statemachine registry # (normally auto-discovered from statemachines.py, but we define them here for clarity) registry.register(SnapshotMachine) -registry.register(ArchiveResultMachine) diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index 7f855b94..3a296516 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -232,11 +232,12 @@ SQLITE_CONNECTION_OPTIONS = { # https://gcollazo.com/optimal-sqlite-settings-for-django/ # https://litestream.io/tips/#busy-timeout # https://docs.djangoproject.com/en/5.1/ref/databases/#setting-pragma-options - "timeout": 10, + "timeout": 30, "check_same_thread": False, "transaction_mode": "IMMEDIATE", "init_command": ( "PRAGMA foreign_keys=ON;" + "PRAGMA busy_timeout = 30000;" "PRAGMA journal_mode = WAL;" "PRAGMA synchronous = NORMAL;" "PRAGMA temp_store = MEMORY;" diff --git a/archivebox/core/tag_utils.py b/archivebox/core/tag_utils.py new file mode 100644 index 00000000..de562b34 --- /dev/null +++ b/archivebox/core/tag_utils.py @@ -0,0 +1,271 @@ +from __future__ import annotations + +import json +from collections import defaultdict +from typing import Any + +from django.contrib.auth.models import User +from django.db.models import Count, F, Q, QuerySet +from django.db.models.functions import Lower +from django.http import HttpRequest +from django.urls import reverse + +from archivebox.core.host_utils import build_snapshot_url, build_web_url +from archivebox.core.models import Snapshot, SnapshotTag, Tag + + +TAG_SNAPSHOT_PREVIEW_LIMIT = 10 +TAG_SORT_CHOICES = ( + ('name_asc', 'Name A-Z'), + ('name_desc', 'Name Z-A'), + ('created_desc', 'Created newest'), + ('created_asc', 'Created oldest'), + ('snapshots_desc', 'Most snapshots'), + ('snapshots_asc', 'Fewest snapshots'), +) +TAG_HAS_SNAPSHOTS_CHOICES = ( + ('all', 'All'), + ('yes', 'Has snapshots'), + ('no', 'No snapshots'), +) + + +def normalize_tag_name(name: str) -> str: + return (name or '').strip() + + +def normalize_tag_sort(sort: str = 'created_desc') -> str: + valid_sorts = {key for key, _label in TAG_SORT_CHOICES} + return sort if sort in valid_sorts else 'created_desc' + + +def normalize_has_snapshots_filter(value: str = 'all') -> str: + valid_filters = {key for key, _label in TAG_HAS_SNAPSHOTS_CHOICES} + return value if value in valid_filters else 'all' + + +def normalize_created_by_filter(created_by: str = '') -> str: + return created_by if str(created_by).isdigit() else '' + + +def normalize_created_year_filter(year: str = '') -> str: + year = (year or '').strip() + return year if len(year) == 4 and year.isdigit() else '' + + +def get_matching_tags( + query: str = '', + sort: str = 'created_desc', + created_by: str = '', + year: str = '', + has_snapshots: str = 'all', +) -> QuerySet[Tag]: + queryset = Tag.objects.select_related('created_by').annotate( + num_snapshots=Count('snapshot_set', distinct=True), + ) + + query = normalize_tag_name(query) + if query: + queryset = queryset.filter( + Q(name__icontains=query) | Q(slug__icontains=query), + ) + + created_by = normalize_created_by_filter(created_by) + if created_by: + queryset = queryset.filter(created_by_id=int(created_by)) + + year = normalize_created_year_filter(year) + if year: + queryset = queryset.filter(created_at__year=int(year)) + + has_snapshots = normalize_has_snapshots_filter(has_snapshots) + if has_snapshots == 'yes': + queryset = queryset.filter(num_snapshots__gt=0) + elif has_snapshots == 'no': + queryset = queryset.filter(num_snapshots=0) + + sort = normalize_tag_sort(sort) + if sort == 'name_asc': + queryset = queryset.order_by(Lower('name'), 'id') + elif sort == 'name_desc': + queryset = queryset.order_by(Lower('name').desc(), '-id') + elif sort == 'created_asc': + queryset = queryset.order_by(F('created_at').asc(nulls_first=True), 'id', Lower('name')) + elif sort == 'snapshots_desc': + queryset = queryset.order_by(F('num_snapshots').desc(nulls_last=True), F('created_at').desc(nulls_last=True), '-id', Lower('name')) + elif sort == 'snapshots_asc': + queryset = queryset.order_by(F('num_snapshots').asc(nulls_first=True), Lower('name'), 'id') + else: + queryset = queryset.order_by(F('created_at').desc(nulls_last=True), '-id', Lower('name')) + + return queryset + + +def get_tag_creator_choices() -> list[tuple[str, str]]: + rows = ( + Tag.objects + .filter(created_by__isnull=False) + .values_list('created_by_id', 'created_by__username') + .order_by(Lower('created_by__username'), 'created_by_id') + .distinct() + ) + return [(str(user_id), username or f'User {user_id}') for user_id, username in rows] + + +def get_tag_year_choices() -> list[str]: + years = Tag.objects.exclude(created_at__isnull=True).dates('created_at', 'year', order='DESC') + return [str(year.year) for year in years] + + +def get_tag_by_ref(tag_ref: str | int) -> Tag: + if isinstance(tag_ref, int): + return Tag.objects.get(pk=tag_ref) + + ref = str(tag_ref).strip() + if ref.isdigit(): + return Tag.objects.get(pk=int(ref)) + + try: + return Tag.objects.get(slug__iexact=ref) + except Tag.DoesNotExist: + return Tag.objects.get(slug__icontains=ref) + + +def get_or_create_tag(name: str, created_by: User | None = None) -> tuple[Tag, bool]: + normalized_name = normalize_tag_name(name) + if not normalized_name: + raise ValueError('Tag name is required') + + existing = Tag.objects.filter(name__iexact=normalized_name).first() + if existing: + return existing, False + + tag = Tag.objects.create( + name=normalized_name, + created_by=created_by, + ) + return tag, True + + +def rename_tag(tag: Tag, name: str) -> Tag: + normalized_name = normalize_tag_name(name) + if not normalized_name: + raise ValueError('Tag name is required') + + existing = Tag.objects.filter(name__iexact=normalized_name).exclude(pk=tag.pk).first() + if existing: + raise ValueError(f'Tag "{existing.name}" already exists') + + if tag.name != normalized_name: + tag.name = normalized_name + tag.save() + return tag + + +def delete_tag(tag: Tag) -> tuple[int, dict[str, int]]: + return tag.delete() + + +def export_tag_urls(tag: Tag) -> str: + urls = tag.snapshot_set.order_by('-downloaded_at', '-created_at', '-pk').values_list('url', flat=True) + return '\n'.join(urls) + + +def export_tag_snapshots_jsonl(tag: Tag) -> str: + snapshots = tag.snapshot_set.order_by('-downloaded_at', '-created_at', '-pk').prefetch_related('tags') + return '\n'.join(json.dumps(snapshot.to_json()) for snapshot in snapshots) + + +def _display_snapshot_title(snapshot: Snapshot) -> str: + title = (snapshot.title or '').strip() + url = (snapshot.url or '').strip() + if not title: + return url + + normalized_title = title.lower() + if normalized_title == 'pending...' or normalized_title == url.lower(): + return url + return title + + +def _build_snapshot_preview(snapshot: Snapshot, request: HttpRequest | None = None) -> dict[str, Any]: + return { + 'id': str(snapshot.pk), + 'title': _display_snapshot_title(snapshot), + 'url': snapshot.url, + 'favicon_url': build_snapshot_url(str(snapshot.pk), 'favicon.ico', request=request), + 'admin_url': reverse('admin:core_snapshot_change', args=[snapshot.pk]), + 'archive_url': build_web_url(f'/{snapshot.archive_path_from_db}/index.html', request=request), + 'downloaded_at': snapshot.downloaded_at.isoformat() if snapshot.downloaded_at else None, + } + + +def _build_snapshot_preview_map(tags: list[Tag], request: HttpRequest | None = None, preview_limit: int = TAG_SNAPSHOT_PREVIEW_LIMIT) -> dict[int, list[dict[str, Any]]]: + tag_ids = [tag.pk for tag in tags] + if not tag_ids: + return {} + + snapshot_tags = ( + SnapshotTag.objects + .filter(tag_id__in=tag_ids) + .select_related('snapshot__crawl__created_by') + .order_by( + 'tag_id', + F('snapshot__downloaded_at').desc(nulls_last=True), + F('snapshot__created_at').desc(nulls_last=True), + F('snapshot_id').desc(), + ) + ) + + preview_map: dict[int, list[dict[str, Any]]] = defaultdict(list) + for snapshot_tag in snapshot_tags: + previews = preview_map[snapshot_tag.tag_id] + if len(previews) >= preview_limit: + continue + previews.append(_build_snapshot_preview(snapshot_tag.snapshot, request=request)) + return preview_map + + +def build_tag_card(tag: Tag, snapshot_previews: list[dict[str, Any]] | None = None) -> dict[str, Any]: + count = getattr(tag, 'num_snapshots', tag.snapshot_set.count()) + return { + 'id': tag.pk, + 'name': tag.name, + 'slug': tag.slug, + 'num_snapshots': count, + 'filter_url': f"{reverse('admin:core_snapshot_changelist')}?tags__id__exact={tag.pk}", + 'edit_url': reverse('admin:core_tag_change', args=[tag.pk]), + 'export_urls_url': reverse('api-1:tag_urls_export', args=[tag.pk]), + 'export_jsonl_url': reverse('api-1:tag_snapshots_export', args=[tag.pk]), + 'rename_url': reverse('api-1:rename_tag', args=[tag.pk]), + 'delete_url': reverse('api-1:delete_tag', args=[tag.pk]), + 'snapshots': snapshot_previews or [], + } + + +def build_tag_cards( + query: str = '', + request: HttpRequest | None = None, + limit: int | None = None, + preview_limit: int = TAG_SNAPSHOT_PREVIEW_LIMIT, + sort: str = 'created_desc', + created_by: str = '', + year: str = '', + has_snapshots: str = 'all', +) -> list[dict[str, Any]]: + queryset = get_matching_tags( + query=query, + sort=sort, + created_by=created_by, + year=year, + has_snapshots=has_snapshots, + ) + if limit is not None: + queryset = queryset[:limit] + + tags = list(queryset) + preview_map = _build_snapshot_preview_map(tags, request=request, preview_limit=preview_limit) + return [ + build_tag_card(tag, snapshot_previews=preview_map.get(tag.pk, [])) + for tag in tags + ] diff --git a/archivebox/core/templatetags/core_tags.py b/archivebox/core/templatetags/core_tags.py index 859a4c6f..a0323ca3 100644 --- a/archivebox/core/templatetags/core_tags.py +++ b/archivebox/core/templatetags/core_tags.py @@ -11,6 +11,7 @@ from archivebox.hooks import ( ) from archivebox.core.host_utils import ( get_admin_base_url, + get_public_base_url, get_web_base_url, get_snapshot_base_url, build_snapshot_url, @@ -166,6 +167,11 @@ def web_base_url(context) -> str: return get_web_base_url(request=context.get('request')) +@register.simple_tag(takes_context=True) +def public_base_url(context) -> str: + return get_public_base_url(request=context.get('request')) + + @register.simple_tag(takes_context=True) def snapshot_base_url(context, snapshot) -> str: snapshot_id = getattr(snapshot, 'id', snapshot) diff --git a/archivebox/core/views.py b/archivebox/core/views.py index 5ff0d2fd..d63af6dc 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -1,5 +1,6 @@ __package__ = 'archivebox.core' +import json import os import posixpath from glob import glob, escape @@ -7,7 +8,7 @@ from django.utils import timezone import inspect from typing import Callable, cast, get_type_hints from pathlib import Path -from urllib.parse import urlparse +from urllib.parse import quote, urlparse from django.shortcuts import render, redirect from django.http import JsonResponse, HttpRequest, HttpResponse, Http404, HttpResponseForbidden @@ -26,7 +27,7 @@ from admin_data_views.typing import TableContext, ItemContext, SectionData from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink from archivebox.config import CONSTANTS, CONSTANTS_CONFIG, DATA_DIR, VERSION -from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG +from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG, SEARCH_BACKEND_CONFIG from archivebox.config.configset import get_flat_config, get_config, get_all_configs from archivebox.misc.util import base_url, htmlencode, ts_to_date_str, urldecode from archivebox.misc.serve_static import serve_static_with_byterange_support @@ -37,7 +38,18 @@ from archivebox.core.models import Snapshot from archivebox.core.host_utils import build_snapshot_url from archivebox.core.forms import AddLinkForm from archivebox.crawls.models import Crawl -from archivebox.hooks import get_enabled_plugins, get_plugin_name +from archivebox.hooks import ( + BUILTIN_PLUGINS_DIR, + USER_PLUGINS_DIR, + discover_plugin_configs, + get_enabled_plugins, + get_plugin_name, + iter_plugin_dirs, +) + + +ABX_PLUGINS_GITHUB_BASE_URL = 'https://github.com/ArchiveBox/abx-plugins/tree/main/abx_plugins/plugins/' +LIVE_PLUGIN_BASE_URL = '/admin/environment/plugins/' def _files_index_target(snapshot: Snapshot, archivefile: str | None) -> str: @@ -699,6 +711,9 @@ def _serve_responses_path(request, responses_root: Path, rel_path: str, show_ind def _serve_snapshot_replay(request: HttpRequest, snapshot: Snapshot, path: str = ""): rel_path = path or "" show_indexes = bool(request.GET.get("files")) + if not show_indexes and (not rel_path or rel_path == "index.html"): + return SnapshotView.render_live_index(request, snapshot) + if not rel_path or rel_path.endswith("/"): if show_indexes: rel_path = rel_path.rstrip("/") @@ -784,7 +799,6 @@ class SnapshotHostView(View): raise Http404 return _serve_snapshot_replay(request, snapshot, path) - class SnapshotReplayView(View): """Serve snapshot directory contents on a one-domain replay path.""" @@ -915,8 +929,17 @@ class AddView(UserPassesTestMixin, FormView): return custom_config def get_context_data(self, **kwargs): - from archivebox.core.models import Tag - + required_search_plugin = f'search_backend_{SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}'.strip() + plugin_configs = discover_plugin_configs() + plugin_dependency_map = { + plugin_name: [ + str(required_plugin).strip() + for required_plugin in (schema.get('required_plugins') or []) + if str(required_plugin).strip() + ] + for plugin_name, schema in plugin_configs.items() + if isinstance(schema.get('required_plugins'), list) and schema.get('required_plugins') + } return { **super().get_context_data(**kwargs), 'title': "Create Crawl", @@ -924,8 +947,9 @@ class AddView(UserPassesTestMixin, FormView): 'absolute_add_path': self.request.build_absolute_uri(self.request.path), 'VERSION': VERSION, 'FOOTER_INFO': SERVER_CONFIG.FOOTER_INFO, + 'required_search_plugin': required_search_plugin, + 'plugin_dependency_map_json': json.dumps(plugin_dependency_map, sort_keys=True), 'stdout': '', - 'available_tags': list(Tag.objects.all().order_by('name').values_list('name', flat=True)), } def _create_crawl_from_form(self, form, *, created_by_id=None) -> Crawl: @@ -937,11 +961,10 @@ class AddView(UserPassesTestMixin, FormView): depth = int(form.cleaned_data["depth"]) plugins = ','.join(form.cleaned_data.get("plugins", [])) schedule = form.cleaned_data.get("schedule", "").strip() - persona = form.cleaned_data.get("persona", "Default") - overwrite = form.cleaned_data.get("overwrite", False) - update = form.cleaned_data.get("update", False) + persona = form.cleaned_data.get("persona") index_only = form.cleaned_data.get("index_only", False) notes = form.cleaned_data.get("notes", "") + url_filters = form.cleaned_data.get("url_filters") or {} custom_config = self._get_custom_config_overrides(form) from archivebox.config.permissions import HOSTNAME @@ -957,6 +980,7 @@ class AddView(UserPassesTestMixin, FormView): # 1. save the provided urls to sources/2024-11-05__23-59-59__web_ui_add_by_user_.txt sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__web_ui_add_by_user_{created_by_id}.txt' + sources_file.parent.mkdir(parents=True, exist_ok=True) sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls)) # 2. create a new Crawl with the URLs from the file @@ -964,16 +988,18 @@ class AddView(UserPassesTestMixin, FormView): urls_content = sources_file.read_text() # Build complete config config = { - 'ONLY_NEW': not update, 'INDEX_ONLY': index_only, - 'OVERWRITE': overwrite, 'DEPTH': depth, 'PLUGINS': plugins or '', - 'DEFAULT_PERSONA': persona or 'Default', + 'DEFAULT_PERSONA': (persona.name if persona else 'Default'), } # Merge custom config overrides config.update(custom_config) + if url_filters.get('allowlist'): + config['URL_ALLOWLIST'] = url_filters['allowlist'] + if url_filters.get('denylist'): + config['URL_DENYLIST'] = url_filters['denylist'] crawl = Crawl.objects.create( urls=urls_content, @@ -999,6 +1025,8 @@ class AddView(UserPassesTestMixin, FormView): crawl.schedule = crawl_schedule crawl.save(update_fields=['schedule']) + crawl.create_snapshots_from_urls() + # 4. start the Orchestrator & wait until it completes # ... orchestrator will create the root Snapshot, which creates pending ArchiveResults, which gets run by the ArchiveResultActors ... # from archivebox.crawls.actors import CrawlActor @@ -1011,7 +1039,7 @@ class AddView(UserPassesTestMixin, FormView): urls = form.cleaned_data["url"] schedule = form.cleaned_data.get("schedule", "").strip() - rough_url_count = urls.count('://') + rough_url_count = len([url for url in urls.splitlines() if url.strip()]) # Build success message with schedule link if created schedule_msg = "" @@ -1080,10 +1108,6 @@ class WebAddView(AddView): 'persona': defaults_form.fields['persona'].initial or 'Default', 'config': {}, } - if defaults_form.fields['update'].initial: - form_data['update'] = 'on' - if defaults_form.fields['overwrite'].initial: - form_data['overwrite'] = 'on' if defaults_form.fields['index_only'].initial: form_data['index_only'] = 'on' @@ -1118,6 +1142,41 @@ def live_progress_view(request): from archivebox.core.models import Snapshot, ArchiveResult from archivebox.machine.models import Process, Machine + def hook_details(hook_name: str, plugin: str = "setup") -> tuple[str, str, str, str]: + normalized_hook_name = Path(hook_name).name if hook_name else "" + if not normalized_hook_name: + return (plugin, plugin, "unknown", "") + + phase = "unknown" + if normalized_hook_name.startswith("on_Crawl__"): + phase = "crawl" + elif normalized_hook_name.startswith("on_Snapshot__"): + phase = "snapshot" + elif normalized_hook_name.startswith("on_Binary__"): + phase = "binary" + + label = normalized_hook_name + if "__" in normalized_hook_name: + label = normalized_hook_name.split("__", 1)[1] + label = label.rsplit(".", 1)[0] + if len(label) > 3 and label[:2].isdigit() and label[2] == "_": + label = label[3:] + label = label.replace("_", " ").strip() or plugin + + return (plugin, label, phase, normalized_hook_name) + + def process_label(cmd: list[str] | None) -> tuple[str, str, str, str]: + hook_path = "" + if isinstance(cmd, list) and cmd: + first = cmd[0] + if isinstance(first, str): + hook_path = first + + if not hook_path: + return ("", "setup", "unknown", "") + + return hook_details(Path(hook_path).name, plugin=Path(hook_path).parent.name or "setup") + machine = Machine.current() orchestrator_proc = Process.objects.filter( machine=machine, @@ -1188,8 +1247,19 @@ def live_progress_view(request): Process.TypeChoices.BINARY, ], ) + recent_processes = Process.objects.filter( + machine=machine, + process_type__in=[ + Process.TypeChoices.HOOK, + Process.TypeChoices.BINARY, + ], + modified_at__gte=timezone.now() - timedelta(minutes=10), + ).order_by("-modified_at") crawl_process_pids: dict[str, int] = {} snapshot_process_pids: dict[str, int] = {} + process_records_by_crawl: dict[str, list[dict[str, object]]] = {} + process_records_by_snapshot: dict[str, list[dict[str, object]]] = {} + seen_process_records: set[str] = set() for proc in running_processes: env = proc.env or {} if not isinstance(env, dict): @@ -1197,11 +1267,48 @@ def live_progress_view(request): crawl_id = env.get('CRAWL_ID') snapshot_id = env.get('SNAPSHOT_ID') + _plugin, _label, phase, _hook_name = process_label(proc.cmd) if crawl_id and proc.pid: crawl_process_pids.setdefault(str(crawl_id), proc.pid) - if snapshot_id and proc.pid: + if phase == "snapshot" and snapshot_id and proc.pid: snapshot_process_pids.setdefault(str(snapshot_id), proc.pid) + for proc in recent_processes: + env = proc.env or {} + if not isinstance(env, dict): + env = {} + + crawl_id = env.get("CRAWL_ID") + snapshot_id = env.get("SNAPSHOT_ID") + if not crawl_id and not snapshot_id: + continue + + plugin, label, phase, hook_name = process_label(proc.cmd) + + record_scope = str(snapshot_id) if phase == "snapshot" and snapshot_id else str(crawl_id) + proc_key = f"{record_scope}:{plugin}:{label}:{proc.status}:{proc.exit_code}" + if proc_key in seen_process_records: + continue + seen_process_records.add(proc_key) + + status = "started" if proc.status == Process.StatusChoices.RUNNING else ("failed" if proc.exit_code not in (None, 0) else "succeeded") + payload: dict[str, object] = { + "id": str(proc.id), + "plugin": plugin, + "label": label, + "hook_name": hook_name, + "status": status, + "phase": phase, + "source": "process", + "process_id": str(proc.id), + } + if status == "started" and proc.pid: + payload["pid"] = proc.pid + if phase == "snapshot" and snapshot_id: + process_records_by_snapshot.setdefault(str(snapshot_id), []).append(payload) + elif crawl_id: + process_records_by_crawl.setdefault(str(crawl_id), []).append(payload) + active_crawls_qs = Crawl.objects.filter( status__in=[Crawl.StatusChoices.QUEUED, Crawl.StatusChoices.STARTED] ).prefetch_related( @@ -1234,6 +1341,11 @@ def live_progress_view(request): # Calculate crawl progress crawl_progress = int((completed_snapshots / total_snapshots) * 100) if total_snapshots > 0 else 0 + crawl_setup_plugins = list(process_records_by_crawl.get(str(crawl.id), [])) + crawl_setup_total = len(crawl_setup_plugins) + crawl_setup_completed = sum(1 for item in crawl_setup_plugins if item.get("status") == "succeeded") + crawl_setup_failed = sum(1 for item in crawl_setup_plugins if item.get("status") == "failed") + crawl_setup_pending = sum(1 for item in crawl_setup_plugins if item.get("status") == "queued") # Get active snapshots for this crawl (already prefetched) active_snapshots_for_crawl = [] @@ -1241,28 +1353,21 @@ def live_progress_view(request): # Get archive results for this snapshot (already prefetched) snapshot_results = snapshot.archiveresult_set.all() - # Count in memory instead of DB queries - total_plugins = len(snapshot_results) - completed_plugins = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.SUCCEEDED) - failed_plugins = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.FAILED) - pending_plugins = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.QUEUED) - - # Calculate snapshot progress using per-plugin progress now = timezone.now() plugin_progress_values: list[int] = [] + all_plugins: list[dict[str, object]] = [] + seen_plugin_keys: set[str] = set() - # Get all extractor plugins for this snapshot (already prefetched, sort in Python) - # Order: started first, then queued, then completed def plugin_sort_key(ar): status_order = { ArchiveResult.StatusChoices.STARTED: 0, ArchiveResult.StatusChoices.QUEUED: 1, ArchiveResult.StatusChoices.SUCCEEDED: 2, - ArchiveResult.StatusChoices.FAILED: 3, + ArchiveResult.StatusChoices.NORESULTS: 3, + ArchiveResult.StatusChoices.FAILED: 4, } - return (status_order.get(ar.status, 4), ar.plugin) + return (status_order.get(ar.status, 5), ar.plugin, ar.hook_name or "") - all_plugins = [] for ar in sorted(snapshot_results, key=plugin_sort_key): status = ar.status progress_value = 0 @@ -1270,6 +1375,7 @@ def live_progress_view(request): ArchiveResult.StatusChoices.SUCCEEDED, ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED, + ArchiveResult.StatusChoices.NORESULTS, ): progress_value = 100 elif status == ArchiveResult.StatusChoices.STARTED: @@ -1284,20 +1390,49 @@ def live_progress_view(request): progress_value = 0 plugin_progress_values.append(progress_value) + plugin, label, phase, hook_name = hook_details(ar.hook_name or ar.plugin, plugin=ar.plugin) plugin_payload = { 'id': str(ar.id), 'plugin': ar.plugin, + 'label': label, + 'hook_name': hook_name, + 'phase': phase, 'status': status, + 'process_id': str(ar.process_id) if ar.process_id else None, } if status == ArchiveResult.StatusChoices.STARTED and ar.process_id and ar.process: plugin_payload['pid'] = ar.process.pid if status == ArchiveResult.StatusChoices.STARTED: plugin_payload['progress'] = progress_value plugin_payload['timeout'] = ar.timeout or 120 + plugin_payload['source'] = 'archiveresult' all_plugins.append(plugin_payload) + seen_plugin_keys.add( + str(ar.process_id) if ar.process_id else f"{ar.plugin}:{hook_name}" + ) - snapshot_progress = int(sum(plugin_progress_values) / total_plugins) if total_plugins > 0 else 0 + for proc_payload in process_records_by_snapshot.get(str(snapshot.id), []): + proc_key = str(proc_payload.get("process_id") or f"{proc_payload.get('plugin')}:{proc_payload.get('hook_name')}") + if proc_key in seen_plugin_keys: + continue + seen_plugin_keys.add(proc_key) + all_plugins.append(proc_payload) + + proc_status = proc_payload.get("status") + if proc_status in ("succeeded", "failed", "skipped"): + plugin_progress_values.append(100) + elif proc_status == "started": + plugin_progress_values.append(1) + else: + plugin_progress_values.append(0) + + total_plugins = len(all_plugins) + completed_plugins = sum(1 for item in all_plugins if item.get("status") == "succeeded") + failed_plugins = sum(1 for item in all_plugins if item.get("status") == "failed") + pending_plugins = sum(1 for item in all_plugins if item.get("status") == "queued") + + snapshot_progress = int(sum(plugin_progress_values) / len(plugin_progress_values)) if plugin_progress_values else 0 active_snapshots_for_crawl.append({ 'id': str(snapshot.id), @@ -1334,6 +1469,11 @@ def live_progress_view(request): 'started_snapshots': started_snapshots, 'failed_snapshots': 0, 'pending_snapshots': pending_snapshots, + 'setup_plugins': crawl_setup_plugins, + 'setup_total_plugins': crawl_setup_total, + 'setup_completed_plugins': crawl_setup_completed, + 'setup_failed_plugins': crawl_setup_failed, + 'setup_pending_plugins': crawl_setup_pending, 'active_snapshots': active_snapshots_for_crawl, 'can_start': can_start, 'urls_preview': urls_preview, @@ -1461,7 +1601,11 @@ def find_config_source(key: str, merged_config: dict) -> str: """Determine where a config value comes from.""" from archivebox.machine.models import Machine - # Check if it's from archivebox.machine.config + # Environment variables override all persistent config sources. + if key in os.environ: + return 'Environment' + + # Machine.config overrides ArchiveBox.conf. try: machine = Machine.current() if machine.config and key in machine.config: @@ -1469,10 +1613,6 @@ def find_config_source(key: str, merged_config: dict) -> str: except Exception: pass - # Check if it's from environment variable - if key in os.environ: - return 'Environment' - # Check if it's from archivebox.config.file from archivebox.config.configset import BaseConfigSet file_config = BaseConfigSet.load_from_file(CONSTANTS.CONFIG_FILE) @@ -1483,6 +1623,43 @@ def find_config_source(key: str, merged_config: dict) -> str: return 'Default' +def find_plugin_for_config_key(key: str) -> str | None: + for plugin_name, schema in discover_plugin_configs().items(): + if key in (schema.get('properties') or {}): + return plugin_name + return None + + +def get_config_definition_link(key: str) -> tuple[str, str]: + plugin_name = find_plugin_for_config_key(key) + if not plugin_name: + return ( + f'https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig+{quote(key)}&type=code', + 'archivebox/config', + ) + + plugin_dir = next((path.resolve() for path in iter_plugin_dirs() if path.name == plugin_name), None) + if plugin_dir: + builtin_root = BUILTIN_PLUGINS_DIR.resolve() + if plugin_dir.is_relative_to(builtin_root): + return ( + f'{ABX_PLUGINS_GITHUB_BASE_URL}{quote(plugin_name)}/config.json', + f'abx_plugins/plugins/{plugin_name}/config.json', + ) + + user_root = USER_PLUGINS_DIR.resolve() + if plugin_dir.is_relative_to(user_root): + return ( + f'{LIVE_PLUGIN_BASE_URL}user.{quote(plugin_name)}/', + f'data/custom_plugins/{plugin_name}/config.json', + ) + + return ( + f'{LIVE_PLUGIN_BASE_URL}builtin.{quote(plugin_name)}/', + f'abx_plugins/plugins/{plugin_name}/config.json', + ) + + @render_with_table_view def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext: CONFIGS = get_all_configs() @@ -1566,17 +1743,6 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont # Determine all sources for this config value sources_info = [] - # Default value - default_val = find_config_default(key) - if default_val: - sources_info.append(('Default', default_val, 'gray')) - - # Config file value - if CONSTANTS.CONFIG_FILE.exists(): - file_config = BaseConfigSet.load_from_file(CONSTANTS.CONFIG_FILE) - if key in file_config: - sources_info.append(('Config File', file_config[key], 'green')) - # Environment variable if key in os.environ: sources_info.append(('Environment', os.environ[key] if key_is_safe(key) else '********', 'blue')) @@ -1592,6 +1758,17 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont except Exception: pass + # Config file value + if CONSTANTS.CONFIG_FILE.exists(): + file_config = BaseConfigSet.load_from_file(CONSTANTS.CONFIG_FILE) + if key in file_config: + sources_info.append(('Config File', file_config[key], 'green')) + + # Default value + default_val = find_config_default(key) + if default_val: + sources_info.append(('Default', default_val, 'gray')) + # Final computed value final_value = merged_config.get(key, FLAT_CONFIG.get(key, CONFIGS.get(key, None))) if not key_is_safe(key): @@ -1614,6 +1791,8 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont section_header = mark_safe(f'[DYNAMIC CONFIG]   {key}   (read-only, calculated at runtime)') + definition_url, definition_label = get_config_definition_link(key) + section_data = cast(SectionData, { "name": section_header, "description": None, @@ -1621,7 +1800,7 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont 'Key': key, 'Type': find_config_type(key), 'Value': final_value, - 'Source': find_config_source(key, merged_config), + 'Currently read from': find_config_source(key, merged_config), }, "help_texts": { 'Key': mark_safe(f''' @@ -1631,14 +1810,14 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont '''), 'Type': mark_safe(f''' - - See full definition in archivebox/config... + + See full definition in {definition_label}... '''), 'Value': mark_safe(f''' {'Value is redacted for your security. (Passwords, secrets, API tokens, etc. cannot be viewed in the Web UI)

' if not key_is_safe(key) else ''}


- Configuration Sources (in priority order):

+ Configuration Sources (highest priority first):

{sources_html}

@@ -1651,15 +1830,15 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont }"

'''), - 'Source': mark_safe(f''' + 'Currently read from': mark_safe(f''' The value shown in the "Value" field comes from the {find_config_source(key, merged_config)} source.

Priority order (highest to lowest):
    +
  1. Environment - Environment variables
  2. Machine - Machine-specific overrides (e.g., resolved binary paths) {f'
    → Edit {key} in Machine.config for this server' if machine_admin_url else ''}
  3. -
  4. Environment - Environment variables
  5. Config File - data/ArchiveBox.conf
  6. Default - Default value from code
diff --git a/archivebox/core/widgets.py b/archivebox/core/widgets.py index 6e9fe475..af30544c 100644 --- a/archivebox/core/widgets.py +++ b/archivebox/core/widgets.py @@ -131,7 +131,46 @@ class TagEditorWidget(forms.Widget): }}; window.updateHiddenInput_{widget_id} = function() {{ - document.getElementById('{widget_id}').value = currentTags_{widget_id}.join(','); + var hiddenInput = document.getElementById('{widget_id}'); + if (!hiddenInput) {{ + return; + }} + hiddenInput.value = currentTags_{widget_id}.join(','); + hiddenInput.dispatchEvent(new Event('input', {{ bubbles: true }})); + hiddenInput.dispatchEvent(new Event('change', {{ bubbles: true }})); + }}; + + function normalizeTags_{widget_id}(value) {{ + var rawTags = Array.isArray(value) ? value : String(value || '').split(','); + var seen = {{}}; + return rawTags + .map(function(tag) {{ return String(tag || '').trim(); }}) + .filter(function(tag) {{ + if (!tag) return false; + var normalized = tag.toLowerCase(); + if (seen[normalized]) return false; + seen[normalized] = true; + return true; + }}) + .sort(function(a, b) {{ + return a.toLowerCase().localeCompare(b.toLowerCase()); + }}); + }} + + window.setTags_{widget_id} = function(value, options) {{ + currentTags_{widget_id} = normalizeTags_{widget_id}(value); + rebuildPills_{widget_id}(); + if (!(options && options.skipHiddenUpdate)) {{ + updateHiddenInput_{widget_id}(); + }} + }}; + + window.syncTagEditorFromHidden_{widget_id} = function() {{ + var hiddenInput = document.getElementById('{widget_id}'); + if (!hiddenInput) {{ + return; + }} + setTags_{widget_id}(hiddenInput.value, {{ skipHiddenUpdate: true }}); }}; function computeTagStyle_{widget_id}(tagName) {{ @@ -190,9 +229,7 @@ class TagEditorWidget(forms.Widget): // Add to current tags currentTags_{widget_id}.push(tagName); - currentTags_{widget_id}.sort(function(a, b) {{ - return a.toLowerCase().localeCompare(b.toLowerCase()); - }}); + currentTags_{widget_id} = normalizeTags_{widget_id}(currentTags_{widget_id}); // Rebuild pills rebuildPills_{widget_id}(); @@ -252,6 +289,14 @@ class TagEditorWidget(forms.Widget): }} }}); + document.getElementById('{widget_id}').addEventListener('change', function() {{ + syncTagEditorFromHidden_{widget_id}(); + }}); + + document.getElementById('{widget_id}').addEventListener('archivebox:sync-tags', function() {{ + syncTagEditorFromHidden_{widget_id}(); + }}); + window.handleTagKeydown_{widget_id} = function(event) {{ var input = event.target; var value = input.value.trim(); @@ -320,6 +365,8 @@ class TagEditorWidget(forms.Widget): var input = document.querySelector('input[name="csrfmiddlewaretoken"]'); return input ? input.value : ''; }} + + syncTagEditorFromHidden_{widget_id}(); }})(); ''' @@ -327,15 +374,232 @@ class TagEditorWidget(forms.Widget): return mark_safe(html) +class URLFiltersWidget(forms.Widget): + """Render URL allowlist / denylist controls with same-domain autofill.""" + + template_name = "" + + def __init__(self, attrs=None, *, source_selector='textarea[name="url"]'): + self.source_selector = source_selector + super().__init__(attrs) + + def render(self, name, value, attrs=None, renderer=None): + value = value if isinstance(value, dict) else {} + widget_id_raw = attrs.get('id', name) if attrs else name + widget_id = re.sub(r'[^A-Za-z0-9_]', '_', str(widget_id_raw)) or name + allowlist = escape(value.get('allowlist', '') or '') + denylist = escape(value.get('denylist', '') or '') + + return mark_safe(f''' +
+ +
+
+
+ + Regex patterns or domains to exclude, one pattern per line. +
+ +
+
+
+ + Regex patterns or domains to exclude, one pattern per line. +
+ +
+
+ +
These values can be one regex pattern or domain per line. URL_DENYLIST takes precedence over URL_ALLOWLIST.
+ +
+ ''') + + def value_from_datadict(self, data, files, name): + return { + 'allowlist': data.get(f'{name}_allowlist', ''), + 'denylist': data.get(f'{name}_denylist', ''), + 'same_domain_only': data.get(f'{name}_same_domain_only') in ('1', 'on', 'true'), + } + + class InlineTagEditorWidget(TagEditorWidget): """ Inline version of TagEditorWidget for use in list views. Includes AJAX save functionality for immediate persistence. """ - def __init__(self, attrs=None, snapshot_id=None): + def __init__(self, attrs=None, snapshot_id=None, editable=True): super().__init__(attrs, snapshot_id) self.snapshot_id = snapshot_id + self.editable = editable def render(self, name, value, attrs=None, renderer=None, snapshot_id=None): """Render inline tag editor with AJAX save.""" @@ -361,20 +625,24 @@ class InlineTagEditorWidget(TagEditorWidget): # Build pills HTML with filter links pills_html = '' for td in tag_data: + remove_button = '' + if self.editable: + remove_button = ( + f'' + ) pills_html += f''' {self._escape(td['name'])} - + {remove_button} ''' tags_json = escape(json.dumps(tag_data)) - - html = f''' - - - {pills_html} - + input_html = '' + readonly_class = ' readonly' if not self.editable else '' + if self.editable: + input_html = f''' + ''' + + html = f''' + + + {pills_html} + + {input_html} ''' diff --git a/archivebox/crawls/admin.py b/archivebox/crawls/admin.py index 2e637ff0..4c83e97b 100644 --- a/archivebox/crawls/admin.py +++ b/archivebox/crawls/admin.py @@ -1,8 +1,11 @@ __package__ = 'archivebox.crawls' - from django import forms -from django.utils.html import format_html, format_html_join +from django.http import JsonResponse, HttpRequest, HttpResponseNotAllowed +from django.shortcuts import get_object_or_404, redirect +from django.urls import path, reverse +from django.utils.html import escape, format_html, format_html_join +from django.utils import timezone from django.utils.safestring import mark_safe from django.contrib import admin, messages from django.db.models import Count, Q @@ -13,16 +16,19 @@ from django_object_actions import action from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin from archivebox.core.models import Snapshot +from archivebox.core.widgets import TagEditorWidget from archivebox.crawls.models import Crawl, CrawlSchedule -def render_snapshots_list(snapshots_qs, limit=20): +def render_snapshots_list(snapshots_qs, limit=20, crawl=None): """Render a nice inline list view of snapshots with status, title, URL, and progress.""" snapshots = snapshots_qs.order_by('-created_at')[:limit].annotate( total_results=Count('archiveresult'), succeeded_results=Count('archiveresult', filter=Q(archiveresult__status='succeeded')), failed_results=Count('archiveresult', filter=Q(archiveresult__status='failed')), + started_results=Count('archiveresult', filter=Q(archiveresult__status='started')), + skipped_results=Count('archiveresult', filter=Q(archiveresult__status='skipped')), ) if not snapshots: @@ -43,17 +49,57 @@ def render_snapshots_list(snapshots_qs, limit=20): # Calculate progress total = snapshot.total_results - done = snapshot.succeeded_results + snapshot.failed_results + succeeded = snapshot.succeeded_results + failed = snapshot.failed_results + running = snapshot.started_results + skipped = snapshot.skipped_results + done = succeeded + failed + skipped + pending = max(total - done - running, 0) progress_pct = int((done / total) * 100) if total > 0 else 0 progress_text = f'{done}/{total}' if total > 0 else '-' + progress_title = ( + f'{succeeded} succeeded, {failed} failed, {running} running, ' + f'{pending} pending, {skipped} skipped' + ) + progress_color = '#28a745' + if failed: + progress_color = '#dc3545' + elif running: + progress_color = '#17a2b8' + elif pending: + progress_color = '#ffc107' # Truncate title and URL - title = (snapshot.title or 'Untitled')[:60] - if len(snapshot.title or '') > 60: + snapshot_title = snapshot.title or 'Untitled' + title = snapshot_title[:60] + if len(snapshot_title) > 60: title += '...' url_display = snapshot.url[:50] if len(snapshot.url) > 50: url_display += '...' + delete_button = '' + exclude_button = '' + if crawl is not None: + delete_url = reverse('admin:crawls_crawl_snapshot_delete', args=[crawl.pk, snapshot.pk]) + exclude_url = reverse('admin:crawls_crawl_snapshot_exclude_domain', args=[crawl.pk, snapshot.pk]) + delete_button = f''' + + ''' + exclude_button = f''' + + ''' # Format date date_str = snapshot.created_at.strftime('%Y-%m-%d %H:%M') if snapshot.created_at else '-' @@ -74,18 +120,18 @@ def render_snapshots_list(snapshots_qs, limit=20):
+ {"" % (exclude_button, delete_button) if crawl is not None else ""} ''') @@ -111,7 +158,7 @@ def render_snapshots_list(snapshots_qs, limit=20): ''' return mark_safe(f''' -
+
IDDetails Status Plugin {title} + title="{escape(snapshot_title)}">{escape(title)} - {url_display} + title="{escape(snapshot.url)}">{escape(url_display)} -
%s%s
@@ -121,6 +168,7 @@ def render_snapshots_list(snapshots_qs, limit=20): + {'' if crawl is not None else ''} @@ -129,11 +177,197 @@ def render_snapshots_list(snapshots_qs, limit=20):
URL Progress CreatedActions
+ {''' + + ''' if crawl is not None else ''} ''') +class URLFiltersWidget(forms.Widget): + def render(self, name, value, attrs=None, renderer=None): + value = value if isinstance(value, dict) else {} + widget_id = (attrs or {}).get('id', name) + allowlist = escape(value.get('allowlist', '') or '') + denylist = escape(value.get('denylist', '') or '') + + return mark_safe(f''' +
+ +
+
+ + +
+
+ + +
+
+ +

+ Enter domains, wildcards, or regex patterns. Denylist takes precedence over allowlist. +

+ +
+ ''') + + def value_from_datadict(self, data, files, name): + return { + 'allowlist': data.get(f'{name}_allowlist', ''), + 'denylist': data.get(f'{name}_denylist', ''), + 'same_domain_only': data.get(f'{name}_same_domain_only') in ('1', 'on', 'true'), + } + + +class URLFiltersField(forms.Field): + widget = URLFiltersWidget + + def to_python(self, value): + if isinstance(value, dict): + return value + return {'allowlist': '', 'denylist': '', 'same_domain_only': False} + + class CrawlAdminForm(forms.ModelForm): """Custom form for Crawl admin to render urls field as textarea.""" + tags_editor = forms.CharField( + label='Tags', + required=False, + widget=TagEditorWidget(), + help_text='Type tag names and press Enter or Space to add. Click × to remove.', + ) + url_filters = URLFiltersField( + label='URL Filters', + required=False, + help_text='Set URL_ALLOWLIST / URL_DENYLIST for this crawl.', + ) class Meta: model = Crawl @@ -144,8 +378,62 @@ class CrawlAdminForm(forms.ModelForm): 'style': 'width: 100%; font-family: monospace; font-size: 13px;', 'placeholder': 'https://example.com\nhttps://example2.com\n# Comments start with #', }), + 'notes': forms.Textarea(attrs={ + 'rows': 1, + 'style': 'width: 100%; min-height: 0; resize: vertical;', + }), } + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + config = dict(self.instance.config or {}) if self.instance and self.instance.pk else {} + if self.instance and self.instance.pk: + self.initial['tags_editor'] = self.instance.tags_str + self.initial['url_filters'] = { + 'allowlist': config.get('URL_ALLOWLIST', ''), + 'denylist': config.get('URL_DENYLIST', ''), + 'same_domain_only': False, + } + + def clean_tags_editor(self): + tags_str = self.cleaned_data.get('tags_editor', '') + tag_names = [] + seen = set() + for raw_name in tags_str.split(','): + name = raw_name.strip() + if not name: + continue + lowered = name.lower() + if lowered in seen: + continue + seen.add(lowered) + tag_names.append(name) + return ','.join(tag_names) + + def clean_url_filters(self): + value = self.cleaned_data.get('url_filters') or {} + return { + 'allowlist': '\n'.join(Crawl.split_filter_patterns(value.get('allowlist', ''))), + 'denylist': '\n'.join(Crawl.split_filter_patterns(value.get('denylist', ''))), + 'same_domain_only': bool(value.get('same_domain_only')), + } + + def save(self, commit=True): + instance = super().save(commit=False) + instance.tags_str = self.cleaned_data.get('tags_editor', '') + url_filters = self.cleaned_data.get('url_filters') or {} + instance.set_url_filters( + url_filters.get('allowlist', ''), + url_filters.get('denylist', ''), + ) + if commit: + instance.save() + instance.apply_crawl_config_filters() + save_m2m = getattr(self, '_save_m2m', None) + if callable(save_m2m): + save_m2m() + return instance + class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin): form = CrawlAdminForm @@ -161,11 +449,11 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin): 'classes': ('card', 'wide'), }), ('Info', { - 'fields': ('label', 'notes', 'tags_str'), + 'fields': ('label', 'notes', 'tags_editor'), 'classes': ('card',), }), ('Settings', { - 'fields': ('max_depth', 'config'), + 'fields': (('max_depth', 'url_filters'), 'config'), 'classes': ('card',), }), ('Status', { @@ -185,6 +473,28 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin): 'classes': ('card', 'wide'), }), ) + add_fieldsets = ( + ('URLs', { + 'fields': ('urls',), + 'classes': ('card', 'wide'), + }), + ('Info', { + 'fields': ('label', 'notes', 'tags_editor'), + 'classes': ('card',), + }), + ('Settings', { + 'fields': (('max_depth', 'url_filters'), 'config'), + 'classes': ('card',), + }), + ('Status', { + 'fields': ('status', 'retry_at'), + 'classes': ('card',), + }), + ('Relations', { + 'fields': ('schedule', 'created_by'), + 'classes': ('card',), + }), + ) list_filter = ('max_depth', 'schedule', 'created_by', 'status', 'retry_at') ordering = ['-created_at', '-retry_at'] @@ -199,6 +509,25 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin): num_snapshots_cached=Count('snapshot_set') ) + def get_fieldsets(self, request, obj=None): + return self.fieldsets if obj else self.add_fieldsets + + def get_urls(self): + urls = super().get_urls() + custom_urls = [ + path( + '/snapshot//delete/', + self.admin_site.admin_view(self.delete_snapshot_view), + name='crawls_crawl_snapshot_delete', + ), + path( + '/snapshot//exclude-domain/', + self.admin_site.admin_view(self.exclude_domain_view), + name='crawls_crawl_snapshot_exclude_domain', + ), + ] + return custom_urls + urls + @admin.action(description='Delete selected crawls') def delete_selected_batched(self, request, queryset): """Delete crawls in a single transaction to avoid SQLite concurrency issues.""" @@ -218,8 +547,6 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin): @action(label='Recrawl', description='Create a new crawl with the same settings') def recrawl(self, request, obj): """Duplicate this crawl as a new crawl with the same URLs and settings.""" - from django.utils import timezone - from django.shortcuts import redirect # Validate URLs (required for crawl to start) if not obj.urls: @@ -252,7 +579,37 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin): return getattr(obj, 'num_snapshots_cached', obj.snapshot_set.count()) def snapshots(self, obj): - return render_snapshots_list(obj.snapshot_set.all()) + return render_snapshots_list(obj.snapshot_set.all(), crawl=obj) + + def delete_snapshot_view(self, request: HttpRequest, object_id: str, snapshot_id: str): + if request.method != 'POST': + return HttpResponseNotAllowed(['POST']) + + crawl = get_object_or_404(Crawl, pk=object_id) + snapshot = get_object_or_404(Snapshot, pk=snapshot_id, crawl=crawl) + + if snapshot.status == Snapshot.StatusChoices.STARTED: + snapshot.cancel_running_hooks() + + removed_urls = crawl.prune_url(snapshot.url) + snapshot.delete() + return JsonResponse({ + 'ok': True, + 'snapshot_id': str(snapshot.id), + 'removed_urls': removed_urls, + }) + + def exclude_domain_view(self, request: HttpRequest, object_id: str, snapshot_id: str): + if request.method != 'POST': + return HttpResponseNotAllowed(['POST']) + + crawl = get_object_or_404(Crawl, pk=object_id) + snapshot = get_object_or_404(Snapshot, pk=snapshot_id, crawl=crawl) + result = crawl.exclude_domain(snapshot.url) + return JsonResponse({ + 'ok': True, + **result, + }) @admin.display(description='Schedule', ordering='schedule') def schedule_str(self, obj): diff --git a/archivebox/crawls/models.py b/archivebox/crawls/models.py index afdb928f..77023c55 100755 --- a/archivebox/crawls/models.py +++ b/archivebox/crawls/models.py @@ -2,9 +2,12 @@ __package__ = 'archivebox.crawls' from typing import TYPE_CHECKING import uuid +import json +import re from datetime import timedelta from archivebox.uuid_compat import uuid7 from pathlib import Path +from urllib.parse import urlparse from django.db import models from django.core.validators import MaxValueValidator, MinValueValidator @@ -141,22 +144,21 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith return f'[...{short_id}] {first_url[:120]}' def save(self, *args, **kwargs): - is_new = self._state.adding super().save(*args, **kwargs) - if is_new: - from archivebox.misc.logging_util import log_worker_event - first_url = self.get_urls_list()[0] if self.get_urls_list() else '' - log_worker_event( - worker_type='DB', - event='Created Crawl', - indent_level=1, - metadata={ - 'id': str(self.id), - 'first_url': first_url[:64], - 'max_depth': self.max_depth, - 'status': self.status, - }, - ) + # if is_new: + # from archivebox.misc.logging_util import log_worker_event + # first_url = self.get_urls_list()[0] if self.get_urls_list() else '' + # log_worker_event( + # worker_type='DB', + # event='Created Crawl', + # indent_level=1, + # metadata={ + # 'id': str(self.id), + # 'first_url': first_url[:64], + # 'max_depth': self.max_depth, + # 'status': self.status, + # }, + # ) @property def api_url(self) -> str: @@ -248,6 +250,222 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith if url.strip() and not url.strip().startswith('#') ] + @staticmethod + def normalize_domain(value: str) -> str: + candidate = (value or '').strip().lower() + if not candidate: + return '' + if '://' not in candidate and '/' not in candidate: + candidate = f'https://{candidate.lstrip(".")}' + try: + parsed = urlparse(candidate) + hostname = parsed.hostname or '' + if not hostname: + return '' + if parsed.port: + return f'{hostname}_{parsed.port}' + return hostname + except Exception: + return '' + + @staticmethod + def split_filter_patterns(value) -> list[str]: + patterns = [] + seen = set() + if isinstance(value, list): + raw_values = value + elif isinstance(value, str): + raw_values = value.splitlines() + else: + raw_values = [] + + for raw_value in raw_values: + pattern = str(raw_value or '').strip() + if not pattern or pattern in seen: + continue + seen.add(pattern) + patterns.append(pattern) + return patterns + + @classmethod + def _pattern_matches_url(cls, url: str, pattern: str) -> bool: + normalized_pattern = str(pattern or '').strip() + if not normalized_pattern: + return False + + if re.fullmatch(r'[\w.*:-]+', normalized_pattern): + wildcard_only_subdomains = normalized_pattern.startswith('*.') + normalized_domain = cls.normalize_domain( + normalized_pattern[2:] if wildcard_only_subdomains else normalized_pattern + ) + normalized_url_domain = cls.normalize_domain(url) + if not normalized_domain or not normalized_url_domain: + return False + + pattern_host = normalized_domain.split('_', 1)[0] + url_host = normalized_url_domain.split('_', 1)[0] + + if wildcard_only_subdomains: + return url_host.endswith(f'.{pattern_host}') + + if normalized_url_domain == normalized_domain: + return True + return url_host == pattern_host or url_host.endswith(f'.{pattern_host}') + + try: + return bool(re.search(normalized_pattern, url)) + except re.error: + return False + + def get_url_allowlist(self, *, use_effective_config: bool = False, snapshot=None) -> list[str]: + if use_effective_config: + from archivebox.config.configset import get_config + + config = get_config(crawl=self, snapshot=snapshot) + else: + config = self.config or {} + return self.split_filter_patterns(config.get('URL_ALLOWLIST', '')) + + def get_url_denylist(self, *, use_effective_config: bool = False, snapshot=None) -> list[str]: + if use_effective_config: + from archivebox.config.configset import get_config + + config = get_config(crawl=self, snapshot=snapshot) + else: + config = self.config or {} + return self.split_filter_patterns(config.get('URL_DENYLIST', '')) + + def url_passes_filters(self, url: str, *, snapshot=None, use_effective_config: bool = True) -> bool: + denylist = self.get_url_denylist(use_effective_config=use_effective_config, snapshot=snapshot) + allowlist = self.get_url_allowlist(use_effective_config=use_effective_config, snapshot=snapshot) + + for pattern in denylist: + if self._pattern_matches_url(url, pattern): + return False + + if allowlist: + return any(self._pattern_matches_url(url, pattern) for pattern in allowlist) + + return True + + def set_url_filters(self, allowlist, denylist) -> None: + config = dict(self.config or {}) + allow_patterns = self.split_filter_patterns(allowlist) + deny_patterns = self.split_filter_patterns(denylist) + + if allow_patterns: + config['URL_ALLOWLIST'] = '\n'.join(allow_patterns) + else: + config.pop('URL_ALLOWLIST', None) + + if deny_patterns: + config['URL_DENYLIST'] = '\n'.join(deny_patterns) + else: + config.pop('URL_DENYLIST', None) + + self.config = config + + def apply_crawl_config_filters(self) -> dict[str, int]: + from archivebox.core.models import Snapshot + + removed_urls = self.prune_urls( + lambda url: not self.url_passes_filters(url, use_effective_config=False) + ) + + filtered_snapshots = [ + snapshot + for snapshot in self.snapshot_set.filter( + status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED], + ).only('pk', 'url', 'status') + if not self.url_passes_filters(snapshot.url, snapshot=snapshot, use_effective_config=False) + ] + + deleted_snapshots = 0 + if filtered_snapshots: + started_snapshots = [ + snapshot for snapshot in filtered_snapshots + if snapshot.status == Snapshot.StatusChoices.STARTED + ] + for snapshot in started_snapshots: + snapshot.cancel_running_hooks() + + filtered_snapshot_ids = [snapshot.pk for snapshot in filtered_snapshots] + deleted_snapshots, _ = self.snapshot_set.filter(pk__in=filtered_snapshot_ids).delete() + + return { + 'removed_urls': len(removed_urls), + 'deleted_snapshots': deleted_snapshots, + } + + def _iter_url_lines(self) -> list[tuple[str, str]]: + entries: list[tuple[str, str]] = [] + for raw_line in (self.urls or '').splitlines(): + stripped = raw_line.strip() + if not stripped: + continue + if stripped.startswith('#'): + entries.append((raw_line.rstrip(), '')) + continue + try: + entry = json.loads(stripped) + entries.append((raw_line.rstrip(), str(entry.get('url', '') or '').strip())) + except json.JSONDecodeError: + entries.append((raw_line.rstrip(), stripped)) + return entries + + def prune_urls(self, predicate) -> list[str]: + kept_lines: list[str] = [] + removed_urls: list[str] = [] + + for raw_line, url in self._iter_url_lines(): + if not url: + kept_lines.append(raw_line) + continue + if predicate(url): + removed_urls.append(url) + continue + kept_lines.append(raw_line) + + next_urls = '\n'.join(kept_lines) + if next_urls != (self.urls or ''): + self.urls = next_urls + self.save(update_fields=['urls', 'modified_at']) + return removed_urls + + def prune_url(self, url: str) -> int: + target = (url or '').strip() + removed = self.prune_urls(lambda candidate: candidate == target) + return len(removed) + + def exclude_domain(self, domain: str) -> dict[str, int | str | bool]: + normalized_domain = self.normalize_domain(domain) + if not normalized_domain: + return { + 'domain': '', + 'created': False, + 'removed_urls': 0, + 'deleted_snapshots': 0, + } + + domains = self.get_url_denylist(use_effective_config=False) + created = normalized_domain not in domains + if created: + domains.append(normalized_domain) + self.set_url_filters( + self.get_url_allowlist(use_effective_config=False), + domains, + ) + self.save(update_fields=['config', 'modified_at']) + + filter_result = self.apply_crawl_config_filters() + + return { + 'domain': normalized_domain, + 'created': created, + 'removed_urls': filter_result['removed_urls'], + 'deleted_snapshots': filter_result['deleted_snapshots'], + } + def get_system_task(self) -> str | None: urls = self.get_urls_list() if len(urls) != 1: @@ -284,11 +502,13 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith Returns: True if URL was added, False if skipped (duplicate or depth exceeded) """ - import json + from archivebox.misc.util import fix_url_from_markdown - url = entry.get('url', '') + url = fix_url_from_markdown(str(entry.get('url', '') or '').strip()) if not url: return False + if not self.url_passes_filters(url): + return False depth = entry.get('depth', 1) @@ -301,20 +521,13 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith return False # Check if already in urls (parse existing JSONL entries) - existing_urls = set() - for line in self.urls.splitlines(): - if not line.strip(): - continue - try: - existing_entry = json.loads(line) - existing_urls.add(existing_entry.get('url', '')) - except json.JSONDecodeError: - existing_urls.add(line.strip()) + existing_urls = {url for _raw_line, url in self._iter_url_lines() if url} if url in existing_urls: return False # Append as JSONL + entry = {**entry, 'url': url} jsonl_entry = json.dumps(entry) self.urls = (self.urls.rstrip() + '\n' + jsonl_entry).lstrip('\n') self.save(update_fields=['urls', 'modified_at']) @@ -327,15 +540,11 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith Returns: List of newly created Snapshot objects """ - import sys - import json from archivebox.core.models import Snapshot + from archivebox.misc.util import fix_url_from_markdown created_snapshots = [] - print(f'[cyan]DEBUG create_snapshots_from_urls: self.urls={repr(self.urls)}[/cyan]', file=sys.stderr) - print(f'[cyan]DEBUG create_snapshots_from_urls: lines={self.urls.splitlines()}[/cyan]', file=sys.stderr) - for line in self.urls.splitlines(): if not line.strip(): continue @@ -343,13 +552,13 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith # Parse JSONL or plain URL try: entry = json.loads(line) - url = entry.get('url', '') + url = fix_url_from_markdown(str(entry.get('url', '') or '').strip()) depth = entry.get('depth', 0) title = entry.get('title') timestamp = entry.get('timestamp') tags = entry.get('tags', '') except json.JSONDecodeError: - url = line.strip() + url = fix_url_from_markdown(line.strip()) depth = 0 title = None timestamp = None @@ -357,6 +566,8 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith if not url: continue + if not self.url_passes_filters(url): + continue # Skip if depth exceeds max_depth if depth > self.max_depth: diff --git a/archivebox/hooks.py b/archivebox/hooks.py index 586f88fb..3dda2bd6 100644 --- a/archivebox/hooks.py +++ b/archivebox/hooks.py @@ -64,6 +64,7 @@ from abx_plugins import get_plugins_dir from django.conf import settings from django.utils.safestring import mark_safe from archivebox.config.constants import CONSTANTS +from archivebox.misc.util import fix_url_from_markdown if TYPE_CHECKING: from archivebox.machine.models import Process @@ -266,7 +267,7 @@ def run_hook( if process.status == 'exited': records = process.get_records() # Get parsed JSONL output """ - from archivebox.machine.models import Process, Machine + from archivebox.machine.models import Process, Machine, NetworkInterface from archivebox.config.constants import CONSTANTS import sys @@ -280,6 +281,8 @@ def run_hook( # Get current machine machine = Machine.current() + iface = NetworkInterface.current(refresh=True) + machine = iface.machine # Auto-detect parent process if not explicitly provided # This enables automatic hierarchy tracking: Worker -> Hook @@ -294,6 +297,7 @@ def run_hook( # Create a failed Process record for hooks that don't exist process = Process.objects.create( machine=machine, + iface=iface, parent=parent, process_type=Process.TypeChoices.HOOK, pwd=str(output_dir), @@ -449,6 +453,7 @@ def run_hook( # Create Process record process = Process.objects.create( machine=machine, + iface=iface, parent=parent, process_type=Process.TypeChoices.HOOK, pwd=str(output_dir), @@ -458,6 +463,7 @@ def run_hook( # Copy the env dict we already built (includes os.environ + all customizations) process.env = env.copy() + process.hydrate_binary_from_context(plugin_name=script.parent.name, hook_path=str(script)) # Save env before launching process.save() @@ -472,6 +478,7 @@ def run_hook( # Create a failed Process record for exceptions process = Process.objects.create( machine=machine, + iface=iface, process_type=Process.TypeChoices.HOOK, pwd=str(output_dir), cmd=cmd, @@ -544,6 +551,9 @@ def collect_urls_from_plugins(snapshot_dir: Path) -> List[Dict[str, Any]]: text = urls_file.read_text() for entry in Process.parse_records_from_text(text): if entry.get('url'): + entry['url'] = fix_url_from_markdown(str(entry['url']).strip()) + if not entry['url']: + continue # Track which parser plugin found this URL entry['plugin'] = subdir.name urls.append(entry) @@ -615,11 +625,30 @@ def get_enabled_plugins(config: Optional[Dict[str, Any]] = None) -> List[str]: from archivebox.config.configset import get_config config = get_config() + def normalize_enabled_plugins(value: Any) -> List[str]: + if value is None: + return [] + if isinstance(value, str): + raw = value.strip() + if not raw: + return [] + if raw.startswith('['): + try: + parsed = json.loads(raw) + except json.JSONDecodeError: + parsed = None + if isinstance(parsed, list): + return [str(plugin).strip() for plugin in parsed if str(plugin).strip()] + return [plugin.strip() for plugin in raw.split(',') if plugin.strip()] + if isinstance(value, (list, tuple, set)): + return [str(plugin).strip() for plugin in value if str(plugin).strip()] + return [str(value).strip()] if str(value).strip() else [] + # Support explicit ENABLED_PLUGINS override (legacy) if 'ENABLED_PLUGINS' in config: - return config['ENABLED_PLUGINS'] + return normalize_enabled_plugins(config['ENABLED_PLUGINS']) if 'ENABLED_EXTRACTORS' in config: - return config['ENABLED_EXTRACTORS'] + return normalize_enabled_plugins(config['ENABLED_EXTRACTORS']) # Filter all plugins by enabled status all_plugins = get_plugins() @@ -1042,6 +1071,14 @@ def process_hook_records(records: List[Dict[str, Any]], overrides: Dict[str, Any if record_type == 'Snapshot': from archivebox.core.models import Snapshot + if record.get('url'): + record = { + **record, + 'url': fix_url_from_markdown(str(record['url']).strip()), + } + if not record['url']: + continue + # Check if discovered snapshot exceeds crawl max_depth snapshot_depth = record.get('depth', 0) crawl = overrides.get('crawl') diff --git a/archivebox/machine/admin.py b/archivebox/machine/admin.py index 27bdf060..7d531aed 100644 --- a/archivebox/machine/admin.py +++ b/archivebox/machine/admin.py @@ -113,7 +113,7 @@ class BinaryAdmin(BaseModelAdmin): sort_fields = ('id', 'created_at', 'machine_info', 'name', 'binprovider', 'version', 'abspath', 'sha256', 'status') search_fields = ('id', 'machine__id', 'name', 'binprovider', 'version', 'abspath', 'sha256') - readonly_fields = ('created_at', 'modified_at') + readonly_fields = ('created_at', 'modified_at', 'output_dir') fieldsets = ( ('Binary Info', { @@ -166,7 +166,7 @@ class ProcessAdmin(BaseModelAdmin): sort_fields = ('id', 'created_at', 'status', 'exit_code', 'pid') search_fields = ('id', 'machine__id', 'binary__name', 'cmd', 'pwd', 'stdout', 'stderr') - readonly_fields = ('created_at', 'modified_at', 'machine', 'binary', 'iface', 'archiveresult_link') + readonly_fields = ('created_at', 'modified_at', 'machine', 'binary_link', 'iface_link', 'archiveresult_link') fieldsets = ( ('Process Info', { @@ -178,7 +178,7 @@ class ProcessAdmin(BaseModelAdmin): 'classes': ('card', 'wide'), }), ('Execution', { - 'fields': ('binary', 'iface', 'pid', 'exit_code', 'url'), + 'fields': ('binary_link', 'iface_link', 'pid', 'exit_code', 'url'), 'classes': ('card',), }), ('Timing', { @@ -216,6 +216,21 @@ class ProcessAdmin(BaseModelAdmin): process.binary.id, process.binary.name, process.binary.version, ) + @admin.display(description='Binary', ordering='binary__name') + def binary_link(self, process): + return self.binary_info(process) + + @admin.display(description='Network Interface', ordering='iface__id') + def iface_link(self, process): + if not process.iface: + return '-' + return format_html( + '{} {}', + process.iface.id, + str(process.iface.id)[:8], + process.iface.iface or process.iface.ip_public or process.iface.ip_local, + ) + @admin.display(description='ArchiveResult') def archiveresult_link(self, process): if not hasattr(process, 'archiveresult'): diff --git a/archivebox/machine/models.py b/archivebox/machine/models.py index fd700f91..441b8cf1 100755 --- a/archivebox/machine/models.py +++ b/archivebox/machine/models.py @@ -49,6 +49,89 @@ BINARY_RECHECK_INTERVAL = 1 * 30 * 60 PROCESS_RECHECK_INTERVAL = 60 # Re-validate every 60 seconds PID_REUSE_WINDOW = timedelta(hours=24) # Max age for considering a PID match valid START_TIME_TOLERANCE = 5.0 # Seconds tolerance for start time matching +LEGACY_MACHINE_CONFIG_KEYS = frozenset({"CHROMIUM_VERSION"}) + + +def _find_existing_binary_for_reference(machine: 'Machine', reference: str) -> 'Binary | None': + reference = str(reference or '').strip() + if not reference: + return None + + qs = Binary.objects.filter(machine=machine) + + direct_match = qs.filter(abspath=reference).order_by('-modified_at').first() + if direct_match: + return direct_match + + ref_name = Path(reference).name + if ref_name: + named_match = qs.filter(name=ref_name).order_by('-modified_at').first() + if named_match: + return named_match + + return qs.filter(name=reference).order_by('-modified_at').first() + + +def _get_process_binary_env_keys(plugin_name: str, hook_path: str, env: dict[str, Any] | None) -> list[str]: + env = env or {} + plugin_name = str(plugin_name or '').strip() + hook_path = str(hook_path or '').strip() + plugin_key = plugin_name.upper().replace('-', '_') + keys: list[str] = [] + seen: set[str] = set() + + def add(key: str) -> None: + if key and key not in seen and env.get(key): + seen.add(key) + keys.append(key) + + if plugin_key: + add(f'{plugin_key}_BINARY') + + try: + from archivebox.hooks import discover_plugin_configs + + plugin_schema = discover_plugin_configs().get(plugin_name, {}) + schema_keys = [ + key + for key in (plugin_schema.get('properties') or {}) + if key.endswith('_BINARY') + ] + except Exception: + schema_keys = [] + + schema_keys.sort(key=lambda key: ( + key != f'{plugin_key}_BINARY', + key.endswith('_NODE_BINARY'), + key.endswith('_CHROME_BINARY'), + key, + )) + for key in schema_keys: + add(key) + + if plugin_name.startswith('search_backend_'): + backend_name = plugin_name.removeprefix('search_backend_').upper().replace('-', '_') + configured_engine = str(env.get('SEARCH_BACKEND_ENGINE') or '').strip().upper().replace('-', '_') + if backend_name and backend_name == configured_engine: + add(f'{backend_name}_BINARY') + + hook_suffix = Path(hook_path).suffix.lower() + if hook_suffix == '.js': + if plugin_key: + add(f'{plugin_key}_NODE_BINARY') + add('NODE_BINARY') + + return keys + + +def _sanitize_machine_config(config: dict[str, Any] | None) -> dict[str, Any]: + if not isinstance(config, dict): + return {} + + sanitized = dict(config) + for key in LEGACY_MACHINE_CONFIG_KEYS: + sanitized.pop(key, None) + return sanitized class MachineManager(models.Manager): @@ -89,13 +172,13 @@ class Machine(ModelWithHealthStats): global _CURRENT_MACHINE if _CURRENT_MACHINE: if timezone.now() < _CURRENT_MACHINE.modified_at + timedelta(seconds=MACHINE_RECHECK_INTERVAL): - return cls._hydrate_config_from_sibling(_CURRENT_MACHINE) + return cls._sanitize_config(cls._hydrate_config_from_sibling(_CURRENT_MACHINE)) _CURRENT_MACHINE = None _CURRENT_MACHINE, _ = cls.objects.update_or_create( guid=get_host_guid(), defaults={'hostname': socket.gethostname(), **get_os_info(), **get_vm_info(), 'stats': get_host_stats()}, ) - return cls._hydrate_config_from_sibling(_CURRENT_MACHINE) + return cls._sanitize_config(cls._hydrate_config_from_sibling(_CURRENT_MACHINE)) @classmethod def _hydrate_config_from_sibling(cls, machine: 'Machine') -> 'Machine': @@ -115,6 +198,15 @@ class Machine(ModelWithHealthStats): machine.save(update_fields=['config', 'modified_at']) return machine + @classmethod + def _sanitize_config(cls, machine: 'Machine') -> 'Machine': + sanitized = _sanitize_machine_config(machine.config) + current = machine.config or {} + if sanitized != current: + machine.config = sanitized + machine.save(update_fields=['config', 'modified_at']) + return machine + def to_json(self) -> dict: """ Convert Machine model instance to a JSON-serializable dict. @@ -152,11 +244,10 @@ class Machine(ModelWithHealthStats): Returns: Machine instance or None """ - config_patch = record.get('config') - if isinstance(config_patch, dict) and config_patch: + config_patch = _sanitize_machine_config(record.get('config')) + if config_patch: machine = Machine.current() - if not machine.config: - machine.config = {} + machine.config = _sanitize_machine_config(machine.config) machine.config.update(config_patch) machine.save(update_fields=['config']) return machine @@ -194,13 +285,17 @@ class NetworkInterface(ModelWithHealthStats): unique_together = (('machine', 'ip_public', 'ip_local', 'mac_address', 'dns_server'),) @classmethod - def current(cls) -> 'NetworkInterface': + def current(cls, refresh: bool = False) -> 'NetworkInterface': global _CURRENT_INTERFACE + machine = Machine.current() if _CURRENT_INTERFACE: - if timezone.now() < _CURRENT_INTERFACE.modified_at + timedelta(seconds=NETWORK_INTERFACE_RECHECK_INTERVAL): + if ( + not refresh + and _CURRENT_INTERFACE.machine_id == machine.id + and timezone.now() < _CURRENT_INTERFACE.modified_at + timedelta(seconds=NETWORK_INTERFACE_RECHECK_INTERVAL) + ): return _CURRENT_INTERFACE _CURRENT_INTERFACE = None - machine = Machine.current() net_info = get_host_network() _CURRENT_INTERFACE, _ = cls.objects.update_or_create( machine=machine, ip_public=net_info.pop('ip_public'), ip_local=net_info.pop('ip_local'), @@ -747,14 +842,17 @@ class ProcessManager(models.Manager): Called during migration and when creating new ArchiveResults. """ + iface = kwargs.get('iface') or NetworkInterface.current() + # Defaults from ArchiveResult if not provided defaults = { - 'machine': Machine.current(), + 'machine': iface.machine, 'pwd': kwargs.get('pwd') or str(archiveresult.snapshot.output_dir / archiveresult.plugin), 'cmd': kwargs.get('cmd') or [], 'status': 'queued', 'timeout': kwargs.get('timeout', 120), 'env': kwargs.get('env', {}), + 'iface': iface, } defaults.update(kwargs) @@ -971,6 +1069,28 @@ class Process(models.Model): record['timeout'] = self.timeout return record + def hydrate_binary_from_context(self, *, plugin_name: str = '', hook_path: str = '') -> 'Binary | None': + machine = self.machine if self.machine_id else Machine.current() + + references: list[str] = [] + for key in _get_process_binary_env_keys(plugin_name, hook_path, self.env): + value = str(self.env.get(key) or '').strip() + if value and value not in references: + references.append(value) + + if self.cmd: + cmd_0 = str(self.cmd[0]).strip() + if cmd_0 and cmd_0 not in references: + references.append(cmd_0) + + for reference in references: + binary = _find_existing_binary_for_reference(machine, reference) + if binary: + self.binary = binary + return binary + + return None + @classmethod def parse_records_from_text(cls, text: str) -> list[dict]: """Parse JSONL records from raw text using the shared JSONL parser.""" @@ -1044,6 +1164,7 @@ class Process(models.Model): current_pid = os.getpid() machine = Machine.current() + iface = NetworkInterface.current() # Check cache validity if _CURRENT_PROCESS: @@ -1053,6 +1174,9 @@ class Process(models.Model): and _CURRENT_PROCESS.machine_id == machine.id and timezone.now() < _CURRENT_PROCESS.modified_at + timedelta(seconds=PROCESS_RECHECK_INTERVAL) ): + if _CURRENT_PROCESS.iface_id != iface.id: + _CURRENT_PROCESS.iface = iface + _CURRENT_PROCESS.save(update_fields=['iface', 'modified_at']) _CURRENT_PROCESS.ensure_log_files() return _CURRENT_PROCESS _CURRENT_PROCESS = None @@ -1080,6 +1204,9 @@ class Process(models.Model): db_start_time = existing.started_at.timestamp() if abs(db_start_time - os_start_time) < START_TIME_TOLERANCE: _CURRENT_PROCESS = existing + if existing.iface_id != iface.id: + existing.iface = iface + existing.save(update_fields=['iface', 'modified_at']) _CURRENT_PROCESS.ensure_log_files() return existing @@ -1112,6 +1239,7 @@ class Process(models.Model): pid=current_pid, started_at=started_at, status=cls.StatusChoices.RUNNING, + iface=iface, ) _CURRENT_PROCESS.ensure_log_files() return _CURRENT_PROCESS @@ -1176,7 +1304,9 @@ class Process(models.Model): if 'supervisord' in argv_str: return cls.TypeChoices.SUPERVISORD - elif 'archivebox run' in argv_str or 'runner_watch' in argv_str: + elif 'runner_watch' in argv_str: + return cls.TypeChoices.WORKER + elif 'archivebox run' in argv_str: return cls.TypeChoices.ORCHESTRATOR elif 'archivebox' in argv_str: return cls.TypeChoices.CLI @@ -1321,14 +1451,17 @@ class Process(models.Model): if self.cmd: try: os_cmdline = os_proc.cmdline() - # Check if first arg (binary) matches if os_cmdline and self.cmd: - os_binary = os_cmdline[0] if os_cmdline else '' db_binary = self.cmd[0] if self.cmd else '' - # Match by basename (handles /usr/bin/python3 vs python3) - if os_binary and db_binary: - if Path(os_binary).name != Path(db_binary).name: - return None # Different binary, PID reused + if db_binary: + db_binary_name = Path(db_binary).name + cmd_matches = any( + arg == db_binary or Path(arg).name == db_binary_name + for arg in os_cmdline + if arg + ) + if not cmd_matches: + return None # Different command, PID reused except (psutil.AccessDenied, psutil.ZombieProcess): pass # Can't check cmdline, trust start time match diff --git a/archivebox/misc/util.py b/archivebox/misc/util.py index e040b219..5d9a3129 100644 --- a/archivebox/misc/util.py +++ b/archivebox/misc/util.py @@ -4,6 +4,7 @@ import re import requests import json as pyjson import http.cookiejar +from dateparser import parse as dateparser from typing import List, Optional, Any, Callable from pathlib import Path @@ -13,7 +14,6 @@ from hashlib import sha256 from urllib.parse import urlparse, quote, unquote from html import escape, unescape from datetime import datetime, timezone -from dateparser import parse as dateparser from requests.exceptions import RequestException, ReadTimeout from base32_crockford import encode as base32_encode @@ -122,9 +122,35 @@ def fix_url_from_markdown(url_str: str) -> str: return url_str +def split_comma_separated_urls(url: str): + offset = 0 + while True: + http_index = url.find('http://', 1) + https_index = url.find('https://', 1) + next_indices = [idx for idx in (http_index, https_index) if idx != -1] + if not next_indices: + yield offset, url + return + + next_index = min(next_indices) + if url[next_index - 1] != ',': + yield offset, url + return + + yield offset, url[:next_index - 1] + offset += next_index + url = url[next_index:] + def find_all_urls(urls_str: str): - for url in re.findall(URL_REGEX, urls_str): - yield fix_url_from_markdown(url) + skipped_starts = set() + for match in re.finditer(URL_REGEX, urls_str): + if match.start() in skipped_starts: + continue + + for offset, url in split_comma_separated_urls(fix_url_from_markdown(match.group(1))): + if offset: + skipped_starts.add(match.start() + offset) + yield url def is_static_file(url: str): @@ -214,7 +240,25 @@ def parse_date(date: Any) -> datetime | None: date = str(date) if isinstance(date, str): - parsed_date = dateparser(date, settings={'TIMEZONE': 'UTC'}) + normalized = date.strip() + if not normalized: + raise ValueError(f'Tried to parse invalid date string! {date}') + + try: + return datetime.fromtimestamp(float(normalized), tz=timezone.utc) + except (TypeError, ValueError, OSError): + pass + + try: + iso_date = normalized.replace('Z', '+00:00') + parsed_date = datetime.fromisoformat(iso_date) + if parsed_date.tzinfo is None: + return parsed_date.replace(tzinfo=timezone.utc) + return parsed_date.astimezone(timezone.utc) + except ValueError: + pass + + parsed_date = dateparser(normalized, settings={'TIMEZONE': 'UTC'}) if parsed_date is None: raise ValueError(f'Tried to parse invalid date string! {date}') return parsed_date.astimezone(timezone.utc) @@ -408,6 +452,7 @@ assert fix_url_from_markdown('https://wikipedia.org/en/some_article_(Disambiguat URL_REGEX_TESTS = [ ('https://example.com', ['https://example.com']), + ('https://sweeting.me,https://google.com', ['https://sweeting.me', 'https://google.com']), ('http://abc-file234example.com/abc?def=abc&23423=sdfsdf#abc=234&234=a234', ['http://abc-file234example.com/abc?def=abc&23423=sdfsdf#abc=234&234=a234']), ('https://twitter.com/share?url=https://akaao.success-corp.co.jp&text=ア@サ!ト&hashtags=ア%オ,元+ア.ア-オ_イ*シ$ロ abc', ['https://twitter.com/share?url=https://akaao.success-corp.co.jp&text=ア@サ!ト&hashtags=ア%オ,元+ア.ア-オ_イ*シ$ロ', 'https://akaao.success-corp.co.jp&text=ア@サ!ト&hashtags=ア%オ,元+ア.ア-オ_イ*シ$ロ']), diff --git a/archivebox/personas/admin.py b/archivebox/personas/admin.py index b97a94f6..501495bf 100644 --- a/archivebox/personas/admin.py +++ b/archivebox/personas/admin.py @@ -1,2 +1,169 @@ +__package__ = "archivebox.personas" -# Register your models here. +import shutil + +from django.contrib import admin, messages +from django.utils.html import format_html, format_html_join + +from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin +from archivebox.personas.forms import PersonaAdminForm +from archivebox.personas.importers import discover_local_browser_profiles +from archivebox.personas.models import Persona + + +class PersonaAdmin(ConfigEditorMixin, BaseModelAdmin): + form = PersonaAdminForm + change_form_template = "admin/personas/persona/change_form.html" + + list_display = ("name", "created_by", "created_at", "chrome_profile_state", "cookies_state", "auth_state") + search_fields = ("name", "created_by__username") + list_filter = ("created_by",) + ordering = ["name"] + list_per_page = 100 + readonly_fields = ("id", "created_at", "persona_paths", "import_artifact_status") + + add_fieldsets = ( + ("Persona", { + "fields": ("name", "created_by"), + "classes": ("card",), + }), + ("Browser Import", { + "fields": ( + "import_mode", + "import_discovered_profile", + "import_source", + "import_profile_name", + "import_copy_profile", + "import_extract_cookies", + "import_capture_storage", + ), + "classes": ("card", "wide"), + }), + ("Advanced", { + "fields": ("config",), + "classes": ("card", "wide"), + }), + ) + + change_fieldsets = add_fieldsets + ( + ("Artifacts", { + "fields": ("persona_paths", "import_artifact_status"), + "classes": ("card", "wide"), + }), + ("Timestamps", { + "fields": ("id", "created_at"), + "classes": ("card",), + }), + ) + + @admin.display(description="Chrome Profile") + def chrome_profile_state(self, obj: Persona) -> str: + return "yes" if (obj.path / "chrome_user_data").exists() else "no" + + @admin.display(description="cookies.txt") + def cookies_state(self, obj: Persona) -> str: + return "yes" if obj.COOKIES_FILE else "no" + + @admin.display(description="auth.json") + def auth_state(self, obj: Persona) -> str: + return "yes" if obj.AUTH_STORAGE_FILE else "no" + + @admin.display(description="Persona Paths") + def persona_paths(self, obj: Persona) -> str: + return format_html( + "
" + "
Persona root{}
" + "
chrome_user_data{}
" + "
chrome_extensions{}
" + "
chrome_downloads{}
" + "
cookies.txt{}
" + "
auth.json{}
" + "
", + obj.path, + obj.CHROME_USER_DATA_DIR, + obj.CHROME_EXTENSIONS_DIR, + obj.CHROME_DOWNLOADS_DIR, + obj.COOKIES_FILE or (obj.path / "cookies.txt"), + obj.AUTH_STORAGE_FILE or (obj.path / "auth.json"), + ) + + @admin.display(description="Import Artifacts") + def import_artifact_status(self, obj: Persona) -> str: + entries = [ + ("Browser profile", (obj.path / "chrome_user_data").exists(), obj.CHROME_USER_DATA_DIR), + ("cookies.txt", bool(obj.COOKIES_FILE), obj.COOKIES_FILE or (obj.path / "cookies.txt")), + ("auth.json", bool(obj.AUTH_STORAGE_FILE), obj.AUTH_STORAGE_FILE or (obj.path / "auth.json")), + ] + return format_html( + "
{}
", + format_html_join( + "", + "
{}{}{}
", + ( + ( + label, + "abx-artifact-state abx-artifact-state--yes" if enabled else "abx-artifact-state abx-artifact-state--no", + "present" if enabled else "missing", + path, + ) + for label, enabled, path in entries + ), + ), + ) + + def get_fieldsets(self, request, obj=None): + return self.change_fieldsets if obj else self.add_fieldsets + + def render_change_form(self, request, context, add=False, change=False, form_url="", obj=None): + context["detected_profile_count"] = len(discover_local_browser_profiles()) + return super().render_change_form(request, context, add=add, change=change, form_url=form_url, obj=obj) + + def save_model(self, request, obj, form, change): + old_path = None + new_path = None + if change: + previous = Persona.objects.get(pk=obj.pk) + if previous.name != obj.name: + old_path = previous.path + new_path = obj.path + + super().save_model(request, obj, form, change) + + if old_path and new_path and old_path != new_path and old_path.exists(): + if new_path.exists(): + raise FileExistsError(f"Cannot rename Persona directory because the destination already exists: {new_path}") + shutil.move(str(old_path), str(new_path)) + + obj.ensure_dirs() + + import_result = form.apply_import(obj) + if import_result is None: + return + + completed_actions = [] + if import_result.profile_copied: + completed_actions.append("profile copied") + if import_result.cookies_imported: + completed_actions.append("cookies.txt generated") + if import_result.storage_captured: + completed_actions.append("auth.json captured") + if import_result.user_agent_imported: + completed_actions.append("USER_AGENT copied") + + if completed_actions: + messages.success( + request, + f'Imported {", ".join(completed_actions)} from {import_result.source.display_label}.', + ) + else: + messages.warning( + request, + f"Persona saved, but no browser artifacts were imported from {import_result.source.display_label}.", + ) + + for warning in import_result.warnings: + messages.warning(request, warning) + + +def register_admin(admin_site: admin.AdminSite) -> None: + admin_site.register(Persona, PersonaAdmin) diff --git a/archivebox/personas/export_browser_state.js b/archivebox/personas/export_browser_state.js new file mode 100644 index 00000000..77b394f9 --- /dev/null +++ b/archivebox/personas/export_browser_state.js @@ -0,0 +1,210 @@ +#!/usr/bin/env node +/** + * Export cookies and open-tab storage from a Chromium profile or live CDP URL. + * + * Environment variables: + * ARCHIVEBOX_ABX_PLUGINS_DIR Absolute path to abx_plugins/plugins + * CHROME_USER_DATA_DIR Local Chromium user-data directory to launch + * CHROME_CDP_URL Existing browser CDP URL to attach to + * COOKIES_OUTPUT_FILE Optional output path for Netscape cookies.txt + * AUTH_STORAGE_OUTPUT_FILE Optional output path for auth.json + * CHROME_BINARY Optional browser binary override + * NODE_MODULES_DIR Optional node_modules path for puppeteer-core + */ + +const fs = require('fs'); +const os = require('os'); +const path = require('path'); + +const pluginsDir = process.env.ARCHIVEBOX_ABX_PLUGINS_DIR || process.env.ABX_PLUGINS_DIR; +if (!pluginsDir) { + console.error('ARCHIVEBOX_ABX_PLUGINS_DIR is required'); + process.exit(1); +} + +const baseUtils = require(path.join(pluginsDir, 'base', 'utils.js')); +baseUtils.ensureNodeModuleResolution(module); + +const chromeUtils = require(path.join(pluginsDir, 'chrome', 'chrome_utils.js')); +const puppeteer = require('puppeteer-core'); + +function cookieToNetscape(cookie) { + let domain = cookie.domain; + if (!domain.startsWith('.') && !cookie.hostOnly) { + domain = '.' + domain; + } + + const includeSubdomains = domain.startsWith('.') ? 'TRUE' : 'FALSE'; + const cookiePath = cookie.path || '/'; + const secure = cookie.secure ? 'TRUE' : 'FALSE'; + const expiry = cookie.expires && cookie.expires > 0 ? Math.floor(cookie.expires).toString() : '0'; + + return `${domain}\t${includeSubdomains}\t${cookiePath}\t${secure}\t${expiry}\t${cookie.name}\t${cookie.value}`; +} + +function writeCookiesFile(cookies, outputPath) { + const lines = [ + '# Netscape HTTP Cookie File', + '# https://curl.se/docs/http-cookies.html', + '# This file was generated by ArchiveBox persona cookie extraction', + '#', + '# Format: domain\\tincludeSubdomains\\tpath\\tsecure\\texpiry\\tname\\tvalue', + '', + ]; + + for (const cookie of cookies) { + lines.push(cookieToNetscape(cookie)); + } + + fs.mkdirSync(path.dirname(outputPath), { recursive: true }); + fs.writeFileSync(outputPath, lines.join('\n') + '\n'); +} + +async function collectStorage(browser) { + const localStorage = {}; + const sessionStorage = {}; + const pages = await browser.pages(); + + for (const page of pages) { + try { + const url = page.url(); + if (!url || url === 'about:blank') continue; + if (url.startsWith('chrome:') || url.startsWith('edge:') || url.startsWith('devtools:')) continue; + + const payload = await page.evaluate(() => ({ + origin: window.location.origin, + localStorage: Object.fromEntries(Object.entries(window.localStorage)), + sessionStorage: Object.fromEntries(Object.entries(window.sessionStorage)), + })); + + if (!payload.origin || payload.origin === 'null') continue; + if (Object.keys(payload.localStorage || {}).length > 0) { + localStorage[payload.origin] = payload.localStorage; + } + if (Object.keys(payload.sessionStorage || {}).length > 0) { + sessionStorage[payload.origin] = payload.sessionStorage; + } + } catch (error) { + // Ignore pages that cannot be inspected via evaluate(). + } + } + + return { localStorage, sessionStorage }; +} + +async function openBrowser() { + const cdpUrl = process.env.CHROME_CDP_URL || ''; + if (cdpUrl) { + const browser = await chromeUtils.connectToBrowserEndpoint(puppeteer, cdpUrl, { defaultViewport: null }); + return { + browser, + async cleanup() { + try { + await browser.disconnect(); + } catch (error) {} + }, + sourceDescription: cdpUrl, + }; + } + + const userDataDir = process.env.CHROME_USER_DATA_DIR; + if (!userDataDir) { + throw new Error('Either CHROME_USER_DATA_DIR or CHROME_CDP_URL is required'); + } + if (!fs.existsSync(userDataDir)) { + throw new Error(`User data directory does not exist: ${userDataDir}`); + } + + const outputDir = fs.mkdtempSync(path.join(os.tmpdir(), 'abx-browser-state-')); + const binary = process.env.CHROME_BINARY || chromeUtils.findAnyChromiumBinary(); + if (!binary) { + throw new Error('Could not find a Chromium binary for browser state export'); + } + + const launched = await chromeUtils.launchChromium({ + binary, + outputDir, + userDataDir, + headless: true, + killZombies: false, + }); + + if (!launched.success) { + throw new Error(launched.error || 'Chrome launch failed'); + } + + const browser = await chromeUtils.connectToBrowserEndpoint(puppeteer, launched.cdpUrl, { defaultViewport: null }); + + return { + browser, + async cleanup() { + try { + await browser.disconnect(); + } catch (error) {} + try { + await chromeUtils.killChrome(launched.pid, outputDir); + } catch (error) {} + try { + fs.rmSync(outputDir, { recursive: true, force: true }); + } catch (error) {} + }, + sourceDescription: userDataDir, + }; +} + +async function main() { + const cookiesOutput = process.env.COOKIES_OUTPUT_FILE || ''; + const authOutput = process.env.AUTH_STORAGE_OUTPUT_FILE || ''; + if (!cookiesOutput && !authOutput) { + throw new Error('COOKIES_OUTPUT_FILE or AUTH_STORAGE_OUTPUT_FILE is required'); + } + + const { browser, cleanup, sourceDescription } = await openBrowser(); + + try { + const session = await browser.target().createCDPSession(); + const browserVersion = await session.send('Browser.getVersion'); + const cookieResult = await session.send('Storage.getCookies'); + const cookies = cookieResult?.cookies || []; + const { localStorage, sessionStorage } = await collectStorage(browser); + const userAgent = browserVersion?.userAgent || ''; + + if (cookiesOutput) { + writeCookiesFile(cookies, cookiesOutput); + } + + if (authOutput) { + fs.mkdirSync(path.dirname(authOutput), { recursive: true }); + fs.writeFileSync( + authOutput, + JSON.stringify( + { + TYPE: 'auth', + SOURCE: sourceDescription, + captured_at: new Date().toISOString(), + user_agent: userAgent, + cookies, + localStorage, + sessionStorage, + }, + null, + 2, + ) + '\n', + ); + } + + console.error( + `[+] Exported ${cookies.length} cookies` + + `${authOutput ? ` and ${Object.keys(localStorage).length + Object.keys(sessionStorage).length} storage origins` : ''}` + + `${userAgent ? ' with browser USER_AGENT' : ''}` + + ` from ${sourceDescription}`, + ); + } finally { + await cleanup(); + } +} + +main().catch((error) => { + console.error(`ERROR: ${error.message}`); + process.exit(1); +}); diff --git a/archivebox/personas/forms.py b/archivebox/personas/forms.py new file mode 100644 index 00000000..fbcf8a61 --- /dev/null +++ b/archivebox/personas/forms.py @@ -0,0 +1,176 @@ +__package__ = "archivebox.personas" + +from typing import Any + +from django import forms +from django.utils.safestring import mark_safe + +from archivebox.personas.importers import ( + PersonaImportResult, + PersonaImportSource, + discover_local_browser_profiles, + import_persona_from_source, + resolve_custom_import_source, + validate_persona_name, +) +from archivebox.personas.models import Persona + + +def _mode_label(title: str, description: str) -> str: + return mark_safe( + f'{title}{description}' + ) + + +class PersonaAdminForm(forms.ModelForm): + import_mode = forms.ChoiceField( + required=False, + initial="none", + label="Bootstrap this persona", + widget=forms.RadioSelect, + choices=( + ("none", _mode_label("Blank Persona", "Create the persona without importing browser state yet.")), + ("discovered", _mode_label("Use a detected profile", "Pick from Chromium profiles auto-discovered on this host.")), + ("custom", _mode_label("Use a custom path or CDP URL", "Paste an absolute Chromium path or attach to a live browser debugging endpoint.")), + ), + help_text="These options run after the Persona row is saved, using the same backend import helpers as the CLI.", + ) + import_discovered_profile = forms.ChoiceField( + required=False, + label="Autodiscovered profiles", + widget=forms.RadioSelect, + choices=(), + help_text="Detected from local Chrome, Chromium, Brave, and Edge profile roots.", + ) + import_source = forms.CharField( + required=False, + label="Absolute path or CDP URL", + widget=forms.TextInput( + attrs={ + "placeholder": "/Users/alice/Library/Application Support/Google/Chrome or ws://127.0.0.1:9222/devtools/browser/...", + "style": "width: 100%; font-family: monospace;", + } + ), + help_text="Accepts an absolute Chromium user-data dir, an exact profile dir, or a live HTTP/WS CDP endpoint.", + ) + import_profile_name = forms.CharField( + required=False, + label="Profile directory name", + widget=forms.TextInput( + attrs={ + "placeholder": "Default or Profile 1", + "style": "width: 100%; font-family: monospace;", + } + ), + help_text="Only used when the custom path points at a browser root containing multiple profiles.", + ) + import_copy_profile = forms.BooleanField( + required=False, + initial=True, + label="Copy browser profile into this persona", + help_text="Copies the chosen Chromium user-data tree into `chrome_user_data` for future archiving runs.", + ) + import_extract_cookies = forms.BooleanField( + required=False, + initial=True, + label="Generate `cookies.txt`", + help_text="Extracts cookies through Chrome DevTools Protocol and writes a Netscape cookie jar for wget/curl-based plugins.", + ) + import_capture_storage = forms.BooleanField( + required=False, + initial=True, + label="Capture open-tab storage into `auth.json`", + help_text="Snapshots currently open tab `localStorage` / `sessionStorage` values by origin. This is most useful for live CDP imports.", + ) + + class Meta: + model = Persona + fields = ("name", "created_by", "config") + + def __init__(self, *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + self.discovered_profiles = discover_local_browser_profiles() + self._resolved_import_source: PersonaImportSource | None = None + + self.fields["import_mode"].widget.attrs["class"] = "abx-import-mode" + self.fields["import_discovered_profile"].widget.attrs["class"] = "abx-profile-picker" + + if self.discovered_profiles: + self.fields["import_discovered_profile"].choices = [ + (profile.choice_value, profile.as_choice_label()) for profile in self.discovered_profiles + ] + else: + self.fields["import_discovered_profile"].choices = [] + self.fields["import_discovered_profile"].help_text = ( + "No local Chromium profiles were detected on this host right now. " + "Use the custom path/CDP option if the browser data lives elsewhere." + ) + + def clean_name(self) -> str: + name = str(self.cleaned_data.get("name") or "").strip() + is_valid, error_message = validate_persona_name(name) + if not is_valid: + raise forms.ValidationError(error_message) + return name + + def clean(self) -> dict[str, Any]: + cleaned_data = super().clean() + self._resolved_import_source = None + + import_mode = str(cleaned_data.get("import_mode") or "none").strip() or "none" + if import_mode == "none": + return cleaned_data + + if import_mode == "discovered": + selection = str(cleaned_data.get("import_discovered_profile") or "").strip() + if not selection: + self.add_error("import_discovered_profile", "Choose one of the discovered profiles to import.") + return cleaned_data + try: + self._resolved_import_source = PersonaImportSource.from_choice_value(selection) + except ValueError as err: + self.add_error("import_discovered_profile", str(err)) + return cleaned_data + elif import_mode == "custom": + raw_value = str(cleaned_data.get("import_source") or "").strip() + if not raw_value: + self.add_error("import_source", "Provide an absolute Chromium profile path or a CDP URL.") + return cleaned_data + try: + self._resolved_import_source = resolve_custom_import_source( + raw_value, + profile_dir=str(cleaned_data.get("import_profile_name") or "").strip() or None, + ) + except ValueError as err: + self.add_error("import_source", str(err)) + return cleaned_data + else: + self.add_error("import_mode", "Choose how this Persona should be bootstrapped.") + return cleaned_data + + copy_profile = bool(cleaned_data.get("import_copy_profile")) + import_cookies = bool(cleaned_data.get("import_extract_cookies")) + capture_storage = bool(cleaned_data.get("import_capture_storage")) + + if self._resolved_import_source.kind == "cdp": + if not (import_cookies or capture_storage): + self.add_error( + "import_extract_cookies", + "CDP imports can only capture cookies and/or open-tab storage. Profile copying is not available for a remote browser endpoint.", + ) + elif not (copy_profile or import_cookies or capture_storage): + raise forms.ValidationError("Select at least one import action.") + + return cleaned_data + + def apply_import(self, persona: Persona) -> PersonaImportResult | None: + if not self._resolved_import_source: + return None + + return import_persona_from_source( + persona, + self._resolved_import_source, + copy_profile=bool(self.cleaned_data.get("import_copy_profile")), + import_cookies=bool(self.cleaned_data.get("import_extract_cookies")), + capture_storage=bool(self.cleaned_data.get("import_capture_storage")), + ) diff --git a/archivebox/personas/importers.py b/archivebox/personas/importers.py new file mode 100644 index 00000000..fa0963bd --- /dev/null +++ b/archivebox/personas/importers.py @@ -0,0 +1,845 @@ +""" +Shared persona browser discovery/import helpers. + +These helpers are used by both the CLI and the Django admin so Persona import +behavior stays consistent regardless of where it is triggered from. +""" + +from __future__ import annotations + +import json +import os +import platform +import shutil +import subprocess +import tempfile +from dataclasses import dataclass, field +from pathlib import Path +from typing import TYPE_CHECKING, Optional +from urllib.parse import urlparse + +from django.utils.html import format_html +from django.utils.safestring import SafeString + +if TYPE_CHECKING: + from archivebox.personas.models import Persona + + +BROWSER_LABELS = { + "chrome": "Google Chrome", + "chromium": "Chromium", + "brave": "Brave", + "edge": "Microsoft Edge", + "custom": "Custom Path", + "persona": "Persona Template", +} + +BROWSER_PROFILE_DIR_NAMES = ( + "Default", + "Profile ", + "Guest Profile", +) + +VOLATILE_PROFILE_COPY_PATTERNS = ( + "Cache", + "Code Cache", + "GPUCache", + "ShaderCache", + "Service Worker", + "GCM Store", + "*.log", + "Crashpad", + "BrowserMetrics", + "BrowserMetrics-spare.pma", + "SingletonLock", + "SingletonSocket", + "SingletonCookie", +) + +PERSONA_PROFILE_DIR_CANDIDATES = ( + "chrome_profile", + "chrome_user_data", +) + + +@dataclass(frozen=True) +class PersonaImportSource: + kind: str + browser: str = "custom" + source_name: str | None = None + user_data_dir: Path | None = None + profile_dir: str | None = None + browser_binary: str | None = None + cdp_url: str | None = None + + @property + def browser_label(self) -> str: + return BROWSER_LABELS.get(self.browser, self.browser.title()) + + @property + def profile_path(self) -> Path | None: + if not self.user_data_dir or not self.profile_dir: + return None + return self.user_data_dir / self.profile_dir + + @property + def display_label(self) -> str: + if self.kind == "cdp": + return self.cdp_url or "CDP URL" + profile_suffix = f" / {self.profile_dir}" if self.profile_dir else "" + source_prefix = f": {self.source_name}" if self.source_name else "" + return f"{self.browser_label}{source_prefix}{profile_suffix}" + + @property + def choice_value(self) -> str: + return json.dumps( + { + "kind": self.kind, + "browser": self.browser, + "source_name": self.source_name or "", + "user_data_dir": str(self.user_data_dir) if self.user_data_dir else "", + "profile_dir": self.profile_dir or "", + "browser_binary": self.browser_binary or "", + "cdp_url": self.cdp_url or "", + }, + sort_keys=True, + ) + + def as_choice_label(self) -> SafeString: + path_str = str(self.profile_path or self.user_data_dir or self.cdp_url or "") + binary_suffix = f"Using {self.browser_binary}" if self.browser_binary else "Will auto-detect a Chromium binary" + return format_html( + '' + '{}' + '{}' + '{}' + "", + self.display_label, + binary_suffix, + path_str, + ) + + @classmethod + def from_choice_value(cls, value: str) -> "PersonaImportSource": + try: + payload = json.loads(value) + except json.JSONDecodeError as err: + raise ValueError("Invalid discovered profile selection.") from err + + if payload.get("kind") != "browser-profile": + raise ValueError("Invalid discovered profile selection.") + + user_data_dir = Path(str(payload.get("user_data_dir") or "")).expanduser() + profile_dir = str(payload.get("profile_dir") or "").strip() + browser = str(payload.get("browser") or "custom").strip().lower() or "custom" + source_name = str(payload.get("source_name") or "").strip() or None + browser_binary = str(payload.get("browser_binary") or "").strip() or None + + return resolve_browser_profile_source( + browser=browser, + source_name=source_name, + user_data_dir=user_data_dir, + profile_dir=profile_dir, + browser_binary=browser_binary, + ) + + +@dataclass +class PersonaImportResult: + source: PersonaImportSource + profile_copied: bool = False + cookies_imported: bool = False + storage_captured: bool = False + user_agent_imported: bool = False + warnings: list[str] = field(default_factory=list) + + @property + def did_work(self) -> bool: + return self.profile_copied or self.cookies_imported or self.storage_captured or self.user_agent_imported + + +def get_chrome_user_data_dir() -> Optional[Path]: + """Get the default Chrome user data directory for the current platform.""" + system = platform.system() + home = Path.home() + + if system == "Darwin": + candidates = [ + home / "Library" / "Application Support" / "Google" / "Chrome", + home / "Library" / "Application Support" / "Chromium", + ] + elif system == "Linux": + candidates = [ + home / ".config" / "google-chrome", + home / ".config" / "chromium", + home / ".config" / "chrome", + home / "snap" / "chromium" / "common" / "chromium", + ] + elif system == "Windows": + local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local")) + candidates = [ + local_app_data / "Google" / "Chrome" / "User Data", + local_app_data / "Chromium" / "User Data", + ] + else: + candidates = [] + + for candidate in candidates: + if candidate.exists() and _list_profile_names(candidate): + return candidate + + return None + + +def get_brave_user_data_dir() -> Optional[Path]: + """Get the default Brave user data directory for the current platform.""" + system = platform.system() + home = Path.home() + + if system == "Darwin": + candidates = [ + home / "Library" / "Application Support" / "BraveSoftware" / "Brave-Browser", + ] + elif system == "Linux": + candidates = [ + home / ".config" / "BraveSoftware" / "Brave-Browser", + ] + elif system == "Windows": + local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local")) + candidates = [ + local_app_data / "BraveSoftware" / "Brave-Browser" / "User Data", + ] + else: + candidates = [] + + for candidate in candidates: + if candidate.exists() and _list_profile_names(candidate): + return candidate + + return None + + +def get_edge_user_data_dir() -> Optional[Path]: + """Get the default Edge user data directory for the current platform.""" + system = platform.system() + home = Path.home() + + if system == "Darwin": + candidates = [ + home / "Library" / "Application Support" / "Microsoft Edge", + ] + elif system == "Linux": + candidates = [ + home / ".config" / "microsoft-edge", + home / ".config" / "microsoft-edge-beta", + home / ".config" / "microsoft-edge-dev", + ] + elif system == "Windows": + local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local")) + candidates = [ + local_app_data / "Microsoft" / "Edge" / "User Data", + ] + else: + candidates = [] + + for candidate in candidates: + if candidate.exists() and _list_profile_names(candidate): + return candidate + + return None + + +def get_browser_binary(browser: str) -> Optional[str]: + system = platform.system() + home = Path.home() + browser = browser.lower() + + if system == "Darwin": + candidates = { + "chrome": ["/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"], + "chromium": ["/Applications/Chromium.app/Contents/MacOS/Chromium"], + "brave": ["/Applications/Brave Browser.app/Contents/MacOS/Brave Browser"], + "edge": ["/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge"], + }.get(browser, []) + elif system == "Linux": + candidates = { + "chrome": ["/usr/bin/google-chrome", "/usr/bin/google-chrome-stable", "/usr/bin/google-chrome-beta", "/usr/bin/google-chrome-unstable"], + "chromium": ["/usr/bin/chromium", "/usr/bin/chromium-browser"], + "brave": ["/usr/bin/brave-browser", "/usr/bin/brave-browser-beta", "/usr/bin/brave-browser-nightly"], + "edge": ["/usr/bin/microsoft-edge", "/usr/bin/microsoft-edge-stable", "/usr/bin/microsoft-edge-beta", "/usr/bin/microsoft-edge-dev"], + }.get(browser, []) + elif system == "Windows": + local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local")) + candidates = { + "chrome": [ + str(local_app_data / "Google" / "Chrome" / "Application" / "chrome.exe"), + "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe", + "C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe", + ], + "chromium": [str(local_app_data / "Chromium" / "Application" / "chrome.exe")], + "brave": [ + str(local_app_data / "BraveSoftware" / "Brave-Browser" / "Application" / "brave.exe"), + "C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe", + "C:\\Program Files (x86)\\BraveSoftware\\Brave-Browser\\Application\\brave.exe", + ], + "edge": [ + str(local_app_data / "Microsoft" / "Edge" / "Application" / "msedge.exe"), + "C:\\Program Files\\Microsoft\\Edge\\Application\\msedge.exe", + "C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe", + ], + }.get(browser, []) + else: + candidates = [] + + for candidate in candidates: + if candidate and Path(candidate).exists(): + return candidate + + return None + + +BROWSER_PROFILE_FINDERS = { + "chrome": get_chrome_user_data_dir, + "chromium": get_chrome_user_data_dir, + "brave": get_brave_user_data_dir, + "edge": get_edge_user_data_dir, +} + +CHROMIUM_BROWSERS = tuple(BROWSER_PROFILE_FINDERS.keys()) + + +NETSCAPE_COOKIE_HEADER = [ + "# Netscape HTTP Cookie File", + "# https://curl.se/docs/http-cookies.html", + "# This file was generated by ArchiveBox persona cookie extraction", + "#", + "# Format: domain\\tincludeSubdomains\\tpath\\tsecure\\texpiry\\tname\\tvalue", + "", +] + + +def validate_persona_name(name: str) -> tuple[bool, str]: + """Validate persona name to prevent path traversal.""" + if not name or not name.strip(): + return False, "Persona name cannot be empty" + if "/" in name or "\\" in name: + return False, "Persona name cannot contain path separators (/ or \\)" + if ".." in name: + return False, "Persona name cannot contain parent directory references (..)" + if name.startswith("."): + return False, "Persona name cannot start with a dot (.)" + if "\x00" in name or "\n" in name or "\r" in name: + return False, "Persona name contains invalid characters" + return True, "" + + +def discover_local_browser_profiles() -> list[PersonaImportSource]: + discovered: list[PersonaImportSource] = [] + + for browser, finder in BROWSER_PROFILE_FINDERS.items(): + user_data_dir = finder() + if not user_data_dir: + continue + + browser_binary = get_browser_binary(browser) + for profile_dir in _list_profile_names(user_data_dir): + try: + discovered.append( + resolve_browser_profile_source( + browser=browser, + user_data_dir=user_data_dir, + profile_dir=profile_dir, + browser_binary=browser_binary, + ) + ) + except ValueError: + continue + + discovered.extend(discover_persona_template_profiles()) + + return discovered + + +def discover_persona_template_profiles(personas_dir: Path | None = None) -> list[PersonaImportSource]: + from archivebox.config.constants import CONSTANTS + + templates: list[PersonaImportSource] = [] + candidate_roots: list[Path] = [] + + if personas_dir is not None: + candidate_roots.append(personas_dir.expanduser()) + else: + candidate_roots.extend( + [ + CONSTANTS.PERSONAS_DIR.expanduser(), + Path.home() / ".config" / "abx" / "personas", + ] + ) + + seen_roots: set[Path] = set() + for personas_root in candidate_roots: + resolved_root = personas_root.resolve() + if resolved_root in seen_roots: + continue + seen_roots.add(resolved_root) + + if not resolved_root.exists() or not resolved_root.is_dir(): + continue + + for persona_dir in sorted((path for path in resolved_root.iterdir() if path.is_dir()), key=lambda path: path.name.lower()): + for candidate_dir_name in PERSONA_PROFILE_DIR_CANDIDATES: + user_data_dir = persona_dir / candidate_dir_name + if not user_data_dir.exists() or not user_data_dir.is_dir(): + continue + + for profile_dir in _list_profile_names(user_data_dir): + try: + templates.append( + resolve_browser_profile_source( + browser="persona", + source_name=persona_dir.name, + user_data_dir=user_data_dir, + profile_dir=profile_dir, + browser_binary=get_browser_binary("chrome"), + ) + ) + except ValueError: + continue + + return templates + + +def resolve_browser_import_source(browser: str, profile_dir: str | None = None) -> PersonaImportSource: + browser = browser.lower().strip() + if browser not in BROWSER_PROFILE_FINDERS: + supported = ", ".join(BROWSER_PROFILE_FINDERS) + raise ValueError(f"Unknown browser: {browser}. Supported browsers: {supported}") + + user_data_dir = BROWSER_PROFILE_FINDERS[browser]() + if not user_data_dir: + raise ValueError(f"Could not find {browser} profile directory") + + chosen_profile = profile_dir or pick_default_profile_dir(user_data_dir) + if not chosen_profile: + raise ValueError(f"Could not find a profile in {user_data_dir}") + + return resolve_browser_profile_source( + browser=browser, + user_data_dir=user_data_dir, + profile_dir=chosen_profile, + browser_binary=get_browser_binary(browser), + ) + + +def resolve_browser_profile_source( + browser: str, + user_data_dir: Path, + profile_dir: str, + source_name: str | None = None, + browser_binary: str | None = None, +) -> PersonaImportSource: + resolved_root = user_data_dir.expanduser() + if not resolved_root.is_absolute(): + resolved_root = resolved_root.resolve() + if not resolved_root.exists(): + raise ValueError(f"Profile root does not exist: {resolved_root}") + if not profile_dir.strip(): + raise ValueError("Profile directory name cannot be empty.") + + profile_path = resolved_root / profile_dir + if not _looks_like_profile_dir(profile_path): + raise ValueError(f"Profile directory does not look valid: {profile_path}") + + return PersonaImportSource( + kind="browser-profile", + browser=browser, + source_name=source_name, + user_data_dir=resolved_root, + profile_dir=profile_dir, + browser_binary=browser_binary, + ) + + +def resolve_custom_import_source(raw_value: str, profile_dir: str | None = None) -> PersonaImportSource: + raw_value = raw_value.strip() + if not raw_value: + raise ValueError("Provide an absolute browser profile path or a CDP URL.") + + if _looks_like_cdp_url(raw_value): + return PersonaImportSource(kind="cdp", cdp_url=raw_value) + + source_path = Path(raw_value).expanduser() + if not source_path.is_absolute(): + raise ValueError("Custom browser path must be an absolute path.") + if not source_path.exists(): + raise ValueError(f"Custom browser path does not exist: {source_path}") + + explicit_profile = profile_dir.strip() if profile_dir else "" + if _looks_like_profile_dir(source_path): + if explicit_profile and explicit_profile != source_path.name: + raise ValueError("Profile name does not match the provided profile directory path.") + return resolve_browser_profile_source( + browser="custom", + user_data_dir=source_path.parent.resolve(), + profile_dir=source_path.name, + ) + + chosen_profile = explicit_profile or pick_default_profile_dir(source_path) + if not chosen_profile: + raise ValueError( + "Could not find a Chromium profile in that directory. " + "Provide an exact profile directory path or fill in the profile name field." + ) + + return resolve_browser_profile_source( + browser="custom", + user_data_dir=source_path.resolve(), + profile_dir=chosen_profile, + ) + + +def pick_default_profile_dir(user_data_dir: Path) -> str | None: + profiles = _list_profile_names(user_data_dir) + if not profiles: + return None + if "Default" in profiles: + return "Default" + return profiles[0] + + +def import_persona_from_source( + persona: "Persona", + source: PersonaImportSource, + *, + copy_profile: bool = True, + import_cookies: bool = True, + capture_storage: bool = False, +) -> PersonaImportResult: + persona.ensure_dirs() + result = PersonaImportResult(source=source) + + persona_chrome_dir = Path(persona.CHROME_USER_DATA_DIR) + cookies_file = persona.path / "cookies.txt" + auth_file = persona.path / "auth.json" + + launch_user_data_dir: Path | None = None + + if source.kind == "browser-profile": + if copy_profile and source.user_data_dir: + resolved_source_root = source.user_data_dir.resolve() + resolved_persona_root = persona_chrome_dir.resolve() + if resolved_source_root == resolved_persona_root: + result.warnings.append("Skipped profile copy because the selected source is already this persona's chrome_user_data directory.") + else: + copy_browser_user_data_dir(resolved_source_root, resolved_persona_root) + persona.cleanup_chrome_profile(resolved_persona_root) + result.profile_copied = True + launch_user_data_dir = resolved_persona_root + else: + launch_user_data_dir = source.user_data_dir + elif copy_profile: + result.warnings.append("Profile copying is only available for local Chromium profile paths. CDP imports can only pull cookies and open-tab storage.") + + if source.kind == "cdp": + export_success, auth_payload, export_message = export_browser_state( + cdp_url=source.cdp_url, + cookies_output_file=cookies_file if import_cookies else None, + auth_output_file=auth_file if capture_storage else None, + ) + else: + export_success, auth_payload, export_message = export_browser_state( + user_data_dir=launch_user_data_dir, + profile_dir=source.profile_dir, + chrome_binary=source.browser_binary, + cookies_output_file=cookies_file if import_cookies else None, + auth_output_file=auth_file if capture_storage else None, + ) + + if not export_success: + result.warnings.append(export_message or "Browser import failed.") + return result + + if import_cookies and cookies_file.exists(): + result.cookies_imported = True + if capture_storage and auth_file.exists(): + result.storage_captured = True + if _apply_imported_user_agent(persona, auth_payload): + result.user_agent_imported = True + + return result + + +def copy_browser_user_data_dir(source_dir: Path, destination_dir: Path) -> None: + destination_dir.parent.mkdir(parents=True, exist_ok=True) + shutil.rmtree(destination_dir, ignore_errors=True) + shutil.copytree( + source_dir, + destination_dir, + symlinks=True, + ignore=shutil.ignore_patterns(*VOLATILE_PROFILE_COPY_PATTERNS), + ) + + +def export_browser_state( + *, + user_data_dir: Path | None = None, + cdp_url: str | None = None, + profile_dir: str | None = None, + chrome_binary: str | None = None, + cookies_output_file: Path | None = None, + auth_output_file: Path | None = None, +) -> tuple[bool, dict | None, str]: + if not user_data_dir and not cdp_url: + return False, None, "Missing browser source." + + from abx_plugins import get_plugins_dir + from archivebox.config.common import STORAGE_CONFIG + + state_script = Path(__file__).with_name("export_browser_state.js") + if not state_script.exists(): + return False, None, f"Browser state export script not found at {state_script}" + + node_modules_dir = STORAGE_CONFIG.LIB_DIR / "npm" / "node_modules" + chrome_plugin_dir = Path(get_plugins_dir()).resolve() + + env = os.environ.copy() + env["NODE_MODULES_DIR"] = str(node_modules_dir) + env["ARCHIVEBOX_ABX_PLUGINS_DIR"] = str(chrome_plugin_dir) + + if user_data_dir: + env["CHROME_USER_DATA_DIR"] = str(user_data_dir) + if cdp_url: + env["CHROME_CDP_URL"] = cdp_url + env["CHROME_IS_LOCAL"] = "false" + if chrome_binary: + env["CHROME_BINARY"] = str(chrome_binary) + if profile_dir: + extra_arg = f"--profile-directory={profile_dir}" + existing_extra = env.get("CHROME_ARGS_EXTRA", "").strip() + args_list: list[str] = [] + if existing_extra: + if existing_extra.startswith("["): + try: + parsed = json.loads(existing_extra) + if isinstance(parsed, list): + args_list.extend(str(x) for x in parsed) + except Exception: + args_list.extend([s.strip() for s in existing_extra.split(",") if s.strip()]) + else: + args_list.extend([s.strip() for s in existing_extra.split(",") if s.strip()]) + args_list.append(extra_arg) + env["CHROME_ARGS_EXTRA"] = json.dumps(args_list) + + temp_dir: Path | None = None + tmp_cookies_file: Path | None = None + tmp_auth_file: Path | None = None + + if cookies_output_file and cookies_output_file.exists(): + temp_dir = Path(tempfile.mkdtemp(prefix="ab_browser_state_")) + tmp_cookies_file = temp_dir / "cookies.txt" + env["COOKIES_OUTPUT_FILE"] = str(tmp_cookies_file) + elif cookies_output_file: + env["COOKIES_OUTPUT_FILE"] = str(cookies_output_file) + + if auth_output_file and auth_output_file.exists(): + temp_dir = temp_dir or Path(tempfile.mkdtemp(prefix="ab_browser_state_")) + tmp_auth_file = temp_dir / "auth.json" + env["AUTH_STORAGE_OUTPUT_FILE"] = str(tmp_auth_file) + elif auth_output_file: + env["AUTH_STORAGE_OUTPUT_FILE"] = str(auth_output_file) + else: + temp_dir = temp_dir or Path(tempfile.mkdtemp(prefix="ab_browser_state_")) + tmp_auth_file = temp_dir / "auth.json" + env["AUTH_STORAGE_OUTPUT_FILE"] = str(tmp_auth_file) + + try: + result = subprocess.run( + ["node", str(state_script)], + env=env, + capture_output=True, + text=True, + timeout=120, + ) + except subprocess.TimeoutExpired: + return False, None, "Browser state export timed out." + except FileNotFoundError: + return False, None, "Node.js was not found, so ArchiveBox could not extract browser state." + except Exception as err: + return False, None, f"Browser state export failed: {err}" + + if result.returncode != 0: + message = (result.stderr or result.stdout or "").strip() or "Browser state export failed." + return False, None, message + + auth_payload: dict | None = None + if cookies_output_file and tmp_cookies_file and tmp_cookies_file.exists(): + _merge_netscape_cookies(cookies_output_file, tmp_cookies_file) + if auth_output_file and tmp_auth_file and tmp_auth_file.exists(): + _merge_auth_storage(auth_output_file, tmp_auth_file) + auth_payload = _load_auth_storage(tmp_auth_file) + elif auth_output_file and auth_output_file.exists(): + auth_payload = _load_auth_storage(auth_output_file) + elif tmp_auth_file and tmp_auth_file.exists(): + auth_payload = _load_auth_storage(tmp_auth_file) + + if temp_dir and temp_dir.exists(): + shutil.rmtree(temp_dir, ignore_errors=True) + + return True, auth_payload, (result.stderr or result.stdout or "").strip() + + +def _list_profile_names(user_data_dir: Path) -> list[str]: + if not user_data_dir.exists() or not user_data_dir.is_dir(): + return [] + + profiles: list[str] = [] + for child in sorted(user_data_dir.iterdir(), key=lambda path: path.name.lower()): + if not child.is_dir(): + continue + if child.name == "System Profile": + continue + if child.name == "Default" or child.name.startswith("Profile ") or child.name.startswith("Guest Profile"): + if _looks_like_profile_dir(child): + profiles.append(child.name) + continue + if _looks_like_profile_dir(child): + profiles.append(child.name) + return profiles + + +def _looks_like_profile_dir(path: Path) -> bool: + if not path.exists() or not path.is_dir(): + return False + + marker_paths = ( + path / "Preferences", + path / "History", + path / "Cookies", + path / "Network" / "Cookies", + path / "Local Storage", + path / "Session Storage", + ) + + if any(marker.exists() for marker in marker_paths): + return True + + return any(path.name == prefix or path.name.startswith(prefix) for prefix in BROWSER_PROFILE_DIR_NAMES) + + +def _looks_like_cdp_url(value: str) -> bool: + parsed = urlparse(value) + return parsed.scheme in {"ws", "wss", "http", "https"} and bool(parsed.netloc) + + +def _parse_netscape_cookies(path: Path) -> dict[tuple[str, str, str], tuple[str, str, str, str, str, str, str]]: + cookies: dict[tuple[str, str, str], tuple[str, str, str, str, str, str, str]] = {} + if not path.exists(): + return cookies + + for line in path.read_text().splitlines(): + if not line or line.startswith("#"): + continue + parts = line.split("\t") + if len(parts) < 7: + continue + domain, include_subdomains, cookie_path, secure, expiry, name, value = parts[:7] + cookies[(domain, cookie_path, name)] = (domain, include_subdomains, cookie_path, secure, expiry, name, value) + return cookies + + +def _write_netscape_cookies( + path: Path, + cookies: dict[tuple[str, str, str], tuple[str, str, str, str, str, str, str]], +) -> None: + lines = list(NETSCAPE_COOKIE_HEADER) + for cookie in cookies.values(): + lines.append("\t".join(cookie)) + path.write_text("\n".join(lines) + "\n") + + +def _merge_netscape_cookies(existing_file: Path, new_file: Path) -> None: + existing = _parse_netscape_cookies(existing_file) + new = _parse_netscape_cookies(new_file) + existing.update(new) + _write_netscape_cookies(existing_file, existing) + + +def _merge_auth_storage(existing_file: Path, new_file: Path) -> None: + existing_payload = _load_auth_storage(existing_file) + new_payload = _load_auth_storage(new_file) + + existing_local = existing_payload.setdefault("localStorage", {}) + existing_session = existing_payload.setdefault("sessionStorage", {}) + + for origin, payload in (new_payload.get("localStorage") or {}).items(): + existing_local[origin] = payload + for origin, payload in (new_payload.get("sessionStorage") or {}).items(): + existing_session[origin] = payload + + cookies = _merge_cookie_dicts(existing_payload.get("cookies") or [], new_payload.get("cookies") or []) + + merged = { + **existing_payload, + **new_payload, + "cookies": cookies, + "localStorage": existing_local, + "sessionStorage": existing_session, + "user_agent": new_payload.get("user_agent") or existing_payload.get("user_agent") or "", + } + existing_file.write_text(json.dumps(merged, indent=2, sort_keys=True) + "\n") + + +def _load_auth_storage(path: Path) -> dict: + if not path.exists(): + return { + "TYPE": "auth", + "cookies": [], + "localStorage": {}, + "sessionStorage": {}, + } + try: + payload = json.loads(path.read_text()) + except json.JSONDecodeError: + return { + "TYPE": "auth", + "cookies": [], + "localStorage": {}, + "sessionStorage": {}, + } + if not isinstance(payload, dict): + return { + "TYPE": "auth", + "cookies": [], + "localStorage": {}, + "sessionStorage": {}, + } + return payload + + +def _merge_cookie_dicts(existing: list[dict], new: list[dict]) -> list[dict]: + merged: dict[tuple[str, str, str], dict] = {} + for cookie in existing: + key = (str(cookie.get("domain") or ""), str(cookie.get("path") or "/"), str(cookie.get("name") or "")) + merged[key] = cookie + for cookie in new: + key = (str(cookie.get("domain") or ""), str(cookie.get("path") or "/"), str(cookie.get("name") or "")) + merged[key] = cookie + return list(merged.values()) + + +def _apply_imported_user_agent(persona: "Persona", auth_payload: dict | None) -> bool: + if not auth_payload: + return False + + user_agent = str(auth_payload.get("user_agent") or "").strip() + if not user_agent: + return False + + config = dict(persona.config or {}) + if config.get("USER_AGENT") == user_agent: + return False + + config["USER_AGENT"] = user_agent + persona.config = config + persona.save(update_fields=["config"]) + return True diff --git a/archivebox/personas/models.py b/archivebox/personas/models.py index 7e927c22..f19e70a0 100644 --- a/archivebox/personas/models.py +++ b/archivebox/personas/models.py @@ -117,6 +117,12 @@ class Persona(ModelWithConfig): cookies_path = self.path / 'cookies.txt' return str(cookies_path) if cookies_path.exists() else '' + @property + def AUTH_STORAGE_FILE(self) -> str: + """Derived path to auth.json for this persona (if it exists).""" + auth_path = self.path / 'auth.json' + return str(auth_path) if auth_path.exists() else '' + def get_derived_config(self) -> dict: """ Get config dict with derived paths filled in. @@ -127,6 +133,7 @@ class Persona(ModelWithConfig): - CHROME_EXTENSIONS_DIR (derived from persona path) - CHROME_DOWNLOADS_DIR (derived from persona path) - COOKIES_FILE (derived from persona path, if file exists) + - AUTH_STORAGE_FILE (derived from persona path, if file exists) - ACTIVE_PERSONA (set to this persona's name) """ derived = dict(self.config or {}) @@ -140,6 +147,8 @@ class Persona(ModelWithConfig): derived['CHROME_DOWNLOADS_DIR'] = self.CHROME_DOWNLOADS_DIR if 'COOKIES_FILE' not in derived and self.COOKIES_FILE: derived['COOKIES_FILE'] = self.COOKIES_FILE + if 'AUTH_STORAGE_FILE' not in derived and self.AUTH_STORAGE_FILE: + derived['AUTH_STORAGE_FILE'] = self.AUTH_STORAGE_FILE # Always set ACTIVE_PERSONA to this persona's name derived['ACTIVE_PERSONA'] = self.name diff --git a/archivebox/services/archive_result_service.py b/archivebox/services/archive_result_service.py index 1e346dc7..9912cf6b 100644 --- a/archivebox/services/archive_result_service.py +++ b/archivebox/services/archive_result_service.py @@ -1,5 +1,6 @@ from __future__ import annotations +import json import mimetypes from collections import defaultdict from pathlib import Path @@ -7,9 +8,10 @@ from pathlib import Path from asgiref.sync import sync_to_async from django.utils import timezone -from abx_dl.events import ArchiveResultEvent +from abx_dl.events import ArchiveResultEvent, ProcessCompletedEvent from abx_dl.services.base import BaseService +from .db import run_db_op from .process_service import ProcessService, parse_event_datetime @@ -48,22 +50,93 @@ def _collect_output_metadata(plugin_dir: Path) -> tuple[dict[str, dict], int, st def _normalize_status(status: str) -> str: if status == "noresult": - return "skipped" + return "noresults" return status or "failed" +def _has_content_files(output_files: list[str]) -> bool: + return any(Path(path).suffix not in {".log", ".pid", ".sh"} for path in output_files) + + +def _iter_archiveresult_records(stdout: str) -> list[dict]: + records: list[dict] = [] + for raw_line in stdout.splitlines(): + line = raw_line.strip() + if not line.startswith("{"): + continue + try: + record = json.loads(line) + except json.JSONDecodeError: + continue + if record.get("type") == "ArchiveResult": + records.append(record) + return records + + class ArchiveResultService(BaseService): - LISTENS_TO = [ArchiveResultEvent] + LISTENS_TO = [ArchiveResultEvent, ProcessCompletedEvent] EMITS = [] def __init__(self, bus, *, process_service: ProcessService): self.process_service = process_service super().__init__(bus) - async def on_ArchiveResultEvent(self, event: ArchiveResultEvent) -> None: - await sync_to_async(self._project, thread_sensitive=True)(event) + async def on_ArchiveResultEvent__Outer(self, event: ArchiveResultEvent) -> None: + snapshot_output_dir = await run_db_op(self._get_snapshot_output_dir, event.snapshot_id) + if snapshot_output_dir is None: + return + plugin_dir = Path(snapshot_output_dir) / event.plugin + output_files, output_size, output_mimetypes = await sync_to_async(_collect_output_metadata)(plugin_dir) + await run_db_op(self._project, event, output_files, output_size, output_mimetypes) - def _project(self, event: ArchiveResultEvent) -> None: + async def on_ProcessCompletedEvent__Outer(self, event: ProcessCompletedEvent) -> None: + if not event.snapshot_id or not event.hook_name.startswith("on_Snapshot"): + return + + plugin_dir = Path(event.output_dir) + output_files, output_size, output_mimetypes = await sync_to_async(_collect_output_metadata)(plugin_dir) + records = _iter_archiveresult_records(event.stdout) + if records: + for record in records: + await run_db_op( + self._project_from_process_completed, + event, + record, + output_files, + output_size, + output_mimetypes, + ) + return + + synthetic_record = { + "plugin": event.plugin_name, + "hook_name": event.hook_name, + "status": "failed" if event.exit_code != 0 else ("succeeded" if _has_content_files(event.output_files) else "skipped"), + "output_str": event.stderr if event.exit_code != 0 else "", + "error": event.stderr if event.exit_code != 0 else "", + } + await run_db_op( + self._project_from_process_completed, + event, + synthetic_record, + output_files, + output_size, + output_mimetypes, + ) + + def _get_snapshot_output_dir(self, snapshot_id: str) -> str | None: + from archivebox.core.models import Snapshot + + snapshot = Snapshot.objects.filter(id=snapshot_id).only("output_dir").first() + return str(snapshot.output_dir) if snapshot is not None else None + + def _project( + self, + event: ArchiveResultEvent, + output_files: dict[str, dict], + output_size: int, + output_mimetypes: str, + ) -> None: from archivebox.core.models import ArchiveResult, Snapshot from archivebox.machine.models import Process @@ -86,8 +159,6 @@ class ArchiveResultService(BaseService): }, ) - plugin_dir = Path(snapshot.output_dir) / event.plugin - output_files, output_size, output_mimetypes = _collect_output_metadata(plugin_dir) result.process = process or result.process result.status = _normalize_status(event.status) result.output_str = event.output_str @@ -97,7 +168,28 @@ class ArchiveResultService(BaseService): result.output_mimetypes = output_mimetypes result.start_ts = parse_event_datetime(event.start_ts) or result.start_ts or timezone.now() result.end_ts = parse_event_datetime(event.end_ts) or timezone.now() - result.retry_at = None if event.error: result.notes = event.error result.save() + + def _project_from_process_completed( + self, + event: ProcessCompletedEvent, + record: dict, + output_files: dict[str, dict], + output_size: int, + output_mimetypes: str, + ) -> None: + archive_result_event = ArchiveResultEvent( + snapshot_id=record.get("snapshot_id") or event.snapshot_id, + plugin=record.get("plugin") or event.plugin_name, + hook_name=record.get("hook_name") or event.hook_name, + status=record.get("status") or "", + process_id=event.process_id, + output_str=record.get("output_str") or "", + output_json=record.get("output_json") if isinstance(record.get("output_json"), dict) else None, + start_ts=event.start_ts, + end_ts=event.end_ts, + error=record.get("error") or (event.stderr if event.exit_code != 0 else ""), + ) + self._project(archive_result_event, output_files, output_size, output_mimetypes) diff --git a/archivebox/services/binary_service.py b/archivebox/services/binary_service.py index bf571e8f..5bba73af 100644 --- a/archivebox/services/binary_service.py +++ b/archivebox/services/binary_service.py @@ -1,19 +1,23 @@ from __future__ import annotations -from asgiref.sync import sync_to_async +import asyncio + from abx_dl.events import BinaryEvent, BinaryInstalledEvent from abx_dl.services.base import BaseService +from .db import run_db_op + class BinaryService(BaseService): LISTENS_TO = [BinaryEvent, BinaryInstalledEvent] EMITS = [] - async def on_BinaryEvent(self, event: BinaryEvent) -> None: - await sync_to_async(self._project_binary, thread_sensitive=True)(event) + async def on_BinaryEvent__Outer(self, event: BinaryEvent) -> None: + await run_db_op(self._project_binary, event) - async def on_BinaryInstalledEvent(self, event: BinaryInstalledEvent) -> None: - await sync_to_async(self._project_installed_binary, thread_sensitive=True)(event) + async def on_BinaryInstalledEvent__Outer(self, event: BinaryInstalledEvent) -> None: + resolved = await asyncio.to_thread(self._resolve_installed_binary_metadata, event) + await run_db_op(self._project_installed_binary, event, resolved) def _project_binary(self, event: BinaryEvent) -> None: from archivebox.machine.models import Binary, Machine @@ -44,7 +48,39 @@ class BinaryService(BaseService): }, ) - def _project_installed_binary(self, event: BinaryInstalledEvent) -> None: + def _resolve_installed_binary_metadata(self, event: BinaryInstalledEvent) -> dict[str, str]: + resolved = { + "abspath": event.abspath or "", + "version": event.version or "", + "sha256": event.sha256 or "", + "binproviders": event.binproviders or "", + "binprovider": event.binprovider or "", + } + if resolved["abspath"] and resolved["version"] and resolved["binprovider"]: + return resolved + + try: + from abx_dl.dependencies import load_binary + + allowed_providers = resolved["binproviders"] or resolved["binprovider"] or "env,pip,npm,brew,apt" + spec = { + "name": event.name, + "binproviders": allowed_providers, + "overrides": event.overrides or {}, + } + binary = load_binary(spec) + resolved["abspath"] = str(getattr(binary, "abspath", None) or resolved["abspath"] or "") + resolved["version"] = str(getattr(binary, "version", None) or resolved["version"] or "") + resolved["sha256"] = str(getattr(binary, "sha256", None) or resolved["sha256"] or "") + provider_name = getattr(getattr(binary, "loaded_binprovider", None), "name", None) + if provider_name: + resolved["binprovider"] = str(provider_name) + except Exception: + pass + + return resolved + + def _project_installed_binary(self, event: BinaryInstalledEvent, resolved: dict[str, str]) -> None: from archivebox.machine.models import Binary, Machine machine = Machine.current() @@ -55,10 +91,14 @@ class BinaryService(BaseService): "status": Binary.StatusChoices.QUEUED, }, ) - binary.abspath = event.abspath or binary.abspath - binary.version = event.version or binary.version - binary.sha256 = event.sha256 or binary.sha256 - binary.binprovider = event.binprovider or binary.binprovider + binary.abspath = resolved["abspath"] or binary.abspath + binary.version = resolved["version"] or binary.version + binary.sha256 = resolved["sha256"] or binary.sha256 + if resolved["binproviders"]: + binary.binproviders = resolved["binproviders"] + binary.binprovider = resolved["binprovider"] or binary.binprovider + if event.overrides and binary.overrides != event.overrides: + binary.overrides = event.overrides binary.status = Binary.StatusChoices.INSTALLED binary.retry_at = None - binary.save(update_fields=["abspath", "version", "sha256", "binprovider", "status", "retry_at", "modified_at"]) + binary.save(update_fields=["abspath", "version", "sha256", "binproviders", "binprovider", "overrides", "status", "retry_at", "modified_at"]) diff --git a/archivebox/services/crawl_service.py b/archivebox/services/crawl_service.py index 5add6c2a..1b5e314b 100644 --- a/archivebox/services/crawl_service.py +++ b/archivebox/services/crawl_service.py @@ -1,11 +1,10 @@ from __future__ import annotations -from asgiref.sync import sync_to_async -from django.utils import timezone - from abx_dl.events import CrawlCleanupEvent, CrawlCompletedEvent, CrawlSetupEvent, CrawlStartEvent from abx_dl.services.base import BaseService +from .db import run_db_op + class CrawlService(BaseService): LISTENS_TO = [CrawlSetupEvent, CrawlStartEvent, CrawlCleanupEvent, CrawlCompletedEvent] @@ -15,17 +14,17 @@ class CrawlService(BaseService): self.crawl_id = crawl_id super().__init__(bus) - async def on_CrawlSetupEvent(self, event: CrawlSetupEvent) -> None: - await sync_to_async(self._mark_started, thread_sensitive=True)() + async def on_CrawlSetupEvent__Outer(self, event: CrawlSetupEvent) -> None: + await run_db_op(self._mark_started) - async def on_CrawlStartEvent(self, event: CrawlStartEvent) -> None: - await sync_to_async(self._mark_started, thread_sensitive=True)() + async def on_CrawlStartEvent__Outer(self, event: CrawlStartEvent) -> None: + await run_db_op(self._mark_started) - async def on_CrawlCleanupEvent(self, event: CrawlCleanupEvent) -> None: - await sync_to_async(self._mark_started, thread_sensitive=True)() + async def on_CrawlCleanupEvent__Outer(self, event: CrawlCleanupEvent) -> None: + await run_db_op(self._mark_started) - async def on_CrawlCompletedEvent(self, event: CrawlCompletedEvent) -> None: - await sync_to_async(self._mark_completed, thread_sensitive=True)() + async def on_CrawlCompletedEvent__Outer(self, event: CrawlCompletedEvent) -> None: + await run_db_op(self._mark_completed) def _mark_started(self) -> None: from archivebox.crawls.models import Crawl diff --git a/archivebox/services/db.py b/archivebox/services/db.py new file mode 100644 index 00000000..0c8e542c --- /dev/null +++ b/archivebox/services/db.py @@ -0,0 +1,16 @@ +from __future__ import annotations + +from asgiref.sync import sync_to_async +from django.db import close_old_connections + + +def _run_db_op(func, *args, **kwargs): + close_old_connections() + try: + return func(*args, **kwargs) + finally: + close_old_connections() + + +async def run_db_op(func, *args, **kwargs): + return await sync_to_async(_run_db_op, thread_sensitive=True)(func, *args, **kwargs) diff --git a/archivebox/services/live_ui.py b/archivebox/services/live_ui.py new file mode 100644 index 00000000..40f149bc --- /dev/null +++ b/archivebox/services/live_ui.py @@ -0,0 +1 @@ +from abx_dl.cli import LiveBusUI diff --git a/archivebox/services/machine_service.py b/archivebox/services/machine_service.py index 62966a91..574893ee 100644 --- a/archivebox/services/machine_service.py +++ b/archivebox/services/machine_service.py @@ -1,16 +1,17 @@ from __future__ import annotations -from asgiref.sync import sync_to_async from abx_dl.events import MachineEvent from abx_dl.services.base import BaseService +from .db import run_db_op + class MachineService(BaseService): LISTENS_TO = [MachineEvent] EMITS = [] - async def on_MachineEvent(self, event: MachineEvent) -> None: - await sync_to_async(self._project, thread_sensitive=True)(event) + async def on_MachineEvent__Outer(self, event: MachineEvent) -> None: + await run_db_op(self._project, event) def _project(self, event: MachineEvent) -> None: from archivebox.machine.models import Machine diff --git a/archivebox/services/process_service.py b/archivebox/services/process_service.py index 32e702d7..2b6551b2 100644 --- a/archivebox/services/process_service.py +++ b/archivebox/services/process_service.py @@ -3,12 +3,13 @@ from __future__ import annotations from datetime import datetime from typing import TYPE_CHECKING -from asgiref.sync import sync_to_async from django.utils import timezone from abx_dl.events import ProcessCompletedEvent, ProcessStartedEvent from abx_dl.services.base import BaseService +from .db import run_db_op + if TYPE_CHECKING: from archivebox.machine.models import Process @@ -33,27 +34,33 @@ class ProcessService(BaseService): self.process_ids: dict[str, str] = {} super().__init__(bus) - async def on_ProcessStartedEvent(self, event: ProcessStartedEvent) -> None: - await sync_to_async(self._project_started, thread_sensitive=True)(event) + async def on_ProcessStartedEvent__Outer(self, event: ProcessStartedEvent) -> None: + await run_db_op(self._project_started, event) - async def on_ProcessCompletedEvent(self, event: ProcessCompletedEvent) -> None: - await sync_to_async(self._project_completed, thread_sensitive=True)(event) + async def on_ProcessCompletedEvent__Outer(self, event: ProcessCompletedEvent) -> None: + await run_db_op(self._project_completed, event) def get_db_process_id(self, process_id: str) -> str | None: return self.process_ids.get(process_id) def _get_or_create_process(self, event: ProcessStartedEvent | ProcessCompletedEvent) -> "Process": - from archivebox.machine.models import Machine, Process + from archivebox.machine.models import NetworkInterface, Process db_process_id = self.process_ids.get(event.process_id) + iface = NetworkInterface.current(refresh=True) if db_process_id: process = Process.objects.filter(id=db_process_id).first() if process is not None: + if process.iface_id != iface.id or process.machine_id != iface.machine_id: + process.iface = iface + process.machine = iface.machine + process.save(update_fields=["iface", "machine", "modified_at"]) return process process_type = Process.TypeChoices.BINARY if event.hook_name.startswith("on_Binary") else Process.TypeChoices.HOOK process = Process.objects.create( - machine=Machine.current(), + machine=iface.machine, + iface=iface, process_type=process_type, pwd=event.output_dir, cmd=[event.hook_path, *event.hook_args], @@ -77,12 +84,14 @@ class ProcessService(BaseService): process.started_at = parse_event_datetime(event.start_ts) or process.started_at or timezone.now() process.status = process.StatusChoices.RUNNING process.retry_at = None + process.hydrate_binary_from_context(plugin_name=event.plugin_name, hook_path=event.hook_path) process.save() def _project_completed(self, event: ProcessCompletedEvent) -> None: process = self._get_or_create_process(event) process.pwd = event.output_dir - process.cmd = [event.hook_path, *event.hook_args] + if not process.cmd: + process.cmd = [event.hook_path, *event.hook_args] process.env = event.env process.pid = event.pid or process.pid process.started_at = parse_event_datetime(event.start_ts) or process.started_at @@ -92,4 +101,5 @@ class ProcessService(BaseService): process.exit_code = event.exit_code process.status = process.StatusChoices.EXITED process.retry_at = None + process.hydrate_binary_from_context(plugin_name=event.plugin_name, hook_path=event.hook_path) process.save() diff --git a/archivebox/services/runner.py b/archivebox/services/runner.py index 283dfb21..9821ef3a 100644 --- a/archivebox/services/runner.py +++ b/archivebox/services/runner.py @@ -3,16 +3,21 @@ from __future__ import annotations import asyncio import json import os +import shutil +import subprocess import sys import time +from contextlib import nullcontext from pathlib import Path +from tempfile import TemporaryDirectory from typing import Any from django.utils import timezone +from rich.console import Console from abx_dl.events import BinaryEvent -from abx_dl.models import INSTALL_URL, Snapshot as AbxSnapshot, discover_plugins -from abx_dl.orchestrator import create_bus, download, install_plugins as abx_install_plugins, setup_services as setup_abx_services +from abx_dl.models import INSTALL_URL, Plugin, Snapshot as AbxSnapshot, discover_plugins, filter_plugins +from abx_dl.orchestrator import create_bus, download, install_plugins as abx_install_plugins, prepare_install_plugins, setup_services as setup_abx_services from .archive_result_service import ArchiveResultService from .binary_service import BinaryService @@ -21,6 +26,7 @@ from .machine_service import MachineService from .process_service import ProcessService from .snapshot_service import SnapshotService from .tag_service import TagService +from .live_ui import LiveBusUI def _bus_name(prefix: str, identifier: str) -> str: @@ -35,6 +41,19 @@ def _selected_plugins_from_config(config: dict[str, Any]) -> list[str] | None: return [name.strip() for name in raw.split(",") if name.strip()] +def _count_selected_hooks(plugins: dict[str, Plugin], selected_plugins: list[str] | None) -> int: + selected = filter_plugins(plugins, selected_plugins) if selected_plugins else plugins + total = 0 + for plugin in selected.values(): + total += len(list(plugin.get_crawl_hooks())) + total += len(list(plugin.get_snapshot_hooks())) + return total + + +def _runner_debug(message: str) -> None: + print(f"[runner] {message}", file=sys.stderr, flush=True) + + def _attach_bus_trace(bus) -> None: trace_target = (os.environ.get("ARCHIVEBOX_BUS_TRACE") or "").strip() if not trace_target: @@ -78,10 +97,51 @@ async def _stop_bus_trace(bus) -> None: bus._archivebox_trace_task = None +def ensure_background_runner(*, allow_under_pytest: bool = False) -> bool: + if os.environ.get("PYTEST_CURRENT_TEST") and not allow_under_pytest: + return False + + from archivebox.config import CONSTANTS + from archivebox.machine.models import Machine, Process + + Process.cleanup_stale_running() + machine = Machine.current() + if Process.objects.filter( + machine=machine, + status=Process.StatusChoices.RUNNING, + process_type=Process.TypeChoices.ORCHESTRATOR, + ).exists(): + return False + + log_path = CONSTANTS.LOGS_DIR / "errors.log" + log_path.parent.mkdir(parents=True, exist_ok=True) + env = os.environ.copy() + env.setdefault("DATA_DIR", str(CONSTANTS.DATA_DIR)) + + with log_path.open("a", encoding="utf-8") as log_handle: + subprocess.Popen( + [sys.executable, "-m", "archivebox", "run", "--daemon"], + cwd=str(CONSTANTS.DATA_DIR), + env=env, + stdin=subprocess.DEVNULL, + stdout=log_handle, + stderr=log_handle, + start_new_session=True, + ) + return True + + class CrawlRunner: MAX_CONCURRENT_SNAPSHOTS = 8 - def __init__(self, crawl, *, snapshot_ids: list[str] | None = None, selected_plugins: list[str] | None = None): + def __init__( + self, + crawl, + *, + snapshot_ids: list[str] | None = None, + selected_plugins: list[str] | None = None, + process_discovered_snapshots_inline: bool = True, + ): self.crawl = crawl self.bus = create_bus(name=_bus_name("ArchiveBox", str(crawl.id)), total_timeout=3600.0) self.plugins = discover_plugins() @@ -90,7 +150,12 @@ class CrawlRunner: self.binary_service = BinaryService(self.bus) self.tag_service = TagService(self.bus) self.crawl_service = CrawlService(self.bus, crawl_id=str(crawl.id)) - self.snapshot_service = SnapshotService(self.bus, crawl_id=str(crawl.id), schedule_snapshot=self.enqueue_snapshot) + self.process_discovered_snapshots_inline = process_discovered_snapshots_inline + self.snapshot_service = SnapshotService( + self.bus, + crawl_id=str(crawl.id), + schedule_snapshot=self.enqueue_snapshot if process_discovered_snapshots_inline else self.leave_snapshot_queued, + ) self.archive_result_service = ArchiveResultService(self.bus, process_service=self.process_service) self.selected_plugins = selected_plugins self.initial_snapshot_ids = snapshot_ids @@ -100,6 +165,29 @@ class CrawlRunner: self.persona = None self.base_config: dict[str, Any] = {} self.primary_url = "" + self._live_stream = None + + def _create_projector_bus(self, *, identifier: str, config_overrides: dict[str, Any]): + bus = create_bus(name=_bus_name("ArchiveBox", identifier), total_timeout=3600.0) + process_service = ProcessService(bus) + MachineService(bus) + BinaryService(bus) + TagService(bus) + CrawlService(bus, crawl_id=str(self.crawl.id)) + SnapshotService( + bus, + crawl_id=str(self.crawl.id), + schedule_snapshot=self.enqueue_snapshot if self.process_discovered_snapshots_inline else self.leave_snapshot_queued, + ) + ArchiveResultService(bus, process_service=process_service) + abx_services = setup_abx_services( + bus, + plugins=self.plugins, + config_overrides=config_overrides, + auto_install=True, + emit_jsonl=False, + ) + return bus, abx_services async def run(self) -> None: from asgiref.sync import sync_to_async @@ -107,35 +195,63 @@ class CrawlRunner: try: await sync_to_async(self._prepare, thread_sensitive=True)() - _attach_bus_trace(self.bus) - self.abx_services = setup_abx_services( - self.bus, - plugins=self.plugins, - config_overrides=self.base_config, - auto_install=True, - emit_jsonl=False, - ) - if self.crawl.get_system_task() == INSTALL_URL: - await self._run_install_crawl() - else: - snapshot_ids = await sync_to_async(self._initial_snapshot_ids, thread_sensitive=True)() - if snapshot_ids: - root_snapshot_id = snapshot_ids[0] - await self._run_crawl_setup(root_snapshot_id) - for snapshot_id in snapshot_ids: - await self.enqueue_snapshot(snapshot_id) - await self._wait_for_snapshot_tasks() - await self._run_crawl_cleanup(root_snapshot_id) - if self.abx_services is not None: - await self.abx_services.process.wait_for_background_monitors() + live_ui = self._create_live_ui() + with live_ui if live_ui is not None else nullcontext(): + _attach_bus_trace(self.bus) + self.abx_services = setup_abx_services( + self.bus, + plugins=self.plugins, + config_overrides=self.base_config, + auto_install=True, + emit_jsonl=False, + ) + if self.crawl.get_system_task() == INSTALL_URL: + await self._run_install_crawl() + else: + snapshot_ids = await sync_to_async(self._initial_snapshot_ids, thread_sensitive=True)() + if snapshot_ids: + root_snapshot_id = snapshot_ids[0] + _runner_debug(f"crawl {self.crawl.id} starting crawl setup root_snapshot={root_snapshot_id}") + await self._run_crawl_setup(root_snapshot_id) + _runner_debug(f"crawl {self.crawl.id} finished crawl setup root_snapshot={root_snapshot_id}") + for snapshot_id in snapshot_ids: + await self.enqueue_snapshot(snapshot_id) + _runner_debug(f"crawl {self.crawl.id} waiting for snapshot tasks count={len(self.snapshot_tasks)}") + await self._wait_for_snapshot_tasks() + _runner_debug(f"crawl {self.crawl.id} finished waiting for snapshot tasks") + _runner_debug(f"crawl {self.crawl.id} starting django crawl.cleanup()") + await sync_to_async(self.crawl.cleanup, thread_sensitive=True)() + _runner_debug(f"crawl {self.crawl.id} finished django crawl.cleanup()") + _runner_debug(f"crawl {self.crawl.id} starting abx crawl cleanup root_snapshot={root_snapshot_id}") + await self._run_crawl_cleanup(root_snapshot_id) + _runner_debug(f"crawl {self.crawl.id} finished abx crawl cleanup root_snapshot={root_snapshot_id}") + if self.abx_services is not None: + _runner_debug(f"crawl {self.crawl.id} waiting for main bus background monitors") + await self.abx_services.process.wait_for_background_monitors() + _runner_debug(f"crawl {self.crawl.id} finished waiting for main bus background monitors") finally: await _stop_bus_trace(self.bus) await self.bus.stop() + if self._live_stream is not None: + try: + self._live_stream.close() + except Exception: + pass + self._live_stream = None await sync_to_async(self._cleanup_persona, thread_sensitive=True)() crawl = await sync_to_async(Crawl.objects.get, thread_sensitive=True)(id=self.crawl.id) - if crawl.status != Crawl.StatusChoices.SEALED: - crawl.status = Crawl.StatusChoices.SEALED - crawl.retry_at = None + crawl_is_finished = await sync_to_async(crawl.is_finished, thread_sensitive=True)() + if crawl_is_finished: + if crawl.status != Crawl.StatusChoices.SEALED: + crawl.status = Crawl.StatusChoices.SEALED + crawl.retry_at = None + await sync_to_async(crawl.save, thread_sensitive=True)(update_fields=["status", "retry_at", "modified_at"]) + else: + if crawl.status == Crawl.StatusChoices.SEALED: + crawl.status = Crawl.StatusChoices.QUEUED + elif crawl.status != Crawl.StatusChoices.STARTED: + crawl.status = Crawl.StatusChoices.STARTED + crawl.retry_at = crawl.retry_at or timezone.now() await sync_to_async(crawl.save, thread_sensitive=True)(update_fields=["status", "retry_at", "modified_at"]) async def enqueue_snapshot(self, snapshot_id: str) -> None: @@ -145,17 +261,36 @@ class CrawlRunner: task = asyncio.create_task(self._run_snapshot(snapshot_id)) self.snapshot_tasks[snapshot_id] = task + async def leave_snapshot_queued(self, snapshot_id: str) -> None: + return None + async def _wait_for_snapshot_tasks(self) -> None: while True: - active = [task for task in self.snapshot_tasks.values() if not task.done()] - if not active: + pending_tasks: list[asyncio.Task[None]] = [] + for snapshot_id, task in list(self.snapshot_tasks.items()): + if task.done(): + if self.snapshot_tasks.get(snapshot_id) is task: + self.snapshot_tasks.pop(snapshot_id, None) + task.result() + continue + pending_tasks.append(task) + if not pending_tasks: return - await asyncio.gather(*active) + done, _pending = await asyncio.wait(pending_tasks, return_when=asyncio.FIRST_COMPLETED) + for task in done: + task.result() def _prepare(self) -> None: from archivebox.config.configset import get_config + from archivebox.machine.models import NetworkInterface, Process self.primary_url = self.crawl.get_urls_list()[0] if self.crawl.get_urls_list() else "" + current_iface = NetworkInterface.current(refresh=True) + current_process = Process.current() + if current_process.iface_id != current_iface.id or current_process.machine_id != current_iface.machine_id: + current_process.iface = current_iface + current_process.machine = current_iface.machine + current_process.save(update_fields=["iface", "machine", "modified_at"]) self.persona = self.crawl.resolve_persona() self.base_config = get_config(crawl=self.crawl) if self.selected_plugins is None: @@ -168,6 +303,52 @@ class CrawlRunner: if self.persona: self.persona.cleanup_runtime_for_crawl(self.crawl) + def _create_live_ui(self) -> LiveBusUI | None: + stdout_is_tty = sys.stdout.isatty() + stderr_is_tty = sys.stderr.isatty() + interactive_tty = stdout_is_tty or stderr_is_tty + if not interactive_tty: + return None + stream = sys.stderr if stderr_is_tty else sys.stdout + if os.path.exists("/dev/tty"): + try: + self._live_stream = open("/dev/tty", "w", buffering=1, encoding=getattr(stream, "encoding", None) or "utf-8") + stream = self._live_stream + except OSError: + self._live_stream = None + try: + terminal_size = os.get_terminal_size(stream.fileno()) + terminal_width = terminal_size.columns + terminal_height = terminal_size.lines + except (AttributeError, OSError, ValueError): + terminal_size = shutil.get_terminal_size(fallback=(160, 40)) + terminal_width = terminal_size.columns + terminal_height = terminal_size.lines + ui_console = Console( + file=stream, + force_terminal=True, + width=terminal_width, + height=terminal_height, + _environ={ + "COLUMNS": str(terminal_width), + "LINES": str(terminal_height), + }, + ) + plugins_label = ", ".join(self.selected_plugins) if self.selected_plugins else f"all ({len(self.plugins)} available)" + live_ui = LiveBusUI( + self.bus, + total_hooks=_count_selected_hooks(self.plugins, self.selected_plugins), + timeout_seconds=int(self.base_config.get("TIMEOUT") or 60), + ui_console=ui_console, + interactive_tty=True, + ) + live_ui.print_intro( + url=self.primary_url or INSTALL_URL, + output_dir=Path(self.crawl.output_dir), + plugins_label=plugins_label, + ) + return live_ui + def _create_root_snapshots(self) -> list[str]: created = self.crawl.create_snapshots_from_urls() snapshots = created or list(self.crawl.snapshot_set.filter(depth=0).order_by("created_at")) @@ -290,18 +471,34 @@ class CrawlRunner: parent_snapshot_id=snapshot["parent_snapshot_id"], crawl_id=str(self.crawl.id), ) - await download( - url=snapshot["url"], - plugins=self.plugins, - output_dir=Path(snapshot["output_dir"]), - selected_plugins=self.selected_plugins, + snapshot_bus, snapshot_services = self._create_projector_bus( + identifier=f"{self.crawl.id}_{snapshot['id']}", config_overrides=snapshot["config"], - bus=self.bus, - emit_jsonl=False, - snapshot=abx_snapshot, - skip_crawl_setup=True, - skip_crawl_cleanup=True, ) + try: + _attach_bus_trace(snapshot_bus) + _runner_debug(f"snapshot {snapshot_id} starting download()") + await download( + url=snapshot["url"], + plugins=self.plugins, + output_dir=Path(snapshot["output_dir"]), + selected_plugins=self.selected_plugins, + config_overrides=snapshot["config"], + bus=snapshot_bus, + emit_jsonl=False, + snapshot=abx_snapshot, + skip_crawl_setup=True, + skip_crawl_cleanup=True, + ) + _runner_debug(f"snapshot {snapshot_id} finished download(), waiting for background monitors") + await snapshot_services.process.wait_for_background_monitors() + _runner_debug(f"snapshot {snapshot_id} finished waiting for background monitors") + finally: + current_task = asyncio.current_task() + if current_task is not None and self.snapshot_tasks.get(snapshot_id) is current_task: + self.snapshot_tasks.pop(snapshot_id, None) + await _stop_bus_trace(snapshot_bus) + await snapshot_bus.stop() def _load_snapshot_run_data(self, snapshot_id: str): from archivebox.core.models import Snapshot @@ -322,11 +519,24 @@ class CrawlRunner: } -def run_crawl(crawl_id: str, *, snapshot_ids: list[str] | None = None, selected_plugins: list[str] | None = None) -> None: +def run_crawl( + crawl_id: str, + *, + snapshot_ids: list[str] | None = None, + selected_plugins: list[str] | None = None, + process_discovered_snapshots_inline: bool = True, +) -> None: from archivebox.crawls.models import Crawl crawl = Crawl.objects.get(id=crawl_id) - asyncio.run(CrawlRunner(crawl, snapshot_ids=snapshot_ids, selected_plugins=selected_plugins).run()) + asyncio.run( + CrawlRunner( + crawl, + snapshot_ids=snapshot_ids, + selected_plugins=selected_plugins, + process_discovered_snapshots_inline=process_discovered_snapshots_inline, + ).run() + ) async def _run_binary(binary_id: str) -> None: @@ -397,28 +607,203 @@ async def _run_install(plugin_names: list[str] | None = None) -> None: BinaryService(bus) TagService(bus) ArchiveResultService(bus, process_service=process_service) + live_stream = None try: - _attach_bus_trace(bus) - await abx_install_plugins( - plugin_names=plugin_names, - plugins=plugins, - config_overrides=config, - emit_jsonl=False, - bus=bus, - ) - await abx_services.process.wait_for_background_monitors() + selected_plugins = prepare_install_plugins(plugins, plugin_names=plugin_names) + plugins_label = ", ".join(plugin_names) if plugin_names else f"all ({len(plugins)} available)" + timeout_seconds = int(config.get("TIMEOUT") or 60) + stdout_is_tty = sys.stdout.isatty() + stderr_is_tty = sys.stderr.isatty() + interactive_tty = stdout_is_tty or stderr_is_tty + ui_console = None + live_ui = None + + if interactive_tty: + stream = sys.stderr if stderr_is_tty else sys.stdout + if os.path.exists("/dev/tty"): + try: + live_stream = open("/dev/tty", "w", buffering=1, encoding=getattr(stream, "encoding", None) or "utf-8") + stream = live_stream + except OSError: + live_stream = None + try: + terminal_size = os.get_terminal_size(stream.fileno()) + terminal_width = terminal_size.columns + terminal_height = terminal_size.lines + except (AttributeError, OSError, ValueError): + terminal_size = shutil.get_terminal_size(fallback=(160, 40)) + terminal_width = terminal_size.columns + terminal_height = terminal_size.lines + ui_console = Console( + file=stream, + force_terminal=True, + width=terminal_width, + height=terminal_height, + _environ={ + "COLUMNS": str(terminal_width), + "LINES": str(terminal_height), + }, + ) + + with TemporaryDirectory(prefix="archivebox-install-") as temp_dir: + output_dir = Path(temp_dir) + if ui_console is not None: + live_ui = LiveBusUI( + bus, + total_hooks=_count_selected_hooks(selected_plugins, None), + timeout_seconds=timeout_seconds, + ui_console=ui_console, + interactive_tty=interactive_tty, + ) + live_ui.print_intro( + url=INSTALL_URL, + output_dir=output_dir, + plugins_label=plugins_label, + ) + with live_ui if live_ui is not None else nullcontext(): + _attach_bus_trace(bus) + results = await abx_install_plugins( + plugin_names=plugin_names, + plugins=plugins, + output_dir=output_dir, + config_overrides=config, + emit_jsonl=False, + bus=bus, + ) + await abx_services.process.wait_for_background_monitors() + if live_ui is not None: + live_ui.print_summary(results, output_dir=output_dir) finally: await _stop_bus_trace(bus) await bus.stop() + try: + if live_stream is not None: + live_stream.close() + except Exception: + pass def run_install(*, plugin_names: list[str] | None = None) -> None: asyncio.run(_run_install(plugin_names=plugin_names)) +def recover_orphaned_crawls() -> int: + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + from archivebox.machine.models import Process + + active_crawl_ids: set[str] = set() + running_processes = Process.objects.filter( + status=Process.StatusChoices.RUNNING, + process_type__in=[ + Process.TypeChoices.WORKER, + Process.TypeChoices.HOOK, + Process.TypeChoices.BINARY, + ], + ).only("env") + + for proc in running_processes: + env = proc.env or {} + if not isinstance(env, dict): + continue + crawl_id = env.get("CRAWL_ID") + if crawl_id: + active_crawl_ids.add(str(crawl_id)) + + recovered = 0 + now = timezone.now() + orphaned_crawls = Crawl.objects.filter( + status=Crawl.StatusChoices.STARTED, + retry_at__isnull=True, + ).prefetch_related("snapshot_set") + + for crawl in orphaned_crawls: + if str(crawl.id) in active_crawl_ids: + continue + + snapshots = list(crawl.snapshot_set.all()) + if not snapshots or all(snapshot.status == Snapshot.StatusChoices.SEALED for snapshot in snapshots): + crawl.status = Crawl.StatusChoices.SEALED + crawl.retry_at = None + crawl.save(update_fields=["status", "retry_at", "modified_at"]) + recovered += 1 + continue + + crawl.retry_at = now + crawl.save(update_fields=["retry_at", "modified_at"]) + recovered += 1 + + return recovered + + +def recover_orphaned_snapshots() -> int: + from archivebox.crawls.models import Crawl + from archivebox.core.models import ArchiveResult, Snapshot + from archivebox.machine.models import Process + + active_snapshot_ids: set[str] = set() + running_processes = Process.objects.filter( + status=Process.StatusChoices.RUNNING, + process_type__in=[ + Process.TypeChoices.WORKER, + Process.TypeChoices.HOOK, + Process.TypeChoices.BINARY, + ], + ).only("env") + + for proc in running_processes: + env = proc.env or {} + if not isinstance(env, dict): + continue + snapshot_id = env.get("SNAPSHOT_ID") + if snapshot_id: + active_snapshot_ids.add(str(snapshot_id)) + + recovered = 0 + now = timezone.now() + orphaned_snapshots = ( + Snapshot.objects + .filter(status=Snapshot.StatusChoices.STARTED, retry_at__isnull=True) + .select_related("crawl") + .prefetch_related("archiveresult_set") + ) + + for snapshot in orphaned_snapshots: + if str(snapshot.id) in active_snapshot_ids: + continue + + results = list(snapshot.archiveresult_set.all()) + if results and all(result.status in ArchiveResult.FINAL_STATES for result in results): + snapshot.status = Snapshot.StatusChoices.SEALED + snapshot.retry_at = None + snapshot.downloaded_at = snapshot.downloaded_at or now + snapshot.save(update_fields=["status", "retry_at", "downloaded_at", "modified_at"]) + + crawl = snapshot.crawl + if crawl.is_finished() and crawl.status != Crawl.StatusChoices.SEALED: + crawl.status = Crawl.StatusChoices.SEALED + crawl.retry_at = None + crawl.save(update_fields=["status", "retry_at", "modified_at"]) + recovered += 1 + continue + + snapshot.status = Snapshot.StatusChoices.QUEUED + snapshot.retry_at = now + snapshot.save(update_fields=["status", "retry_at", "modified_at"]) + + crawl = snapshot.crawl + crawl.status = Crawl.StatusChoices.QUEUED + crawl.retry_at = now + crawl.save(update_fields=["status", "retry_at", "modified_at"]) + recovered += 1 + + return recovered + + def run_pending_crawls(*, daemon: bool = False, crawl_id: str | None = None) -> int: from archivebox.crawls.models import Crawl, CrawlSchedule + from archivebox.core.models import Snapshot from archivebox.machine.models import Binary while True: @@ -436,10 +821,48 @@ def run_pending_crawls(*, daemon: bool = False, crawl_id: str | None = None) -> .first() ) if binary is not None: + if not binary.claim_processing_lock(lock_seconds=60): + continue run_binary(str(binary.id)) continue - pending = Crawl.objects.filter(retry_at__lte=timezone.now()).exclude(status=Crawl.StatusChoices.SEALED) + queued_crawls = Crawl.objects.filter( + retry_at__lte=timezone.now(), + status=Crawl.StatusChoices.QUEUED, + ) + if crawl_id: + queued_crawls = queued_crawls.filter(id=crawl_id) + queued_crawls = queued_crawls.order_by("retry_at", "created_at") + + queued_crawl = queued_crawls.first() + if queued_crawl is not None: + if not queued_crawl.claim_processing_lock(lock_seconds=60): + continue + run_crawl(str(queued_crawl.id), process_discovered_snapshots_inline=False) + continue + + if crawl_id is None: + snapshot = ( + Snapshot.objects.filter(retry_at__lte=timezone.now()) + .exclude(status=Snapshot.StatusChoices.SEALED) + .select_related("crawl") + .order_by("retry_at", "created_at") + .first() + ) + if snapshot is not None: + if not snapshot.claim_processing_lock(lock_seconds=60): + continue + run_crawl( + str(snapshot.crawl_id), + snapshot_ids=[str(snapshot.id)], + process_discovered_snapshots_inline=False, + ) + continue + + pending = Crawl.objects.filter( + retry_at__lte=timezone.now(), + status=Crawl.StatusChoices.STARTED, + ) if crawl_id: pending = pending.filter(id=crawl_id) pending = pending.order_by("retry_at", "created_at") @@ -451,4 +874,7 @@ def run_pending_crawls(*, daemon: bool = False, crawl_id: str | None = None) -> continue return 0 - run_crawl(str(crawl.id)) + if not crawl.claim_processing_lock(lock_seconds=60): + continue + + run_crawl(str(crawl.id), process_discovered_snapshots_inline=False) diff --git a/archivebox/services/snapshot_service.py b/archivebox/services/snapshot_service.py index bdb35641..c4acbe5d 100644 --- a/archivebox/services/snapshot_service.py +++ b/archivebox/services/snapshot_service.py @@ -1,13 +1,13 @@ from __future__ import annotations -import re - from asgiref.sync import sync_to_async from django.utils import timezone from abx_dl.events import SnapshotCompletedEvent, SnapshotEvent from abx_dl.services.base import BaseService +from .db import run_db_op + class SnapshotService(BaseService): LISTENS_TO = [SnapshotEvent, SnapshotCompletedEvent] @@ -18,13 +18,17 @@ class SnapshotService(BaseService): self.schedule_snapshot = schedule_snapshot super().__init__(bus) - async def on_SnapshotEvent(self, event: SnapshotEvent) -> None: - snapshot_id = await sync_to_async(self._project_snapshot, thread_sensitive=True)(event) + async def on_SnapshotEvent__Outer(self, event: SnapshotEvent) -> None: + snapshot_id = await run_db_op(self._project_snapshot, event) + if snapshot_id: + await sync_to_async(self._ensure_crawl_symlink)(snapshot_id) if snapshot_id and event.depth > 0: await self.schedule_snapshot(snapshot_id) - async def on_SnapshotCompletedEvent(self, event: SnapshotCompletedEvent) -> None: - await sync_to_async(self._seal_snapshot, thread_sensitive=True)(event.snapshot_id) + async def on_SnapshotCompletedEvent__Outer(self, event: SnapshotCompletedEvent) -> None: + snapshot_id = await run_db_op(self._seal_snapshot, event.snapshot_id) + if snapshot_id: + await sync_to_async(self._write_snapshot_details)(snapshot_id) def _project_snapshot(self, event: SnapshotEvent) -> str | None: from archivebox.core.models import Snapshot @@ -39,7 +43,6 @@ class SnapshotService(BaseService): snapshot.status = Snapshot.StatusChoices.STARTED snapshot.retry_at = None snapshot.save(update_fields=["status", "retry_at", "modified_at"]) - snapshot.ensure_crawl_symlink() return str(snapshot.id) if event.depth > crawl.max_depth: @@ -73,56 +76,36 @@ class SnapshotService(BaseService): if snapshot.status != Snapshot.StatusChoices.SEALED: snapshot.status = Snapshot.StatusChoices.QUEUED snapshot.save(update_fields=["status", "retry_at", "modified_at"]) - snapshot.ensure_crawl_symlink() return str(snapshot.id) def _url_passes_filters(self, crawl, parent_snapshot, url: str) -> bool: - from archivebox.config.configset import get_config + return crawl.url_passes_filters(url, snapshot=parent_snapshot) - config = get_config( - user=getattr(crawl, "created_by", None), - crawl=crawl, - snapshot=parent_snapshot, - ) - - def to_pattern_list(value): - if isinstance(value, list): - return value - if isinstance(value, str): - return [pattern.strip() for pattern in value.split(",") if pattern.strip()] - return [] - - allowlist = to_pattern_list(config.get("URL_ALLOWLIST", "")) - denylist = to_pattern_list(config.get("URL_DENYLIST", "")) - - for pattern in denylist: - try: - if re.search(pattern, url): - return False - except re.error: - continue - - if allowlist: - for pattern in allowlist: - try: - if re.search(pattern, url): - return True - except re.error: - continue - return False - - return True - - def _seal_snapshot(self, snapshot_id: str) -> None: + def _seal_snapshot(self, snapshot_id: str) -> str | None: from archivebox.core.models import Snapshot snapshot = Snapshot.objects.filter(id=snapshot_id).first() if snapshot is None: - return + return None snapshot.status = Snapshot.StatusChoices.SEALED snapshot.retry_at = None snapshot.downloaded_at = snapshot.downloaded_at or timezone.now() snapshot.save(update_fields=["status", "retry_at", "downloaded_at", "modified_at"]) + return str(snapshot.id) + + def _ensure_crawl_symlink(self, snapshot_id: str) -> None: + from archivebox.core.models import Snapshot + + snapshot = Snapshot.objects.filter(id=snapshot_id).select_related("crawl", "crawl__created_by").first() + if snapshot is not None: + snapshot.ensure_crawl_symlink() + + def _write_snapshot_details(self, snapshot_id: str) -> None: + from archivebox.core.models import Snapshot + + snapshot = Snapshot.objects.filter(id=snapshot_id).select_related("crawl", "crawl__created_by").first() + if snapshot is None: + return snapshot.write_index_jsonl() snapshot.write_json_details() snapshot.write_html_details() diff --git a/archivebox/services/tag_service.py b/archivebox/services/tag_service.py index 69d0fe2f..78622609 100644 --- a/archivebox/services/tag_service.py +++ b/archivebox/services/tag_service.py @@ -1,16 +1,17 @@ from __future__ import annotations -from asgiref.sync import sync_to_async from abx_dl.events import TagEvent from abx_dl.services.base import BaseService +from .db import run_db_op + class TagService(BaseService): LISTENS_TO = [TagEvent] EMITS = [] - async def on_TagEvent(self, event: TagEvent) -> None: - await sync_to_async(self._project, thread_sensitive=True)(event) + async def on_TagEvent__Outer(self, event: TagEvent) -> None: + await run_db_op(self._project, event) def _project(self, event: TagEvent) -> None: from archivebox.core.models import Snapshot, Tag diff --git a/archivebox/templates/admin/base.html b/archivebox/templates/admin/base.html index b2b5bcc9..f9d42c66 100644 --- a/archivebox/templates/admin/base.html +++ b/archivebox/templates/admin/base.html @@ -1083,8 +1083,11 @@ width: 100% !important; } - body.filters-collapsed.change-list #changelist .changelist-form-container > div { + body.filters-collapsed.change-list #changelist .changelist-form-container > div, + body.filters-collapsed.change-list #changelist .changelist-form-container > form { max-width: 100% !important; + width: 100% !important; + flex: 1 1 100% !important; } /* Actions bar */ @@ -1372,7 +1375,8 @@ order: 2; align-self: flex-start; } - body.change-list #changelist .changelist-form-container > div { + body.change-list #changelist .changelist-form-container > div, + body.change-list #changelist .changelist-form-container > form { flex: 1 1 auto; min-width: 0; order: 1; diff --git a/archivebox/templates/admin/core/tag/change_form.html b/archivebox/templates/admin/core/tag/change_form.html new file mode 100644 index 00000000..cde49905 --- /dev/null +++ b/archivebox/templates/admin/core/tag/change_form.html @@ -0,0 +1,268 @@ +{% extends "admin/change_form.html" %} + +{% block bodyclass %}{{ block.super }} app-core model-tag tag-form-page{% endblock %} + +{% block extrastyle %} +{{ block.super }} + +{% endblock %} + +{% block form_top %} +
+
+

{% if add %}New Tag{% else %}Edit Tag{% endif %}

+

Similar tags are shown below while typing.

+
+
+
+ Matches + Current tags +
+
+ Links + Open filtered snapshots +
+
+
+{{ block.super }} +{% endblock %} + +{% block after_field_sets %} +{{ block.super }} +
+

Similar Tags

+

Updates while typing.

+
+
+ +{{ tag_similar_cards|json_script:"abx-tag-similar-data" }} + + +{% endblock %} diff --git a/archivebox/templates/admin/core/tag/change_list.html b/archivebox/templates/admin/core/tag/change_list.html new file mode 100644 index 00000000..5ce822c5 --- /dev/null +++ b/archivebox/templates/admin/core/tag/change_list.html @@ -0,0 +1,997 @@ +{% extends "admin/change_list.html" %} + +{% block bodyclass %}{{ block.super }} app-core model-tag change-list tag-admin-page{% endblock %} + +{% block object-tools %}{% endblock %} + +{% block extrastyle %} +{{ block.super }} + +{% endblock %} + +{% block content %} +
+
+
+ + +
+
+ + + +
+
+ +
+
+ {% csrf_token %} +
+ + +
+
+
+
+ +
+
+ {% if initial_tag_cards %} + {% for card in initial_tag_cards %} +
+
+
+ +
+ + + +
+
+
+ + + + + {{ card.num_snapshots }} +
+
+
+ {% if card.snapshots %} + {% for snapshot in card.snapshots %} + + + {{ snapshot.title }} + + {% endfor %} + {% else %} +
No snapshots attached yet.
+ {% endif %} +
+
+ {% endfor %} + {% else %} +
No tags.
+ {% endif %} +
+
+
+ +{{ initial_tag_cards|json_script:"abx-tag-cards-data" }} + + +{% endblock %} diff --git a/archivebox/templates/admin/personas/persona/change_form.html b/archivebox/templates/admin/personas/persona/change_form.html new file mode 100644 index 00000000..262c66c9 --- /dev/null +++ b/archivebox/templates/admin/personas/persona/change_form.html @@ -0,0 +1,249 @@ +{% extends "admin/change_form.html" %} + +{% block bodyclass %}{{ block.super }} app-personas model-persona{% endblock %} + +{% block extrastyle %} +{{ block.super }} + +{% endblock %} + +{% block extrahead %} +{{ block.super }} + +{% endblock %} + +{% block form_top %} +
+
+

Bootstrap a persona from a real browser session

+

+ Pick a local Chromium profile, paste an absolute profile path, or attach to a live CDP endpoint. + The form saves the Persona normally, then imports profile files, cookies, and optional tab storage into + the Persona's own directories. +

+
+
+
+ Detected profiles + {{ detected_profile_count }} +
+
+ Persona artifacts + chrome_user_data + cookies.txt + auth.json +
+
+
+{{ block.super }} +{% endblock %} diff --git a/archivebox/templates/admin/progress_monitor.html b/archivebox/templates/admin/progress_monitor.html index cd676de9..f5e48789 100644 --- a/archivebox/templates/admin/progress_monitor.html +++ b/archivebox/templates/admin/progress_monitor.html @@ -706,14 +706,14 @@ ? Math.max(0, Math.min(100, extractor.progress)) : null; const progressStyle = progress !== null ? ` style="width: ${progress}%;"` : ''; - const pidHtml = extractor.pid ? `pid ${extractor.pid}` : ''; + const pidHtml = extractor.status === 'started' && extractor.pid ? `pid ${extractor.pid}` : ''; return ` ${icon} - ${extractor.plugin || 'unknown'} + ${extractor.label || extractor.plugin || 'unknown'} ${pidHtml} @@ -742,6 +742,23 @@ `; } + const hasProcessEntries = (snapshot.all_plugins || []).some(extractor => extractor.source === 'process'); + const hasArchiveResults = (snapshot.all_plugins || []).some(extractor => extractor.source === 'archiveresult'); + const processOnly = hasProcessEntries && !hasArchiveResults; + const runningProcessCount = (snapshot.all_plugins || []).filter(extractor => extractor.source === 'process' && extractor.status === 'started').length; + const failedProcessCount = (snapshot.all_plugins || []).filter(extractor => extractor.source === 'process' && extractor.status === 'failed').length; + const snapshotMeta = (snapshot.total_plugins || 0) > 0 + ? processOnly + ? runningProcessCount > 0 + ? `Running ${runningProcessCount}/${snapshot.total_plugins || 0} setup hooks` + : failedProcessCount > 0 + ? `${failedProcessCount} setup hook${failedProcessCount === 1 ? '' : 's'} failed` + : `${snapshot.completed_plugins || 0}/${snapshot.total_plugins || 0} setup hooks` + : hasProcessEntries + ? `${snapshot.completed_plugins || 0}/${snapshot.total_plugins || 0} tasks${(snapshot.failed_plugins || 0) > 0 ? ` (${snapshot.failed_plugins} failed)` : ''}${runningProcessCount > 0 ? ` (${runningProcessCount} hooks running)` : ''}` + : `${snapshot.completed_plugins || 0}/${snapshot.total_plugins || 0} extractors${(snapshot.failed_plugins || 0) > 0 ? ` (${snapshot.failed_plugins} failed)` : ''}` + : 'Waiting for extractors...'; + return `
@@ -750,9 +767,7 @@
${formatUrl(snapshot.url)}
- ${(snapshot.total_plugins || 0) > 0 - ? `${snapshot.completed_plugins || 0}/${snapshot.total_plugins || 0} extractors${(snapshot.failed_plugins || 0) > 0 ? ` (${snapshot.failed_plugins} failed)` : ''}` - : 'Waiting for extractors...'} + ${snapshotMeta}
${snapshotPidHtml} @@ -762,7 +777,7 @@
-
@@ -784,6 +799,29 @@ if (crawl.active_snapshots && crawl.active_snapshots.length > 0) { snapshotsHtml = crawl.active_snapshots.map(s => renderSnapshot(s, crawl.id)).join(''); } + let setupHtml = ''; + if (crawl.setup_plugins && crawl.setup_plugins.length > 0) { + const setupSummary = `${crawl.setup_completed_plugins || 0}/${crawl.setup_total_plugins || 0} setup tasks${(crawl.setup_failed_plugins || 0) > 0 ? ` (${crawl.setup_failed_plugins} failed)` : ''}`; + const sortedSetup = [...crawl.setup_plugins].sort((a, b) => + (a.plugin || '').localeCompare(b.plugin || '') + ); + setupHtml = ` +
+
+ +
+
+ ${sortedSetup.map(e => renderExtractor(e)).join('')} +
+
+ `; + } // Show warning if crawl is stuck (queued but can't start) let warningHtml = ''; @@ -847,6 +885,7 @@ ${warningHtml}
+ ${setupHtml} ${snapshotsHtml}
diff --git a/archivebox/templates/core/add.html b/archivebox/templates/core/add.html index dc5455c4..6663770a 100644 --- a/archivebox/templates/core/add.html +++ b/archivebox/templates/core/add.html @@ -38,56 +38,76 @@
{% csrf_token %} -

Create a new Crawl

+
+

Create a new Crawl

+

A Crawl is a job that processes URLs and creates Snapshots (archived copies) for each URL discovered. The settings below apply to the entire crawl and all snapshots it creates.

-
-

Crawl Settings

-
- {{ form.url.label_tag }} - {{ form.url }} -
0 URLs detected
+
+
+
+ {{ form.url.label_tag }} +
0 URLs detected
+
+
+ + {{ form.url }} +
+
+ +
{% if form.url.errors %}
{{ form.url.errors }}
{% endif %}
- Enter URLs to archive, one per line. Examples:
+ Enter URLs to archive, as one per line, CSV, JSON, or embedded in text (e.g. markdown, HTML, etc.). Examples:
https://example.com
- https://news.ycombinator.com
- https://github.com/ArchiveBox/ArchiveBox + https://news.ycombinator.com,https://news.google.com
+ [ArchiveBox](https://github.com/ArchiveBox/ArchiveBox)
{{ form.tag.label_tag }} {{ form.tag }} - - - {% for tag_name in available_tags %} - {% if form.tag.errors %}
{{ form.tag.errors }}
{% endif %} -
Tags will be applied to all snapshots created by this crawl. Start typing to see existing tags.
+
Tags will be applied to all snapshots created by this crawl.
-
- {{ form.depth.label_tag }} - {{ form.depth }} - {% if form.depth.errors %} -
{{ form.depth.errors }}
- {% endif %} -
Controls how many links deep the crawl will follow from the starting URLs.
+
+
+ {{ form.depth.label_tag }} + {{ form.depth }} + {% if form.depth.errors %} +
{{ form.depth.errors }}
+ {% endif %} +
Controls how many links deep the crawl will follow from the starting URLs.
+
+ +
+ {{ form.url_filters }} + {% if form.url_filters.errors %} +
{{ form.url_filters.errors }}
+ {% endif %} +
@@ -98,6 +118,18 @@ {% endif %}
Optional description for this crawl (visible in the admin interface).
+ +
+ {{ form.persona.label_tag }} + {{ form.persona }} + {% if form.persona.errors %} +
{{ form.persona.errors }}
+ {% endif %} +
+ Authentication profile (Chrome profile, cookies, etc.) to use when accessing URLs. + Create new persona / import from Chrome → +
+
@@ -108,7 +140,6 @@ View plugin details →

-
Quick Select: @@ -118,66 +149,63 @@
- -
-
- - +
+
+
+ + +
+
+ {{ form.chrome_plugins }} +
-
- {{ form.chrome_plugins }} -
-
- -
-
- +
+
+ +
+
+ {{ form.archiving_plugins }} +
-
- {{ form.archiving_plugins }} -
-
- -
-
- +
+
+ +
+
+ {{ form.parsing_plugins }} +
-
- {{ form.parsing_plugins }} -
-
- -
-
- +
+
+ + (defaults to SEARCH_BACKEND_ENGINE) +
+
+ {{ form.search_plugins }} +
-
- {{ form.search_plugins }} -
-
- -
-
- +
+
+ +
+
+ {{ form.binary_plugins }} +
-
- {{ form.binary_plugins }} -
-
- -
-
- -
-
- {{ form.extension_plugins }} +
+
+ +
+
+ {{ form.extension_plugins }} +
@@ -203,43 +231,13 @@
-
- {{ form.persona.label_tag }} - {{ form.persona }} - {% if form.persona.errors %} -
{{ form.persona.errors }}
- {% endif %} -
- Authentication profile to use for all snapshots in this crawl. - Create new persona → -
-
- -
- {{ form.overwrite }} - {{ form.overwrite.label_tag }} - {% if form.overwrite.errors %} -
{{ form.overwrite.errors }}
- {% endif %} -
Re-archive URLs even if they already exist
-
- -
- {{ form.update }} - {{ form.update.label_tag }} - {% if form.update.errors %} -
{{ form.update.errors }}
- {% endif %} -
Retry archiving URLs that previously failed
-
-
{{ form.index_only }} {{ form.index_only.label_tag }} {% if form.index_only.errors %}
{{ form.index_only.errors }}
{% endif %} -
Create snapshots but don't run archiving plugins yet (queue for later)
+
Create the crawl and queue snapshots without running archive plugins yet.
@@ -249,7 +247,7 @@
{{ form.config.errors }}
{% endif %}
- Override any config option for this crawl (e.g., TIMEOUT, USER_AGENT, CHROME_BINARY, etc.) + Override any config option for this crawl (e.g., TIMEOUT, USER_AGENT, CHROME_BINARY, etc.). URL_ALLOWLIST, URL_DENYLIST, and ENABLED_PLUGINS are updated automatically from the fields above.
@@ -270,22 +268,738 @@ --> {% endif %} {% block extra_head %} diff --git a/archivebox/templates/core/navigation.html b/archivebox/templates/core/navigation.html index e909c362..479d6319 100644 --- a/archivebox/templates/core/navigation.html +++ b/archivebox/templates/core/navigation.html @@ -6,7 +6,7 @@ Tags | Log     Docs | - API | + API | Public | Admin     diff --git a/archivebox/templates/core/snapshot.html b/archivebox/templates/core/snapshot.html index 0ad5a226..8b502d48 100644 --- a/archivebox/templates/core/snapshot.html +++ b/archivebox/templates/core/snapshot.html @@ -456,6 +456,9 @@ text-overflow: ellipsis; white-space: nowrap; } + .thumb-card:has([data-compact]) .card-text { + display: none; + } .thumb-card:has([data-compact]) .thumbnail-text-header, .thumb-card:has([data-compact]) .thumbnail-compact-icon, .thumb-card:has([data-compact]) .thumbnail-compact-label { @@ -620,8 +623,9 @@ @@ -713,12 +715,12 @@ ⬇️ {% endif %}
+ +

{% plugin_icon result_info.name %} {{ result_info.name|plugin_name|truncatechars:20 }}

+

{{ result_info.path }}

- -

{{ result_info.name|title }}

-
{% if result_info.result %} {% with plugin_base=result_info.name|plugin_name %} {% if plugin_base == 'ytdlp' or plugin_base == 'yt-dlp' or plugin_base == 'youtube-dl' %} diff --git a/archivebox/templates/core/snapshot_live.html b/archivebox/templates/core/snapshot_live.html index 6e9756b0..1ba83529 100644 --- a/archivebox/templates/core/snapshot_live.html +++ b/archivebox/templates/core/snapshot_live.html @@ -902,9 +902,9 @@
@@ -996,8 +996,7 @@